diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,65238 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9998274969811973, + "eval_steps": 500, + "global_step": 4347, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006900120752113162, + "grad_norm": 20.955894470214844, + "learning_rate": 1.9164430816404755e-09, + "logits/chosen": 3.9654581546783447, + "logits/rejected": 4.114469528198242, + "logps/chosen": -167.3888397216797, + "logps/rejected": -176.84234619140625, + "loss": 0.6875, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.960953712463379, + "rewards/margins": 0.9470504522323608, + "rewards/rejected": -12.908003807067871, + "step": 1 + }, + { + "epoch": 0.0013800241504226323, + "grad_norm": 1.8053655624389648, + "learning_rate": 3.832886163280951e-09, + "logits/chosen": 3.630797863006592, + "logits/rejected": 3.8480708599090576, + "logps/chosen": -158.61602783203125, + "logps/rejected": -181.46742248535156, + "loss": 0.4524, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.010522842407227, + "rewards/margins": 2.3291540145874023, + "rewards/rejected": -13.339676856994629, + "step": 2 + }, + { + "epoch": 0.0020700362256339485, + "grad_norm": 0.3223591148853302, + "learning_rate": 5.749329244921426e-09, + "logits/chosen": 3.73079252243042, + "logits/rejected": 3.9610674381256104, + "logps/chosen": -174.32089233398438, + "logps/rejected": -194.45828247070312, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.724654197692871, + "rewards/margins": 2.0578179359436035, + "rewards/rejected": -14.782471656799316, + "step": 3 + }, + { + "epoch": 0.0027600483008452647, + "grad_norm": 37.72278594970703, + "learning_rate": 7.665772326561902e-09, + "logits/chosen": 3.8161613941192627, + "logits/rejected": 3.8257155418395996, + "logps/chosen": -167.58670043945312, + "logps/rejected": -182.06349182128906, + "loss": 1.0369, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.940882682800293, + "rewards/margins": 1.5024067163467407, + "rewards/rejected": -13.443288803100586, + "step": 4 + }, + { + "epoch": 0.003450060376056581, + "grad_norm": 2.0203702449798584, + "learning_rate": 9.582215408202378e-09, + "logits/chosen": 3.509491443634033, + "logits/rejected": 3.627190351486206, + "logps/chosen": -161.29473876953125, + "logps/rejected": -171.41151428222656, + "loss": 0.5394, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.2756986618042, + "rewards/margins": 1.0838329792022705, + "rewards/rejected": -12.35953140258789, + "step": 5 + }, + { + "epoch": 0.004140072451267897, + "grad_norm": 0.30293455719947815, + "learning_rate": 1.1498658489842852e-08, + "logits/chosen": 3.97316837310791, + "logits/rejected": 3.97316837310791, + "logps/chosen": -186.8819580078125, + "logps/rejected": -186.8819580078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.933112144470215, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.933112144470215, + "step": 6 + }, + { + "epoch": 0.004830084526479214, + "grad_norm": 0.34308815002441406, + "learning_rate": 1.3415101571483328e-08, + "logits/chosen": 3.835020065307617, + "logits/rejected": 3.835020065307617, + "logps/chosen": -155.2503662109375, + "logps/rejected": -155.2503662109375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.82938003540039, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -10.82938003540039, + "step": 7 + }, + { + "epoch": 0.005520096601690529, + "grad_norm": 0.46687737107276917, + "learning_rate": 1.5331544653123804e-08, + "logits/chosen": 3.844010829925537, + "logits/rejected": 4.084088325500488, + "logps/chosen": -172.8306884765625, + "logps/rejected": -189.60659790039062, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.47307014465332, + "rewards/margins": 1.7099103927612305, + "rewards/rejected": -14.182979583740234, + "step": 8 + }, + { + "epoch": 0.006210108676901846, + "grad_norm": 0.31873977184295654, + "learning_rate": 1.7247987734764278e-08, + "logits/chosen": 4.01571798324585, + "logits/rejected": 4.173055171966553, + "logps/chosen": -165.92916870117188, + "logps/rejected": -172.55250549316406, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.876453399658203, + "rewards/margins": 0.6676046848297119, + "rewards/rejected": -12.544057846069336, + "step": 9 + }, + { + "epoch": 0.006900120752113162, + "grad_norm": 0.5344240665435791, + "learning_rate": 1.9164430816404756e-08, + "logits/chosen": 3.484137773513794, + "logits/rejected": 3.5830698013305664, + "logps/chosen": -161.95843505859375, + "logps/rejected": -172.42428588867188, + "loss": 0.5227, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.284912109375, + "rewards/margins": 1.195345163345337, + "rewards/rejected": -12.480257034301758, + "step": 10 + }, + { + "epoch": 0.007590132827324478, + "grad_norm": 0.38461536169052124, + "learning_rate": 2.108087389804523e-08, + "logits/chosen": 4.145138740539551, + "logits/rejected": 4.256324291229248, + "logps/chosen": -177.89181518554688, + "logps/rejected": -183.92852783203125, + "loss": 0.6073, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.979063034057617, + "rewards/margins": 0.6249732971191406, + "rewards/rejected": -13.604036331176758, + "step": 11 + }, + { + "epoch": 0.008280144902535794, + "grad_norm": 5.5035481452941895, + "learning_rate": 2.2997316979685704e-08, + "logits/chosen": 4.141195297241211, + "logits/rejected": 4.134566307067871, + "logps/chosen": -183.44895935058594, + "logps/rejected": -183.36724853515625, + "loss": 0.6702, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.491649627685547, + "rewards/margins": 0.051138103008270264, + "rewards/rejected": -13.542787551879883, + "step": 12 + }, + { + "epoch": 0.00897015697774711, + "grad_norm": 0.3910250663757324, + "learning_rate": 2.491376006132618e-08, + "logits/chosen": 4.049118518829346, + "logits/rejected": 4.256428241729736, + "logps/chosen": -163.0498046875, + "logps/rejected": -177.18728637695312, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.418907165527344, + "rewards/margins": 1.494235873222351, + "rewards/rejected": -12.913143157958984, + "step": 13 + }, + { + "epoch": 0.009660169052958427, + "grad_norm": 0.7451475262641907, + "learning_rate": 2.6830203142966656e-08, + "logits/chosen": 3.711667776107788, + "logits/rejected": 3.6630699634552, + "logps/chosen": -169.31817626953125, + "logps/rejected": -172.3978271484375, + "loss": 0.6264, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.11435317993164, + "rewards/margins": 0.2194344401359558, + "rewards/rejected": -12.33378791809082, + "step": 14 + }, + { + "epoch": 0.010350181128169742, + "grad_norm": 0.4903412461280823, + "learning_rate": 2.8746646224607133e-08, + "logits/chosen": 4.064061164855957, + "logits/rejected": 4.064061164855957, + "logps/chosen": -177.3813018798828, + "logps/rejected": -177.3813018798828, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.038660049438477, + "rewards/margins": 0.0, + "rewards/rejected": -13.038660049438477, + "step": 15 + }, + { + "epoch": 0.011040193203381059, + "grad_norm": 0.42276227474212646, + "learning_rate": 3.066308930624761e-08, + "logits/chosen": 3.5569980144500732, + "logits/rejected": 4.0002970695495605, + "logps/chosen": -158.10057067871094, + "logps/rejected": -181.07833862304688, + "loss": 0.4367, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.946258544921875, + "rewards/margins": 2.3559446334838867, + "rewards/rejected": -13.302204132080078, + "step": 16 + }, + { + "epoch": 0.011730205278592375, + "grad_norm": 0.3118950426578522, + "learning_rate": 3.257953238788808e-08, + "logits/chosen": 4.350334167480469, + "logits/rejected": 4.350334167480469, + "logps/chosen": -192.59339904785156, + "logps/rejected": -192.59339904785156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.452496528625488, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.452496528625488, + "step": 17 + }, + { + "epoch": 0.012420217353803692, + "grad_norm": 0.3975149393081665, + "learning_rate": 3.4495975469528556e-08, + "logits/chosen": 3.933147430419922, + "logits/rejected": 4.105241775512695, + "logps/chosen": -160.9093780517578, + "logps/rejected": -186.0137939453125, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.491897583007812, + "rewards/margins": 2.4882283210754395, + "rewards/rejected": -13.980125427246094, + "step": 18 + }, + { + "epoch": 0.013110229429015009, + "grad_norm": 0.9089391827583313, + "learning_rate": 3.6412418551169034e-08, + "logits/chosen": 3.652235269546509, + "logits/rejected": 3.698359727859497, + "logps/chosen": -150.89813232421875, + "logps/rejected": -161.08163452148438, + "loss": 0.5245, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.428153991699219, + "rewards/margins": 1.0005251169204712, + "rewards/rejected": -11.428678512573242, + "step": 19 + }, + { + "epoch": 0.013800241504226323, + "grad_norm": 0.301924467086792, + "learning_rate": 3.832886163280951e-08, + "logits/chosen": 3.7901220321655273, + "logits/rejected": 4.0904459953308105, + "logps/chosen": -162.47756958007812, + "logps/rejected": -189.1368408203125, + "loss": 0.4336, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.564027786254883, + "rewards/margins": 2.73394775390625, + "rewards/rejected": -14.297975540161133, + "step": 20 + }, + { + "epoch": 0.01449025357943764, + "grad_norm": 0.35328540205955505, + "learning_rate": 4.024530471444998e-08, + "logits/chosen": 3.9479899406433105, + "logits/rejected": 3.9479899406433105, + "logps/chosen": -187.19264221191406, + "logps/rejected": -187.19265747070312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.907034873962402, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.907034873962402, + "step": 21 + }, + { + "epoch": 0.015180265654648957, + "grad_norm": 0.6325603127479553, + "learning_rate": 4.216174779609046e-08, + "logits/chosen": 4.374211311340332, + "logits/rejected": 4.374211311340332, + "logps/chosen": -181.22183227539062, + "logps/rejected": -181.22183227539062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.207663536071777, + "rewards/margins": 0.0, + "rewards/rejected": -13.207663536071777, + "step": 22 + }, + { + "epoch": 0.01587027772986027, + "grad_norm": 0.43281832337379456, + "learning_rate": 4.407819087773093e-08, + "logits/chosen": 3.9768123626708984, + "logits/rejected": 3.9768123626708984, + "logps/chosen": -184.15789794921875, + "logps/rejected": -184.15789794921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.649845123291016, + "rewards/margins": 0.0, + "rewards/rejected": -13.649845123291016, + "step": 23 + }, + { + "epoch": 0.016560289805071588, + "grad_norm": 0.43453484773635864, + "learning_rate": 4.599463395937141e-08, + "logits/chosen": 3.938511371612549, + "logits/rejected": 3.997133731842041, + "logps/chosen": -172.57772827148438, + "logps/rejected": -178.3443145751953, + "loss": 0.6078, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.540776252746582, + "rewards/margins": 0.5751376152038574, + "rewards/rejected": -13.115913391113281, + "step": 24 + }, + { + "epoch": 0.017250301880282905, + "grad_norm": 0.8792168498039246, + "learning_rate": 4.7911077041011885e-08, + "logits/chosen": 3.78745698928833, + "logits/rejected": 3.827889919281006, + "logps/chosen": -166.38833618164062, + "logps/rejected": -182.206787109375, + "loss": 0.524, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.801326751708984, + "rewards/margins": 1.557956337928772, + "rewards/rejected": -13.359284400939941, + "step": 25 + }, + { + "epoch": 0.01794031395549422, + "grad_norm": 0.4631957709789276, + "learning_rate": 4.982752012265236e-08, + "logits/chosen": 3.6513211727142334, + "logits/rejected": 3.8209893703460693, + "logps/chosen": -150.24732971191406, + "logps/rejected": -163.1312713623047, + "loss": 0.5217, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.375011444091797, + "rewards/margins": 1.2957658767700195, + "rewards/rejected": -11.670778274536133, + "step": 26 + }, + { + "epoch": 0.018630326030705538, + "grad_norm": 0.37629738450050354, + "learning_rate": 5.174396320429284e-08, + "logits/chosen": 4.043480396270752, + "logits/rejected": 4.043480396270752, + "logps/chosen": -173.75387573242188, + "logps/rejected": -173.75387573242188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.46938419342041, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.46938419342041, + "step": 27 + }, + { + "epoch": 0.019320338105916855, + "grad_norm": 0.35369938611984253, + "learning_rate": 5.366040628593331e-08, + "logits/chosen": 4.26887321472168, + "logits/rejected": 4.26887321472168, + "logps/chosen": -181.7294464111328, + "logps/rejected": -181.7294464111328, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.744085311889648, + "rewards/margins": 0.0, + "rewards/rejected": -13.744085311889648, + "step": 28 + }, + { + "epoch": 0.02001035018112817, + "grad_norm": 1.6800479888916016, + "learning_rate": 5.557684936757379e-08, + "logits/chosen": 3.78863525390625, + "logits/rejected": 3.8216776847839355, + "logps/chosen": -165.74179077148438, + "logps/rejected": -168.77906799316406, + "loss": 0.6149, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.818138122558594, + "rewards/margins": 0.33411353826522827, + "rewards/rejected": -12.152252197265625, + "step": 29 + }, + { + "epoch": 0.020700362256339484, + "grad_norm": 7.052005290985107, + "learning_rate": 5.7493292449214267e-08, + "logits/chosen": 3.589214563369751, + "logits/rejected": 3.7438509464263916, + "logps/chosen": -166.47059631347656, + "logps/rejected": -180.48690795898438, + "loss": 0.5598, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.83026123046875, + "rewards/margins": 1.3540911674499512, + "rewards/rejected": -13.18435287475586, + "step": 30 + }, + { + "epoch": 0.0213903743315508, + "grad_norm": 0.4110473096370697, + "learning_rate": 5.9409735530854744e-08, + "logits/chosen": 3.887430191040039, + "logits/rejected": 3.887430191040039, + "logps/chosen": -164.5235595703125, + "logps/rejected": -164.5235595703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.540007591247559, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -11.540007591247559, + "step": 31 + }, + { + "epoch": 0.022080386406762118, + "grad_norm": 0.35257548093795776, + "learning_rate": 6.132617861249522e-08, + "logits/chosen": 3.7484617233276367, + "logits/rejected": 3.7484617233276367, + "logps/chosen": -179.9328155517578, + "logps/rejected": -179.9328155517578, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.301275253295898, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -13.301275253295898, + "step": 32 + }, + { + "epoch": 0.022770398481973434, + "grad_norm": 0.2840827703475952, + "learning_rate": 6.324262169413569e-08, + "logits/chosen": 3.5373382568359375, + "logits/rejected": 3.8265085220336914, + "logps/chosen": -165.8881072998047, + "logps/rejected": -185.47586059570312, + "loss": 0.5201, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.78973388671875, + "rewards/margins": 1.9899263381958008, + "rewards/rejected": -13.779659271240234, + "step": 33 + }, + { + "epoch": 0.02346041055718475, + "grad_norm": 0.3852826952934265, + "learning_rate": 6.515906477577616e-08, + "logits/chosen": 3.824913263320923, + "logits/rejected": 3.824913263320923, + "logps/chosen": -181.32810974121094, + "logps/rejected": -181.32810974121094, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.248322486877441, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.248322486877441, + "step": 34 + }, + { + "epoch": 0.024150422632396067, + "grad_norm": 0.4075649082660675, + "learning_rate": 6.707550785741664e-08, + "logits/chosen": 3.807910919189453, + "logits/rejected": 3.807910919189453, + "logps/chosen": -168.09426879882812, + "logps/rejected": -168.09425354003906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.85145378112793, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -11.851452827453613, + "step": 35 + }, + { + "epoch": 0.024840434707607384, + "grad_norm": 0.2985926866531372, + "learning_rate": 6.899195093905711e-08, + "logits/chosen": 4.075528621673584, + "logits/rejected": 4.075528621673584, + "logps/chosen": -190.6187286376953, + "logps/rejected": -190.6187286376953, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.121664047241211, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -14.121662139892578, + "step": 36 + }, + { + "epoch": 0.0255304467828187, + "grad_norm": 0.2376803755760193, + "learning_rate": 7.09083940206976e-08, + "logits/chosen": 3.623880386352539, + "logits/rejected": 4.028255462646484, + "logps/chosen": -144.7654571533203, + "logps/rejected": -177.70883178710938, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.876742362976074, + "rewards/margins": 3.2544448375701904, + "rewards/rejected": -13.131187438964844, + "step": 37 + }, + { + "epoch": 0.026220458858030017, + "grad_norm": 0.6533240675926208, + "learning_rate": 7.282483710233807e-08, + "logits/chosen": 3.81518292427063, + "logits/rejected": 4.028363227844238, + "logps/chosen": -159.96051025390625, + "logps/rejected": -176.10784912109375, + "loss": 0.525, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.038688659667969, + "rewards/margins": 1.6652376651763916, + "rewards/rejected": -12.703926086425781, + "step": 38 + }, + { + "epoch": 0.02691047093324133, + "grad_norm": 1.9406898021697998, + "learning_rate": 7.474128018397854e-08, + "logits/chosen": 3.7337026596069336, + "logits/rejected": 3.754815101623535, + "logps/chosen": -152.4678955078125, + "logps/rejected": -155.57723999023438, + "loss": 0.6146, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.429882049560547, + "rewards/margins": 0.33744341135025024, + "rewards/rejected": -10.767325401306152, + "step": 39 + }, + { + "epoch": 0.027600483008452647, + "grad_norm": 0.307609498500824, + "learning_rate": 7.665772326561902e-08, + "logits/chosen": 3.6087756156921387, + "logits/rejected": 3.754815101623535, + "logps/chosen": -155.50021362304688, + "logps/rejected": -173.986083984375, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.50242805480957, + "rewards/margins": 1.960037112236023, + "rewards/rejected": -12.462465286254883, + "step": 40 + }, + { + "epoch": 0.028290495083663963, + "grad_norm": 0.3584468960762024, + "learning_rate": 7.857416634725948e-08, + "logits/chosen": 3.8065884113311768, + "logits/rejected": 3.944950819015503, + "logps/chosen": -172.03517150878906, + "logps/rejected": -185.38174438476562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.566292762756348, + "rewards/margins": 1.3845751285552979, + "rewards/rejected": -13.950867652893066, + "step": 41 + }, + { + "epoch": 0.02898050715887528, + "grad_norm": 0.40441352128982544, + "learning_rate": 8.049060942889996e-08, + "logits/chosen": 3.8702902793884277, + "logits/rejected": 4.003049373626709, + "logps/chosen": -163.39089965820312, + "logps/rejected": -179.12913513183594, + "loss": 0.5221, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.450457572937012, + "rewards/margins": 1.578500747680664, + "rewards/rejected": -13.028958320617676, + "step": 42 + }, + { + "epoch": 0.029670519234086597, + "grad_norm": 0.4841839075088501, + "learning_rate": 8.240705251054043e-08, + "logits/chosen": 3.7668654918670654, + "logits/rejected": 3.7668654918670654, + "logps/chosen": -185.56552124023438, + "logps/rejected": -185.56552124023438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.712247848510742, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.712247848510742, + "step": 43 + }, + { + "epoch": 0.030360531309297913, + "grad_norm": 0.29689016938209534, + "learning_rate": 8.432349559218092e-08, + "logits/chosen": 3.7734475135803223, + "logits/rejected": 3.844658851623535, + "logps/chosen": -168.70962524414062, + "logps/rejected": -183.4956512451172, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.161144256591797, + "rewards/margins": 1.446109652519226, + "rewards/rejected": -13.60725212097168, + "step": 44 + }, + { + "epoch": 0.03105054338450923, + "grad_norm": 0.349777489900589, + "learning_rate": 8.623993867382139e-08, + "logits/chosen": 3.9710421562194824, + "logits/rejected": 3.9710421562194824, + "logps/chosen": -179.85134887695312, + "logps/rejected": -179.85134887695312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.076904296875, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.076904296875, + "step": 45 + }, + { + "epoch": 0.03174055545972054, + "grad_norm": 0.3918284475803375, + "learning_rate": 8.815638175546186e-08, + "logits/chosen": 3.660889148712158, + "logits/rejected": 3.694937229156494, + "logps/chosen": -177.2576141357422, + "logps/rejected": -187.57041931152344, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.922846794128418, + "rewards/margins": 1.0416573286056519, + "rewards/rejected": -13.964503288269043, + "step": 46 + }, + { + "epoch": 0.03243056753493186, + "grad_norm": 0.32722407579421997, + "learning_rate": 9.007282483710234e-08, + "logits/chosen": 4.091784954071045, + "logits/rejected": 4.091784954071045, + "logps/chosen": -187.51361083984375, + "logps/rejected": -187.51361083984375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.930334091186523, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.930334091186523, + "step": 47 + }, + { + "epoch": 0.033120579610143176, + "grad_norm": 0.367316335439682, + "learning_rate": 9.198926791874282e-08, + "logits/chosen": 3.676713466644287, + "logits/rejected": 3.860173463821411, + "logps/chosen": -166.41751098632812, + "logps/rejected": -178.30108642578125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.715490341186523, + "rewards/margins": 1.1421626806259155, + "rewards/rejected": -12.857654571533203, + "step": 48 + }, + { + "epoch": 0.033810591685354496, + "grad_norm": 0.3517066538333893, + "learning_rate": 9.39057110003833e-08, + "logits/chosen": 4.118497371673584, + "logits/rejected": 4.161655426025391, + "logps/chosen": -163.40548706054688, + "logps/rejected": -178.31980895996094, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.52076530456543, + "rewards/margins": 1.5027704238891602, + "rewards/rejected": -13.023534774780273, + "step": 49 + }, + { + "epoch": 0.03450060376056581, + "grad_norm": 0.8730058670043945, + "learning_rate": 9.582215408202377e-08, + "logits/chosen": 3.885134696960449, + "logits/rejected": 4.040589332580566, + "logps/chosen": -158.46469116210938, + "logps/rejected": -175.82984924316406, + "loss": 0.5239, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.29660415649414, + "rewards/margins": 1.6317464113235474, + "rewards/rejected": -12.928350448608398, + "step": 50 + }, + { + "epoch": 0.03519061583577713, + "grad_norm": 0.3376832902431488, + "learning_rate": 9.773859716366424e-08, + "logits/chosen": 3.7844748497009277, + "logits/rejected": 3.7844748497009277, + "logps/chosen": -175.23416137695312, + "logps/rejected": -175.23416137695312, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.94976806640625, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.949769020080566, + "step": 51 + }, + { + "epoch": 0.03588062791098844, + "grad_norm": 12.944125175476074, + "learning_rate": 9.965504024530473e-08, + "logits/chosen": 4.083037376403809, + "logits/rejected": 4.142056465148926, + "logps/chosen": -170.31219482421875, + "logps/rejected": -186.1625213623047, + "loss": 1.1304, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.222467422485352, + "rewards/margins": 1.5481274127960205, + "rewards/rejected": -13.770593643188477, + "step": 52 + }, + { + "epoch": 0.036570639986199756, + "grad_norm": 0.31569233536720276, + "learning_rate": 1.015714833269452e-07, + "logits/chosen": 3.467911720275879, + "logits/rejected": 3.8307738304138184, + "logps/chosen": -146.68063354492188, + "logps/rejected": -165.88768005371094, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.91257381439209, + "rewards/margins": 1.981831669807434, + "rewards/rejected": -11.89440631866455, + "step": 53 + }, + { + "epoch": 0.037260652061411076, + "grad_norm": 0.3378657102584839, + "learning_rate": 1.0348792640858568e-07, + "logits/chosen": 4.187561988830566, + "logits/rejected": 4.187561988830566, + "logps/chosen": -186.41107177734375, + "logps/rejected": -186.41107177734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.737068176269531, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.737068176269531, + "step": 54 + }, + { + "epoch": 0.03795066413662239, + "grad_norm": 0.4417838752269745, + "learning_rate": 1.0540436949022615e-07, + "logits/chosen": 4.007149696350098, + "logits/rejected": 4.0971479415893555, + "logps/chosen": -166.75022888183594, + "logps/rejected": -184.1231689453125, + "loss": 0.5218, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.900541305541992, + "rewards/margins": 1.629082202911377, + "rewards/rejected": -13.529623031616211, + "step": 55 + }, + { + "epoch": 0.03864067621183371, + "grad_norm": 0.270698219537735, + "learning_rate": 1.0732081257186662e-07, + "logits/chosen": 3.9037392139434814, + "logits/rejected": 4.076130390167236, + "logps/chosen": -163.78866577148438, + "logps/rejected": -182.61666870117188, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.494552612304688, + "rewards/margins": 1.9568185806274414, + "rewards/rejected": -13.451371192932129, + "step": 56 + }, + { + "epoch": 0.03933068828704502, + "grad_norm": 0.362991064786911, + "learning_rate": 1.0923725565350711e-07, + "logits/chosen": 3.955893039703369, + "logits/rejected": 3.955893039703369, + "logps/chosen": -173.83523559570312, + "logps/rejected": -173.83523559570312, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.612198829650879, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.612199783325195, + "step": 57 + }, + { + "epoch": 0.04002070036225634, + "grad_norm": 0.28137123584747314, + "learning_rate": 1.1115369873514758e-07, + "logits/chosen": 3.6568236351013184, + "logits/rejected": 3.9791266918182373, + "logps/chosen": -147.67306518554688, + "logps/rejected": -169.46836853027344, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.117817878723145, + "rewards/margins": 2.098172426223755, + "rewards/rejected": -12.21599006652832, + "step": 58 + }, + { + "epoch": 0.040710712437467655, + "grad_norm": 0.38625895977020264, + "learning_rate": 1.1307014181678805e-07, + "logits/chosen": 3.9869065284729004, + "logits/rejected": 3.9869065284729004, + "logps/chosen": -179.48851013183594, + "logps/rejected": -179.48851013183594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.203496932983398, + "rewards/margins": 0.0, + "rewards/rejected": -13.203496932983398, + "step": 59 + }, + { + "epoch": 0.04140072451267897, + "grad_norm": 0.2313198447227478, + "learning_rate": 1.1498658489842853e-07, + "logits/chosen": 3.6818599700927734, + "logits/rejected": 3.846540689468384, + "logps/chosen": -143.9743194580078, + "logps/rejected": -176.19557189941406, + "loss": 0.4333, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.754254341125488, + "rewards/margins": 3.215677499771118, + "rewards/rejected": -12.969931602478027, + "step": 60 + }, + { + "epoch": 0.04209073658789029, + "grad_norm": 0.35599616169929504, + "learning_rate": 1.16903027980069e-07, + "logits/chosen": 3.644808769226074, + "logits/rejected": 3.6976962089538574, + "logps/chosen": -179.36807250976562, + "logps/rejected": -186.63555908203125, + "loss": 0.6069, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.094310760498047, + "rewards/margins": 0.7292821407318115, + "rewards/rejected": -13.823593139648438, + "step": 61 + }, + { + "epoch": 0.0427807486631016, + "grad_norm": 23.062101364135742, + "learning_rate": 1.1881947106170949e-07, + "logits/chosen": 3.697634696960449, + "logits/rejected": 3.9904379844665527, + "logps/chosen": -176.20166015625, + "logps/rejected": -189.87957763671875, + "loss": 1.0229, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.777608871459961, + "rewards/margins": 1.3845880031585693, + "rewards/rejected": -14.162199020385742, + "step": 62 + }, + { + "epoch": 0.04347076073831292, + "grad_norm": 0.3016614317893982, + "learning_rate": 1.2073591414334996e-07, + "logits/chosen": 3.94990873336792, + "logits/rejected": 3.94990873336792, + "logps/chosen": -198.95147705078125, + "logps/rejected": -198.95147705078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -15.023240089416504, + "rewards/margins": 0.0, + "rewards/rejected": -15.023240089416504, + "step": 63 + }, + { + "epoch": 0.044160772813524235, + "grad_norm": 0.3537074327468872, + "learning_rate": 1.2265235722499043e-07, + "logits/chosen": 3.516343593597412, + "logits/rejected": 3.622861862182617, + "logps/chosen": -178.82073974609375, + "logps/rejected": -185.18081665039062, + "loss": 0.607, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.174943923950195, + "rewards/margins": 0.6865929365158081, + "rewards/rejected": -13.86153793334961, + "step": 64 + }, + { + "epoch": 0.044850784888735555, + "grad_norm": 0.6564141511917114, + "learning_rate": 1.245688003066309e-07, + "logits/chosen": 3.9733614921569824, + "logits/rejected": 4.0328240394592285, + "logps/chosen": -171.07354736328125, + "logps/rejected": -185.48280334472656, + "loss": 0.5229, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.469482421875, + "rewards/margins": 1.4193470478057861, + "rewards/rejected": -13.888830184936523, + "step": 65 + }, + { + "epoch": 0.04554079696394687, + "grad_norm": 0.3813719153404236, + "learning_rate": 1.2648524338827137e-07, + "logits/chosen": 3.76322078704834, + "logits/rejected": 3.76322078704834, + "logps/chosen": -189.62042236328125, + "logps/rejected": -189.62042236328125, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.293164253234863, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -14.29316520690918, + "step": 66 + }, + { + "epoch": 0.04623080903915819, + "grad_norm": 0.4140745997428894, + "learning_rate": 1.2840168646991184e-07, + "logits/chosen": 3.7688212394714355, + "logits/rejected": 3.7688212394714355, + "logps/chosen": -170.58087158203125, + "logps/rejected": -170.58087158203125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.234166145324707, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.234166145324707, + "step": 67 + }, + { + "epoch": 0.0469208211143695, + "grad_norm": 0.5323903560638428, + "learning_rate": 1.3031812955155231e-07, + "logits/chosen": 4.264087677001953, + "logits/rejected": 4.242564678192139, + "logps/chosen": -184.23085021972656, + "logps/rejected": -189.0679473876953, + "loss": 0.6087, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.63438606262207, + "rewards/margins": 0.5021853446960449, + "rewards/rejected": -14.136571884155273, + "step": 68 + }, + { + "epoch": 0.047610833189580815, + "grad_norm": 2.196309804916382, + "learning_rate": 1.322345726331928e-07, + "logits/chosen": 4.145041465759277, + "logits/rejected": 4.1357269287109375, + "logps/chosen": -179.9483642578125, + "logps/rejected": -183.70443725585938, + "loss": 0.6129, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.302711486816406, + "rewards/margins": 0.3693277835845947, + "rewards/rejected": -13.672039031982422, + "step": 69 + }, + { + "epoch": 0.048300845264792135, + "grad_norm": 0.392334520816803, + "learning_rate": 1.3415101571483328e-07, + "logits/chosen": 4.214783668518066, + "logits/rejected": 4.214783668518066, + "logps/chosen": -178.38951110839844, + "logps/rejected": -178.38951110839844, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.063689231872559, + "rewards/margins": 0.0, + "rewards/rejected": -13.063689231872559, + "step": 70 + }, + { + "epoch": 0.04899085734000345, + "grad_norm": 0.28031250834465027, + "learning_rate": 1.3606745879647375e-07, + "logits/chosen": 3.6318674087524414, + "logits/rejected": 3.7979187965393066, + "logps/chosen": -173.99017333984375, + "logps/rejected": -180.86880493164062, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.5177001953125, + "rewards/margins": 0.6822555065155029, + "rewards/rejected": -13.199954986572266, + "step": 71 + }, + { + "epoch": 0.04968086941521477, + "grad_norm": 0.2851777672767639, + "learning_rate": 1.3798390187811422e-07, + "logits/chosen": 3.5012238025665283, + "logits/rejected": 3.5012238025665283, + "logps/chosen": -164.36526489257812, + "logps/rejected": -164.36526489257812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.559993743896484, + "rewards/margins": 0.0, + "rewards/rejected": -11.559993743896484, + "step": 72 + }, + { + "epoch": 0.05037088149042608, + "grad_norm": 0.3632601797580719, + "learning_rate": 1.399003449597547e-07, + "logits/chosen": 3.626513957977295, + "logits/rejected": 3.809216022491455, + "logps/chosen": -167.9180145263672, + "logps/rejected": -181.09547424316406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.180133819580078, + "rewards/margins": 1.352436900138855, + "rewards/rejected": -13.532569885253906, + "step": 73 + }, + { + "epoch": 0.0510608935656374, + "grad_norm": 0.35516586899757385, + "learning_rate": 1.418167880413952e-07, + "logits/chosen": 4.011384010314941, + "logits/rejected": 4.011384010314941, + "logps/chosen": -179.07431030273438, + "logps/rejected": -179.07431030273438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.126554489135742, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.126555442810059, + "step": 74 + }, + { + "epoch": 0.051750905640848714, + "grad_norm": 1.7714030742645264, + "learning_rate": 1.4373323112303566e-07, + "logits/chosen": 3.8600125312805176, + "logits/rejected": 3.878448009490967, + "logps/chosen": -170.68472290039062, + "logps/rejected": -174.78524780273438, + "loss": 0.6119, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.517295837402344, + "rewards/margins": 0.39036643505096436, + "rewards/rejected": -12.907661437988281, + "step": 75 + }, + { + "epoch": 0.052440917716060034, + "grad_norm": 0.3206137418746948, + "learning_rate": 1.4564967420467613e-07, + "logits/chosen": 4.073652267456055, + "logits/rejected": 4.180914878845215, + "logps/chosen": -184.2132568359375, + "logps/rejected": -194.3201904296875, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.691658020019531, + "rewards/margins": 1.0337915420532227, + "rewards/rejected": -14.72545051574707, + "step": 76 + }, + { + "epoch": 0.05313092979127135, + "grad_norm": 0.34602969884872437, + "learning_rate": 1.475661172863166e-07, + "logits/chosen": 3.906959295272827, + "logits/rejected": 4.145905494689941, + "logps/chosen": -173.17979431152344, + "logps/rejected": -182.98130798339844, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.520613670349121, + "rewards/margins": 0.9700199961662292, + "rewards/rejected": -13.490633010864258, + "step": 77 + }, + { + "epoch": 0.05382094186648266, + "grad_norm": 0.38226670026779175, + "learning_rate": 1.4948256036795708e-07, + "logits/chosen": 4.1268439292907715, + "logits/rejected": 4.2858099937438965, + "logps/chosen": -166.74966430664062, + "logps/rejected": -177.8331756591797, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.930452346801758, + "rewards/margins": 1.0806952714920044, + "rewards/rejected": -13.011146545410156, + "step": 78 + }, + { + "epoch": 0.05451095394169398, + "grad_norm": 0.39894604682922363, + "learning_rate": 1.5139900344959755e-07, + "logits/chosen": 3.273808240890503, + "logits/rejected": 3.4086742401123047, + "logps/chosen": -149.5101776123047, + "logps/rejected": -159.44483947753906, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.357515335083008, + "rewards/margins": 1.011832356452942, + "rewards/rejected": -11.369346618652344, + "step": 79 + }, + { + "epoch": 0.055200966016905294, + "grad_norm": 0.3252900242805481, + "learning_rate": 1.5331544653123804e-07, + "logits/chosen": 3.987269639968872, + "logits/rejected": 3.987269639968872, + "logps/chosen": -191.22064208984375, + "logps/rejected": -191.22064208984375, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.281012535095215, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -14.281012535095215, + "step": 80 + }, + { + "epoch": 0.055890978092116614, + "grad_norm": 0.35121479630470276, + "learning_rate": 1.5523188961287852e-07, + "logits/chosen": 3.9690144062042236, + "logits/rejected": 3.9690144062042236, + "logps/chosen": -167.34608459472656, + "logps/rejected": -167.34608459472656, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.867776870727539, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -11.867777824401855, + "step": 81 + }, + { + "epoch": 0.05658099016732793, + "grad_norm": 0.4938546419143677, + "learning_rate": 1.5714833269451896e-07, + "logits/chosen": 3.527217388153076, + "logits/rejected": 3.7463202476501465, + "logps/chosen": -150.1567840576172, + "logps/rejected": -175.1078338623047, + "loss": 0.4365, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.232585906982422, + "rewards/margins": 2.4538626670837402, + "rewards/rejected": -12.68644905090332, + "step": 82 + }, + { + "epoch": 0.05727100224253925, + "grad_norm": 0.4336704909801483, + "learning_rate": 1.5906477577615946e-07, + "logits/chosen": 3.885532855987549, + "logits/rejected": 3.885532855987549, + "logps/chosen": -168.469482421875, + "logps/rejected": -168.469482421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.116385459899902, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.116385459899902, + "step": 83 + }, + { + "epoch": 0.05796101431775056, + "grad_norm": 0.34912556409835815, + "learning_rate": 1.6098121885779993e-07, + "logits/chosen": 3.825300693511963, + "logits/rejected": 3.825300693511963, + "logps/chosen": -167.46078491210938, + "logps/rejected": -167.4607696533203, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.009830474853516, + "rewards/margins": -8.940696716308594e-07, + "rewards/rejected": -12.0098295211792, + "step": 84 + }, + { + "epoch": 0.05865102639296188, + "grad_norm": 0.3946368098258972, + "learning_rate": 1.628976619394404e-07, + "logits/chosen": 3.9875779151916504, + "logits/rejected": 4.03257417678833, + "logps/chosen": -171.31979370117188, + "logps/rejected": -179.65347290039062, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.403787612915039, + "rewards/margins": 0.7670051455497742, + "rewards/rejected": -13.170793533325195, + "step": 85 + }, + { + "epoch": 0.05934103846817319, + "grad_norm": 0.5254521369934082, + "learning_rate": 1.6481410502108087e-07, + "logits/chosen": 3.969043254852295, + "logits/rejected": 3.969043254852295, + "logps/chosen": -171.2100830078125, + "logps/rejected": -171.2100830078125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.272745132446289, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.272745132446289, + "step": 86 + }, + { + "epoch": 0.060031050543384507, + "grad_norm": 0.3475699722766876, + "learning_rate": 1.6673054810272134e-07, + "logits/chosen": 3.6619653701782227, + "logits/rejected": 3.7563891410827637, + "logps/chosen": -152.28099060058594, + "logps/rejected": -169.2995147705078, + "loss": 0.5208, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.41816520690918, + "rewards/margins": 1.862404227256775, + "rewards/rejected": -12.280570030212402, + "step": 87 + }, + { + "epoch": 0.06072106261859583, + "grad_norm": 0.3956550657749176, + "learning_rate": 1.6864699118436184e-07, + "logits/chosen": 4.119354248046875, + "logits/rejected": 4.119354248046875, + "logps/chosen": -174.62403869628906, + "logps/rejected": -174.62405395507812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.689831733703613, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.689831733703613, + "step": 88 + }, + { + "epoch": 0.06141107469380714, + "grad_norm": 0.25100407004356384, + "learning_rate": 1.705634342660023e-07, + "logits/chosen": 3.8849263191223145, + "logits/rejected": 4.169778823852539, + "logps/chosen": -157.6252899169922, + "logps/rejected": -184.5730438232422, + "loss": 0.4337, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.104158401489258, + "rewards/margins": 2.606387138366699, + "rewards/rejected": -13.71054458618164, + "step": 89 + }, + { + "epoch": 0.06210108676901846, + "grad_norm": 0.3049978017807007, + "learning_rate": 1.7247987734764278e-07, + "logits/chosen": 3.8211166858673096, + "logits/rejected": 3.876741886138916, + "logps/chosen": -150.372802734375, + "logps/rejected": -160.59381103515625, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.116925239562988, + "rewards/margins": 1.0439409017562866, + "rewards/rejected": -11.160865783691406, + "step": 90 + }, + { + "epoch": 0.06279109884422977, + "grad_norm": 0.5299607515335083, + "learning_rate": 1.7439632042928325e-07, + "logits/chosen": 3.8292031288146973, + "logits/rejected": 3.953152656555176, + "logps/chosen": -158.14791870117188, + "logps/rejected": -171.90309143066406, + "loss": 0.5244, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.106134414672852, + "rewards/margins": 1.3541327714920044, + "rewards/rejected": -12.460268020629883, + "step": 91 + }, + { + "epoch": 0.06348111091944109, + "grad_norm": 0.3629153370857239, + "learning_rate": 1.7631276351092372e-07, + "logits/chosen": 3.642857074737549, + "logits/rejected": 3.686919927597046, + "logps/chosen": -149.87554931640625, + "logps/rejected": -158.9801025390625, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.01669692993164, + "rewards/margins": 0.9452404975891113, + "rewards/rejected": -10.961936950683594, + "step": 92 + }, + { + "epoch": 0.06417112299465241, + "grad_norm": 0.3260781764984131, + "learning_rate": 1.7822920659256422e-07, + "logits/chosen": 4.16462516784668, + "logits/rejected": 4.16462516784668, + "logps/chosen": -180.247802734375, + "logps/rejected": -180.247802734375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.211207389831543, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.211207389831543, + "step": 93 + }, + { + "epoch": 0.06486113506986373, + "grad_norm": 0.41267767548561096, + "learning_rate": 1.801456496742047e-07, + "logits/chosen": 4.126120567321777, + "logits/rejected": 4.126120567321777, + "logps/chosen": -186.07638549804688, + "logps/rejected": -186.07638549804688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.898558616638184, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.8985595703125, + "step": 94 + }, + { + "epoch": 0.06555114714507504, + "grad_norm": 0.36979883909225464, + "learning_rate": 1.8206209275584516e-07, + "logits/chosen": 4.115845203399658, + "logits/rejected": 4.264064788818359, + "logps/chosen": -177.38035583496094, + "logps/rejected": -192.30113220214844, + "loss": 0.5207, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.803505897521973, + "rewards/margins": 1.5115820169448853, + "rewards/rejected": -14.31508731842041, + "step": 95 + }, + { + "epoch": 0.06624115922028635, + "grad_norm": 0.28559526801109314, + "learning_rate": 1.8397853583748563e-07, + "logits/chosen": 3.603606939315796, + "logits/rejected": 3.752472400665283, + "logps/chosen": -161.44180297851562, + "logps/rejected": -179.11878967285156, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.259331703186035, + "rewards/margins": 1.8032293319702148, + "rewards/rejected": -13.06256103515625, + "step": 96 + }, + { + "epoch": 0.06693117129549767, + "grad_norm": 12.510151863098145, + "learning_rate": 1.858949789191261e-07, + "logits/chosen": 3.9546308517456055, + "logits/rejected": 3.8087997436523438, + "logps/chosen": -167.14120483398438, + "logps/rejected": -171.65869140625, + "loss": 1.3003, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.980365753173828, + "rewards/margins": 0.3792550563812256, + "rewards/rejected": -12.35962200164795, + "step": 97 + }, + { + "epoch": 0.06762118337070899, + "grad_norm": 7.497704029083252, + "learning_rate": 1.878114220007666e-07, + "logits/chosen": 3.7736551761627197, + "logits/rejected": 3.9008281230926514, + "logps/chosen": -174.71238708496094, + "logps/rejected": -185.37998962402344, + "loss": 0.5554, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.59337043762207, + "rewards/margins": 1.0976080894470215, + "rewards/rejected": -13.69097900390625, + "step": 98 + }, + { + "epoch": 0.0683111954459203, + "grad_norm": 13.732841491699219, + "learning_rate": 1.8972786508240707e-07, + "logits/chosen": 3.7895331382751465, + "logits/rejected": 3.8168740272521973, + "logps/chosen": -184.3809051513672, + "logps/rejected": -182.76344299316406, + "loss": 0.8143, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.651607513427734, + "rewards/margins": -0.18148422241210938, + "rewards/rejected": -13.470123291015625, + "step": 99 + }, + { + "epoch": 0.06900120752113162, + "grad_norm": 0.40537458658218384, + "learning_rate": 1.9164430816404754e-07, + "logits/chosen": 4.090005874633789, + "logits/rejected": 4.090005874633789, + "logps/chosen": -188.6317596435547, + "logps/rejected": -188.6317596435547, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.955214500427246, + "rewards/margins": 0.0, + "rewards/rejected": -13.955214500427246, + "step": 100 + }, + { + "epoch": 0.06969121959634293, + "grad_norm": 0.3165909945964813, + "learning_rate": 1.93560751245688e-07, + "logits/chosen": 3.901989221572876, + "logits/rejected": 4.106201171875, + "logps/chosen": -167.69821166992188, + "logps/rejected": -177.31393432617188, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.093584060668945, + "rewards/margins": 0.9500320553779602, + "rewards/rejected": -13.043615341186523, + "step": 101 + }, + { + "epoch": 0.07038123167155426, + "grad_norm": 0.3245113492012024, + "learning_rate": 1.9547719432732848e-07, + "logits/chosen": 3.9173550605773926, + "logits/rejected": 4.022019863128662, + "logps/chosen": -155.03433227539062, + "logps/rejected": -174.35971069335938, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.725296020507812, + "rewards/margins": 1.9528565406799316, + "rewards/rejected": -12.678152084350586, + "step": 102 + }, + { + "epoch": 0.07107124374676557, + "grad_norm": 0.3404594659805298, + "learning_rate": 1.9739363740896898e-07, + "logits/chosen": 3.836334705352783, + "logits/rejected": 3.9291372299194336, + "logps/chosen": -178.40087890625, + "logps/rejected": -188.88172912597656, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.966500282287598, + "rewards/margins": 1.0595766305923462, + "rewards/rejected": -14.026077270507812, + "step": 103 + }, + { + "epoch": 0.07176125582197689, + "grad_norm": 0.24263162910938263, + "learning_rate": 1.9931008049060945e-07, + "logits/chosen": 3.7339823246002197, + "logits/rejected": 3.755171060562134, + "logps/chosen": -158.42926025390625, + "logps/rejected": -178.3647003173828, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.063694953918457, + "rewards/margins": 2.0316624641418457, + "rewards/rejected": -13.095357894897461, + "step": 104 + }, + { + "epoch": 0.0724512678971882, + "grad_norm": 23.491445541381836, + "learning_rate": 2.0122652357224992e-07, + "logits/chosen": 3.825438976287842, + "logits/rejected": 3.6961119174957275, + "logps/chosen": -164.79830932617188, + "logps/rejected": -166.2101593017578, + "loss": 1.8282, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.667235374450684, + "rewards/margins": 0.25043827295303345, + "rewards/rejected": -11.91767406463623, + "step": 105 + }, + { + "epoch": 0.07314127997239951, + "grad_norm": 0.3783701956272125, + "learning_rate": 2.031429666538904e-07, + "logits/chosen": 4.259041786193848, + "logits/rejected": 4.259041786193848, + "logps/chosen": -184.47950744628906, + "logps/rejected": -184.47950744628906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.625654220581055, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.625654220581055, + "step": 106 + }, + { + "epoch": 0.07383129204761084, + "grad_norm": 0.43063491582870483, + "learning_rate": 2.0505940973553086e-07, + "logits/chosen": 3.80926775932312, + "logits/rejected": 3.9555320739746094, + "logps/chosen": -164.53411865234375, + "logps/rejected": -175.82196044921875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.847484588623047, + "rewards/margins": 1.115272045135498, + "rewards/rejected": -12.962757110595703, + "step": 107 + }, + { + "epoch": 0.07452130412282215, + "grad_norm": 0.41867437958717346, + "learning_rate": 2.0697585281717136e-07, + "logits/chosen": 4.022878646850586, + "logits/rejected": 4.022878646850586, + "logps/chosen": -176.06210327148438, + "logps/rejected": -176.0620880126953, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.95574951171875, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.95574951171875, + "step": 108 + }, + { + "epoch": 0.07521131619803346, + "grad_norm": 27.110477447509766, + "learning_rate": 2.0889229589881183e-07, + "logits/chosen": 3.8291268348693848, + "logits/rejected": 3.843191623687744, + "logps/chosen": -167.48602294921875, + "logps/rejected": -171.9195556640625, + "loss": 1.2829, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.993877410888672, + "rewards/margins": 0.37826597690582275, + "rewards/rejected": -12.372142791748047, + "step": 109 + }, + { + "epoch": 0.07590132827324478, + "grad_norm": 0.33766815066337585, + "learning_rate": 2.108087389804523e-07, + "logits/chosen": 3.5725085735321045, + "logits/rejected": 3.8028154373168945, + "logps/chosen": -151.00685119628906, + "logps/rejected": -168.51077270507812, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.254400253295898, + "rewards/margins": 1.7701401710510254, + "rewards/rejected": -12.024540901184082, + "step": 110 + }, + { + "epoch": 0.07659134034845609, + "grad_norm": 0.35393014550209045, + "learning_rate": 2.1272518206209278e-07, + "logits/chosen": 4.083166122436523, + "logits/rejected": 4.083166122436523, + "logps/chosen": -191.1388397216797, + "logps/rejected": -191.1388397216797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.20280647277832, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.20280647277832, + "step": 111 + }, + { + "epoch": 0.07728135242366742, + "grad_norm": 0.40315112471580505, + "learning_rate": 2.1464162514373325e-07, + "logits/chosen": 3.7555642127990723, + "logits/rejected": 3.974264621734619, + "logps/chosen": -167.74267578125, + "logps/rejected": -177.9791259765625, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.898524284362793, + "rewards/margins": 1.0663816928863525, + "rewards/rejected": -12.964905738830566, + "step": 112 + }, + { + "epoch": 0.07797136449887873, + "grad_norm": 0.2857323884963989, + "learning_rate": 2.1655806822537372e-07, + "logits/chosen": 3.945939064025879, + "logits/rejected": 4.054264545440674, + "logps/chosen": -177.01844787597656, + "logps/rejected": -193.79322814941406, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.835189819335938, + "rewards/margins": 1.7307982444763184, + "rewards/rejected": -14.565988540649414, + "step": 113 + }, + { + "epoch": 0.07866137657409004, + "grad_norm": 0.465252161026001, + "learning_rate": 2.1847451130701421e-07, + "logits/chosen": 3.874260425567627, + "logits/rejected": 3.951447010040283, + "logps/chosen": -171.0720672607422, + "logps/rejected": -176.3419647216797, + "loss": 0.6078, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.410463333129883, + "rewards/margins": 0.566879153251648, + "rewards/rejected": -12.97734260559082, + "step": 114 + }, + { + "epoch": 0.07935138864930136, + "grad_norm": 18.67799949645996, + "learning_rate": 2.2039095438865469e-07, + "logits/chosen": 4.1904778480529785, + "logits/rejected": 4.371285438537598, + "logps/chosen": -165.69253540039062, + "logps/rejected": -173.74191284179688, + "loss": 1.0921, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.765519142150879, + "rewards/margins": 0.7228598594665527, + "rewards/rejected": -12.488378524780273, + "step": 115 + }, + { + "epoch": 0.08004140072451268, + "grad_norm": 0.32273492217063904, + "learning_rate": 2.2230739747029516e-07, + "logits/chosen": 3.973423480987549, + "logits/rejected": 4.1463470458984375, + "logps/chosen": -162.6868896484375, + "logps/rejected": -182.97311401367188, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.489083290100098, + "rewards/margins": 2.074314594268799, + "rewards/rejected": -13.563397407531738, + "step": 116 + }, + { + "epoch": 0.080731412799724, + "grad_norm": 0.8191843628883362, + "learning_rate": 2.2422384055193563e-07, + "logits/chosen": 4.210084915161133, + "logits/rejected": 4.2576189041137695, + "logps/chosen": -176.7487335205078, + "logps/rejected": -180.46121215820312, + "loss": 0.6107, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.860275268554688, + "rewards/margins": 0.42216330766677856, + "rewards/rejected": -13.282438278198242, + "step": 117 + }, + { + "epoch": 0.08142142487493531, + "grad_norm": 15.082287788391113, + "learning_rate": 2.261402836335761e-07, + "logits/chosen": 3.7886605262756348, + "logits/rejected": 3.7167701721191406, + "logps/chosen": -160.2227783203125, + "logps/rejected": -157.07373046875, + "loss": 0.977, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.391368865966797, + "rewards/margins": -0.2946825623512268, + "rewards/rejected": -11.096687316894531, + "step": 118 + }, + { + "epoch": 0.08211143695014662, + "grad_norm": 0.465023010969162, + "learning_rate": 2.280567267152166e-07, + "logits/chosen": 3.493441104888916, + "logits/rejected": 3.8447165489196777, + "logps/chosen": -156.69212341308594, + "logps/rejected": -194.21615600585938, + "loss": 0.3481, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.860068321228027, + "rewards/margins": 3.780362844467163, + "rewards/rejected": -14.640430450439453, + "step": 119 + }, + { + "epoch": 0.08280144902535794, + "grad_norm": 0.3765423595905304, + "learning_rate": 2.2997316979685707e-07, + "logits/chosen": 3.7096803188323975, + "logits/rejected": 3.783391237258911, + "logps/chosen": -153.77830505371094, + "logps/rejected": -163.92013549804688, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.499885559082031, + "rewards/margins": 1.0441235303878784, + "rewards/rejected": -11.544008255004883, + "step": 120 + }, + { + "epoch": 0.08349146110056926, + "grad_norm": 0.3782792091369629, + "learning_rate": 2.3188961287849754e-07, + "logits/chosen": 3.969428300857544, + "logits/rejected": 3.969428300857544, + "logps/chosen": -183.24478149414062, + "logps/rejected": -183.24478149414062, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.67805004119873, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.678050994873047, + "step": 121 + }, + { + "epoch": 0.08418147317578058, + "grad_norm": 21.2424259185791, + "learning_rate": 2.33806055960138e-07, + "logits/chosen": 3.8056015968322754, + "logits/rejected": 3.813539981842041, + "logps/chosen": -168.91004943847656, + "logps/rejected": -187.55889892578125, + "loss": 0.8399, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.075498580932617, + "rewards/margins": 1.813797950744629, + "rewards/rejected": -13.889297485351562, + "step": 122 + }, + { + "epoch": 0.08487148525099189, + "grad_norm": 0.35438621044158936, + "learning_rate": 2.3572249904177848e-07, + "logits/chosen": 3.799365282058716, + "logits/rejected": 3.954369306564331, + "logps/chosen": -137.4912872314453, + "logps/rejected": -151.51304626464844, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.404111862182617, + "rewards/margins": 1.3693212270736694, + "rewards/rejected": -10.773433685302734, + "step": 123 + }, + { + "epoch": 0.0855614973262032, + "grad_norm": 0.387854665517807, + "learning_rate": 2.3763894212341898e-07, + "logits/chosen": 3.894768238067627, + "logits/rejected": 3.9685475826263428, + "logps/chosen": -178.65805053710938, + "logps/rejected": -186.568359375, + "loss": 0.6068, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.975942611694336, + "rewards/margins": 0.770717978477478, + "rewards/rejected": -13.746661186218262, + "step": 124 + }, + { + "epoch": 0.08625150940141453, + "grad_norm": 0.3378782868385315, + "learning_rate": 2.395553852050594e-07, + "logits/chosen": 3.874943256378174, + "logits/rejected": 3.874943256378174, + "logps/chosen": -178.6158905029297, + "logps/rejected": -178.6158905029297, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.0656099319458, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.065610885620117, + "step": 125 + }, + { + "epoch": 0.08694152147662584, + "grad_norm": 0.25248315930366516, + "learning_rate": 2.414718282866999e-07, + "logits/chosen": 3.752624988555908, + "logits/rejected": 3.832967519760132, + "logps/chosen": -175.88717651367188, + "logps/rejected": -197.27362060546875, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.805017471313477, + "rewards/margins": 2.1230242252349854, + "rewards/rejected": -14.928041458129883, + "step": 126 + }, + { + "epoch": 0.08763153355183716, + "grad_norm": 0.33022385835647583, + "learning_rate": 2.433882713683404e-07, + "logits/chosen": 4.0411858558654785, + "logits/rejected": 4.0411858558654785, + "logps/chosen": -181.634765625, + "logps/rejected": -181.634765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.55565357208252, + "rewards/margins": 0.0, + "rewards/rejected": -13.55565357208252, + "step": 127 + }, + { + "epoch": 0.08832154562704847, + "grad_norm": 10.600749015808105, + "learning_rate": 2.4530471444998086e-07, + "logits/chosen": 3.920074462890625, + "logits/rejected": 4.068134307861328, + "logps/chosen": -182.79039001464844, + "logps/rejected": -182.57371520996094, + "loss": 0.7287, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.368148803710938, + "rewards/margins": -0.06324642896652222, + "rewards/rejected": -13.304901123046875, + "step": 128 + }, + { + "epoch": 0.08901155770225978, + "grad_norm": 17.036277770996094, + "learning_rate": 2.4722115753162136e-07, + "logits/chosen": 3.954768180847168, + "logits/rejected": 3.839926242828369, + "logps/chosen": -164.9195098876953, + "logps/rejected": -153.09130859375, + "loss": 1.7835, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.854615211486816, + "rewards/margins": -1.1769905090332031, + "rewards/rejected": -10.677624702453613, + "step": 129 + }, + { + "epoch": 0.08970156977747111, + "grad_norm": 0.3400668501853943, + "learning_rate": 2.491376006132618e-07, + "logits/chosen": 4.052361011505127, + "logits/rejected": 4.052361011505127, + "logps/chosen": -177.69119262695312, + "logps/rejected": -177.69117736816406, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.056215286254883, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.056215286254883, + "step": 130 + }, + { + "epoch": 0.09039158185268242, + "grad_norm": 0.3169371485710144, + "learning_rate": 2.5105404369490225e-07, + "logits/chosen": 3.8966927528381348, + "logits/rejected": 3.8966927528381348, + "logps/chosen": -182.258056640625, + "logps/rejected": -182.258056640625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.338689804077148, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.338689804077148, + "step": 131 + }, + { + "epoch": 0.09108159392789374, + "grad_norm": 0.45364049077033997, + "learning_rate": 2.5297048677654274e-07, + "logits/chosen": 3.651520252227783, + "logits/rejected": 3.651520252227783, + "logps/chosen": -166.8703155517578, + "logps/rejected": -166.8703155517578, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.024989128112793, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.024989128112793, + "step": 132 + }, + { + "epoch": 0.09177160600310505, + "grad_norm": 0.36682096123695374, + "learning_rate": 2.5488692985818324e-07, + "logits/chosen": 3.7663662433624268, + "logits/rejected": 3.7663662433624268, + "logps/chosen": -168.17898559570312, + "logps/rejected": -168.17898559570312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.058778762817383, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.058777809143066, + "step": 133 + }, + { + "epoch": 0.09246161807831638, + "grad_norm": 0.36921343207359314, + "learning_rate": 2.568033729398237e-07, + "logits/chosen": 4.021571636199951, + "logits/rejected": 4.082857608795166, + "logps/chosen": -171.10147094726562, + "logps/rejected": -181.541015625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.328736305236816, + "rewards/margins": 1.0695620775222778, + "rewards/rejected": -13.398298263549805, + "step": 134 + }, + { + "epoch": 0.09315163015352769, + "grad_norm": 4.654562950134277, + "learning_rate": 2.587198160214642e-07, + "logits/chosen": 3.2584755420684814, + "logits/rejected": 3.4426817893981934, + "logps/chosen": -125.73645782470703, + "logps/rejected": -153.66555786132812, + "loss": 0.4597, + "rewards/accuracies": 0.375, + "rewards/chosen": -7.810571193695068, + "rewards/margins": 2.8041937351226807, + "rewards/rejected": -10.614765167236328, + "step": 135 + }, + { + "epoch": 0.093841642228739, + "grad_norm": 0.27380913496017456, + "learning_rate": 2.6063625910310463e-07, + "logits/chosen": 4.150867462158203, + "logits/rejected": 4.249088287353516, + "logps/chosen": -177.96731567382812, + "logps/rejected": -194.76226806640625, + "loss": 0.5203, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.021340370178223, + "rewards/margins": 1.6473532915115356, + "rewards/rejected": -14.668694496154785, + "step": 136 + }, + { + "epoch": 0.09453165430395032, + "grad_norm": 0.3594914972782135, + "learning_rate": 2.625527021847451e-07, + "logits/chosen": 3.7631893157958984, + "logits/rejected": 3.7631893157958984, + "logps/chosen": -166.05235290527344, + "logps/rejected": -166.05235290527344, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.656765937805176, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.656766891479492, + "step": 137 + }, + { + "epoch": 0.09522166637916163, + "grad_norm": 0.4537081718444824, + "learning_rate": 2.644691452663856e-07, + "logits/chosen": 4.416791915893555, + "logits/rejected": 4.416791915893555, + "logps/chosen": -181.707275390625, + "logps/rejected": -181.707275390625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.251684188842773, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.251684188842773, + "step": 138 + }, + { + "epoch": 0.09591167845437296, + "grad_norm": 0.5293512344360352, + "learning_rate": 2.6638558834802607e-07, + "logits/chosen": 3.8663294315338135, + "logits/rejected": 3.963618516921997, + "logps/chosen": -146.34890747070312, + "logps/rejected": -164.99749755859375, + "loss": 0.5209, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.037443161010742, + "rewards/margins": 1.7987390756607056, + "rewards/rejected": -11.836182594299316, + "step": 139 + }, + { + "epoch": 0.09660169052958427, + "grad_norm": 0.38292670249938965, + "learning_rate": 2.6830203142966656e-07, + "logits/chosen": 4.020331382751465, + "logits/rejected": 4.020331382751465, + "logps/chosen": -191.8922119140625, + "logps/rejected": -191.8922119140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.485140800476074, + "rewards/margins": 0.0, + "rewards/rejected": -14.485140800476074, + "step": 140 + }, + { + "epoch": 0.09729170260479558, + "grad_norm": 1.0227693319320679, + "learning_rate": 2.70218474511307e-07, + "logits/chosen": 3.991957187652588, + "logits/rejected": 4.351632118225098, + "logps/chosen": -168.67398071289062, + "logps/rejected": -198.64089965820312, + "loss": 0.3514, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.832783699035645, + "rewards/margins": 3.1039743423461914, + "rewards/rejected": -14.936758041381836, + "step": 141 + }, + { + "epoch": 0.0979817146800069, + "grad_norm": 0.5137650966644287, + "learning_rate": 2.721349175929475e-07, + "logits/chosen": 3.9516055583953857, + "logits/rejected": 3.9516055583953857, + "logps/chosen": -172.90792846679688, + "logps/rejected": -172.90792846679688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.463582038879395, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.463582038879395, + "step": 142 + }, + { + "epoch": 0.09867172675521822, + "grad_norm": 12.362342834472656, + "learning_rate": 2.74051360674588e-07, + "logits/chosen": 4.142726421356201, + "logits/rejected": 4.143903732299805, + "logps/chosen": -172.5127410888672, + "logps/rejected": -174.97108459472656, + "loss": 0.6637, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.370565414428711, + "rewards/margins": 0.23278820514678955, + "rewards/rejected": -12.603353500366211, + "step": 143 + }, + { + "epoch": 0.09936173883042954, + "grad_norm": 0.37791627645492554, + "learning_rate": 2.7596780375622845e-07, + "logits/chosen": 4.108306407928467, + "logits/rejected": 4.108306407928467, + "logps/chosen": -170.3270721435547, + "logps/rejected": -170.3270721435547, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.426453590393066, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.426453590393066, + "step": 144 + }, + { + "epoch": 0.10005175090564085, + "grad_norm": 0.38743069767951965, + "learning_rate": 2.7788424683786895e-07, + "logits/chosen": 3.9174556732177734, + "logits/rejected": 3.9174556732177734, + "logps/chosen": -183.79766845703125, + "logps/rejected": -183.79766845703125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.584037780761719, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.584038734436035, + "step": 145 + }, + { + "epoch": 0.10074176298085216, + "grad_norm": 0.35825735330581665, + "learning_rate": 2.798006899195094e-07, + "logits/chosen": 3.8532400131225586, + "logits/rejected": 3.8532400131225586, + "logps/chosen": -178.39808654785156, + "logps/rejected": -178.39808654785156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.157971382141113, + "rewards/margins": 0.0, + "rewards/rejected": -13.157971382141113, + "step": 146 + }, + { + "epoch": 0.10143177505606348, + "grad_norm": 0.3770557641983032, + "learning_rate": 2.817171330011499e-07, + "logits/chosen": 3.9615540504455566, + "logits/rejected": 4.159236431121826, + "logps/chosen": -183.93545532226562, + "logps/rejected": -193.70684814453125, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.71642780303955, + "rewards/margins": 0.9920672178268433, + "rewards/rejected": -14.708495140075684, + "step": 147 + }, + { + "epoch": 0.1021217871312748, + "grad_norm": 0.36908024549484253, + "learning_rate": 2.836335760827904e-07, + "logits/chosen": 3.8794937133789062, + "logits/rejected": 4.034075736999512, + "logps/chosen": -170.54373168945312, + "logps/rejected": -182.40283203125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.210285186767578, + "rewards/margins": 1.2062363624572754, + "rewards/rejected": -13.416520118713379, + "step": 148 + }, + { + "epoch": 0.10281179920648612, + "grad_norm": 0.3744489550590515, + "learning_rate": 2.8555001916443083e-07, + "logits/chosen": 4.030801296234131, + "logits/rejected": 4.030801296234131, + "logps/chosen": -173.88526916503906, + "logps/rejected": -173.88526916503906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.485076904296875, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -12.485076904296875, + "step": 149 + }, + { + "epoch": 0.10350181128169743, + "grad_norm": 2.98359751701355, + "learning_rate": 2.874664622460713e-07, + "logits/chosen": 3.7344536781311035, + "logits/rejected": 3.6788864135742188, + "logps/chosen": -165.18902587890625, + "logps/rejected": -166.9334259033203, + "loss": 0.6551, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.838939666748047, + "rewards/margins": 0.09310007095336914, + "rewards/rejected": -11.932039260864258, + "step": 150 + }, + { + "epoch": 0.10419182335690874, + "grad_norm": 0.3148576021194458, + "learning_rate": 2.8938290532771177e-07, + "logits/chosen": 3.6114211082458496, + "logits/rejected": 3.695362091064453, + "logps/chosen": -151.401123046875, + "logps/rejected": -161.54518127441406, + "loss": 0.6066, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.255488395690918, + "rewards/margins": 0.9734227657318115, + "rewards/rejected": -11.228911399841309, + "step": 151 + }, + { + "epoch": 0.10488183543212007, + "grad_norm": 0.3378719389438629, + "learning_rate": 2.9129934840935227e-07, + "logits/chosen": 3.9887382984161377, + "logits/rejected": 3.9887382984161377, + "logps/chosen": -197.51654052734375, + "logps/rejected": -197.51654052734375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.905176162719727, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -14.905177116394043, + "step": 152 + }, + { + "epoch": 0.10557184750733138, + "grad_norm": 0.3092643916606903, + "learning_rate": 2.9321579149099277e-07, + "logits/chosen": 3.8921563625335693, + "logits/rejected": 4.15000057220459, + "logps/chosen": -156.20156860351562, + "logps/rejected": -181.23434448242188, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.639278411865234, + "rewards/margins": 2.3862218856811523, + "rewards/rejected": -13.025500297546387, + "step": 153 + }, + { + "epoch": 0.1062618595825427, + "grad_norm": 0.40292245149612427, + "learning_rate": 2.951322345726332e-07, + "logits/chosen": 3.607805013656616, + "logits/rejected": 3.607805013656616, + "logps/chosen": -172.1619873046875, + "logps/rejected": -172.1619873046875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.312145233154297, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.312145233154297, + "step": 154 + }, + { + "epoch": 0.10695187165775401, + "grad_norm": 0.3251154124736786, + "learning_rate": 2.970486776542737e-07, + "logits/chosen": 4.004019260406494, + "logits/rejected": 4.004019260406494, + "logps/chosen": -180.81707763671875, + "logps/rejected": -180.81707763671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.326959609985352, + "rewards/margins": 0.0, + "rewards/rejected": -13.326959609985352, + "step": 155 + }, + { + "epoch": 0.10764188373296532, + "grad_norm": 2.745258092880249, + "learning_rate": 2.9896512073591415e-07, + "logits/chosen": 3.7674074172973633, + "logits/rejected": 4.028616905212402, + "logps/chosen": -167.50955200195312, + "logps/rejected": -175.37355041503906, + "loss": 0.5662, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.998801231384277, + "rewards/margins": 0.8862197399139404, + "rewards/rejected": -12.885021209716797, + "step": 156 + }, + { + "epoch": 0.10833189580817665, + "grad_norm": 0.4249623119831085, + "learning_rate": 3.0088156381755465e-07, + "logits/chosen": 3.697268009185791, + "logits/rejected": 3.697268009185791, + "logps/chosen": -154.52560424804688, + "logps/rejected": -154.5255889892578, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.777521133422852, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -10.777521133422852, + "step": 157 + }, + { + "epoch": 0.10902190788338796, + "grad_norm": 0.6763598322868347, + "learning_rate": 3.027980068991951e-07, + "logits/chosen": 4.159446716308594, + "logits/rejected": 4.257099628448486, + "logps/chosen": -171.65679931640625, + "logps/rejected": -176.16714477539062, + "loss": 0.6107, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.199361801147461, + "rewards/margins": 0.4227789640426636, + "rewards/rejected": -12.622140884399414, + "step": 158 + }, + { + "epoch": 0.10971191995859927, + "grad_norm": 0.40709227323532104, + "learning_rate": 3.047144499808356e-07, + "logits/chosen": 3.9664883613586426, + "logits/rejected": 3.9664883613586426, + "logps/chosen": -179.48587036132812, + "logps/rejected": -179.48587036132812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.308549880981445, + "rewards/margins": 0.0, + "rewards/rejected": -13.308549880981445, + "step": 159 + }, + { + "epoch": 0.11040193203381059, + "grad_norm": 1.1058140993118286, + "learning_rate": 3.066308930624761e-07, + "logits/chosen": 3.7493295669555664, + "logits/rejected": 3.770050048828125, + "logps/chosen": -166.3879852294922, + "logps/rejected": -170.18728637695312, + "loss": 0.6144, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.992879867553711, + "rewards/margins": 0.3412069082260132, + "rewards/rejected": -12.334087371826172, + "step": 160 + }, + { + "epoch": 0.11109194410902191, + "grad_norm": 1.5544601678848267, + "learning_rate": 3.0854733614411653e-07, + "logits/chosen": 4.222209930419922, + "logits/rejected": 4.277759552001953, + "logps/chosen": -168.38677978515625, + "logps/rejected": -174.66180419921875, + "loss": 0.5512, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.90723991394043, + "rewards/margins": 0.5837541818618774, + "rewards/rejected": -12.490994453430176, + "step": 161 + }, + { + "epoch": 0.11178195618423323, + "grad_norm": 0.3513641953468323, + "learning_rate": 3.1046377922575703e-07, + "logits/chosen": 4.049437999725342, + "logits/rejected": 4.049437999725342, + "logps/chosen": -179.13882446289062, + "logps/rejected": -179.13882446289062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.13159465789795, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.13159465789795, + "step": 162 + }, + { + "epoch": 0.11247196825944454, + "grad_norm": 6.411181926727295, + "learning_rate": 3.123802223073975e-07, + "logits/chosen": 3.2119948863983154, + "logits/rejected": 3.664874792098999, + "logps/chosen": -148.40194702148438, + "logps/rejected": -179.13519287109375, + "loss": 0.312, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.25178337097168, + "rewards/margins": 3.030575752258301, + "rewards/rejected": -13.282360076904297, + "step": 163 + }, + { + "epoch": 0.11316198033465585, + "grad_norm": 0.37451115250587463, + "learning_rate": 3.142966653890379e-07, + "logits/chosen": 3.9905383586883545, + "logits/rejected": 3.9905383586883545, + "logps/chosen": -178.638427734375, + "logps/rejected": -178.638427734375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.13221263885498, + "rewards/margins": 7.152557373046875e-07, + "rewards/rejected": -13.132213592529297, + "step": 164 + }, + { + "epoch": 0.11385199240986717, + "grad_norm": 0.3163132071495056, + "learning_rate": 3.162131084706784e-07, + "logits/chosen": 3.7646102905273438, + "logits/rejected": 3.8894424438476562, + "logps/chosen": -172.6793975830078, + "logps/rejected": -181.97012329101562, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.560579299926758, + "rewards/margins": 0.8965723514556885, + "rewards/rejected": -13.457152366638184, + "step": 165 + }, + { + "epoch": 0.1145420044850785, + "grad_norm": 0.3913365304470062, + "learning_rate": 3.181295515523189e-07, + "logits/chosen": 3.6840858459472656, + "logits/rejected": 3.677652359008789, + "logps/chosen": -167.06285095214844, + "logps/rejected": -172.03465270996094, + "loss": 0.6081, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.040901184082031, + "rewards/margins": 0.5460063219070435, + "rewards/rejected": -12.586908340454102, + "step": 166 + }, + { + "epoch": 0.11523201656028981, + "grad_norm": 0.45868784189224243, + "learning_rate": 3.2004599463395936e-07, + "logits/chosen": 3.9275436401367188, + "logits/rejected": 4.1274847984313965, + "logps/chosen": -174.31405639648438, + "logps/rejected": -183.31597900390625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.575494766235352, + "rewards/margins": 0.872138500213623, + "rewards/rejected": -13.447632789611816, + "step": 167 + }, + { + "epoch": 0.11592202863550112, + "grad_norm": 0.371080607175827, + "learning_rate": 3.2196243771559986e-07, + "logits/chosen": 3.682044744491577, + "logits/rejected": 3.7494735717773438, + "logps/chosen": -156.82041931152344, + "logps/rejected": -164.63046264648438, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.978231430053711, + "rewards/margins": 0.7819192409515381, + "rewards/rejected": -11.760150909423828, + "step": 168 + }, + { + "epoch": 0.11661204071071243, + "grad_norm": 0.35560229420661926, + "learning_rate": 3.238788807972403e-07, + "logits/chosen": 4.260080337524414, + "logits/rejected": 4.359574317932129, + "logps/chosen": -171.90890502929688, + "logps/rejected": -183.3494873046875, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.436738967895508, + "rewards/margins": 1.0962904691696167, + "rewards/rejected": -13.533029556274414, + "step": 169 + }, + { + "epoch": 0.11730205278592376, + "grad_norm": 0.39544034004211426, + "learning_rate": 3.257953238788808e-07, + "logits/chosen": 3.471740484237671, + "logits/rejected": 3.7597882747650146, + "logps/chosen": -148.90841674804688, + "logps/rejected": -172.18629455566406, + "loss": 0.4344, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.207141876220703, + "rewards/margins": 2.4486982822418213, + "rewards/rejected": -12.655839920043945, + "step": 170 + }, + { + "epoch": 0.11799206486113507, + "grad_norm": 0.41122880578041077, + "learning_rate": 3.277117669605213e-07, + "logits/chosen": 3.9997153282165527, + "logits/rejected": 4.066824436187744, + "logps/chosen": -177.82354736328125, + "logps/rejected": -184.3211212158203, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.09919261932373, + "rewards/margins": 0.6549749374389648, + "rewards/rejected": -13.754167556762695, + "step": 171 + }, + { + "epoch": 0.11868207693634639, + "grad_norm": 0.4290090799331665, + "learning_rate": 3.2962821004216174e-07, + "logits/chosen": 3.7109012603759766, + "logits/rejected": 3.9260101318359375, + "logps/chosen": -171.31475830078125, + "logps/rejected": -176.91082763671875, + "loss": 0.6084, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.350421905517578, + "rewards/margins": 0.5213929414749146, + "rewards/rejected": -12.871814727783203, + "step": 172 + }, + { + "epoch": 0.1193720890115577, + "grad_norm": 0.9216614365577698, + "learning_rate": 3.3154465312380224e-07, + "logits/chosen": 3.767822265625, + "logits/rejected": 3.8564136028289795, + "logps/chosen": -157.26060485839844, + "logps/rejected": -175.23477172851562, + "loss": 0.5259, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.874814987182617, + "rewards/margins": 1.8047922849655151, + "rewards/rejected": -12.679607391357422, + "step": 173 + }, + { + "epoch": 0.12006210108676901, + "grad_norm": 0.31435221433639526, + "learning_rate": 3.334610962054427e-07, + "logits/chosen": 3.782940626144409, + "logits/rejected": 3.8507461547851562, + "logps/chosen": -162.5492706298828, + "logps/rejected": -175.80941772460938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.59505558013916, + "rewards/margins": 1.2639284133911133, + "rewards/rejected": -12.858983993530273, + "step": 174 + }, + { + "epoch": 0.12075211316198034, + "grad_norm": 0.3893524706363678, + "learning_rate": 3.353775392870832e-07, + "logits/chosen": 3.9717278480529785, + "logits/rejected": 3.9717278480529785, + "logps/chosen": -186.70449829101562, + "logps/rejected": -186.70449829101562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.916532516479492, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.916532516479492, + "step": 175 + }, + { + "epoch": 0.12144212523719165, + "grad_norm": 0.2938712537288666, + "learning_rate": 3.372939823687237e-07, + "logits/chosen": 3.6893539428710938, + "logits/rejected": 3.804659605026245, + "logps/chosen": -166.64410400390625, + "logps/rejected": -186.760498046875, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.92405891418457, + "rewards/margins": 1.9847923517227173, + "rewards/rejected": -13.90885066986084, + "step": 176 + }, + { + "epoch": 0.12213213731240297, + "grad_norm": 0.35326895117759705, + "learning_rate": 3.392104254503641e-07, + "logits/chosen": 4.049084186553955, + "logits/rejected": 4.233971118927002, + "logps/chosen": -186.3397674560547, + "logps/rejected": -195.2509765625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.81511402130127, + "rewards/margins": 0.9339369535446167, + "rewards/rejected": -14.74905014038086, + "step": 177 + }, + { + "epoch": 0.12282214938761428, + "grad_norm": 0.31750616431236267, + "learning_rate": 3.411268685320046e-07, + "logits/chosen": 3.640845775604248, + "logits/rejected": 3.7622017860412598, + "logps/chosen": -159.34017944335938, + "logps/rejected": -175.15921020507812, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.372411727905273, + "rewards/margins": 1.5497708320617676, + "rewards/rejected": -12.9221830368042, + "step": 178 + }, + { + "epoch": 0.12351216146282559, + "grad_norm": 0.37109455466270447, + "learning_rate": 3.4304331161364506e-07, + "logits/chosen": 4.020460605621338, + "logits/rejected": 4.0988264083862305, + "logps/chosen": -171.41238403320312, + "logps/rejected": -179.92221069335938, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.315720558166504, + "rewards/margins": 0.8161699771881104, + "rewards/rejected": -13.131891250610352, + "step": 179 + }, + { + "epoch": 0.12420217353803692, + "grad_norm": 57.44184494018555, + "learning_rate": 3.4495975469528556e-07, + "logits/chosen": 3.578404426574707, + "logits/rejected": 3.791811466217041, + "logps/chosen": -160.73309326171875, + "logps/rejected": -167.12742614746094, + "loss": 0.9228, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.330400466918945, + "rewards/margins": 0.633480429649353, + "rewards/rejected": -11.96388053894043, + "step": 180 + }, + { + "epoch": 0.12489218561324823, + "grad_norm": 0.38865309953689575, + "learning_rate": 3.4687619777692606e-07, + "logits/chosen": 3.6209330558776855, + "logits/rejected": 3.6281323432922363, + "logps/chosen": -156.46817016601562, + "logps/rejected": -162.37570190429688, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.990018844604492, + "rewards/margins": 0.6045101284980774, + "rewards/rejected": -11.594528198242188, + "step": 181 + }, + { + "epoch": 0.12558219768845955, + "grad_norm": 0.39334598183631897, + "learning_rate": 3.487926408585665e-07, + "logits/chosen": 4.055758476257324, + "logits/rejected": 4.055758476257324, + "logps/chosen": -176.40878295898438, + "logps/rejected": -176.40878295898438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.849325180053711, + "rewards/margins": 0.0, + "rewards/rejected": -12.849325180053711, + "step": 182 + }, + { + "epoch": 0.12627220976367087, + "grad_norm": 0.3021175265312195, + "learning_rate": 3.50709083940207e-07, + "logits/chosen": 4.132734298706055, + "logits/rejected": 4.132734298706055, + "logps/chosen": -194.5938262939453, + "logps/rejected": -194.5938262939453, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.449888229370117, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.449888229370117, + "step": 183 + }, + { + "epoch": 0.12696222183888217, + "grad_norm": 0.5258086323738098, + "learning_rate": 3.5262552702184744e-07, + "logits/chosen": 3.7804768085479736, + "logits/rejected": 4.111393928527832, + "logps/chosen": -164.0008544921875, + "logps/rejected": -176.53164672851562, + "loss": 0.5227, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.60481071472168, + "rewards/margins": 1.239156723022461, + "rewards/rejected": -12.843968391418457, + "step": 184 + }, + { + "epoch": 0.1276522339140935, + "grad_norm": 0.34750378131866455, + "learning_rate": 3.5454197010348794e-07, + "logits/chosen": 3.7538013458251953, + "logits/rejected": 4.098012924194336, + "logps/chosen": -150.59544372558594, + "logps/rejected": -185.02809143066406, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.201292991638184, + "rewards/margins": 3.449481248855591, + "rewards/rejected": -13.650773048400879, + "step": 185 + }, + { + "epoch": 0.12834224598930483, + "grad_norm": 15.012555122375488, + "learning_rate": 3.5645841318512844e-07, + "logits/chosen": 4.004265785217285, + "logits/rejected": 3.93485164642334, + "logps/chosen": -166.66799926757812, + "logps/rejected": -164.12313842773438, + "loss": 0.95, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.280284881591797, + "rewards/margins": -0.335219144821167, + "rewards/rejected": -11.945066452026367, + "step": 186 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 11.724466323852539, + "learning_rate": 3.583748562667689e-07, + "logits/chosen": 3.681802988052368, + "logits/rejected": 3.819581985473633, + "logps/chosen": -168.6154327392578, + "logps/rejected": -180.38050842285156, + "loss": 0.5877, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.137328147888184, + "rewards/margins": 1.2548415660858154, + "rewards/rejected": -13.392169952392578, + "step": 187 + }, + { + "epoch": 0.12972227013972745, + "grad_norm": 0.39948517084121704, + "learning_rate": 3.602912993484094e-07, + "logits/chosen": 3.7473440170288086, + "logits/rejected": 3.820068836212158, + "logps/chosen": -161.77517700195312, + "logps/rejected": -180.16238403320312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.667622566223145, + "rewards/margins": 1.6265997886657715, + "rewards/rejected": -13.294222831726074, + "step": 188 + }, + { + "epoch": 0.13041228221493875, + "grad_norm": 0.3435823917388916, + "learning_rate": 3.622077424300498e-07, + "logits/chosen": 3.827962875366211, + "logits/rejected": 3.977895736694336, + "logps/chosen": -174.73316955566406, + "logps/rejected": -186.63430786132812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.659975051879883, + "rewards/margins": 1.1089718341827393, + "rewards/rejected": -13.76894760131836, + "step": 189 + }, + { + "epoch": 0.13110229429015008, + "grad_norm": 19.788095474243164, + "learning_rate": 3.641241855116903e-07, + "logits/chosen": 3.456777334213257, + "logits/rejected": 3.5467677116394043, + "logps/chosen": -158.7410125732422, + "logps/rejected": -166.88107299804688, + "loss": 0.8268, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.055269241333008, + "rewards/margins": 0.8026773929595947, + "rewards/rejected": -11.857946395874023, + "step": 190 + }, + { + "epoch": 0.1317923063653614, + "grad_norm": 0.4482075870037079, + "learning_rate": 3.660406285933308e-07, + "logits/chosen": 3.7465248107910156, + "logits/rejected": 3.7740345001220703, + "logps/chosen": -178.4246826171875, + "logps/rejected": -184.08074951171875, + "loss": 0.6075, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.077027320861816, + "rewards/margins": 0.6050982475280762, + "rewards/rejected": -13.68212604522705, + "step": 191 + }, + { + "epoch": 0.1324823184405727, + "grad_norm": 0.37577036023139954, + "learning_rate": 3.6795707167497126e-07, + "logits/chosen": 4.115001201629639, + "logits/rejected": 4.115001201629639, + "logps/chosen": -188.80287170410156, + "logps/rejected": -188.80287170410156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.179994583129883, + "rewards/margins": 0.0, + "rewards/rejected": -14.179994583129883, + "step": 192 + }, + { + "epoch": 0.13317233051578403, + "grad_norm": 0.3138216733932495, + "learning_rate": 3.6987351475661176e-07, + "logits/chosen": 3.9965338706970215, + "logits/rejected": 3.9965338706970215, + "logps/chosen": -188.5076446533203, + "logps/rejected": -188.5076446533203, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.99299430847168, + "rewards/margins": 0.0, + "rewards/rejected": -13.99299430847168, + "step": 193 + }, + { + "epoch": 0.13386234259099533, + "grad_norm": 0.3035406470298767, + "learning_rate": 3.717899578382522e-07, + "logits/chosen": 3.9509754180908203, + "logits/rejected": 3.9509754180908203, + "logps/chosen": -190.93032836914062, + "logps/rejected": -190.93032836914062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.185712814331055, + "rewards/margins": 0.0, + "rewards/rejected": -14.185712814331055, + "step": 194 + }, + { + "epoch": 0.13455235466620666, + "grad_norm": 0.47172316908836365, + "learning_rate": 3.737064009198927e-07, + "logits/chosen": 3.782028913497925, + "logits/rejected": 3.782028913497925, + "logps/chosen": -171.89053344726562, + "logps/rejected": -171.89053344726562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.270442962646484, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -12.270442962646484, + "step": 195 + }, + { + "epoch": 0.13524236674141799, + "grad_norm": 0.25725486874580383, + "learning_rate": 3.756228440015332e-07, + "logits/chosen": 3.5578813552856445, + "logits/rejected": 3.770456314086914, + "logps/chosen": -155.33682250976562, + "logps/rejected": -173.62054443359375, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.541692733764648, + "rewards/margins": 1.8745505809783936, + "rewards/rejected": -12.416242599487305, + "step": 196 + }, + { + "epoch": 0.13593237881662928, + "grad_norm": 0.4060191512107849, + "learning_rate": 3.7753928708317364e-07, + "logits/chosen": 4.104308605194092, + "logits/rejected": 4.104308605194092, + "logps/chosen": -174.49224853515625, + "logps/rejected": -174.4922332763672, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.677001953125, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.677001953125, + "step": 197 + }, + { + "epoch": 0.1366223908918406, + "grad_norm": 0.3245529234409332, + "learning_rate": 3.7945573016481414e-07, + "logits/chosen": 3.999969959259033, + "logits/rejected": 4.058833122253418, + "logps/chosen": -174.33837890625, + "logps/rejected": -188.73904418945312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.582172393798828, + "rewards/margins": 1.4057128429412842, + "rewards/rejected": -13.987885475158691, + "step": 198 + }, + { + "epoch": 0.1373124029670519, + "grad_norm": 0.4004806578159332, + "learning_rate": 3.813721732464546e-07, + "logits/chosen": 3.9494552612304688, + "logits/rejected": 4.02940034866333, + "logps/chosen": -162.8669891357422, + "logps/rejected": -175.78271484375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.504061698913574, + "rewards/margins": 1.2362529039382935, + "rewards/rejected": -12.740314483642578, + "step": 199 + }, + { + "epoch": 0.13800241504226324, + "grad_norm": 1.625029444694519, + "learning_rate": 3.832886163280951e-07, + "logits/chosen": 3.6351606845855713, + "logits/rejected": 3.7606236934661865, + "logps/chosen": -163.65419006347656, + "logps/rejected": -174.81602478027344, + "loss": 0.5562, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.595407485961914, + "rewards/margins": 1.1101423501968384, + "rewards/rejected": -12.705549240112305, + "step": 200 + }, + { + "epoch": 0.13869242711747456, + "grad_norm": 0.3972361087799072, + "learning_rate": 3.852050594097356e-07, + "logits/chosen": 3.865206241607666, + "logits/rejected": 3.865206241607666, + "logps/chosen": -171.3946533203125, + "logps/rejected": -171.3946533203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.33617115020752, + "rewards/margins": 0.0, + "rewards/rejected": -12.33617115020752, + "step": 201 + }, + { + "epoch": 0.13938243919268586, + "grad_norm": 0.24798229336738586, + "learning_rate": 3.87121502491376e-07, + "logits/chosen": 3.638762950897217, + "logits/rejected": 3.762960433959961, + "logps/chosen": -168.97230529785156, + "logps/rejected": -190.115966796875, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.052746772766113, + "rewards/margins": 2.0970752239227295, + "rewards/rejected": -14.149821281433105, + "step": 202 + }, + { + "epoch": 0.1400724512678972, + "grad_norm": 0.37775716185569763, + "learning_rate": 3.890379455730165e-07, + "logits/chosen": 3.907238006591797, + "logits/rejected": 3.907238006591797, + "logps/chosen": -183.814453125, + "logps/rejected": -183.814453125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.61143684387207, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.61143684387207, + "step": 203 + }, + { + "epoch": 0.14076246334310852, + "grad_norm": 13.705789566040039, + "learning_rate": 3.9095438865465697e-07, + "logits/chosen": 4.210761547088623, + "logits/rejected": 4.184969425201416, + "logps/chosen": -185.80294799804688, + "logps/rejected": -182.19979858398438, + "loss": 1.0236, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.729080200195312, + "rewards/margins": -0.412614643573761, + "rewards/rejected": -13.316465377807617, + "step": 204 + }, + { + "epoch": 0.14145247541831982, + "grad_norm": 0.38210394978523254, + "learning_rate": 3.9287083173629747e-07, + "logits/chosen": 4.03826379776001, + "logits/rejected": 4.03826379776001, + "logps/chosen": -181.52291870117188, + "logps/rejected": -181.52291870117188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.274534225463867, + "rewards/margins": -2.980232238769531e-07, + "rewards/rejected": -13.27453327178955, + "step": 205 + }, + { + "epoch": 0.14214248749353114, + "grad_norm": 0.32654261589050293, + "learning_rate": 3.9478727481793796e-07, + "logits/chosen": 3.5850443840026855, + "logits/rejected": 3.6851940155029297, + "logps/chosen": -159.824462890625, + "logps/rejected": -170.68783569335938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.961864471435547, + "rewards/margins": 1.1234824657440186, + "rewards/rejected": -12.085346221923828, + "step": 206 + }, + { + "epoch": 0.14283249956874244, + "grad_norm": 0.4209830164909363, + "learning_rate": 3.967037178995784e-07, + "logits/chosen": 4.029435157775879, + "logits/rejected": 4.029435157775879, + "logps/chosen": -180.982177734375, + "logps/rejected": -180.982177734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.384469985961914, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.384469985961914, + "step": 207 + }, + { + "epoch": 0.14352251164395377, + "grad_norm": 11.078187942504883, + "learning_rate": 3.986201609812189e-07, + "logits/chosen": 3.899111270904541, + "logits/rejected": 3.8586788177490234, + "logps/chosen": -167.35055541992188, + "logps/rejected": -172.44302368164062, + "loss": 0.6668, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.983070373535156, + "rewards/margins": 0.5739836692810059, + "rewards/rejected": -12.557053565979004, + "step": 208 + }, + { + "epoch": 0.1442125237191651, + "grad_norm": 0.32792553305625916, + "learning_rate": 4.0053660406285935e-07, + "logits/chosen": 4.32331657409668, + "logits/rejected": 4.32331657409668, + "logps/chosen": -188.29779052734375, + "logps/rejected": -188.29779052734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.028562545776367, + "rewards/margins": 0.0, + "rewards/rejected": -14.028562545776367, + "step": 209 + }, + { + "epoch": 0.1449025357943764, + "grad_norm": 1.640384554862976, + "learning_rate": 4.0245304714449985e-07, + "logits/chosen": 3.962019205093384, + "logits/rejected": 4.089582443237305, + "logps/chosen": -169.8790740966797, + "logps/rejected": -179.33676147460938, + "loss": 0.5266, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.243635177612305, + "rewards/margins": 0.9503495693206787, + "rewards/rejected": -13.193984985351562, + "step": 210 + }, + { + "epoch": 0.14559254786958772, + "grad_norm": 0.3096751868724823, + "learning_rate": 4.0436949022614034e-07, + "logits/chosen": 3.8546640872955322, + "logits/rejected": 4.004922389984131, + "logps/chosen": -157.51316833496094, + "logps/rejected": -173.737548828125, + "loss": 0.5204, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.079120635986328, + "rewards/margins": 1.6247223615646362, + "rewards/rejected": -12.70384407043457, + "step": 211 + }, + { + "epoch": 0.14628255994479902, + "grad_norm": 0.3551698923110962, + "learning_rate": 4.062859333077808e-07, + "logits/chosen": 3.6580147743225098, + "logits/rejected": 3.7712059020996094, + "logps/chosen": -173.87258911132812, + "logps/rejected": -193.86358642578125, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.490012168884277, + "rewards/margins": 2.07161808013916, + "rewards/rejected": -14.561630249023438, + "step": 212 + }, + { + "epoch": 0.14697257202001035, + "grad_norm": 24.50393295288086, + "learning_rate": 4.082023763894213e-07, + "logits/chosen": 3.8087635040283203, + "logits/rejected": 3.9053187370300293, + "logps/chosen": -173.56961059570312, + "logps/rejected": -175.1015167236328, + "loss": 1.0742, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.661375045776367, + "rewards/margins": 0.1113249659538269, + "rewards/rejected": -12.772701263427734, + "step": 213 + }, + { + "epoch": 0.14766258409522168, + "grad_norm": 2.407498598098755, + "learning_rate": 4.1011881947106173e-07, + "logits/chosen": 3.6396255493164062, + "logits/rejected": 3.8734652996063232, + "logps/chosen": -156.7728729248047, + "logps/rejected": -167.271484375, + "loss": 0.5428, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.824806213378906, + "rewards/margins": 1.0762614011764526, + "rewards/rejected": -11.901067733764648, + "step": 214 + }, + { + "epoch": 0.14835259617043298, + "grad_norm": 7.593843460083008, + "learning_rate": 4.1203526255270223e-07, + "logits/chosen": 3.578061580657959, + "logits/rejected": 3.790421962738037, + "logps/chosen": -163.98239135742188, + "logps/rejected": -175.88294982910156, + "loss": 0.5695, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.735363006591797, + "rewards/margins": 1.168565273284912, + "rewards/rejected": -12.90392780303955, + "step": 215 + }, + { + "epoch": 0.1490426082456443, + "grad_norm": 0.4500364363193512, + "learning_rate": 4.139517056343427e-07, + "logits/chosen": 4.108406066894531, + "logits/rejected": 4.108406066894531, + "logps/chosen": -181.6544647216797, + "logps/rejected": -181.6544647216797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.2437105178833, + "rewards/margins": 0.0, + "rewards/rejected": -13.2437105178833, + "step": 216 + }, + { + "epoch": 0.1497326203208556, + "grad_norm": 0.3471899926662445, + "learning_rate": 4.1586814871598317e-07, + "logits/chosen": 3.9735469818115234, + "logits/rejected": 4.047122955322266, + "logps/chosen": -184.9503631591797, + "logps/rejected": -194.93011474609375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.542062759399414, + "rewards/margins": 1.0930527448654175, + "rewards/rejected": -14.635114669799805, + "step": 217 + }, + { + "epoch": 0.15042263239606693, + "grad_norm": 0.3437124788761139, + "learning_rate": 4.1778459179762367e-07, + "logits/chosen": 3.629568099975586, + "logits/rejected": 3.629568099975586, + "logps/chosen": -164.71466064453125, + "logps/rejected": -164.71466064453125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.650861740112305, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.650861740112305, + "step": 218 + }, + { + "epoch": 0.15111264447127826, + "grad_norm": 0.47724008560180664, + "learning_rate": 4.197010348792641e-07, + "logits/chosen": 3.6878747940063477, + "logits/rejected": 3.6878747940063477, + "logps/chosen": -169.20132446289062, + "logps/rejected": -169.20132446289062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.191110610961914, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.191110610961914, + "step": 219 + }, + { + "epoch": 0.15180265654648956, + "grad_norm": 0.31058618426322937, + "learning_rate": 4.216174779609046e-07, + "logits/chosen": 3.4311203956604004, + "logits/rejected": 3.548222541809082, + "logps/chosen": -174.1872100830078, + "logps/rejected": -186.0226593017578, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.527786254882812, + "rewards/margins": 1.142269253730774, + "rewards/rejected": -13.670055389404297, + "step": 220 + }, + { + "epoch": 0.15249266862170088, + "grad_norm": 0.33576154708862305, + "learning_rate": 4.235339210425451e-07, + "logits/chosen": 4.148375511169434, + "logits/rejected": 4.148375511169434, + "logps/chosen": -201.1121063232422, + "logps/rejected": -201.1121063232422, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -15.217966079711914, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -15.217966079711914, + "step": 221 + }, + { + "epoch": 0.15318268069691218, + "grad_norm": 0.37310591340065, + "learning_rate": 4.2545036412418555e-07, + "logits/chosen": 3.8143064975738525, + "logits/rejected": 3.8143064975738525, + "logps/chosen": -153.8453369140625, + "logps/rejected": -153.8453369140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.666213035583496, + "rewards/margins": 0.0, + "rewards/rejected": -10.666213035583496, + "step": 222 + }, + { + "epoch": 0.1538726927721235, + "grad_norm": 0.4323681890964508, + "learning_rate": 4.2736680720582605e-07, + "logits/chosen": 3.8723936080932617, + "logits/rejected": 3.8723936080932617, + "logps/chosen": -195.12794494628906, + "logps/rejected": -195.12796020507812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.774356842041016, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.774356842041016, + "step": 223 + }, + { + "epoch": 0.15456270484733484, + "grad_norm": 0.45105648040771484, + "learning_rate": 4.292832502874665e-07, + "logits/chosen": 3.6430156230926514, + "logits/rejected": 3.737945079803467, + "logps/chosen": -162.0599365234375, + "logps/rejected": -172.77053833007812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.660882949829102, + "rewards/margins": 1.0755990743637085, + "rewards/rejected": -12.736481666564941, + "step": 224 + }, + { + "epoch": 0.15525271692254614, + "grad_norm": 0.3278805613517761, + "learning_rate": 4.31199693369107e-07, + "logits/chosen": 3.9931392669677734, + "logits/rejected": 4.11300802230835, + "logps/chosen": -161.8638153076172, + "logps/rejected": -173.67601013183594, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.52792739868164, + "rewards/margins": 1.1373566389083862, + "rewards/rejected": -12.665283203125, + "step": 225 + }, + { + "epoch": 0.15594272899775746, + "grad_norm": 0.4640181362628937, + "learning_rate": 4.3311613645074743e-07, + "logits/chosen": 4.019922256469727, + "logits/rejected": 4.05990743637085, + "logps/chosen": -174.08628845214844, + "logps/rejected": -186.8079376220703, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.628702163696289, + "rewards/margins": 1.2097824811935425, + "rewards/rejected": -13.838485717773438, + "step": 226 + }, + { + "epoch": 0.1566327410729688, + "grad_norm": 5.576011657714844, + "learning_rate": 4.3503257953238793e-07, + "logits/chosen": 3.4939842224121094, + "logits/rejected": 3.9145591259002686, + "logps/chosen": -144.07864379882812, + "logps/rejected": -176.06011962890625, + "loss": 0.3029, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.732322692871094, + "rewards/margins": 3.164653778076172, + "rewards/rejected": -12.896976470947266, + "step": 227 + }, + { + "epoch": 0.1573227531481801, + "grad_norm": 0.2693938612937927, + "learning_rate": 4.3694902261402843e-07, + "logits/chosen": 3.3695931434631348, + "logits/rejected": 3.8048229217529297, + "logps/chosen": -164.83599853515625, + "logps/rejected": -195.4136962890625, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.750131607055664, + "rewards/margins": 3.081641674041748, + "rewards/rejected": -14.831771850585938, + "step": 228 + }, + { + "epoch": 0.15801276522339142, + "grad_norm": 1.3909368515014648, + "learning_rate": 4.3886546569566887e-07, + "logits/chosen": 3.7936644554138184, + "logits/rejected": 3.7803897857666016, + "logps/chosen": -169.6132354736328, + "logps/rejected": -172.07327270507812, + "loss": 0.6254, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.267406463623047, + "rewards/margins": 0.22660213708877563, + "rewards/rejected": -12.49400806427002, + "step": 229 + }, + { + "epoch": 0.15870277729860272, + "grad_norm": 3.082085609436035, + "learning_rate": 4.4078190877730937e-07, + "logits/chosen": 4.003514289855957, + "logits/rejected": 4.060214042663574, + "logps/chosen": -173.6118621826172, + "logps/rejected": -176.07861328125, + "loss": 0.6241, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.636665344238281, + "rewards/margins": 0.23644793033599854, + "rewards/rejected": -12.873113632202148, + "step": 230 + }, + { + "epoch": 0.15939278937381404, + "grad_norm": 23.08545684814453, + "learning_rate": 4.426983518589498e-07, + "logits/chosen": 3.965087890625, + "logits/rejected": 3.9080796241760254, + "logps/chosen": -161.93685913085938, + "logps/rejected": -164.739990234375, + "loss": 1.2083, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.341156005859375, + "rewards/margins": 0.27297067642211914, + "rewards/rejected": -11.614126205444336, + "step": 231 + }, + { + "epoch": 0.16008280144902537, + "grad_norm": 0.26151418685913086, + "learning_rate": 4.446147949405903e-07, + "logits/chosen": 4.005407810211182, + "logits/rejected": 4.168929100036621, + "logps/chosen": -178.87429809570312, + "logps/rejected": -191.26498413085938, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.103803634643555, + "rewards/margins": 1.2462952136993408, + "rewards/rejected": -14.350099563598633, + "step": 232 + }, + { + "epoch": 0.16077281352423667, + "grad_norm": 1.5348246097564697, + "learning_rate": 4.465312380222308e-07, + "logits/chosen": 4.239727973937988, + "logits/rejected": 4.195718765258789, + "logps/chosen": -173.10595703125, + "logps/rejected": -182.4237060546875, + "loss": 0.529, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.548380851745605, + "rewards/margins": 1.017569661140442, + "rewards/rejected": -13.565950393676758, + "step": 233 + }, + { + "epoch": 0.161462825599448, + "grad_norm": 17.099361419677734, + "learning_rate": 4.4844768110387125e-07, + "logits/chosen": 4.073119163513184, + "logits/rejected": 4.096502304077148, + "logps/chosen": -179.18161010742188, + "logps/rejected": -193.34597778320312, + "loss": 1.0694, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.06879997253418, + "rewards/margins": 1.572197437286377, + "rewards/rejected": -14.640996932983398, + "step": 234 + }, + { + "epoch": 0.1621528376746593, + "grad_norm": 0.3451100289821625, + "learning_rate": 4.5036412418551175e-07, + "logits/chosen": 3.87176513671875, + "logits/rejected": 3.9879746437072754, + "logps/chosen": -177.3440399169922, + "logps/rejected": -184.4565887451172, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.854764938354492, + "rewards/margins": 0.7844562530517578, + "rewards/rejected": -13.63922119140625, + "step": 235 + }, + { + "epoch": 0.16284284974987062, + "grad_norm": 0.4657173752784729, + "learning_rate": 4.522805672671522e-07, + "logits/chosen": 4.188790321350098, + "logits/rejected": 4.2131242752075195, + "logps/chosen": -176.282470703125, + "logps/rejected": -180.83599853515625, + "loss": 0.6081, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.796001434326172, + "rewards/margins": 0.5450886487960815, + "rewards/rejected": -13.341090202331543, + "step": 236 + }, + { + "epoch": 0.16353286182508195, + "grad_norm": 0.40683072805404663, + "learning_rate": 4.541970103487927e-07, + "logits/chosen": 3.7852180004119873, + "logits/rejected": 4.09987211227417, + "logps/chosen": -164.02984619140625, + "logps/rejected": -184.89605712890625, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.687934875488281, + "rewards/margins": 2.1435279846191406, + "rewards/rejected": -13.831461906433105, + "step": 237 + }, + { + "epoch": 0.16422287390029325, + "grad_norm": 11.249533653259277, + "learning_rate": 4.561134534304332e-07, + "logits/chosen": 3.8084371089935303, + "logits/rejected": 3.7834808826446533, + "logps/chosen": -171.76516723632812, + "logps/rejected": -171.06259155273438, + "loss": 0.6962, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.444649696350098, + "rewards/margins": -0.006093025207519531, + "rewards/rejected": -12.438557624816895, + "step": 238 + }, + { + "epoch": 0.16491288597550458, + "grad_norm": 1.0879862308502197, + "learning_rate": 4.5802989651207364e-07, + "logits/chosen": 3.825571060180664, + "logits/rejected": 3.930972099304199, + "logps/chosen": -162.90475463867188, + "logps/rejected": -181.27392578125, + "loss": 0.525, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.406722068786621, + "rewards/margins": 1.7498003244400024, + "rewards/rejected": -13.156522750854492, + "step": 239 + }, + { + "epoch": 0.16560289805071587, + "grad_norm": 0.40778693556785583, + "learning_rate": 4.5994633959371413e-07, + "logits/chosen": 3.946753740310669, + "logits/rejected": 3.946753740310669, + "logps/chosen": -179.51625061035156, + "logps/rejected": -179.5162353515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.171734809875488, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.171733856201172, + "step": 240 + }, + { + "epoch": 0.1662929101259272, + "grad_norm": 0.3038535714149475, + "learning_rate": 4.618627826753546e-07, + "logits/chosen": 3.7266383171081543, + "logits/rejected": 3.93277645111084, + "logps/chosen": -147.66796875, + "logps/rejected": -173.8125, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.952195167541504, + "rewards/margins": 2.6958601474761963, + "rewards/rejected": -12.648055076599121, + "step": 241 + }, + { + "epoch": 0.16698292220113853, + "grad_norm": 0.38052791357040405, + "learning_rate": 4.637792257569951e-07, + "logits/chosen": 4.050795555114746, + "logits/rejected": 4.050795555114746, + "logps/chosen": -166.29336547851562, + "logps/rejected": -166.29336547851562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.768083572387695, + "rewards/margins": 0.0, + "rewards/rejected": -11.768083572387695, + "step": 242 + }, + { + "epoch": 0.16767293427634983, + "grad_norm": 0.4285120964050293, + "learning_rate": 4.6569566883863557e-07, + "logits/chosen": 4.158690929412842, + "logits/rejected": 4.158690929412842, + "logps/chosen": -189.4462432861328, + "logps/rejected": -189.4462432861328, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.135557174682617, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -14.135557174682617, + "step": 243 + }, + { + "epoch": 0.16836294635156115, + "grad_norm": 0.37588346004486084, + "learning_rate": 4.67612111920276e-07, + "logits/chosen": 3.74542236328125, + "logits/rejected": 3.74542236328125, + "logps/chosen": -163.845458984375, + "logps/rejected": -163.845458984375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.530536651611328, + "rewards/margins": -2.980232238769531e-07, + "rewards/rejected": -11.530536651611328, + "step": 244 + }, + { + "epoch": 0.16905295842677248, + "grad_norm": 20.06930160522461, + "learning_rate": 4.695285550019165e-07, + "logits/chosen": 3.6760077476501465, + "logits/rejected": 3.754271984100342, + "logps/chosen": -156.31130981445312, + "logps/rejected": -155.42874145507812, + "loss": 0.756, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.870647430419922, + "rewards/margins": -0.10441362857818604, + "rewards/rejected": -10.766233444213867, + "step": 245 + }, + { + "epoch": 0.16974297050198378, + "grad_norm": 0.2850953936576843, + "learning_rate": 4.7144499808355696e-07, + "logits/chosen": 3.7489376068115234, + "logits/rejected": 3.886322498321533, + "logps/chosen": -175.46519470214844, + "logps/rejected": -185.06202697753906, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.850099563598633, + "rewards/margins": 0.9715667366981506, + "rewards/rejected": -13.82166576385498, + "step": 246 + }, + { + "epoch": 0.1704329825771951, + "grad_norm": 0.4798845648765564, + "learning_rate": 4.7336144116519746e-07, + "logits/chosen": 3.7500174045562744, + "logits/rejected": 3.7500174045562744, + "logps/chosen": -168.72657775878906, + "logps/rejected": -168.72657775878906, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.215822219848633, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.215822219848633, + "step": 247 + }, + { + "epoch": 0.1711229946524064, + "grad_norm": 0.30977389216423035, + "learning_rate": 4.7527788424683795e-07, + "logits/chosen": 3.6225857734680176, + "logits/rejected": 3.816579580307007, + "logps/chosen": -168.85731506347656, + "logps/rejected": -177.89999389648438, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.1965913772583, + "rewards/margins": 0.8544291853904724, + "rewards/rejected": -13.051021575927734, + "step": 248 + }, + { + "epoch": 0.17181300672761773, + "grad_norm": 0.3818824589252472, + "learning_rate": 4.771943273284785e-07, + "logits/chosen": 4.075075149536133, + "logits/rejected": 4.075075149536133, + "logps/chosen": -178.5846405029297, + "logps/rejected": -178.5846405029297, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.090214729309082, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.090214729309082, + "step": 249 + }, + { + "epoch": 0.17250301880282906, + "grad_norm": 0.3252604305744171, + "learning_rate": 4.791107704101188e-07, + "logits/chosen": 4.003413677215576, + "logits/rejected": 4.055771350860596, + "logps/chosen": -173.60943603515625, + "logps/rejected": -183.70040893554688, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.717453956604004, + "rewards/margins": 1.0512497425079346, + "rewards/rejected": -13.76870346069336, + "step": 250 + }, + { + "epoch": 0.17319303087804036, + "grad_norm": 0.39795413613319397, + "learning_rate": 4.810272134917593e-07, + "logits/chosen": 3.6547675132751465, + "logits/rejected": 3.8038506507873535, + "logps/chosen": -178.74913024902344, + "logps/rejected": -185.06788635253906, + "loss": 0.6074, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.102039337158203, + "rewards/margins": 0.621198296546936, + "rewards/rejected": -13.723237037658691, + "step": 251 + }, + { + "epoch": 0.1738830429532517, + "grad_norm": 0.34394773840904236, + "learning_rate": 4.829436565733998e-07, + "logits/chosen": 4.125033378601074, + "logits/rejected": 4.2549567222595215, + "logps/chosen": -178.937744140625, + "logps/rejected": -188.31663513183594, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.943517684936523, + "rewards/margins": 0.9020028114318848, + "rewards/rejected": -13.845520973205566, + "step": 252 + }, + { + "epoch": 0.174573055028463, + "grad_norm": 0.3770841062068939, + "learning_rate": 4.848600996550403e-07, + "logits/chosen": 3.961052894592285, + "logits/rejected": 3.961052894592285, + "logps/chosen": -192.80545043945312, + "logps/rejected": -192.80545043945312, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.57866382598877, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -14.578664779663086, + "step": 253 + }, + { + "epoch": 0.1752630671036743, + "grad_norm": 0.330710768699646, + "learning_rate": 4.867765427366808e-07, + "logits/chosen": 3.8245294094085693, + "logits/rejected": 4.012722492218018, + "logps/chosen": -173.51898193359375, + "logps/rejected": -191.30227661132812, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.410252571105957, + "rewards/margins": 1.814591884613037, + "rewards/rejected": -14.224844932556152, + "step": 254 + }, + { + "epoch": 0.17595307917888564, + "grad_norm": 11.215513229370117, + "learning_rate": 4.886929858183212e-07, + "logits/chosen": 3.813009262084961, + "logits/rejected": 3.8041932582855225, + "logps/chosen": -153.5599365234375, + "logps/rejected": -170.1527557373047, + "loss": 1.1772, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.616848945617676, + "rewards/margins": 1.5285732746124268, + "rewards/rejected": -12.145421981811523, + "step": 255 + }, + { + "epoch": 0.17664309125409694, + "grad_norm": 0.41319867968559265, + "learning_rate": 4.906094288999617e-07, + "logits/chosen": 3.835491180419922, + "logits/rejected": 3.9137463569641113, + "logps/chosen": -165.71511840820312, + "logps/rejected": -174.70643615722656, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.78524112701416, + "rewards/margins": 0.8895859718322754, + "rewards/rejected": -12.674827575683594, + "step": 256 + }, + { + "epoch": 0.17733310332930827, + "grad_norm": 0.5400515198707581, + "learning_rate": 4.925258719816022e-07, + "logits/chosen": 3.6585586071014404, + "logits/rejected": 3.6585586071014404, + "logps/chosen": -171.7253875732422, + "logps/rejected": -171.72540283203125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.326761245727539, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.326763153076172, + "step": 257 + }, + { + "epoch": 0.17802311540451957, + "grad_norm": 0.3943483829498291, + "learning_rate": 4.944423150632427e-07, + "logits/chosen": 3.591732978820801, + "logits/rejected": 3.618314266204834, + "logps/chosen": -174.22500610351562, + "logps/rejected": -181.46063232421875, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.507545471191406, + "rewards/margins": 0.7925291061401367, + "rewards/rejected": -13.300073623657227, + "step": 258 + }, + { + "epoch": 0.1787131274797309, + "grad_norm": 0.41925889253616333, + "learning_rate": 4.963587581448832e-07, + "logits/chosen": 3.5197677612304688, + "logits/rejected": 3.5651655197143555, + "logps/chosen": -178.33697509765625, + "logps/rejected": -186.71456909179688, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.017461776733398, + "rewards/margins": 0.8285014629364014, + "rewards/rejected": -13.845963478088379, + "step": 259 + }, + { + "epoch": 0.17940313955494222, + "grad_norm": 0.28740522265434265, + "learning_rate": 4.982752012265236e-07, + "logits/chosen": 3.8972673416137695, + "logits/rejected": 4.150703430175781, + "logps/chosen": -166.97705078125, + "logps/rejected": -187.56744384765625, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.75822925567627, + "rewards/margins": 2.128561019897461, + "rewards/rejected": -13.88679027557373, + "step": 260 + }, + { + "epoch": 0.18009315163015352, + "grad_norm": 0.30473747849464417, + "learning_rate": 5.001916443081641e-07, + "logits/chosen": 4.169797897338867, + "logits/rejected": 4.273223400115967, + "logps/chosen": -188.26235961914062, + "logps/rejected": -198.4112548828125, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.069221496582031, + "rewards/margins": 1.0346605777740479, + "rewards/rejected": -15.1038818359375, + "step": 261 + }, + { + "epoch": 0.18078316370536485, + "grad_norm": 0.30542656779289246, + "learning_rate": 5.021080873898045e-07, + "logits/chosen": 4.2542572021484375, + "logits/rejected": 4.2542572021484375, + "logps/chosen": -191.88681030273438, + "logps/rejected": -191.88681030273438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.344158172607422, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -14.344158172607422, + "step": 262 + }, + { + "epoch": 0.18147317578057617, + "grad_norm": 18.746484756469727, + "learning_rate": 5.04024530471445e-07, + "logits/chosen": 4.3212409019470215, + "logits/rejected": 4.318099021911621, + "logps/chosen": -180.63372802734375, + "logps/rejected": -175.39483642578125, + "loss": 1.1962, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.388310432434082, + "rewards/margins": -0.5885540246963501, + "rewards/rejected": -12.79975700378418, + "step": 263 + }, + { + "epoch": 0.18216318785578747, + "grad_norm": 7.976447105407715, + "learning_rate": 5.059409735530855e-07, + "logits/chosen": 3.8807573318481445, + "logits/rejected": 3.8918964862823486, + "logps/chosen": -171.519287109375, + "logps/rejected": -172.63999938964844, + "loss": 0.6505, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.38197135925293, + "rewards/margins": 0.10785496234893799, + "rewards/rejected": -12.489828109741211, + "step": 264 + }, + { + "epoch": 0.1828531999309988, + "grad_norm": 0.2776941657066345, + "learning_rate": 5.07857416634726e-07, + "logits/chosen": 3.903608560562134, + "logits/rejected": 4.196962356567383, + "logps/chosen": -176.818603515625, + "logps/rejected": -194.09921264648438, + "loss": 0.5201, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.815094947814941, + "rewards/margins": 1.7525800466537476, + "rewards/rejected": -14.56767463684082, + "step": 265 + }, + { + "epoch": 0.1835432120062101, + "grad_norm": 0.5846889615058899, + "learning_rate": 5.097738597163665e-07, + "logits/chosen": 3.7784249782562256, + "logits/rejected": 3.9577736854553223, + "logps/chosen": -162.9578399658203, + "logps/rejected": -178.96026611328125, + "loss": 0.5277, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.454822540283203, + "rewards/margins": 1.693995714187622, + "rewards/rejected": -13.148818016052246, + "step": 266 + }, + { + "epoch": 0.18423322408142143, + "grad_norm": 0.41411420702934265, + "learning_rate": 5.116903027980069e-07, + "logits/chosen": 3.810293197631836, + "logits/rejected": 3.810293197631836, + "logps/chosen": -179.2377166748047, + "logps/rejected": -179.2377166748047, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.991254806518555, + "rewards/margins": 0.0, + "rewards/rejected": -12.991254806518555, + "step": 267 + }, + { + "epoch": 0.18492323615663275, + "grad_norm": 0.3806070387363434, + "learning_rate": 5.136067458796474e-07, + "logits/chosen": 4.134654998779297, + "logits/rejected": 4.069896221160889, + "logps/chosen": -175.20103454589844, + "logps/rejected": -181.37570190429688, + "loss": 0.6079, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.578387260437012, + "rewards/margins": 0.5629615187644958, + "rewards/rejected": -13.141348838806152, + "step": 268 + }, + { + "epoch": 0.18561324823184405, + "grad_norm": 0.41404610872268677, + "learning_rate": 5.155231889612879e-07, + "logits/chosen": 3.8907322883605957, + "logits/rejected": 3.9602227210998535, + "logps/chosen": -181.87002563476562, + "logps/rejected": -187.13162231445312, + "loss": 0.609, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.385902404785156, + "rewards/margins": 0.4876824617385864, + "rewards/rejected": -13.873584747314453, + "step": 269 + }, + { + "epoch": 0.18630326030705538, + "grad_norm": 3.6042075157165527, + "learning_rate": 5.174396320429284e-07, + "logits/chosen": 3.685335636138916, + "logits/rejected": 3.8327231407165527, + "logps/chosen": -161.10733032226562, + "logps/rejected": -172.36280822753906, + "loss": 0.5335, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.125992774963379, + "rewards/margins": 1.1826655864715576, + "rewards/rejected": -12.3086576461792, + "step": 270 + }, + { + "epoch": 0.18699327238226668, + "grad_norm": 0.3321443796157837, + "learning_rate": 5.193560751245689e-07, + "logits/chosen": 3.8967809677124023, + "logits/rejected": 3.9885988235473633, + "logps/chosen": -164.2710723876953, + "logps/rejected": -172.91819763183594, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.751737594604492, + "rewards/margins": 0.8779622316360474, + "rewards/rejected": -12.62969970703125, + "step": 271 + }, + { + "epoch": 0.187683284457478, + "grad_norm": 0.3635883331298828, + "learning_rate": 5.212725182062093e-07, + "logits/chosen": 3.6777806282043457, + "logits/rejected": 3.8141708374023438, + "logps/chosen": -161.39889526367188, + "logps/rejected": -175.1636505126953, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.437173843383789, + "rewards/margins": 1.3330180644989014, + "rewards/rejected": -12.770191192626953, + "step": 272 + }, + { + "epoch": 0.18837329653268933, + "grad_norm": 0.41407614946365356, + "learning_rate": 5.231889612878498e-07, + "logits/chosen": 4.156615257263184, + "logits/rejected": 4.156615257263184, + "logps/chosen": -183.9001922607422, + "logps/rejected": -183.9001922607422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.700567245483398, + "rewards/margins": 0.0, + "rewards/rejected": -13.700567245483398, + "step": 273 + }, + { + "epoch": 0.18906330860790063, + "grad_norm": 0.3247155547142029, + "learning_rate": 5.251054043694902e-07, + "logits/chosen": 3.8985238075256348, + "logits/rejected": 3.904400587081909, + "logps/chosen": -172.67860412597656, + "logps/rejected": -180.83462524414062, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.576957702636719, + "rewards/margins": 0.8021709322929382, + "rewards/rejected": -13.379128456115723, + "step": 274 + }, + { + "epoch": 0.18975332068311196, + "grad_norm": 0.4520856738090515, + "learning_rate": 5.270218474511307e-07, + "logits/chosen": 3.7160441875457764, + "logits/rejected": 3.797269582748413, + "logps/chosen": -168.02047729492188, + "logps/rejected": -177.8892059326172, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.921066284179688, + "rewards/margins": 0.945277750492096, + "rewards/rejected": -12.866344451904297, + "step": 275 + }, + { + "epoch": 0.19044333275832326, + "grad_norm": 0.40574151277542114, + "learning_rate": 5.289382905327712e-07, + "logits/chosen": 3.9997236728668213, + "logits/rejected": 4.063089370727539, + "logps/chosen": -180.70745849609375, + "logps/rejected": -185.8660888671875, + "loss": 0.6077, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.307626724243164, + "rewards/margins": 0.576331615447998, + "rewards/rejected": -13.883957862854004, + "step": 276 + }, + { + "epoch": 0.19113334483353459, + "grad_norm": 0.37334924936294556, + "learning_rate": 5.308547336144116e-07, + "logits/chosen": 3.6727280616760254, + "logits/rejected": 3.6727280616760254, + "logps/chosen": -190.1754913330078, + "logps/rejected": -190.17547607421875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.177544593811035, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -14.177543640136719, + "step": 277 + }, + { + "epoch": 0.1918233569087459, + "grad_norm": 0.30233827233314514, + "learning_rate": 5.327711766960521e-07, + "logits/chosen": 3.6642003059387207, + "logits/rejected": 3.733978748321533, + "logps/chosen": -165.42807006835938, + "logps/rejected": -173.59640502929688, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.786008834838867, + "rewards/margins": 0.8139928579330444, + "rewards/rejected": -12.60000228881836, + "step": 278 + }, + { + "epoch": 0.1925133689839572, + "grad_norm": 0.3957267701625824, + "learning_rate": 5.346876197776926e-07, + "logits/chosen": 3.9106764793395996, + "logits/rejected": 3.9106764793395996, + "logps/chosen": -188.36212158203125, + "logps/rejected": -188.36212158203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.922258377075195, + "rewards/margins": 0.0, + "rewards/rejected": -13.922258377075195, + "step": 279 + }, + { + "epoch": 0.19320338105916854, + "grad_norm": 0.43891316652297974, + "learning_rate": 5.366040628593331e-07, + "logits/chosen": 3.6451492309570312, + "logits/rejected": 3.6451492309570312, + "logps/chosen": -175.10147094726562, + "logps/rejected": -175.10147094726562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.826537132263184, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.826537132263184, + "step": 280 + }, + { + "epoch": 0.19389339313437984, + "grad_norm": 0.34560513496398926, + "learning_rate": 5.385205059409736e-07, + "logits/chosen": 3.995604991912842, + "logits/rejected": 4.106060028076172, + "logps/chosen": -171.56434631347656, + "logps/rejected": -186.82916259765625, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.43875503540039, + "rewards/margins": 1.5579934120178223, + "rewards/rejected": -13.996748924255371, + "step": 281 + }, + { + "epoch": 0.19458340520959116, + "grad_norm": 0.4528743326663971, + "learning_rate": 5.40436949022614e-07, + "logits/chosen": 3.5892717838287354, + "logits/rejected": 3.5892717838287354, + "logps/chosen": -160.22100830078125, + "logps/rejected": -160.22100830078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.343427658081055, + "rewards/margins": 0.0, + "rewards/rejected": -11.343427658081055, + "step": 282 + }, + { + "epoch": 0.1952734172848025, + "grad_norm": 1.9094328880310059, + "learning_rate": 5.423533921042545e-07, + "logits/chosen": 3.5520918369293213, + "logits/rejected": 3.637251138687134, + "logps/chosen": -169.07359313964844, + "logps/rejected": -172.3491668701172, + "loss": 0.6141, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.363908767700195, + "rewards/margins": 0.3466068506240845, + "rewards/rejected": -12.710515975952148, + "step": 283 + }, + { + "epoch": 0.1959634293600138, + "grad_norm": 27.645618438720703, + "learning_rate": 5.44269835185895e-07, + "logits/chosen": 3.917985200881958, + "logits/rejected": 3.8942928314208984, + "logps/chosen": -167.46258544921875, + "logps/rejected": -169.62623596191406, + "loss": 1.2981, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.945377349853516, + "rewards/margins": 0.13975900411605835, + "rewards/rejected": -12.085136413574219, + "step": 284 + }, + { + "epoch": 0.19665344143522512, + "grad_norm": 1.3660407066345215, + "learning_rate": 5.461862782675355e-07, + "logits/chosen": 3.7061209678649902, + "logits/rejected": 3.7313005924224854, + "logps/chosen": -167.57986450195312, + "logps/rejected": -182.922607421875, + "loss": 0.5381, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.98803424835205, + "rewards/margins": 1.5598400831222534, + "rewards/rejected": -13.547874450683594, + "step": 285 + }, + { + "epoch": 0.19734345351043645, + "grad_norm": 0.3649376332759857, + "learning_rate": 5.48102721349176e-07, + "logits/chosen": 4.18515682220459, + "logits/rejected": 4.18515682220459, + "logps/chosen": -192.62403869628906, + "logps/rejected": -192.62403869628906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.495500564575195, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.495500564575195, + "step": 286 + }, + { + "epoch": 0.19803346558564774, + "grad_norm": 0.38664883375167847, + "learning_rate": 5.500191644308164e-07, + "logits/chosen": 4.03163480758667, + "logits/rejected": 4.03163480758667, + "logps/chosen": -177.3277587890625, + "logps/rejected": -177.32777404785156, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.108163833618164, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.108163833618164, + "step": 287 + }, + { + "epoch": 0.19872347766085907, + "grad_norm": 10.482906341552734, + "learning_rate": 5.519356075124569e-07, + "logits/chosen": 3.677243709564209, + "logits/rejected": 3.6477246284484863, + "logps/chosen": -168.4564208984375, + "logps/rejected": -176.33868408203125, + "loss": 0.585, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.851827621459961, + "rewards/margins": 0.8249413967132568, + "rewards/rejected": -12.676769256591797, + "step": 288 + }, + { + "epoch": 0.19941348973607037, + "grad_norm": 0.40034806728363037, + "learning_rate": 5.538520505940974e-07, + "logits/chosen": 4.0080437660217285, + "logits/rejected": 4.0080437660217285, + "logps/chosen": -187.56704711914062, + "logps/rejected": -187.56704711914062, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.147050857543945, + "rewards/margins": 0.0, + "rewards/rejected": -14.147050857543945, + "step": 289 + }, + { + "epoch": 0.2001035018112817, + "grad_norm": 3.5045742988586426, + "learning_rate": 5.557684936757379e-07, + "logits/chosen": 3.6343271732330322, + "logits/rejected": 3.8474385738372803, + "logps/chosen": -161.79696655273438, + "logps/rejected": -180.08702087402344, + "loss": 0.4582, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.534552574157715, + "rewards/margins": 1.7478399276733398, + "rewards/rejected": -13.282392501831055, + "step": 290 + }, + { + "epoch": 0.20079351388649302, + "grad_norm": 0.3845117390155792, + "learning_rate": 5.576849367573784e-07, + "logits/chosen": 3.936706304550171, + "logits/rejected": 4.001040935516357, + "logps/chosen": -172.45993041992188, + "logps/rejected": -183.85711669921875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.450828552246094, + "rewards/margins": 1.1662997007369995, + "rewards/rejected": -13.617128372192383, + "step": 291 + }, + { + "epoch": 0.20148352596170432, + "grad_norm": 0.38455697894096375, + "learning_rate": 5.596013798390188e-07, + "logits/chosen": 3.854731559753418, + "logits/rejected": 3.854731559753418, + "logps/chosen": -157.2584991455078, + "logps/rejected": -157.25848388671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.792776107788086, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -10.792776107788086, + "step": 292 + }, + { + "epoch": 0.20217353803691565, + "grad_norm": 0.40976306796073914, + "learning_rate": 5.615178229206593e-07, + "logits/chosen": 4.137320518493652, + "logits/rejected": 4.137320518493652, + "logps/chosen": -176.4429473876953, + "logps/rejected": -176.4429473876953, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.743352890014648, + "rewards/margins": 0.0, + "rewards/rejected": -12.743352890014648, + "step": 293 + }, + { + "epoch": 0.20286355011212695, + "grad_norm": 0.3452722132205963, + "learning_rate": 5.634342660022998e-07, + "logits/chosen": 3.7772607803344727, + "logits/rejected": 3.853832721710205, + "logps/chosen": -169.21461486816406, + "logps/rejected": -175.42320251464844, + "loss": 0.6074, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.024299621582031, + "rewards/margins": 0.613949179649353, + "rewards/rejected": -12.638248443603516, + "step": 294 + }, + { + "epoch": 0.20355356218733828, + "grad_norm": 0.3772299885749817, + "learning_rate": 5.653507090839403e-07, + "logits/chosen": 3.8336939811706543, + "logits/rejected": 4.1338629722595215, + "logps/chosen": -164.91055297851562, + "logps/rejected": -193.88992309570312, + "loss": 0.4339, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.65970516204834, + "rewards/margins": 2.851956367492676, + "rewards/rejected": -14.511661529541016, + "step": 295 + }, + { + "epoch": 0.2042435742625496, + "grad_norm": 0.31218722462654114, + "learning_rate": 5.672671521655808e-07, + "logits/chosen": 3.9714818000793457, + "logits/rejected": 4.076793193817139, + "logps/chosen": -190.70941162109375, + "logps/rejected": -201.182373046875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.190484046936035, + "rewards/margins": 1.0753681659698486, + "rewards/rejected": -15.265851974487305, + "step": 296 + }, + { + "epoch": 0.2049335863377609, + "grad_norm": 0.4184395968914032, + "learning_rate": 5.691835952472212e-07, + "logits/chosen": 3.8450117111206055, + "logits/rejected": 4.048823356628418, + "logps/chosen": -157.46775817871094, + "logps/rejected": -169.89834594726562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.193784713745117, + "rewards/margins": 1.270035743713379, + "rewards/rejected": -12.46381950378418, + "step": 297 + }, + { + "epoch": 0.20562359841297223, + "grad_norm": 0.31908518075942993, + "learning_rate": 5.711000383288617e-07, + "logits/chosen": 4.04409646987915, + "logits/rejected": 4.04409646987915, + "logps/chosen": -179.33883666992188, + "logps/rejected": -179.33883666992188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.18505859375, + "rewards/margins": 0.0, + "rewards/rejected": -13.18505859375, + "step": 298 + }, + { + "epoch": 0.20631361048818353, + "grad_norm": 0.4374209940433502, + "learning_rate": 5.730164814105022e-07, + "logits/chosen": 4.0145769119262695, + "logits/rejected": 4.0145769119262695, + "logps/chosen": -187.67181396484375, + "logps/rejected": -187.67181396484375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.8307466506958, + "rewards/margins": 0.0, + "rewards/rejected": -13.830747604370117, + "step": 299 + }, + { + "epoch": 0.20700362256339486, + "grad_norm": 0.3518063426017761, + "learning_rate": 5.749329244921427e-07, + "logits/chosen": 4.185058116912842, + "logits/rejected": 4.185058116912842, + "logps/chosen": -185.033447265625, + "logps/rejected": -185.033447265625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.726630210876465, + "rewards/margins": 0.0, + "rewards/rejected": -13.726630210876465, + "step": 300 + }, + { + "epoch": 0.20769363463860618, + "grad_norm": 0.3285667896270752, + "learning_rate": 5.768493675737831e-07, + "logits/chosen": 3.7554898262023926, + "logits/rejected": 3.7478015422821045, + "logps/chosen": -167.88980102539062, + "logps/rejected": -176.74356079101562, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.06946086883545, + "rewards/margins": 0.6708373427391052, + "rewards/rejected": -12.7402982711792, + "step": 301 + }, + { + "epoch": 0.20838364671381748, + "grad_norm": 0.6674970388412476, + "learning_rate": 5.787658106554235e-07, + "logits/chosen": 3.8035645484924316, + "logits/rejected": 3.8785157203674316, + "logps/chosen": -170.008056640625, + "logps/rejected": -173.817138671875, + "loss": 0.6115, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.373239517211914, + "rewards/margins": 0.39931392669677734, + "rewards/rejected": -12.772553443908691, + "step": 302 + }, + { + "epoch": 0.2090736587890288, + "grad_norm": 0.4890058636665344, + "learning_rate": 5.80682253737064e-07, + "logits/chosen": 3.5446367263793945, + "logits/rejected": 3.5446367263793945, + "logps/chosen": -161.56236267089844, + "logps/rejected": -161.56236267089844, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.474405288696289, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.474405288696289, + "step": 303 + }, + { + "epoch": 0.20976367086424014, + "grad_norm": 0.4014892280101776, + "learning_rate": 5.825986968187045e-07, + "logits/chosen": 3.8830127716064453, + "logits/rejected": 3.8830127716064453, + "logps/chosen": -183.16409301757812, + "logps/rejected": -183.16409301757812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.618077278137207, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.618078231811523, + "step": 304 + }, + { + "epoch": 0.21045368293945144, + "grad_norm": 0.27961257100105286, + "learning_rate": 5.84515139900345e-07, + "logits/chosen": 3.5614356994628906, + "logits/rejected": 3.955261707305908, + "logps/chosen": -155.83016967773438, + "logps/rejected": -183.98892211914062, + "loss": 0.4335, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.690906524658203, + "rewards/margins": 2.8077499866485596, + "rewards/rejected": -13.498655319213867, + "step": 305 + }, + { + "epoch": 0.21114369501466276, + "grad_norm": 0.37410035729408264, + "learning_rate": 5.864315829819855e-07, + "logits/chosen": 3.8365745544433594, + "logits/rejected": 3.9702281951904297, + "logps/chosen": -173.03273010253906, + "logps/rejected": -192.74215698242188, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.594291687011719, + "rewards/margins": 2.0249338150024414, + "rewards/rejected": -14.619226455688477, + "step": 306 + }, + { + "epoch": 0.21183370708987406, + "grad_norm": 0.335147887468338, + "learning_rate": 5.883480260636259e-07, + "logits/chosen": 3.6277966499328613, + "logits/rejected": 3.848820924758911, + "logps/chosen": -164.63827514648438, + "logps/rejected": -184.63958740234375, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.78128433227539, + "rewards/margins": 2.016307830810547, + "rewards/rejected": -13.797592163085938, + "step": 307 + }, + { + "epoch": 0.2125237191650854, + "grad_norm": 1.0022261142730713, + "learning_rate": 5.902644691452664e-07, + "logits/chosen": 4.108640193939209, + "logits/rejected": 4.175785064697266, + "logps/chosen": -177.75927734375, + "logps/rejected": -182.02642822265625, + "loss": 0.6113, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.967005729675293, + "rewards/margins": 0.4060485363006592, + "rewards/rejected": -13.373053550720215, + "step": 308 + }, + { + "epoch": 0.21321373124029672, + "grad_norm": 12.578304290771484, + "learning_rate": 5.921809122269069e-07, + "logits/chosen": 4.105098724365234, + "logits/rejected": 4.117053031921387, + "logps/chosen": -176.76625061035156, + "logps/rejected": -187.6431884765625, + "loss": 0.5744, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.944759368896484, + "rewards/margins": 1.0865418910980225, + "rewards/rejected": -14.031301498413086, + "step": 309 + }, + { + "epoch": 0.21390374331550802, + "grad_norm": 0.33864426612854004, + "learning_rate": 5.940973553085474e-07, + "logits/chosen": 3.740701913833618, + "logits/rejected": 3.7313296794891357, + "logps/chosen": -169.637451171875, + "logps/rejected": -178.83872985839844, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.067876815795898, + "rewards/margins": 0.9306700229644775, + "rewards/rejected": -12.998546600341797, + "step": 310 + }, + { + "epoch": 0.21459375539071934, + "grad_norm": 17.715723037719727, + "learning_rate": 5.960137983901878e-07, + "logits/chosen": 3.8840558528900146, + "logits/rejected": 3.986166477203369, + "logps/chosen": -152.758056640625, + "logps/rejected": -184.08135986328125, + "loss": 0.5263, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.537091255187988, + "rewards/margins": 3.0095438957214355, + "rewards/rejected": -13.546634674072266, + "step": 311 + }, + { + "epoch": 0.21528376746593064, + "grad_norm": 0.4776829183101654, + "learning_rate": 5.979302414718283e-07, + "logits/chosen": 4.22246789932251, + "logits/rejected": 4.22246789932251, + "logps/chosen": -169.46449279785156, + "logps/rejected": -169.46449279785156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.430862426757812, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.430862426757812, + "step": 312 + }, + { + "epoch": 0.21597377954114197, + "grad_norm": 11.025181770324707, + "learning_rate": 5.998466845534688e-07, + "logits/chosen": 3.950209379196167, + "logits/rejected": 4.031666278839111, + "logps/chosen": -162.83419799804688, + "logps/rejected": -174.6807403564453, + "loss": 0.5728, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.537152290344238, + "rewards/margins": 1.2339880466461182, + "rewards/rejected": -12.771141052246094, + "step": 313 + }, + { + "epoch": 0.2166637916163533, + "grad_norm": 0.3923760652542114, + "learning_rate": 6.017631276351093e-07, + "logits/chosen": 3.7095088958740234, + "logits/rejected": 3.7095088958740234, + "logps/chosen": -170.14813232421875, + "logps/rejected": -170.14813232421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.16505241394043, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -12.16505241394043, + "step": 314 + }, + { + "epoch": 0.2173538036915646, + "grad_norm": 0.2963109314441681, + "learning_rate": 6.036795707167498e-07, + "logits/chosen": 3.95036244392395, + "logits/rejected": 4.295942783355713, + "logps/chosen": -176.79934692382812, + "logps/rejected": -196.13088989257812, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.766063690185547, + "rewards/margins": 1.9984055757522583, + "rewards/rejected": -14.764469146728516, + "step": 315 + }, + { + "epoch": 0.21804381576677592, + "grad_norm": 3.242414712905884, + "learning_rate": 6.055960137983902e-07, + "logits/chosen": 3.58290433883667, + "logits/rejected": 3.816953659057617, + "logps/chosen": -156.72976684570312, + "logps/rejected": -177.55499267578125, + "loss": 0.4552, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.128005027770996, + "rewards/margins": 1.9916439056396484, + "rewards/rejected": -13.119649887084961, + "step": 316 + }, + { + "epoch": 0.21873382784198722, + "grad_norm": 8.682942390441895, + "learning_rate": 6.075124568800307e-07, + "logits/chosen": 3.911593198776245, + "logits/rejected": 4.228106498718262, + "logps/chosen": -166.423095703125, + "logps/rejected": -178.5785369873047, + "loss": 0.6021, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.838878631591797, + "rewards/margins": 1.2508646249771118, + "rewards/rejected": -13.089742660522461, + "step": 317 + }, + { + "epoch": 0.21942383991719855, + "grad_norm": 0.3748859763145447, + "learning_rate": 6.094288999616712e-07, + "logits/chosen": 3.649750232696533, + "logits/rejected": 3.6927967071533203, + "logps/chosen": -150.63218688964844, + "logps/rejected": -157.39218139648438, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.225605010986328, + "rewards/margins": 0.6936588287353516, + "rewards/rejected": -10.91926383972168, + "step": 318 + }, + { + "epoch": 0.22011385199240988, + "grad_norm": 0.3577752113342285, + "learning_rate": 6.113453430433117e-07, + "logits/chosen": 3.965508460998535, + "logits/rejected": 3.965508460998535, + "logps/chosen": -178.56497192382812, + "logps/rejected": -178.56497192382812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.025411605834961, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.025411605834961, + "step": 319 + }, + { + "epoch": 0.22080386406762118, + "grad_norm": 0.3051081597805023, + "learning_rate": 6.132617861249522e-07, + "logits/chosen": 3.7442665100097656, + "logits/rejected": 3.7442665100097656, + "logps/chosen": -188.2230224609375, + "logps/rejected": -188.2230224609375, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.012553215026855, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -14.012554168701172, + "step": 320 + }, + { + "epoch": 0.2214938761428325, + "grad_norm": 0.3727867007255554, + "learning_rate": 6.151782292065926e-07, + "logits/chosen": 3.8867897987365723, + "logits/rejected": 3.8867897987365723, + "logps/chosen": -183.9745635986328, + "logps/rejected": -183.9745635986328, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.654333114624023, + "rewards/margins": 0.0, + "rewards/rejected": -13.654333114624023, + "step": 321 + }, + { + "epoch": 0.22218388821804383, + "grad_norm": 0.5386320948600769, + "learning_rate": 6.170946722882331e-07, + "logits/chosen": 3.9737818241119385, + "logits/rejected": 4.121364593505859, + "logps/chosen": -170.43736267089844, + "logps/rejected": -186.39608764648438, + "loss": 0.5215, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.301531791687012, + "rewards/margins": 1.5283567905426025, + "rewards/rejected": -13.829889297485352, + "step": 322 + }, + { + "epoch": 0.22287390029325513, + "grad_norm": 7.567324161529541, + "learning_rate": 6.190111153698736e-07, + "logits/chosen": 3.193368434906006, + "logits/rejected": 3.3104281425476074, + "logps/chosen": -157.27784729003906, + "logps/rejected": -169.09530639648438, + "loss": 0.5975, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.01411247253418, + "rewards/margins": 1.1355394124984741, + "rewards/rejected": -12.149652481079102, + "step": 323 + }, + { + "epoch": 0.22356391236846646, + "grad_norm": 0.514423131942749, + "learning_rate": 6.209275584515141e-07, + "logits/chosen": 3.9275588989257812, + "logits/rejected": 3.9275588989257812, + "logps/chosen": -184.88357543945312, + "logps/rejected": -184.88357543945312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.716337203979492, + "rewards/margins": 0.0, + "rewards/rejected": -13.716337203979492, + "step": 324 + }, + { + "epoch": 0.22425392444367775, + "grad_norm": 0.34927746653556824, + "learning_rate": 6.228440015331546e-07, + "logits/chosen": 3.9684126377105713, + "logits/rejected": 3.9684126377105713, + "logps/chosen": -179.54356384277344, + "logps/rejected": -179.54356384277344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.162925720214844, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.162925720214844, + "step": 325 + }, + { + "epoch": 0.22494393651888908, + "grad_norm": 0.314962774515152, + "learning_rate": 6.24760444614795e-07, + "logits/chosen": 3.85044527053833, + "logits/rejected": 4.102197170257568, + "logps/chosen": -160.66378784179688, + "logps/rejected": -183.38027954101562, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.34341812133789, + "rewards/margins": 2.2665486335754395, + "rewards/rejected": -13.609966278076172, + "step": 326 + }, + { + "epoch": 0.2256339485941004, + "grad_norm": 0.38244950771331787, + "learning_rate": 6.266768876964354e-07, + "logits/chosen": 3.9303948879241943, + "logits/rejected": 4.063828468322754, + "logps/chosen": -182.0391082763672, + "logps/rejected": -190.48011779785156, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.441814422607422, + "rewards/margins": 0.864255964756012, + "rewards/rejected": -14.306070327758789, + "step": 327 + }, + { + "epoch": 0.2263239606693117, + "grad_norm": 0.3418750464916229, + "learning_rate": 6.285933307780758e-07, + "logits/chosen": 3.8387207984924316, + "logits/rejected": 3.9363958835601807, + "logps/chosen": -168.458984375, + "logps/rejected": -178.99853515625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.143319129943848, + "rewards/margins": 1.0216920375823975, + "rewards/rejected": -13.165011405944824, + "step": 328 + }, + { + "epoch": 0.22701397274452303, + "grad_norm": 3.6829795837402344, + "learning_rate": 6.305097738597164e-07, + "logits/chosen": 4.03378963470459, + "logits/rejected": 4.151945114135742, + "logps/chosen": -172.940185546875, + "logps/rejected": -187.64451599121094, + "loss": 0.5322, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.423812866210938, + "rewards/margins": 1.4786423444747925, + "rewards/rejected": -13.902456283569336, + "step": 329 + }, + { + "epoch": 0.22770398481973433, + "grad_norm": 0.3194449543952942, + "learning_rate": 6.324262169413568e-07, + "logits/chosen": 3.867746591567993, + "logits/rejected": 3.865895986557007, + "logps/chosen": -173.56976318359375, + "logps/rejected": -180.61724853515625, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.476978302001953, + "rewards/margins": 0.6589460372924805, + "rewards/rejected": -13.135923385620117, + "step": 330 + }, + { + "epoch": 0.22839399689494566, + "grad_norm": 1.261232852935791, + "learning_rate": 6.343426600229973e-07, + "logits/chosen": 3.8772706985473633, + "logits/rejected": 3.999915599822998, + "logps/chosen": -174.1441650390625, + "logps/rejected": -177.77442932128906, + "loss": 0.6143, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.626049041748047, + "rewards/margins": 0.34255707263946533, + "rewards/rejected": -12.968606948852539, + "step": 331 + }, + { + "epoch": 0.229084008970157, + "grad_norm": 0.6394985318183899, + "learning_rate": 6.362591031046378e-07, + "logits/chosen": 3.95478892326355, + "logits/rejected": 4.113531112670898, + "logps/chosen": -172.23724365234375, + "logps/rejected": -189.3135223388672, + "loss": 0.5215, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.410733222961426, + "rewards/margins": 1.728469967842102, + "rewards/rejected": -14.139203071594238, + "step": 332 + }, + { + "epoch": 0.2297740210453683, + "grad_norm": 0.36794307827949524, + "learning_rate": 6.381755461862783e-07, + "logits/chosen": 3.3349454402923584, + "logits/rejected": 3.5410192012786865, + "logps/chosen": -162.19876098632812, + "logps/rejected": -171.53713989257812, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.305624008178711, + "rewards/margins": 0.9519404768943787, + "rewards/rejected": -12.257563591003418, + "step": 333 + }, + { + "epoch": 0.23046403312057961, + "grad_norm": 0.9436823725700378, + "learning_rate": 6.400919892679187e-07, + "logits/chosen": 3.616490364074707, + "logits/rejected": 3.6227240562438965, + "logps/chosen": -169.02011108398438, + "logps/rejected": -172.27159118652344, + "loss": 0.6145, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.149396896362305, + "rewards/margins": 0.339092493057251, + "rewards/rejected": -12.488490104675293, + "step": 334 + }, + { + "epoch": 0.2311540451957909, + "grad_norm": 0.34583020210266113, + "learning_rate": 6.420084323495593e-07, + "logits/chosen": 3.581448793411255, + "logits/rejected": 3.6550707817077637, + "logps/chosen": -158.699951171875, + "logps/rejected": -168.1131134033203, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.050045013427734, + "rewards/margins": 0.9378799796104431, + "rewards/rejected": -11.98792552947998, + "step": 335 + }, + { + "epoch": 0.23184405727100224, + "grad_norm": 28.1679744720459, + "learning_rate": 6.439248754311997e-07, + "logits/chosen": 3.8285133838653564, + "logits/rejected": 3.780512809753418, + "logps/chosen": -163.49203491210938, + "logps/rejected": -166.89614868164062, + "loss": 1.168, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.265625, + "rewards/margins": 0.34574460983276367, + "rewards/rejected": -11.611370086669922, + "step": 336 + }, + { + "epoch": 0.23253406934621357, + "grad_norm": 8.843696594238281, + "learning_rate": 6.458413185128402e-07, + "logits/chosen": 3.626315116882324, + "logits/rejected": 3.5952892303466797, + "logps/chosen": -168.0386962890625, + "logps/rejected": -167.7559356689453, + "loss": 0.7272, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.021635055541992, + "rewards/margins": -0.06082630157470703, + "rewards/rejected": -11.960807800292969, + "step": 337 + }, + { + "epoch": 0.23322408142142487, + "grad_norm": 12.673538208007812, + "learning_rate": 6.477577615944806e-07, + "logits/chosen": 3.884216547012329, + "logits/rejected": 4.003091335296631, + "logps/chosen": -167.5729217529297, + "logps/rejected": -172.3642120361328, + "loss": 0.6354, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.940276145935059, + "rewards/margins": 0.4727237820625305, + "rewards/rejected": -12.41300106048584, + "step": 338 + }, + { + "epoch": 0.2339140934966362, + "grad_norm": 0.3041689395904541, + "learning_rate": 6.496742046761212e-07, + "logits/chosen": 3.208211898803711, + "logits/rejected": 3.4158966541290283, + "logps/chosen": -152.35916137695312, + "logps/rejected": -165.35293579101562, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.48569107055664, + "rewards/margins": 1.2694404125213623, + "rewards/rejected": -11.755131721496582, + "step": 339 + }, + { + "epoch": 0.23460410557184752, + "grad_norm": 0.3086562156677246, + "learning_rate": 6.515906477577616e-07, + "logits/chosen": 3.954740524291992, + "logits/rejected": 4.040408611297607, + "logps/chosen": -177.16387939453125, + "logps/rejected": -188.12600708007812, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.91125202178955, + "rewards/margins": 1.1215769052505493, + "rewards/rejected": -14.032829284667969, + "step": 340 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.29951390624046326, + "learning_rate": 6.535070908394021e-07, + "logits/chosen": 3.696061134338379, + "logits/rejected": 3.6675424575805664, + "logps/chosen": -167.0355224609375, + "logps/rejected": -178.1529541015625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.059246063232422, + "rewards/margins": 0.8835303783416748, + "rewards/rejected": -12.942776679992676, + "step": 341 + }, + { + "epoch": 0.23598412972227015, + "grad_norm": 0.271997332572937, + "learning_rate": 6.554235339210426e-07, + "logits/chosen": 3.5336050987243652, + "logits/rejected": 3.9798223972320557, + "logps/chosen": -146.06187438964844, + "logps/rejected": -183.690673828125, + "loss": 0.3471, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.930647850036621, + "rewards/margins": 3.779104232788086, + "rewards/rejected": -13.709752082824707, + "step": 342 + }, + { + "epoch": 0.23667414179748145, + "grad_norm": 1.4897981882095337, + "learning_rate": 6.573399770026831e-07, + "logits/chosen": 3.2100226879119873, + "logits/rejected": 3.4713828563690186, + "logps/chosen": -163.07168579101562, + "logps/rejected": -174.2734375, + "loss": 0.5279, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.571918487548828, + "rewards/margins": 1.0790927410125732, + "rewards/rejected": -12.651010513305664, + "step": 343 + }, + { + "epoch": 0.23736415387269277, + "grad_norm": 15.743667602539062, + "learning_rate": 6.592564200843235e-07, + "logits/chosen": 3.7958357334136963, + "logits/rejected": 3.754237651824951, + "logps/chosen": -161.80503845214844, + "logps/rejected": -161.6640167236328, + "loss": 1.058, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.342493057250977, + "rewards/margins": 0.027907729148864746, + "rewards/rejected": -11.370401382446289, + "step": 344 + }, + { + "epoch": 0.2380541659479041, + "grad_norm": 0.43024134635925293, + "learning_rate": 6.611728631659641e-07, + "logits/chosen": 3.9179906845092773, + "logits/rejected": 3.9179906845092773, + "logps/chosen": -184.95233154296875, + "logps/rejected": -184.9523162841797, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.506410598754883, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.506410598754883, + "step": 345 + }, + { + "epoch": 0.2387441780231154, + "grad_norm": 0.4360017776489258, + "learning_rate": 6.630893062476045e-07, + "logits/chosen": 3.9971861839294434, + "logits/rejected": 3.9971861839294434, + "logps/chosen": -176.9279022216797, + "logps/rejected": -176.9279022216797, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.99767017364502, + "rewards/margins": 5.364418029785156e-07, + "rewards/rejected": -12.997671127319336, + "step": 346 + }, + { + "epoch": 0.23943419009832673, + "grad_norm": 0.31363898515701294, + "learning_rate": 6.65005749329245e-07, + "logits/chosen": 3.6195311546325684, + "logits/rejected": 3.74595308303833, + "logps/chosen": -164.05068969726562, + "logps/rejected": -173.71649169921875, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.698200225830078, + "rewards/margins": 0.942807674407959, + "rewards/rejected": -12.641008377075195, + "step": 347 + }, + { + "epoch": 0.24012420217353803, + "grad_norm": 5.055685997009277, + "learning_rate": 6.669221924108854e-07, + "logits/chosen": 3.5585663318634033, + "logits/rejected": 3.6913206577301025, + "logps/chosen": -169.931396484375, + "logps/rejected": -171.004150390625, + "loss": 0.6453, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.180148124694824, + "rewards/margins": 0.12622344493865967, + "rewards/rejected": -12.306371688842773, + "step": 348 + }, + { + "epoch": 0.24081421424874935, + "grad_norm": 0.32905542850494385, + "learning_rate": 6.68838635492526e-07, + "logits/chosen": 3.552699089050293, + "logits/rejected": 3.8372254371643066, + "logps/chosen": -170.8791961669922, + "logps/rejected": -179.31369018554688, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.225080490112305, + "rewards/margins": 0.8428665399551392, + "rewards/rejected": -13.067947387695312, + "step": 349 + }, + { + "epoch": 0.24150422632396068, + "grad_norm": 0.33587974309921265, + "learning_rate": 6.707550785741664e-07, + "logits/chosen": 3.6289334297180176, + "logits/rejected": 3.842782974243164, + "logps/chosen": -169.98150634765625, + "logps/rejected": -190.185546875, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.292346954345703, + "rewards/margins": 2.032764196395874, + "rewards/rejected": -14.32511043548584, + "step": 350 + }, + { + "epoch": 0.24219423839917198, + "grad_norm": 2.425081729888916, + "learning_rate": 6.726715216558069e-07, + "logits/chosen": 3.6020047664642334, + "logits/rejected": 3.9278411865234375, + "logps/chosen": -154.36683654785156, + "logps/rejected": -175.4390106201172, + "loss": 0.45, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.517654418945312, + "rewards/margins": 2.2361016273498535, + "rewards/rejected": -12.753756523132324, + "step": 351 + }, + { + "epoch": 0.2428842504743833, + "grad_norm": 15.507882118225098, + "learning_rate": 6.745879647374474e-07, + "logits/chosen": 4.1294074058532715, + "logits/rejected": 3.9712090492248535, + "logps/chosen": -177.55162048339844, + "logps/rejected": -167.689208984375, + "loss": 1.6307, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.862704277038574, + "rewards/margins": -1.0241519212722778, + "rewards/rejected": -11.838552474975586, + "step": 352 + }, + { + "epoch": 0.2435742625495946, + "grad_norm": 0.5495116114616394, + "learning_rate": 6.765044078190878e-07, + "logits/chosen": 3.4473462104797363, + "logits/rejected": 3.6408042907714844, + "logps/chosen": -167.5970001220703, + "logps/rejected": -172.03468322753906, + "loss": 0.6091, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.935808181762695, + "rewards/margins": 0.4846920967102051, + "rewards/rejected": -12.420499801635742, + "step": 353 + }, + { + "epoch": 0.24426427462480593, + "grad_norm": 0.3177599012851715, + "learning_rate": 6.784208509007282e-07, + "logits/chosen": 3.832275390625, + "logits/rejected": 3.910236358642578, + "logps/chosen": -178.9664764404297, + "logps/rejected": -189.10328674316406, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.11114501953125, + "rewards/margins": 1.0472159385681152, + "rewards/rejected": -14.158360481262207, + "step": 354 + }, + { + "epoch": 0.24495428670001726, + "grad_norm": 0.30151259899139404, + "learning_rate": 6.803372939823688e-07, + "logits/chosen": 3.6533303260803223, + "logits/rejected": 3.8484530448913574, + "logps/chosen": -163.64675903320312, + "logps/rejected": -175.46827697753906, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.800179481506348, + "rewards/margins": 1.1653026342391968, + "rewards/rejected": -12.965482711791992, + "step": 355 + }, + { + "epoch": 0.24564429877522856, + "grad_norm": 0.37805160880088806, + "learning_rate": 6.822537370640092e-07, + "logits/chosen": 4.161439895629883, + "logits/rejected": 4.367018699645996, + "logps/chosen": -166.0662078857422, + "logps/rejected": -191.13392639160156, + "loss": 0.4348, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.716655731201172, + "rewards/margins": 2.449207305908203, + "rewards/rejected": -14.165863037109375, + "step": 356 + }, + { + "epoch": 0.24633431085043989, + "grad_norm": 0.5316848158836365, + "learning_rate": 6.841701801456497e-07, + "logits/chosen": 3.6890974044799805, + "logits/rejected": 3.755533218383789, + "logps/chosen": -155.45802307128906, + "logps/rejected": -173.82688903808594, + "loss": 0.5221, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.666080474853516, + "rewards/margins": 1.905434489250183, + "rewards/rejected": -12.571515083312988, + "step": 357 + }, + { + "epoch": 0.24702432292565119, + "grad_norm": 23.569677352905273, + "learning_rate": 6.860866232272901e-07, + "logits/chosen": 3.8825583457946777, + "logits/rejected": 3.860060691833496, + "logps/chosen": -184.5976104736328, + "logps/rejected": -183.6268310546875, + "loss": 0.7889, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.689044952392578, + "rewards/margins": -0.14937162399291992, + "rewards/rejected": -13.5396728515625, + "step": 358 + }, + { + "epoch": 0.2477143350008625, + "grad_norm": 0.3636704981327057, + "learning_rate": 6.880030663089307e-07, + "logits/chosen": 3.731797456741333, + "logits/rejected": 3.731797456741333, + "logps/chosen": -177.14422607421875, + "logps/rejected": -177.14422607421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.853382110595703, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.853382110595703, + "step": 359 + }, + { + "epoch": 0.24840434707607384, + "grad_norm": 0.319723904132843, + "learning_rate": 6.899195093905711e-07, + "logits/chosen": 3.93721342086792, + "logits/rejected": 3.93721342086792, + "logps/chosen": -184.80833435058594, + "logps/rejected": -184.80833435058594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.784177780151367, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.784177780151367, + "step": 360 + }, + { + "epoch": 0.24909435915128514, + "grad_norm": 0.2834835946559906, + "learning_rate": 6.918359524722116e-07, + "logits/chosen": 4.100399017333984, + "logits/rejected": 4.149493217468262, + "logps/chosen": -185.34506225585938, + "logps/rejected": -195.55154418945312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.845294952392578, + "rewards/margins": 1.027786374092102, + "rewards/rejected": -14.87308120727539, + "step": 361 + }, + { + "epoch": 0.24978437122649647, + "grad_norm": 0.33655425906181335, + "learning_rate": 6.937523955538521e-07, + "logits/chosen": 3.7658567428588867, + "logits/rejected": 3.7658567428588867, + "logps/chosen": -169.91897583007812, + "logps/rejected": -169.91897583007812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.370767593383789, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.370768547058105, + "step": 362 + }, + { + "epoch": 0.2504743833017078, + "grad_norm": 0.5081724524497986, + "learning_rate": 6.956688386354926e-07, + "logits/chosen": 3.356038808822632, + "logits/rejected": 3.356038808822632, + "logps/chosen": -153.76988220214844, + "logps/rejected": -153.76988220214844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.692829132080078, + "rewards/margins": 0.0, + "rewards/rejected": -10.692829132080078, + "step": 363 + }, + { + "epoch": 0.2511643953769191, + "grad_norm": 0.3471983075141907, + "learning_rate": 6.97585281717133e-07, + "logits/chosen": 3.9749937057495117, + "logits/rejected": 3.9749937057495117, + "logps/chosen": -185.02215576171875, + "logps/rejected": -185.02215576171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.83815860748291, + "rewards/margins": 0.0, + "rewards/rejected": -13.83815860748291, + "step": 364 + }, + { + "epoch": 0.2518544074521304, + "grad_norm": 0.4549452066421509, + "learning_rate": 6.995017247987736e-07, + "logits/chosen": 3.981689691543579, + "logits/rejected": 3.981689691543579, + "logps/chosen": -183.00233459472656, + "logps/rejected": -183.00233459472656, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.436598777770996, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.436598777770996, + "step": 365 + }, + { + "epoch": 0.25254441952734175, + "grad_norm": 0.4773138761520386, + "learning_rate": 7.01418167880414e-07, + "logits/chosen": 3.793970823287964, + "logits/rejected": 3.793970823287964, + "logps/chosen": -169.60440063476562, + "logps/rejected": -169.60440063476562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.061440467834473, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.061440467834473, + "step": 366 + }, + { + "epoch": 0.25323443160255305, + "grad_norm": 0.40346240997314453, + "learning_rate": 7.033346109620545e-07, + "logits/chosen": 3.3893020153045654, + "logits/rejected": 3.5259060859680176, + "logps/chosen": -159.25924682617188, + "logps/rejected": -168.5220184326172, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.0490140914917, + "rewards/margins": 0.9468036890029907, + "rewards/rejected": -11.995818138122559, + "step": 367 + }, + { + "epoch": 0.25392444367776434, + "grad_norm": 1.425918698310852, + "learning_rate": 7.052510540436949e-07, + "logits/chosen": 3.466989278793335, + "logits/rejected": 3.6873059272766113, + "logps/chosen": -141.10015869140625, + "logps/rejected": -178.50230407714844, + "loss": 0.2749, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.267684936523438, + "rewards/margins": 3.8074522018432617, + "rewards/rejected": -13.0751371383667, + "step": 368 + }, + { + "epoch": 0.2546144557529757, + "grad_norm": 1.6037224531173706, + "learning_rate": 7.071674971253355e-07, + "logits/chosen": 3.6715612411499023, + "logits/rejected": 3.8490583896636963, + "logps/chosen": -161.36062622070312, + "logps/rejected": -183.12677001953125, + "loss": 0.4448, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.377710342407227, + "rewards/margins": 2.1929168701171875, + "rewards/rejected": -13.570627212524414, + "step": 369 + }, + { + "epoch": 0.255304467828187, + "grad_norm": 13.14455795288086, + "learning_rate": 7.090839402069759e-07, + "logits/chosen": 3.871094226837158, + "logits/rejected": 3.8181991577148438, + "logps/chosen": -171.58554077148438, + "logps/rejected": -174.20904541015625, + "loss": 1.1621, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.319543838500977, + "rewards/margins": 0.3729262948036194, + "rewards/rejected": -12.69247055053711, + "step": 370 + }, + { + "epoch": 0.2559944799033983, + "grad_norm": 0.37150633335113525, + "learning_rate": 7.110003832886164e-07, + "logits/chosen": 3.5332858562469482, + "logits/rejected": 3.741149425506592, + "logps/chosen": -175.7003936767578, + "logps/rejected": -184.9236297607422, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.854693412780762, + "rewards/margins": 0.9120203256607056, + "rewards/rejected": -13.766714096069336, + "step": 371 + }, + { + "epoch": 0.25668449197860965, + "grad_norm": 1.6254076957702637, + "learning_rate": 7.129168263702569e-07, + "logits/chosen": 3.8486576080322266, + "logits/rejected": 3.96063232421875, + "logps/chosen": -169.91517639160156, + "logps/rejected": -180.26849365234375, + "loss": 0.5345, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.218050956726074, + "rewards/margins": 1.1014100313186646, + "rewards/rejected": -13.31946086883545, + "step": 372 + }, + { + "epoch": 0.25737450405382095, + "grad_norm": 5.526604652404785, + "learning_rate": 7.148332694518974e-07, + "logits/chosen": 3.668699264526367, + "logits/rejected": 3.7804203033447266, + "logps/chosen": -171.12503051757812, + "logps/rejected": -184.30604553222656, + "loss": 0.5051, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.413501739501953, + "rewards/margins": 1.326596975326538, + "rewards/rejected": -13.74009895324707, + "step": 373 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 0.3404039144515991, + "learning_rate": 7.167497125335378e-07, + "logits/chosen": 3.6107823848724365, + "logits/rejected": 3.900146007537842, + "logps/chosen": -165.63882446289062, + "logps/rejected": -185.63836669921875, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.72960090637207, + "rewards/margins": 2.0461816787719727, + "rewards/rejected": -13.775782585144043, + "step": 374 + }, + { + "epoch": 0.25875452820424355, + "grad_norm": 0.3975186347961426, + "learning_rate": 7.186661556151784e-07, + "logits/chosen": 4.001906394958496, + "logits/rejected": 4.001906394958496, + "logps/chosen": -178.29031372070312, + "logps/rejected": -178.29031372070312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.986113548278809, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.986113548278809, + "step": 375 + }, + { + "epoch": 0.2594445402794549, + "grad_norm": 0.5077434182167053, + "learning_rate": 7.205825986968188e-07, + "logits/chosen": 3.4682955741882324, + "logits/rejected": 3.4956793785095215, + "logps/chosen": -157.31631469726562, + "logps/rejected": -165.87725830078125, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.920244216918945, + "rewards/margins": 0.8715323805809021, + "rewards/rejected": -11.791775703430176, + "step": 376 + }, + { + "epoch": 0.2601345523546662, + "grad_norm": 0.39325928688049316, + "learning_rate": 7.224990417784593e-07, + "logits/chosen": 3.848768472671509, + "logits/rejected": 3.983815908432007, + "logps/chosen": -174.36221313476562, + "logps/rejected": -185.77127075195312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.661521911621094, + "rewards/margins": 1.090530514717102, + "rewards/rejected": -13.752052307128906, + "step": 377 + }, + { + "epoch": 0.2608245644298775, + "grad_norm": 1.0291380882263184, + "learning_rate": 7.244154848600996e-07, + "logits/chosen": 3.690707206726074, + "logits/rejected": 3.961327075958252, + "logps/chosen": -166.3662872314453, + "logps/rejected": -182.4188690185547, + "loss": 0.4411, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.766460418701172, + "rewards/margins": 1.529972791671753, + "rewards/rejected": -13.296432495117188, + "step": 378 + }, + { + "epoch": 0.26151457650508886, + "grad_norm": 0.3301616609096527, + "learning_rate": 7.263319279417403e-07, + "logits/chosen": 3.9998741149902344, + "logits/rejected": 4.0737104415893555, + "logps/chosen": -164.58291625976562, + "logps/rejected": -175.6943359375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.650016784667969, + "rewards/margins": 1.1074467897415161, + "rewards/rejected": -12.757463455200195, + "step": 379 + }, + { + "epoch": 0.26220458858030016, + "grad_norm": 0.24702619016170502, + "learning_rate": 7.282483710233806e-07, + "logits/chosen": 3.0827648639678955, + "logits/rejected": 3.36506724357605, + "logps/chosen": -140.19610595703125, + "logps/rejected": -180.18338012695312, + "loss": 0.3467, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.267322540283203, + "rewards/margins": 4.064394950866699, + "rewards/rejected": -13.331717491149902, + "step": 380 + }, + { + "epoch": 0.26289460065551146, + "grad_norm": 0.3614504337310791, + "learning_rate": 7.301648141050211e-07, + "logits/chosen": 4.052490234375, + "logits/rejected": 4.052490234375, + "logps/chosen": -176.0684814453125, + "logps/rejected": -176.06846618652344, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.837906837463379, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -12.837906837463379, + "step": 381 + }, + { + "epoch": 0.2635846127307228, + "grad_norm": 0.3465465009212494, + "learning_rate": 7.320812571866616e-07, + "logits/chosen": 3.7709784507751465, + "logits/rejected": 3.854923725128174, + "logps/chosen": -171.01263427734375, + "logps/rejected": -180.98855590820312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.261027336120605, + "rewards/margins": 1.0049893856048584, + "rewards/rejected": -13.266016006469727, + "step": 382 + }, + { + "epoch": 0.2642746248059341, + "grad_norm": 4.340196132659912, + "learning_rate": 7.339977002683021e-07, + "logits/chosen": 3.450239419937134, + "logits/rejected": 3.712261915206909, + "logps/chosen": -171.17694091796875, + "logps/rejected": -201.64920043945312, + "loss": 0.3742, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.389425277709961, + "rewards/margins": 3.095803737640381, + "rewards/rejected": -15.4852294921875, + "step": 383 + }, + { + "epoch": 0.2649646368811454, + "grad_norm": 0.5413082242012024, + "learning_rate": 7.359141433499425e-07, + "logits/chosen": 3.89460825920105, + "logits/rejected": 3.908565044403076, + "logps/chosen": -183.81878662109375, + "logps/rejected": -192.99383544921875, + "loss": 0.5261, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.631290435791016, + "rewards/margins": 0.9207490682601929, + "rewards/rejected": -14.552040100097656, + "step": 384 + }, + { + "epoch": 0.2656546489563567, + "grad_norm": 0.36070576310157776, + "learning_rate": 7.378305864315831e-07, + "logits/chosen": 3.826233148574829, + "logits/rejected": 3.826233148574829, + "logps/chosen": -162.61041259765625, + "logps/rejected": -162.61041259765625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.480682373046875, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.480682373046875, + "step": 385 + }, + { + "epoch": 0.26634466103156806, + "grad_norm": 0.3697658181190491, + "learning_rate": 7.397470295132235e-07, + "logits/chosen": 3.4638662338256836, + "logits/rejected": 3.639225482940674, + "logps/chosen": -172.20152282714844, + "logps/rejected": -178.9622344970703, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.386240005493164, + "rewards/margins": 0.717230498790741, + "rewards/rejected": -13.103471755981445, + "step": 386 + }, + { + "epoch": 0.26703467310677936, + "grad_norm": 0.34870368242263794, + "learning_rate": 7.41663472594864e-07, + "logits/chosen": 3.4920852184295654, + "logits/rejected": 3.5644891262054443, + "logps/chosen": -162.14138793945312, + "logps/rejected": -169.71676635742188, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.534648895263672, + "rewards/margins": 0.776444137096405, + "rewards/rejected": -12.311092376708984, + "step": 387 + }, + { + "epoch": 0.26772468518199066, + "grad_norm": 0.28214791417121887, + "learning_rate": 7.435799156765044e-07, + "logits/chosen": 4.158751010894775, + "logits/rejected": 4.180999755859375, + "logps/chosen": -182.5084686279297, + "logps/rejected": -195.66793823242188, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.570938110351562, + "rewards/margins": 1.2691490650177002, + "rewards/rejected": -14.840085983276367, + "step": 388 + }, + { + "epoch": 0.268414697257202, + "grad_norm": 0.37754830718040466, + "learning_rate": 7.45496358758145e-07, + "logits/chosen": 4.003456115722656, + "logits/rejected": 4.003456115722656, + "logps/chosen": -184.75262451171875, + "logps/rejected": -184.75262451171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.54676628112793, + "rewards/margins": 0.0, + "rewards/rejected": -13.54676628112793, + "step": 389 + }, + { + "epoch": 0.2691047093324133, + "grad_norm": 15.014817237854004, + "learning_rate": 7.474128018397854e-07, + "logits/chosen": 3.4630823135375977, + "logits/rejected": 3.767540454864502, + "logps/chosen": -160.93450927734375, + "logps/rejected": -177.956787109375, + "loss": 0.5254, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.298091888427734, + "rewards/margins": 1.6455090045928955, + "rewards/rejected": -12.943599700927734, + "step": 390 + }, + { + "epoch": 0.2697947214076246, + "grad_norm": 0.34817102551460266, + "learning_rate": 7.493292449214259e-07, + "logits/chosen": 3.7979912757873535, + "logits/rejected": 4.033342361450195, + "logps/chosen": -172.62493896484375, + "logps/rejected": -184.17538452148438, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.623376846313477, + "rewards/margins": 1.1713480949401855, + "rewards/rejected": -13.79472541809082, + "step": 391 + }, + { + "epoch": 0.27048473348283597, + "grad_norm": 0.6123242378234863, + "learning_rate": 7.512456880030664e-07, + "logits/chosen": 3.8226356506347656, + "logits/rejected": 3.8226356506347656, + "logps/chosen": -167.8995819091797, + "logps/rejected": -167.89959716796875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.007216453552246, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.007216453552246, + "step": 392 + }, + { + "epoch": 0.27117474555804727, + "grad_norm": 0.3328576385974884, + "learning_rate": 7.531621310847069e-07, + "logits/chosen": 3.3329977989196777, + "logits/rejected": 3.3329977989196777, + "logps/chosen": -175.34674072265625, + "logps/rejected": -175.34674072265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.592607498168945, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.592607498168945, + "step": 393 + }, + { + "epoch": 0.27186475763325857, + "grad_norm": 0.3654704689979553, + "learning_rate": 7.550785741663473e-07, + "logits/chosen": 3.862172842025757, + "logits/rejected": 3.9497711658477783, + "logps/chosen": -180.20523071289062, + "logps/rejected": -188.46975708007812, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.128347396850586, + "rewards/margins": 0.7767962217330933, + "rewards/rejected": -13.905143737792969, + "step": 394 + }, + { + "epoch": 0.2725547697084699, + "grad_norm": 3.893416404724121, + "learning_rate": 7.569950172479879e-07, + "logits/chosen": 3.5665619373321533, + "logits/rejected": 3.775024175643921, + "logps/chosen": -169.215576171875, + "logps/rejected": -183.22708129882812, + "loss": 0.5341, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.021819114685059, + "rewards/margins": 1.4210344552993774, + "rewards/rejected": -13.442852973937988, + "step": 395 + }, + { + "epoch": 0.2732447817836812, + "grad_norm": 0.334616094827652, + "learning_rate": 7.589114603296283e-07, + "logits/chosen": 4.1046953201293945, + "logits/rejected": 4.1046953201293945, + "logps/chosen": -172.56460571289062, + "logps/rejected": -172.56460571289062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.464946746826172, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.464946746826172, + "step": 396 + }, + { + "epoch": 0.2739347938588925, + "grad_norm": 0.30833491683006287, + "learning_rate": 7.608279034112688e-07, + "logits/chosen": 3.370845317840576, + "logits/rejected": 3.4977645874023438, + "logps/chosen": -155.735595703125, + "logps/rejected": -168.195556640625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.590903282165527, + "rewards/margins": 1.2660421133041382, + "rewards/rejected": -11.856945037841797, + "step": 397 + }, + { + "epoch": 0.2746248059341038, + "grad_norm": 0.3137180209159851, + "learning_rate": 7.627443464929092e-07, + "logits/chosen": 3.6115903854370117, + "logits/rejected": 3.8256585597991943, + "logps/chosen": -168.266845703125, + "logps/rejected": -191.41693115234375, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.018692016601562, + "rewards/margins": 2.3863401412963867, + "rewards/rejected": -14.405031204223633, + "step": 398 + }, + { + "epoch": 0.2753148180093152, + "grad_norm": 0.3493081331253052, + "learning_rate": 7.646607895745498e-07, + "logits/chosen": 3.560271978378296, + "logits/rejected": 3.560271978378296, + "logps/chosen": -167.83712768554688, + "logps/rejected": -167.83712768554688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.071463584899902, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.071463584899902, + "step": 399 + }, + { + "epoch": 0.2760048300845265, + "grad_norm": 25.052215576171875, + "learning_rate": 7.665772326561902e-07, + "logits/chosen": 4.043935775756836, + "logits/rejected": 4.052703857421875, + "logps/chosen": -171.7134552001953, + "logps/rejected": -174.95550537109375, + "loss": 1.0825, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.453071594238281, + "rewards/margins": 0.2652280926704407, + "rewards/rejected": -12.718299865722656, + "step": 400 + }, + { + "epoch": 0.2766948421597378, + "grad_norm": 0.31975147128105164, + "learning_rate": 7.684936757378307e-07, + "logits/chosen": 3.913759231567383, + "logits/rejected": 3.996163845062256, + "logps/chosen": -173.54598999023438, + "logps/rejected": -186.71469116210938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.790035247802734, + "rewards/margins": 1.2548233270645142, + "rewards/rejected": -14.044858932495117, + "step": 401 + }, + { + "epoch": 0.27738485423494913, + "grad_norm": 0.40145373344421387, + "learning_rate": 7.704101188194712e-07, + "logits/chosen": 3.481679916381836, + "logits/rejected": 3.7746775150299072, + "logps/chosen": -149.52471923828125, + "logps/rejected": -178.13546752929688, + "loss": 0.4342, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.190374374389648, + "rewards/margins": 2.9258201122283936, + "rewards/rejected": -13.116194725036621, + "step": 402 + }, + { + "epoch": 0.27807486631016043, + "grad_norm": 5.876901626586914, + "learning_rate": 7.723265619011117e-07, + "logits/chosen": 3.5036940574645996, + "logits/rejected": 3.6960251331329346, + "logps/chosen": -147.3011474609375, + "logps/rejected": -152.74169921875, + "loss": 0.5842, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.925323486328125, + "rewards/margins": 0.5512721538543701, + "rewards/rejected": -10.47659683227539, + "step": 403 + }, + { + "epoch": 0.27876487838537173, + "grad_norm": 0.33862757682800293, + "learning_rate": 7.74243004982752e-07, + "logits/chosen": 3.9915928840637207, + "logits/rejected": 4.15606164932251, + "logps/chosen": -167.51902770996094, + "logps/rejected": -177.5215301513672, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.939374923706055, + "rewards/margins": 0.9752672910690308, + "rewards/rejected": -12.914642333984375, + "step": 404 + }, + { + "epoch": 0.2794548904605831, + "grad_norm": 1.8417508602142334, + "learning_rate": 7.761594480643927e-07, + "logits/chosen": 3.6251280307769775, + "logits/rejected": 3.8785650730133057, + "logps/chosen": -172.31735229492188, + "logps/rejected": -189.62423706054688, + "loss": 0.5362, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.480026245117188, + "rewards/margins": 1.626145839691162, + "rewards/rejected": -14.106171607971191, + "step": 405 + }, + { + "epoch": 0.2801449025357944, + "grad_norm": 0.4554441273212433, + "learning_rate": 7.78075891146033e-07, + "logits/chosen": 3.1447689533233643, + "logits/rejected": 3.194065809249878, + "logps/chosen": -141.99278259277344, + "logps/rejected": -156.1993865966797, + "loss": 0.5227, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.390871047973633, + "rewards/margins": 1.4588910341262817, + "rewards/rejected": -10.849762916564941, + "step": 406 + }, + { + "epoch": 0.2808349146110057, + "grad_norm": 0.4621082544326782, + "learning_rate": 7.799923342276735e-07, + "logits/chosen": 3.9194297790527344, + "logits/rejected": 4.0870161056518555, + "logps/chosen": -173.38558959960938, + "logps/rejected": -183.46873474121094, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.502991676330566, + "rewards/margins": 0.9859420657157898, + "rewards/rejected": -13.488933563232422, + "step": 407 + }, + { + "epoch": 0.28152492668621704, + "grad_norm": 1.1453479528427124, + "learning_rate": 7.819087773093139e-07, + "logits/chosen": 3.4293439388275146, + "logits/rejected": 3.5688982009887695, + "logps/chosen": -152.05984497070312, + "logps/rejected": -177.01412963867188, + "loss": 0.4401, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.51427173614502, + "rewards/margins": 2.56581974029541, + "rewards/rejected": -13.080092430114746, + "step": 408 + }, + { + "epoch": 0.28221493876142834, + "grad_norm": 0.40608707070350647, + "learning_rate": 7.838252203909545e-07, + "logits/chosen": 3.6454594135284424, + "logits/rejected": 3.7638607025146484, + "logps/chosen": -170.1895294189453, + "logps/rejected": -180.97329711914062, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.212310791015625, + "rewards/margins": 1.1006046533584595, + "rewards/rejected": -13.312915802001953, + "step": 409 + }, + { + "epoch": 0.28290495083663963, + "grad_norm": 0.3967089354991913, + "learning_rate": 7.857416634725949e-07, + "logits/chosen": 3.4365997314453125, + "logits/rejected": 3.4365997314453125, + "logps/chosen": -165.90451049804688, + "logps/rejected": -165.90451049804688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.746734619140625, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -11.746734619140625, + "step": 410 + }, + { + "epoch": 0.28359496291185093, + "grad_norm": 0.865945041179657, + "learning_rate": 7.876581065542354e-07, + "logits/chosen": 3.938699722290039, + "logits/rejected": 4.064796447753906, + "logps/chosen": -167.89205932617188, + "logps/rejected": -172.07945251464844, + "loss": 0.6106, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.042464256286621, + "rewards/margins": 0.424197256565094, + "rewards/rejected": -12.466662406921387, + "step": 411 + }, + { + "epoch": 0.2842849749870623, + "grad_norm": 4.441404342651367, + "learning_rate": 7.895745496358759e-07, + "logits/chosen": 3.6361083984375, + "logits/rejected": 3.627289295196533, + "logps/chosen": -179.94012451171875, + "logps/rejected": -182.1240234375, + "loss": 0.6318, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.326696395874023, + "rewards/margins": 0.1866164207458496, + "rewards/rejected": -13.513313293457031, + "step": 412 + }, + { + "epoch": 0.2849749870622736, + "grad_norm": 0.3000492453575134, + "learning_rate": 7.914909927175164e-07, + "logits/chosen": 4.083874225616455, + "logits/rejected": 4.554045677185059, + "logps/chosen": -171.4708709716797, + "logps/rejected": -194.5548858642578, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.270051002502441, + "rewards/margins": 2.2896034717559814, + "rewards/rejected": -14.55965518951416, + "step": 413 + }, + { + "epoch": 0.2856649991374849, + "grad_norm": 0.35031992197036743, + "learning_rate": 7.934074357991568e-07, + "logits/chosen": 4.040624618530273, + "logits/rejected": 4.108983993530273, + "logps/chosen": -176.6506805419922, + "logps/rejected": -191.1684112548828, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.983453750610352, + "rewards/margins": 1.394963264465332, + "rewards/rejected": -14.37841796875, + "step": 414 + }, + { + "epoch": 0.28635501121269624, + "grad_norm": 0.34213608503341675, + "learning_rate": 7.953238788807974e-07, + "logits/chosen": 3.813673496246338, + "logits/rejected": 4.057733058929443, + "logps/chosen": -154.9088134765625, + "logps/rejected": -168.74122619628906, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.919805526733398, + "rewards/margins": 1.4256341457366943, + "rewards/rejected": -12.345439910888672, + "step": 415 + }, + { + "epoch": 0.28704502328790754, + "grad_norm": 0.35047027468681335, + "learning_rate": 7.972403219624378e-07, + "logits/chosen": 3.721407651901245, + "logits/rejected": 4.067727088928223, + "logps/chosen": -157.5092315673828, + "logps/rejected": -175.8169708251953, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.966296195983887, + "rewards/margins": 1.8431568145751953, + "rewards/rejected": -12.809453964233398, + "step": 416 + }, + { + "epoch": 0.28773503536311884, + "grad_norm": 0.418536514043808, + "learning_rate": 7.991567650440783e-07, + "logits/chosen": 3.8082587718963623, + "logits/rejected": 3.8082587718963623, + "logps/chosen": -184.20608520507812, + "logps/rejected": -184.20608520507812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.600580215454102, + "rewards/margins": 0.0, + "rewards/rejected": -13.600580215454102, + "step": 417 + }, + { + "epoch": 0.2884250474383302, + "grad_norm": 0.33454230427742004, + "learning_rate": 8.010732081257187e-07, + "logits/chosen": 4.102786064147949, + "logits/rejected": 4.214009761810303, + "logps/chosen": -182.92474365234375, + "logps/rejected": -194.23062133789062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.6318359375, + "rewards/margins": 1.1461716890335083, + "rewards/rejected": -14.778007507324219, + "step": 418 + }, + { + "epoch": 0.2891150595135415, + "grad_norm": 0.3455432653427124, + "learning_rate": 8.029896512073591e-07, + "logits/chosen": 3.9977009296417236, + "logits/rejected": 4.151537895202637, + "logps/chosen": -183.7301025390625, + "logps/rejected": -191.27757263183594, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.47378158569336, + "rewards/margins": 0.7719489932060242, + "rewards/rejected": -14.245731353759766, + "step": 419 + }, + { + "epoch": 0.2898050715887528, + "grad_norm": 0.5241991877555847, + "learning_rate": 8.049060942889997e-07, + "logits/chosen": 3.9327239990234375, + "logits/rejected": 4.008800506591797, + "logps/chosen": -175.43788146972656, + "logps/rejected": -180.38705444335938, + "loss": 0.6092, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.797417640686035, + "rewards/margins": 0.47972726821899414, + "rewards/rejected": -13.277145385742188, + "step": 420 + }, + { + "epoch": 0.2904950836639641, + "grad_norm": 0.3597410023212433, + "learning_rate": 8.068225373706401e-07, + "logits/chosen": 3.5360536575317383, + "logits/rejected": 3.5360536575317383, + "logps/chosen": -160.01324462890625, + "logps/rejected": -160.01324462890625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.371909141540527, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.371909141540527, + "step": 421 + }, + { + "epoch": 0.29118509573917545, + "grad_norm": 9.158596992492676, + "learning_rate": 8.087389804522807e-07, + "logits/chosen": 3.5437164306640625, + "logits/rejected": 3.722358465194702, + "logps/chosen": -166.5471649169922, + "logps/rejected": -184.05133056640625, + "loss": 0.6363, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.8745698928833, + "rewards/margins": 1.663257122039795, + "rewards/rejected": -13.53782844543457, + "step": 422 + }, + { + "epoch": 0.29187510781438675, + "grad_norm": 0.3181624710559845, + "learning_rate": 8.106554235339211e-07, + "logits/chosen": 3.641280174255371, + "logits/rejected": 3.9232583045959473, + "logps/chosen": -163.8832244873047, + "logps/rejected": -186.84230041503906, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.62736701965332, + "rewards/margins": 2.2309775352478027, + "rewards/rejected": -13.858343124389648, + "step": 423 + }, + { + "epoch": 0.29256511988959805, + "grad_norm": 0.3305222690105438, + "learning_rate": 8.125718666155616e-07, + "logits/chosen": 3.5896522998809814, + "logits/rejected": 3.887694835662842, + "logps/chosen": -168.9004669189453, + "logps/rejected": -188.85427856445312, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.068543434143066, + "rewards/margins": 2.0072622299194336, + "rewards/rejected": -14.0758056640625, + "step": 424 + }, + { + "epoch": 0.2932551319648094, + "grad_norm": 0.44474735856056213, + "learning_rate": 8.14488309697202e-07, + "logits/chosen": 4.018592834472656, + "logits/rejected": 4.018592834472656, + "logps/chosen": -180.38589477539062, + "logps/rejected": -180.38589477539062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.183731079101562, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.183731079101562, + "step": 425 + }, + { + "epoch": 0.2939451440400207, + "grad_norm": 0.49944692850112915, + "learning_rate": 8.164047527788426e-07, + "logits/chosen": 4.0956268310546875, + "logits/rejected": 4.0956268310546875, + "logps/chosen": -179.44473266601562, + "logps/rejected": -179.44473266601562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.263749122619629, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.263749122619629, + "step": 426 + }, + { + "epoch": 0.294635156115232, + "grad_norm": 0.4259551465511322, + "learning_rate": 8.18321195860483e-07, + "logits/chosen": 4.26863956451416, + "logits/rejected": 4.238530158996582, + "logps/chosen": -188.85458374023438, + "logps/rejected": -194.39234924316406, + "loss": 0.6075, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.050151824951172, + "rewards/margins": 0.6032569408416748, + "rewards/rejected": -14.65340805053711, + "step": 427 + }, + { + "epoch": 0.29532516819044335, + "grad_norm": 0.4685962498188019, + "learning_rate": 8.202376389421235e-07, + "logits/chosen": 3.8612122535705566, + "logits/rejected": 4.1115641593933105, + "logps/chosen": -165.7115936279297, + "logps/rejected": -183.24850463867188, + "loss": 0.5228, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.108572959899902, + "rewards/margins": 1.6384296417236328, + "rewards/rejected": -13.747001647949219, + "step": 428 + }, + { + "epoch": 0.29601518026565465, + "grad_norm": 4.244957447052002, + "learning_rate": 8.221540820237639e-07, + "logits/chosen": 3.4439072608947754, + "logits/rejected": 3.8514065742492676, + "logps/chosen": -157.97105407714844, + "logps/rejected": -187.9083251953125, + "loss": 0.3707, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.081328392028809, + "rewards/margins": 3.025670289993286, + "rewards/rejected": -14.106998443603516, + "step": 429 + }, + { + "epoch": 0.29670519234086595, + "grad_norm": 5.443413734436035, + "learning_rate": 8.240705251054045e-07, + "logits/chosen": 3.5883913040161133, + "logits/rejected": 3.697219133377075, + "logps/chosen": -155.95391845703125, + "logps/rejected": -170.40061950683594, + "loss": 0.4672, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.871879577636719, + "rewards/margins": 1.390368938446045, + "rewards/rejected": -12.262248039245605, + "step": 430 + }, + { + "epoch": 0.2973952044160773, + "grad_norm": 2.041261911392212, + "learning_rate": 8.259869681870448e-07, + "logits/chosen": 3.7933382987976074, + "logits/rejected": 3.728692054748535, + "logps/chosen": -166.6956329345703, + "logps/rejected": -177.8508758544922, + "loss": 0.5388, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.958745002746582, + "rewards/margins": 1.1221373081207275, + "rewards/rejected": -13.080883026123047, + "step": 431 + }, + { + "epoch": 0.2980852164912886, + "grad_norm": 0.3159251809120178, + "learning_rate": 8.279034112686854e-07, + "logits/chosen": 3.5007150173187256, + "logits/rejected": 3.6329092979431152, + "logps/chosen": -156.1820831298828, + "logps/rejected": -168.917724609375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.02535629272461, + "rewards/margins": 1.2694885730743408, + "rewards/rejected": -12.294844627380371, + "step": 432 + }, + { + "epoch": 0.2987752285664999, + "grad_norm": 0.31752029061317444, + "learning_rate": 8.298198543503258e-07, + "logits/chosen": 3.9216086864471436, + "logits/rejected": 3.9216086864471436, + "logps/chosen": -184.3501434326172, + "logps/rejected": -184.3501434326172, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.622980117797852, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.622980117797852, + "step": 433 + }, + { + "epoch": 0.2994652406417112, + "grad_norm": 10.617057800292969, + "learning_rate": 8.317362974319663e-07, + "logits/chosen": 3.951622724533081, + "logits/rejected": 3.9540224075317383, + "logps/chosen": -182.30857849121094, + "logps/rejected": -182.32894897460938, + "loss": 0.6989, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.569533348083496, + "rewards/margins": -0.011306285858154297, + "rewards/rejected": -13.558226585388184, + "step": 434 + }, + { + "epoch": 0.30015525271692256, + "grad_norm": 9.46429443359375, + "learning_rate": 8.336527405136067e-07, + "logits/chosen": 3.6538548469543457, + "logits/rejected": 3.9639668464660645, + "logps/chosen": -170.8252716064453, + "logps/rejected": -184.811767578125, + "loss": 0.5254, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.33367919921875, + "rewards/margins": 1.404916524887085, + "rewards/rejected": -13.738597869873047, + "step": 435 + }, + { + "epoch": 0.30084526479213386, + "grad_norm": 0.48688289523124695, + "learning_rate": 8.355691835952473e-07, + "logits/chosen": 3.527163028717041, + "logits/rejected": 3.6086184978485107, + "logps/chosen": -152.93386840820312, + "logps/rejected": -169.39596557617188, + "loss": 0.524, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.580015182495117, + "rewards/margins": 1.5648045539855957, + "rewards/rejected": -12.144819259643555, + "step": 436 + }, + { + "epoch": 0.30153527686734516, + "grad_norm": 0.351779967546463, + "learning_rate": 8.374856266768877e-07, + "logits/chosen": 3.8634605407714844, + "logits/rejected": 4.027810096740723, + "logps/chosen": -171.96987915039062, + "logps/rejected": -186.8408203125, + "loss": 0.5213, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.319756507873535, + "rewards/margins": 1.461796522140503, + "rewards/rejected": -13.781554222106934, + "step": 437 + }, + { + "epoch": 0.3022252889425565, + "grad_norm": 0.4355643391609192, + "learning_rate": 8.394020697585282e-07, + "logits/chosen": 3.625242233276367, + "logits/rejected": 3.625242233276367, + "logps/chosen": -166.7467041015625, + "logps/rejected": -166.7467041015625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.05560302734375, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.05560302734375, + "step": 438 + }, + { + "epoch": 0.3029153010177678, + "grad_norm": 0.46183186769485474, + "learning_rate": 8.413185128401686e-07, + "logits/chosen": 4.029069423675537, + "logits/rejected": 4.029069423675537, + "logps/chosen": -191.1929473876953, + "logps/rejected": -191.1929473876953, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.345914840698242, + "rewards/margins": 8.344650268554688e-07, + "rewards/rejected": -14.345916748046875, + "step": 439 + }, + { + "epoch": 0.3036053130929791, + "grad_norm": 0.3686051368713379, + "learning_rate": 8.432349559218092e-07, + "logits/chosen": 3.6245343685150146, + "logits/rejected": 3.7211403846740723, + "logps/chosen": -158.63653564453125, + "logps/rejected": -168.17596435546875, + "loss": 0.6065, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.218716621398926, + "rewards/margins": 1.0470235347747803, + "rewards/rejected": -12.265739440917969, + "step": 440 + }, + { + "epoch": 0.30429532516819047, + "grad_norm": 0.39967629313468933, + "learning_rate": 8.451513990034496e-07, + "logits/chosen": 3.920659303665161, + "logits/rejected": 3.920659303665161, + "logps/chosen": -179.16419982910156, + "logps/rejected": -179.16419982910156, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.19225025177002, + "rewards/margins": 0.0, + "rewards/rejected": -13.19225025177002, + "step": 441 + }, + { + "epoch": 0.30498533724340177, + "grad_norm": 0.37778836488723755, + "learning_rate": 8.470678420850902e-07, + "logits/chosen": 3.594984531402588, + "logits/rejected": 3.7607994079589844, + "logps/chosen": -150.0142822265625, + "logps/rejected": -161.5752716064453, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.276800155639648, + "rewards/margins": 1.2056212425231934, + "rewards/rejected": -11.482421875, + "step": 442 + }, + { + "epoch": 0.30567534931861307, + "grad_norm": 0.7186985611915588, + "learning_rate": 8.489842851667306e-07, + "logits/chosen": 3.783421277999878, + "logits/rejected": 3.783421277999878, + "logps/chosen": -160.66464233398438, + "logps/rejected": -160.66464233398438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.48109245300293, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.481094360351562, + "step": 443 + }, + { + "epoch": 0.30636536139382436, + "grad_norm": 0.3635159730911255, + "learning_rate": 8.509007282483711e-07, + "logits/chosen": 3.987380266189575, + "logits/rejected": 3.987380266189575, + "logps/chosen": -184.00157165527344, + "logps/rejected": -184.00157165527344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.683601379394531, + "rewards/margins": 0.0, + "rewards/rejected": -13.683601379394531, + "step": 444 + }, + { + "epoch": 0.3070553734690357, + "grad_norm": 0.6535800099372864, + "learning_rate": 8.528171713300115e-07, + "logits/chosen": 3.77323579788208, + "logits/rejected": 3.763216018676758, + "logps/chosen": -160.34207153320312, + "logps/rejected": -164.86367797851562, + "loss": 0.6099, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.362573623657227, + "rewards/margins": 0.4500228762626648, + "rewards/rejected": -11.812597274780273, + "step": 445 + }, + { + "epoch": 0.307745385544247, + "grad_norm": 4.838842868804932, + "learning_rate": 8.547336144116521e-07, + "logits/chosen": 3.8310344219207764, + "logits/rejected": 3.8315207958221436, + "logps/chosen": -167.01150512695312, + "logps/rejected": -174.2558135986328, + "loss": 0.5886, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.763284683227539, + "rewards/margins": 0.7681523561477661, + "rewards/rejected": -12.531436920166016, + "step": 446 + }, + { + "epoch": 0.3084353976194583, + "grad_norm": 8.466703414916992, + "learning_rate": 8.566500574932925e-07, + "logits/chosen": 3.466364860534668, + "logits/rejected": 3.637242317199707, + "logps/chosen": -150.062744140625, + "logps/rejected": -154.00025939941406, + "loss": 0.6091, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.22714900970459, + "rewards/margins": 0.44315433502197266, + "rewards/rejected": -10.670303344726562, + "step": 447 + }, + { + "epoch": 0.3091254096946697, + "grad_norm": 0.3093945384025574, + "learning_rate": 8.58566500574933e-07, + "logits/chosen": 3.8418707847595215, + "logits/rejected": 3.8418707847595215, + "logps/chosen": -180.0433349609375, + "logps/rejected": -180.0433349609375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.129069328308105, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.129068374633789, + "step": 448 + }, + { + "epoch": 0.30981542176988097, + "grad_norm": 0.37579238414764404, + "learning_rate": 8.604829436565734e-07, + "logits/chosen": 4.174118995666504, + "logits/rejected": 4.174118995666504, + "logps/chosen": -199.8577880859375, + "logps/rejected": -199.8577880859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -15.142780303955078, + "rewards/margins": 0.0, + "rewards/rejected": -15.142780303955078, + "step": 449 + }, + { + "epoch": 0.31050543384509227, + "grad_norm": 0.3580400049686432, + "learning_rate": 8.62399386738214e-07, + "logits/chosen": 3.847107172012329, + "logits/rejected": 3.9632184505462646, + "logps/chosen": -148.798095703125, + "logps/rejected": -161.31576538085938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.205623626708984, + "rewards/margins": 1.1408195495605469, + "rewards/rejected": -11.346443176269531, + "step": 450 + }, + { + "epoch": 0.3111954459203036, + "grad_norm": 0.5398547053337097, + "learning_rate": 8.643158298198544e-07, + "logits/chosen": 3.778838634490967, + "logits/rejected": 3.824795722961426, + "logps/chosen": -162.79434204101562, + "logps/rejected": -174.4092254638672, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.40732192993164, + "rewards/margins": 1.1303176879882812, + "rewards/rejected": -12.537639617919922, + "step": 451 + }, + { + "epoch": 0.3118854579955149, + "grad_norm": 0.37363800406455994, + "learning_rate": 8.662322729014949e-07, + "logits/chosen": 4.221747398376465, + "logits/rejected": 4.221747398376465, + "logps/chosen": -195.60589599609375, + "logps/rejected": -195.60589599609375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.707477569580078, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -14.707476615905762, + "step": 452 + }, + { + "epoch": 0.3125754700707262, + "grad_norm": 3.2768213748931885, + "learning_rate": 8.681487159831354e-07, + "logits/chosen": 3.958493232727051, + "logits/rejected": 3.9392690658569336, + "logps/chosen": -178.23268127441406, + "logps/rejected": -180.46022033691406, + "loss": 0.6271, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.004180908203125, + "rewards/margins": 0.21525323390960693, + "rewards/rejected": -13.219432830810547, + "step": 453 + }, + { + "epoch": 0.3132654821459376, + "grad_norm": 0.30087023973464966, + "learning_rate": 8.700651590647759e-07, + "logits/chosen": 3.854853391647339, + "logits/rejected": 3.983055591583252, + "logps/chosen": -171.3144073486328, + "logps/rejected": -191.50753784179688, + "loss": 0.5205, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.328794479370117, + "rewards/margins": 2.0144217014312744, + "rewards/rejected": -14.343215942382812, + "step": 454 + }, + { + "epoch": 0.3139554942211489, + "grad_norm": 0.4331170320510864, + "learning_rate": 8.719816021464163e-07, + "logits/chosen": 3.6762168407440186, + "logits/rejected": 3.6762168407440186, + "logps/chosen": -166.2198944091797, + "logps/rejected": -166.2198944091797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.881983757019043, + "rewards/margins": 0.0, + "rewards/rejected": -11.881983757019043, + "step": 455 + }, + { + "epoch": 0.3146455062963602, + "grad_norm": 0.3192439079284668, + "learning_rate": 8.738980452280569e-07, + "logits/chosen": 3.4511590003967285, + "logits/rejected": 3.6132380962371826, + "logps/chosen": -164.76617431640625, + "logps/rejected": -186.89425659179688, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.663979530334473, + "rewards/margins": 2.2244889736175537, + "rewards/rejected": -13.888467788696289, + "step": 456 + }, + { + "epoch": 0.3153355183715715, + "grad_norm": 5.078715801239014, + "learning_rate": 8.758144883096972e-07, + "logits/chosen": 3.3718366622924805, + "logits/rejected": 3.813337802886963, + "logps/chosen": -145.41738891601562, + "logps/rejected": -183.7446746826172, + "loss": 0.5344, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.699451446533203, + "rewards/margins": 3.9991390705108643, + "rewards/rejected": -13.698589324951172, + "step": 457 + }, + { + "epoch": 0.31602553044678283, + "grad_norm": 0.3901301920413971, + "learning_rate": 8.777309313913377e-07, + "logits/chosen": 3.715066432952881, + "logits/rejected": 3.715066432952881, + "logps/chosen": -162.81378173828125, + "logps/rejected": -162.81378173828125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.408440589904785, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.408440589904785, + "step": 458 + }, + { + "epoch": 0.31671554252199413, + "grad_norm": 0.2930220663547516, + "learning_rate": 8.796473744729781e-07, + "logits/chosen": 3.7427940368652344, + "logits/rejected": 3.9809353351593018, + "logps/chosen": -173.77943420410156, + "logps/rejected": -193.26809692382812, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.570722579956055, + "rewards/margins": 1.9901998043060303, + "rewards/rejected": -14.560922622680664, + "step": 459 + }, + { + "epoch": 0.31740555459720543, + "grad_norm": 0.7104183435440063, + "learning_rate": 8.815638175546187e-07, + "logits/chosen": 4.2465362548828125, + "logits/rejected": 4.201155662536621, + "logps/chosen": -170.56900024414062, + "logps/rejected": -174.9755401611328, + "loss": 0.6109, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.332071304321289, + "rewards/margins": 0.41481781005859375, + "rewards/rejected": -12.746889114379883, + "step": 460 + }, + { + "epoch": 0.3180955666724168, + "grad_norm": 0.361198365688324, + "learning_rate": 8.834802606362591e-07, + "logits/chosen": 3.76938533782959, + "logits/rejected": 3.76938533782959, + "logps/chosen": -193.20675659179688, + "logps/rejected": -193.20675659179688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.570075035095215, + "rewards/margins": 0.0, + "rewards/rejected": -14.570075035095215, + "step": 461 + }, + { + "epoch": 0.3187855787476281, + "grad_norm": 0.38201162219047546, + "learning_rate": 8.853967037178996e-07, + "logits/chosen": 3.7443883419036865, + "logits/rejected": 4.116636276245117, + "logps/chosen": -167.1071319580078, + "logps/rejected": -192.78863525390625, + "loss": 0.4349, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.942486763000488, + "rewards/margins": 2.5596487522125244, + "rewards/rejected": -14.50213623046875, + "step": 462 + }, + { + "epoch": 0.3194755908228394, + "grad_norm": 0.36138200759887695, + "learning_rate": 8.873131467995401e-07, + "logits/chosen": 3.940702438354492, + "logits/rejected": 3.940702438354492, + "logps/chosen": -183.64224243164062, + "logps/rejected": -183.64224243164062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.366179466247559, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.366178512573242, + "step": 463 + }, + { + "epoch": 0.32016560289805074, + "grad_norm": 0.37595903873443604, + "learning_rate": 8.892295898811806e-07, + "logits/chosen": 3.825613260269165, + "logits/rejected": 3.85601806640625, + "logps/chosen": -142.83433532714844, + "logps/rejected": -157.27157592773438, + "loss": 0.522, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.624157905578613, + "rewards/margins": 1.4185354709625244, + "rewards/rejected": -11.042694091796875, + "step": 464 + }, + { + "epoch": 0.32085561497326204, + "grad_norm": 0.4700366258621216, + "learning_rate": 8.91146032962821e-07, + "logits/chosen": 4.06103515625, + "logits/rejected": 4.132356643676758, + "logps/chosen": -182.41748046875, + "logps/rejected": -186.9449462890625, + "loss": 0.6085, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.35806655883789, + "rewards/margins": 0.5169470310211182, + "rewards/rejected": -13.875014305114746, + "step": 465 + }, + { + "epoch": 0.32154562704847334, + "grad_norm": 1.768505573272705, + "learning_rate": 8.930624760444616e-07, + "logits/chosen": 4.062967777252197, + "logits/rejected": 4.039809703826904, + "logps/chosen": -178.21160888671875, + "logps/rejected": -180.23428344726562, + "loss": 0.6246, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.061600685119629, + "rewards/margins": 0.2320576310157776, + "rewards/rejected": -13.293658256530762, + "step": 466 + }, + { + "epoch": 0.3222356391236847, + "grad_norm": 0.4022159278392792, + "learning_rate": 8.94978919126102e-07, + "logits/chosen": 3.709223985671997, + "logits/rejected": 3.709223985671997, + "logps/chosen": -168.7550811767578, + "logps/rejected": -168.7550811767578, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.033642768859863, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -12.033642768859863, + "step": 467 + }, + { + "epoch": 0.322925651198896, + "grad_norm": 0.34295007586479187, + "learning_rate": 8.968953622077425e-07, + "logits/chosen": 3.769246816635132, + "logits/rejected": 3.8792035579681396, + "logps/chosen": -141.3653106689453, + "logps/rejected": -155.45616149902344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.429818153381348, + "rewards/margins": 1.3768081665039062, + "rewards/rejected": -10.806625366210938, + "step": 468 + }, + { + "epoch": 0.3236156632741073, + "grad_norm": 0.30045053362846375, + "learning_rate": 8.988118052893829e-07, + "logits/chosen": 4.134428024291992, + "logits/rejected": 4.134428024291992, + "logps/chosen": -171.51910400390625, + "logps/rejected": -171.51910400390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.371219635009766, + "rewards/margins": 0.0, + "rewards/rejected": -12.371219635009766, + "step": 469 + }, + { + "epoch": 0.3243056753493186, + "grad_norm": 19.066287994384766, + "learning_rate": 9.007282483710235e-07, + "logits/chosen": 3.6895358562469482, + "logits/rejected": 3.662863254547119, + "logps/chosen": -172.12008666992188, + "logps/rejected": -170.67752075195312, + "loss": 0.7625, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.317506790161133, + "rewards/margins": -0.11365008354187012, + "rewards/rejected": -12.203857421875, + "step": 470 + }, + { + "epoch": 0.32499568742452994, + "grad_norm": 0.39620038866996765, + "learning_rate": 9.026446914526639e-07, + "logits/chosen": 3.81211256980896, + "logits/rejected": 4.064940452575684, + "logps/chosen": -169.09298706054688, + "logps/rejected": -189.66250610351562, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.136176109313965, + "rewards/margins": 2.0972023010253906, + "rewards/rejected": -14.233378410339355, + "step": 471 + }, + { + "epoch": 0.32568569949974124, + "grad_norm": 4.591740608215332, + "learning_rate": 9.045611345343044e-07, + "logits/chosen": 3.9535956382751465, + "logits/rejected": 4.01717472076416, + "logps/chosen": -182.94598388671875, + "logps/rejected": -184.36587524414062, + "loss": 0.6371, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.463356018066406, + "rewards/margins": 0.16041189432144165, + "rewards/rejected": -13.623767852783203, + "step": 472 + }, + { + "epoch": 0.32637571157495254, + "grad_norm": 14.565522193908691, + "learning_rate": 9.064775776159449e-07, + "logits/chosen": 3.9299135208129883, + "logits/rejected": 3.966947555541992, + "logps/chosen": -173.96939086914062, + "logps/rejected": -173.9849853515625, + "loss": 1.3559, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.665522575378418, + "rewards/margins": -0.0015709400177001953, + "rewards/rejected": -12.663952827453613, + "step": 473 + }, + { + "epoch": 0.3270657236501639, + "grad_norm": 0.4049626588821411, + "learning_rate": 9.083940206975854e-07, + "logits/chosen": 3.7228503227233887, + "logits/rejected": 3.7228503227233887, + "logps/chosen": -178.87351989746094, + "logps/rejected": -178.87351989746094, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.157669067382812, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.157669067382812, + "step": 474 + }, + { + "epoch": 0.3277557357253752, + "grad_norm": 0.4928308427333832, + "learning_rate": 9.103104637792258e-07, + "logits/chosen": 3.8173818588256836, + "logits/rejected": 3.970593214035034, + "logps/chosen": -165.3936767578125, + "logps/rejected": -188.496337890625, + "loss": 0.4383, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.528192520141602, + "rewards/margins": 2.3084678649902344, + "rewards/rejected": -13.836660385131836, + "step": 475 + }, + { + "epoch": 0.3284457478005865, + "grad_norm": 0.39523354172706604, + "learning_rate": 9.122269068608664e-07, + "logits/chosen": 4.014082908630371, + "logits/rejected": 4.014082908630371, + "logps/chosen": -192.44442749023438, + "logps/rejected": -192.44442749023438, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.361835479736328, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.361836433410645, + "step": 476 + }, + { + "epoch": 0.32913575987579785, + "grad_norm": 2.2991597652435303, + "learning_rate": 9.141433499425068e-07, + "logits/chosen": 3.583216428756714, + "logits/rejected": 3.7064104080200195, + "logps/chosen": -151.12779235839844, + "logps/rejected": -170.82847595214844, + "loss": 0.5574, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.297471046447754, + "rewards/margins": 1.9760534763336182, + "rewards/rejected": -12.27352523803711, + "step": 477 + }, + { + "epoch": 0.32982577195100915, + "grad_norm": 0.29616889357566833, + "learning_rate": 9.160597930241473e-07, + "logits/chosen": 4.062515735626221, + "logits/rejected": 4.062515735626221, + "logps/chosen": -189.22271728515625, + "logps/rejected": -189.22268676757812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.068138122558594, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.068138122558594, + "step": 478 + }, + { + "epoch": 0.33051578402622045, + "grad_norm": 0.43108272552490234, + "learning_rate": 9.179762361057877e-07, + "logits/chosen": 4.167949199676514, + "logits/rejected": 4.168141841888428, + "logps/chosen": -175.910400390625, + "logps/rejected": -183.91477966308594, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.78024673461914, + "rewards/margins": 0.8246902227401733, + "rewards/rejected": -13.604936599731445, + "step": 479 + }, + { + "epoch": 0.33120579610143175, + "grad_norm": 0.46282488107681274, + "learning_rate": 9.198926791874283e-07, + "logits/chosen": 3.9844064712524414, + "logits/rejected": 3.991516590118408, + "logps/chosen": -189.94204711914062, + "logps/rejected": -194.15115356445312, + "loss": 0.6095, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.105359077453613, + "rewards/margins": 0.463703453540802, + "rewards/rejected": -14.569062232971191, + "step": 480 + }, + { + "epoch": 0.3318958081766431, + "grad_norm": 10.81605339050293, + "learning_rate": 9.218091222690687e-07, + "logits/chosen": 3.677651882171631, + "logits/rejected": 3.714024066925049, + "logps/chosen": -153.4302978515625, + "logps/rejected": -153.59661865234375, + "loss": 0.6804, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.661284446716309, + "rewards/margins": 0.02699732780456543, + "rewards/rejected": -10.688282012939453, + "step": 481 + }, + { + "epoch": 0.3325858202518544, + "grad_norm": 0.32217687368392944, + "learning_rate": 9.237255653507092e-07, + "logits/chosen": 4.042904376983643, + "logits/rejected": 4.073591709136963, + "logps/chosen": -174.11473083496094, + "logps/rejected": -194.1980438232422, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.663265228271484, + "rewards/margins": 2.0741372108459473, + "rewards/rejected": -14.737403869628906, + "step": 482 + }, + { + "epoch": 0.3332758323270657, + "grad_norm": 0.32139715552330017, + "learning_rate": 9.256420084323497e-07, + "logits/chosen": 3.9048361778259277, + "logits/rejected": 3.9694533348083496, + "logps/chosen": -178.4253692626953, + "logps/rejected": -185.56024169921875, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.957588195800781, + "rewards/margins": 0.7598334550857544, + "rewards/rejected": -13.717421531677246, + "step": 483 + }, + { + "epoch": 0.33396584440227706, + "grad_norm": 0.5299174189567566, + "learning_rate": 9.275584515139901e-07, + "logits/chosen": 3.535892963409424, + "logits/rejected": 3.736757278442383, + "logps/chosen": -161.53622436523438, + "logps/rejected": -179.13980102539062, + "loss": 0.5217, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.286771774291992, + "rewards/margins": 1.7732058763504028, + "rewards/rejected": -13.059976577758789, + "step": 484 + }, + { + "epoch": 0.33465585647748836, + "grad_norm": 0.3882652223110199, + "learning_rate": 9.294748945956305e-07, + "logits/chosen": 3.899277448654175, + "logits/rejected": 3.899277448654175, + "logps/chosen": -184.2145233154297, + "logps/rejected": -184.2145233154297, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.57868766784668, + "rewards/margins": 0.0, + "rewards/rejected": -13.57868766784668, + "step": 485 + }, + { + "epoch": 0.33534586855269966, + "grad_norm": 0.2933729290962219, + "learning_rate": 9.313913376772711e-07, + "logits/chosen": 3.5622334480285645, + "logits/rejected": 3.808568239212036, + "logps/chosen": -160.96005249023438, + "logps/rejected": -180.98065185546875, + "loss": 0.5204, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.519625663757324, + "rewards/margins": 1.8058326244354248, + "rewards/rejected": -13.325458526611328, + "step": 486 + }, + { + "epoch": 0.336035880627911, + "grad_norm": 4.590398788452148, + "learning_rate": 9.333077807589115e-07, + "logits/chosen": 3.9329752922058105, + "logits/rejected": 3.922152519226074, + "logps/chosen": -169.89942932128906, + "logps/rejected": -178.2649688720703, + "loss": 0.578, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.149036407470703, + "rewards/margins": 0.7359265089035034, + "rewards/rejected": -12.884963989257812, + "step": 487 + }, + { + "epoch": 0.3367258927031223, + "grad_norm": 0.3317889869213104, + "learning_rate": 9.35224223840552e-07, + "logits/chosen": 3.7686426639556885, + "logits/rejected": 3.9454636573791504, + "logps/chosen": -172.30001831054688, + "logps/rejected": -185.76104736328125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.404935836791992, + "rewards/margins": 1.3787682056427002, + "rewards/rejected": -13.78370475769043, + "step": 488 + }, + { + "epoch": 0.3374159047783336, + "grad_norm": 9.603429794311523, + "learning_rate": 9.371406669221924e-07, + "logits/chosen": 3.803217887878418, + "logits/rejected": 3.713850975036621, + "logps/chosen": -165.066650390625, + "logps/rejected": -175.32569885253906, + "loss": 0.6736, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.67042350769043, + "rewards/margins": 1.0507664680480957, + "rewards/rejected": -12.721189498901367, + "step": 489 + }, + { + "epoch": 0.33810591685354496, + "grad_norm": 25.183170318603516, + "learning_rate": 9.39057110003833e-07, + "logits/chosen": 3.779548406600952, + "logits/rejected": 3.9471983909606934, + "logps/chosen": -166.29339599609375, + "logps/rejected": -189.6195068359375, + "loss": 0.8551, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.891275405883789, + "rewards/margins": 2.3411483764648438, + "rewards/rejected": -14.232423782348633, + "step": 490 + }, + { + "epoch": 0.33879592892875626, + "grad_norm": 0.4128996431827545, + "learning_rate": 9.409735530854734e-07, + "logits/chosen": 3.470383644104004, + "logits/rejected": 3.64772367477417, + "logps/chosen": -152.86070251464844, + "logps/rejected": -161.72494506835938, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.64033317565918, + "rewards/margins": 0.8836573362350464, + "rewards/rejected": -11.523990631103516, + "step": 491 + }, + { + "epoch": 0.33948594100396756, + "grad_norm": 0.3886447548866272, + "learning_rate": 9.428899961671139e-07, + "logits/chosen": 3.8571677207946777, + "logits/rejected": 3.8571677207946777, + "logps/chosen": -171.0725860595703, + "logps/rejected": -171.07257080078125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.393086433410645, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -12.393085479736328, + "step": 492 + }, + { + "epoch": 0.34017595307917886, + "grad_norm": 0.36775344610214233, + "learning_rate": 9.448064392487544e-07, + "logits/chosen": 4.015544891357422, + "logits/rejected": 4.109115123748779, + "logps/chosen": -182.18585205078125, + "logps/rejected": -193.72933959960938, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.371732711791992, + "rewards/margins": 1.179662823677063, + "rewards/rejected": -14.551396369934082, + "step": 493 + }, + { + "epoch": 0.3408659651543902, + "grad_norm": 18.011638641357422, + "learning_rate": 9.467228823303949e-07, + "logits/chosen": 3.8956260681152344, + "logits/rejected": 3.9441847801208496, + "logps/chosen": -180.79449462890625, + "logps/rejected": -178.58010864257812, + "loss": 0.7867, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.203025817871094, + "rewards/margins": -0.1463937759399414, + "rewards/rejected": -13.056631088256836, + "step": 494 + }, + { + "epoch": 0.3415559772296015, + "grad_norm": 0.3689093589782715, + "learning_rate": 9.486393254120353e-07, + "logits/chosen": 3.6665399074554443, + "logits/rejected": 3.8508615493774414, + "logps/chosen": -166.64041137695312, + "logps/rejected": -183.98007202148438, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.893462181091309, + "rewards/margins": 1.7398490905761719, + "rewards/rejected": -13.63331127166748, + "step": 495 + }, + { + "epoch": 0.3422459893048128, + "grad_norm": 0.3522513210773468, + "learning_rate": 9.505557684936759e-07, + "logits/chosen": 3.8284573554992676, + "logits/rejected": 3.967654228210449, + "logps/chosen": -171.23406982421875, + "logps/rejected": -183.26766967773438, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.135517120361328, + "rewards/margins": 1.2152936458587646, + "rewards/rejected": -13.350811004638672, + "step": 496 + }, + { + "epoch": 0.34293600138002417, + "grad_norm": 0.29449185729026794, + "learning_rate": 9.524722115753163e-07, + "logits/chosen": 3.640644073486328, + "logits/rejected": 3.7010746002197266, + "logps/chosen": -169.60623168945312, + "logps/rejected": -177.67681884765625, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.218088150024414, + "rewards/margins": 0.8231476545333862, + "rewards/rejected": -13.041236877441406, + "step": 497 + }, + { + "epoch": 0.34362601345523547, + "grad_norm": 0.2971118986606598, + "learning_rate": 9.54388654656957e-07, + "logits/chosen": 4.265342712402344, + "logits/rejected": 4.3829498291015625, + "logps/chosen": -180.8892822265625, + "logps/rejected": -190.76397705078125, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.245424270629883, + "rewards/margins": 0.9775316119194031, + "rewards/rejected": -14.222956657409668, + "step": 498 + }, + { + "epoch": 0.34431602553044677, + "grad_norm": 23.460046768188477, + "learning_rate": 9.563050977385973e-07, + "logits/chosen": 3.791421413421631, + "logits/rejected": 3.9038333892822266, + "logps/chosen": -180.57595825195312, + "logps/rejected": -186.4946746826172, + "loss": 1.0062, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.26163387298584, + "rewards/margins": 0.6369820833206177, + "rewards/rejected": -13.898616790771484, + "step": 499 + }, + { + "epoch": 0.3450060376056581, + "grad_norm": 0.5368048548698425, + "learning_rate": 9.582215408202377e-07, + "logits/chosen": 3.699446678161621, + "logits/rejected": 3.91813588142395, + "logps/chosen": -171.64694213867188, + "logps/rejected": -182.29010009765625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.259066581726074, + "rewards/margins": 1.011112928390503, + "rewards/rejected": -13.27017879486084, + "step": 500 + }, + { + "epoch": 0.3456960496808694, + "grad_norm": 0.2963772118091583, + "learning_rate": 9.60137983901878e-07, + "logits/chosen": 3.525294303894043, + "logits/rejected": 3.775752067565918, + "logps/chosen": -158.59384155273438, + "logps/rejected": -182.40257263183594, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.190407752990723, + "rewards/margins": 2.4006190299987793, + "rewards/rejected": -13.591028213500977, + "step": 501 + }, + { + "epoch": 0.3463860617560807, + "grad_norm": 1.9669289588928223, + "learning_rate": 9.620544269835187e-07, + "logits/chosen": 4.00399112701416, + "logits/rejected": 3.945983648300171, + "logps/chosen": -179.79417419433594, + "logps/rejected": -181.0795440673828, + "loss": 0.6379, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.106317520141602, + "rewards/margins": 0.15666329860687256, + "rewards/rejected": -13.262981414794922, + "step": 502 + }, + { + "epoch": 0.347076073831292, + "grad_norm": 10.381688117980957, + "learning_rate": 9.63970870065159e-07, + "logits/chosen": 3.803118944168091, + "logits/rejected": 3.859407901763916, + "logps/chosen": -168.68695068359375, + "logps/rejected": -170.6278076171875, + "loss": 1.3479, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.112411499023438, + "rewards/margins": 0.18369358777999878, + "rewards/rejected": -12.296106338500977, + "step": 503 + }, + { + "epoch": 0.3477660859065034, + "grad_norm": 0.32299378514289856, + "learning_rate": 9.658873131467997e-07, + "logits/chosen": 4.0915141105651855, + "logits/rejected": 4.0915141105651855, + "logps/chosen": -197.647705078125, + "logps/rejected": -197.647705078125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.718647956848145, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.718648910522461, + "step": 504 + }, + { + "epoch": 0.3484560979817147, + "grad_norm": 0.3194892406463623, + "learning_rate": 9.6780375622844e-07, + "logits/chosen": 4.344378471374512, + "logits/rejected": 4.388795375823975, + "logps/chosen": -176.09393310546875, + "logps/rejected": -183.92953491210938, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.695178031921387, + "rewards/margins": 0.8025984764099121, + "rewards/rejected": -13.49777603149414, + "step": 505 + }, + { + "epoch": 0.349146110056926, + "grad_norm": 0.35955917835235596, + "learning_rate": 9.697201993100807e-07, + "logits/chosen": 3.9297947883605957, + "logits/rejected": 4.014141082763672, + "logps/chosen": -177.90658569335938, + "logps/rejected": -187.64666748046875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.9365234375, + "rewards/margins": 0.9982486367225647, + "rewards/rejected": -13.934772491455078, + "step": 506 + }, + { + "epoch": 0.34983612213213733, + "grad_norm": 0.2670174539089203, + "learning_rate": 9.71636642391721e-07, + "logits/chosen": 3.5579960346221924, + "logits/rejected": 3.690669536590576, + "logps/chosen": -155.0440673828125, + "logps/rejected": -173.83071899414062, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.58563232421875, + "rewards/margins": 2.0478854179382324, + "rewards/rejected": -12.63351821899414, + "step": 507 + }, + { + "epoch": 0.3505261342073486, + "grad_norm": 0.3451307415962219, + "learning_rate": 9.735530854733617e-07, + "logits/chosen": 3.864536762237549, + "logits/rejected": 3.864536762237549, + "logps/chosen": -186.89273071289062, + "logps/rejected": -186.89273071289062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.83812141418457, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.838120460510254, + "step": 508 + }, + { + "epoch": 0.3512161462825599, + "grad_norm": 0.30288082361221313, + "learning_rate": 9.75469528555002e-07, + "logits/chosen": 3.63962984085083, + "logits/rejected": 3.769246816635132, + "logps/chosen": -150.04617309570312, + "logps/rejected": -162.031005859375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.255294799804688, + "rewards/margins": 1.1402347087860107, + "rewards/rejected": -11.395529747009277, + "step": 509 + }, + { + "epoch": 0.3519061583577713, + "grad_norm": 8.61858081817627, + "learning_rate": 9.773859716366424e-07, + "logits/chosen": 3.2978551387786865, + "logits/rejected": 3.704958200454712, + "logps/chosen": -127.64198303222656, + "logps/rejected": -173.70245361328125, + "loss": 0.39, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.170066833496094, + "rewards/margins": 4.53595495223999, + "rewards/rejected": -12.706022262573242, + "step": 510 + }, + { + "epoch": 0.3525961704329826, + "grad_norm": 0.22369763255119324, + "learning_rate": 9.793024147182828e-07, + "logits/chosen": 3.65206241607666, + "logits/rejected": 3.982726573944092, + "logps/chosen": -163.06112670898438, + "logps/rejected": -194.41250610351562, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.633932113647461, + "rewards/margins": 3.0553903579711914, + "rewards/rejected": -14.689321517944336, + "step": 511 + }, + { + "epoch": 0.3532861825081939, + "grad_norm": 0.34986788034439087, + "learning_rate": 9.812188577999234e-07, + "logits/chosen": 3.8524627685546875, + "logits/rejected": 3.908168315887451, + "logps/chosen": -163.57046508789062, + "logps/rejected": -181.6210174560547, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.670331954956055, + "rewards/margins": 1.7367216348648071, + "rewards/rejected": -13.407054901123047, + "step": 512 + }, + { + "epoch": 0.35397619458340523, + "grad_norm": 14.405946731567383, + "learning_rate": 9.831353008815638e-07, + "logits/chosen": 3.6767067909240723, + "logits/rejected": 3.924351215362549, + "logps/chosen": -154.01548767089844, + "logps/rejected": -189.1078643798828, + "loss": 0.375, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.558300018310547, + "rewards/margins": 3.527092695236206, + "rewards/rejected": -14.085392951965332, + "step": 513 + }, + { + "epoch": 0.35466620665861653, + "grad_norm": 0.36283108592033386, + "learning_rate": 9.850517439632044e-07, + "logits/chosen": 3.8984222412109375, + "logits/rejected": 4.055450916290283, + "logps/chosen": -175.0800018310547, + "logps/rejected": -195.03038024902344, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.672811508178711, + "rewards/margins": 2.0734634399414062, + "rewards/rejected": -14.7462739944458, + "step": 514 + }, + { + "epoch": 0.35535621873382783, + "grad_norm": 19.712505340576172, + "learning_rate": 9.869681870448448e-07, + "logits/chosen": 3.365933418273926, + "logits/rejected": 3.4906201362609863, + "logps/chosen": -140.92190551757812, + "logps/rejected": -156.92782592773438, + "loss": 0.7091, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.420259475708008, + "rewards/margins": 1.6010918617248535, + "rewards/rejected": -11.021350860595703, + "step": 515 + }, + { + "epoch": 0.35604623080903913, + "grad_norm": 0.29154834151268005, + "learning_rate": 9.888846301264854e-07, + "logits/chosen": 3.9588828086853027, + "logits/rejected": 3.9744668006896973, + "logps/chosen": -158.70458984375, + "logps/rejected": -170.4754638671875, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.066540718078613, + "rewards/margins": 1.0997328758239746, + "rewards/rejected": -12.166274070739746, + "step": 516 + }, + { + "epoch": 0.3567362428842505, + "grad_norm": 0.4246944487094879, + "learning_rate": 9.908010732081258e-07, + "logits/chosen": 3.826807737350464, + "logits/rejected": 3.826807737350464, + "logps/chosen": -173.2305908203125, + "logps/rejected": -173.2305908203125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.531132698059082, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.531132698059082, + "step": 517 + }, + { + "epoch": 0.3574262549594618, + "grad_norm": 0.4565313756465912, + "learning_rate": 9.927175162897664e-07, + "logits/chosen": 3.833878993988037, + "logits/rejected": 3.833878993988037, + "logps/chosen": -166.4932098388672, + "logps/rejected": -166.4932098388672, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.794776916503906, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.794776916503906, + "step": 518 + }, + { + "epoch": 0.3581162670346731, + "grad_norm": 0.4216315448284149, + "learning_rate": 9.946339593714068e-07, + "logits/chosen": 3.8093769550323486, + "logits/rejected": 3.9441258907318115, + "logps/chosen": -165.84242248535156, + "logps/rejected": -177.3271484375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.685318946838379, + "rewards/margins": 1.1902716159820557, + "rewards/rejected": -12.875591278076172, + "step": 519 + }, + { + "epoch": 0.35880627910988444, + "grad_norm": 0.38380324840545654, + "learning_rate": 9.965504024530472e-07, + "logits/chosen": 3.736802101135254, + "logits/rejected": 3.736802101135254, + "logps/chosen": -185.72149658203125, + "logps/rejected": -185.7215118408203, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.907430648803711, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.907431602478027, + "step": 520 + }, + { + "epoch": 0.35949629118509574, + "grad_norm": 0.34109625220298767, + "learning_rate": 9.984668455346876e-07, + "logits/chosen": 3.9732134342193604, + "logits/rejected": 3.9732134342193604, + "logps/chosen": -173.89735412597656, + "logps/rejected": -173.89735412597656, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.665092468261719, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.665092468261719, + "step": 521 + }, + { + "epoch": 0.36018630326030704, + "grad_norm": 0.36614617705345154, + "learning_rate": 1.0003832886163282e-06, + "logits/chosen": 3.923654556274414, + "logits/rejected": 4.072017669677734, + "logps/chosen": -169.57138061523438, + "logps/rejected": -177.98291015625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.071500778198242, + "rewards/margins": 0.8558517694473267, + "rewards/rejected": -12.927352905273438, + "step": 522 + }, + { + "epoch": 0.3608763153355184, + "grad_norm": 0.33055105805397034, + "learning_rate": 1.0022997316979686e-06, + "logits/chosen": 3.816541910171509, + "logits/rejected": 3.816541910171509, + "logps/chosen": -171.20346069335938, + "logps/rejected": -171.20346069335938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.368751525878906, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -12.368751525878906, + "step": 523 + }, + { + "epoch": 0.3615663274107297, + "grad_norm": 1.488866925239563, + "learning_rate": 1.004216174779609e-06, + "logits/chosen": 3.9637508392333984, + "logits/rejected": 4.074607849121094, + "logps/chosen": -163.1968994140625, + "logps/rejected": -166.34742736816406, + "loss": 0.6207, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.540847778320312, + "rewards/margins": 0.26520633697509766, + "rewards/rejected": -11.80605411529541, + "step": 524 + }, + { + "epoch": 0.362256339485941, + "grad_norm": 35.50017547607422, + "learning_rate": 1.0061326178612496e-06, + "logits/chosen": 3.560123920440674, + "logits/rejected": 3.5289011001586914, + "logps/chosen": -153.08714294433594, + "logps/rejected": -147.1171875, + "loss": 1.2219, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.383415222167969, + "rewards/margins": -0.614478349685669, + "rewards/rejected": -9.768937110900879, + "step": 525 + }, + { + "epoch": 0.36294635156115235, + "grad_norm": 0.6076259016990662, + "learning_rate": 1.00804906094289e-06, + "logits/chosen": 3.4392480850219727, + "logits/rejected": 3.396191120147705, + "logps/chosen": -142.3316650390625, + "logps/rejected": -145.91482543945312, + "loss": 0.6171, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.421976089477539, + "rewards/margins": 0.30262672901153564, + "rewards/rejected": -9.724602699279785, + "step": 526 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 24.687231063842773, + "learning_rate": 1.0099655040245306e-06, + "logits/chosen": 3.859957218170166, + "logits/rejected": 3.8731255531311035, + "logps/chosen": -190.67825317382812, + "logps/rejected": -187.71160888671875, + "loss": 0.8483, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.236648559570312, + "rewards/margins": -0.22231853008270264, + "rewards/rejected": -14.01432991027832, + "step": 527 + }, + { + "epoch": 0.36432637571157495, + "grad_norm": 0.33405864238739014, + "learning_rate": 1.011881947106171e-06, + "logits/chosen": 4.207187175750732, + "logits/rejected": 4.233942985534668, + "logps/chosen": -178.93064880371094, + "logps/rejected": -184.4053192138672, + "loss": 0.607, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.870349884033203, + "rewards/margins": 0.6877707242965698, + "rewards/rejected": -13.558119773864746, + "step": 528 + }, + { + "epoch": 0.36501638778678624, + "grad_norm": 0.38546979427337646, + "learning_rate": 1.0137983901878116e-06, + "logits/chosen": 3.7282066345214844, + "logits/rejected": 4.058753490447998, + "logps/chosen": -163.7222900390625, + "logps/rejected": -187.99972534179688, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.363566398620605, + "rewards/margins": 2.4479942321777344, + "rewards/rejected": -13.811559677124023, + "step": 529 + }, + { + "epoch": 0.3657063998619976, + "grad_norm": 1.1348178386688232, + "learning_rate": 1.015714833269452e-06, + "logits/chosen": 3.549098491668701, + "logits/rejected": 3.6145200729370117, + "logps/chosen": -149.05386352539062, + "logps/rejected": -168.856201171875, + "loss": 0.4415, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.119403839111328, + "rewards/margins": 2.032715320587158, + "rewards/rejected": -12.152119636535645, + "step": 530 + }, + { + "epoch": 0.3663964119372089, + "grad_norm": 0.33842024207115173, + "learning_rate": 1.0176312763510924e-06, + "logits/chosen": 3.8166074752807617, + "logits/rejected": 3.821068286895752, + "logps/chosen": -167.9680938720703, + "logps/rejected": -178.34423828125, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.208735466003418, + "rewards/margins": 0.9654284715652466, + "rewards/rejected": -13.174163818359375, + "step": 531 + }, + { + "epoch": 0.3670864240124202, + "grad_norm": 0.3357568085193634, + "learning_rate": 1.019547719432733e-06, + "logits/chosen": 3.405268907546997, + "logits/rejected": 3.5600059032440186, + "logps/chosen": -154.52816772460938, + "logps/rejected": -161.19436645507812, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.450039863586426, + "rewards/margins": 0.6675609350204468, + "rewards/rejected": -11.117600440979004, + "step": 532 + }, + { + "epoch": 0.36777643608763155, + "grad_norm": 0.40930548310279846, + "learning_rate": 1.0214641625143734e-06, + "logits/chosen": 3.8809139728546143, + "logits/rejected": 3.8809139728546143, + "logps/chosen": -175.69891357421875, + "logps/rejected": -175.69891357421875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.661086082458496, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.661086082458496, + "step": 533 + }, + { + "epoch": 0.36846644816284285, + "grad_norm": 1.6867979764938354, + "learning_rate": 1.0233806055960137e-06, + "logits/chosen": 3.807765245437622, + "logits/rejected": 3.82234787940979, + "logps/chosen": -171.9510498046875, + "logps/rejected": -174.19479370117188, + "loss": 0.6238, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.360549926757812, + "rewards/margins": 0.23834657669067383, + "rewards/rejected": -12.598897933959961, + "step": 534 + }, + { + "epoch": 0.36915646023805415, + "grad_norm": 14.438880920410156, + "learning_rate": 1.0252970486776544e-06, + "logits/chosen": 3.662862777709961, + "logits/rejected": 3.7306089401245117, + "logps/chosen": -145.74215698242188, + "logps/rejected": -169.79910278320312, + "loss": 0.539, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.626153945922852, + "rewards/margins": 2.452378273010254, + "rewards/rejected": -12.078531265258789, + "step": 535 + }, + { + "epoch": 0.3698464723132655, + "grad_norm": 0.41126927733421326, + "learning_rate": 1.0272134917592947e-06, + "logits/chosen": 3.4807424545288086, + "logits/rejected": 3.6813230514526367, + "logps/chosen": -153.6663360595703, + "logps/rejected": -163.47572326660156, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.662064552307129, + "rewards/margins": 0.9580678343772888, + "rewards/rejected": -11.620132446289062, + "step": 536 + }, + { + "epoch": 0.3705364843884768, + "grad_norm": 0.3000882863998413, + "learning_rate": 1.0291299348409353e-06, + "logits/chosen": 3.8262593746185303, + "logits/rejected": 3.943427562713623, + "logps/chosen": -175.86593627929688, + "logps/rejected": -200.506591796875, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.748699188232422, + "rewards/margins": 2.4371228218078613, + "rewards/rejected": -15.185823440551758, + "step": 537 + }, + { + "epoch": 0.3712264964636881, + "grad_norm": 0.32225748896598816, + "learning_rate": 1.0310463779225757e-06, + "logits/chosen": 4.078212261199951, + "logits/rejected": 4.286050796508789, + "logps/chosen": -160.3450164794922, + "logps/rejected": -180.44351196289062, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.187810897827148, + "rewards/margins": 2.016058921813965, + "rewards/rejected": -13.203869819641113, + "step": 538 + }, + { + "epoch": 0.3719165085388994, + "grad_norm": 0.3604412078857422, + "learning_rate": 1.0329628210042163e-06, + "logits/chosen": 3.755535840988159, + "logits/rejected": 3.9429399967193604, + "logps/chosen": -151.03884887695312, + "logps/rejected": -177.91351318359375, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.39561939239502, + "rewards/margins": 2.605227470397949, + "rewards/rejected": -13.000846862792969, + "step": 539 + }, + { + "epoch": 0.37260652061411076, + "grad_norm": 0.3176572918891907, + "learning_rate": 1.0348792640858567e-06, + "logits/chosen": 3.782712697982788, + "logits/rejected": 3.940943956375122, + "logps/chosen": -193.57791137695312, + "logps/rejected": -205.1104278564453, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.669235229492188, + "rewards/margins": 1.125398874282837, + "rewards/rejected": -15.794633865356445, + "step": 540 + }, + { + "epoch": 0.37329653268932206, + "grad_norm": 0.2498069405555725, + "learning_rate": 1.0367957071674971e-06, + "logits/chosen": 3.596841812133789, + "logits/rejected": 3.953249454498291, + "logps/chosen": -148.77981567382812, + "logps/rejected": -179.56756591796875, + "loss": 0.4341, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.207281112670898, + "rewards/margins": 2.907961368560791, + "rewards/rejected": -13.115242004394531, + "step": 541 + }, + { + "epoch": 0.37398654476453336, + "grad_norm": 0.3007005751132965, + "learning_rate": 1.0387121502491377e-06, + "logits/chosen": 3.642974615097046, + "logits/rejected": 3.7824325561523438, + "logps/chosen": -176.0046844482422, + "logps/rejected": -193.4716033935547, + "loss": 0.5209, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.67984390258789, + "rewards/margins": 1.7633264064788818, + "rewards/rejected": -14.443170547485352, + "step": 542 + }, + { + "epoch": 0.3746765568397447, + "grad_norm": 0.4473317563533783, + "learning_rate": 1.0406285933307781e-06, + "logits/chosen": 3.492483139038086, + "logits/rejected": 3.5608386993408203, + "logps/chosen": -171.3813018798828, + "logps/rejected": -183.23690795898438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.174488067626953, + "rewards/margins": 1.23478364944458, + "rewards/rejected": -13.409271240234375, + "step": 543 + }, + { + "epoch": 0.375366568914956, + "grad_norm": 2.1027891635894775, + "learning_rate": 1.0425450364124185e-06, + "logits/chosen": 3.6898186206817627, + "logits/rejected": 4.009800910949707, + "logps/chosen": -148.0858154296875, + "logps/rejected": -176.48255920410156, + "loss": 0.4434, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.012069702148438, + "rewards/margins": 2.753716468811035, + "rewards/rejected": -12.765787124633789, + "step": 544 + }, + { + "epoch": 0.3760565809901673, + "grad_norm": 0.27299627661705017, + "learning_rate": 1.0444614794940591e-06, + "logits/chosen": 4.228782653808594, + "logits/rejected": 4.320986747741699, + "logps/chosen": -171.54586791992188, + "logps/rejected": -191.657470703125, + "loss": 0.5204, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.480571746826172, + "rewards/margins": 2.0592737197875977, + "rewards/rejected": -14.539846420288086, + "step": 545 + }, + { + "epoch": 0.37674659306537867, + "grad_norm": 0.3531125485897064, + "learning_rate": 1.0463779225756995e-06, + "logits/chosen": 3.9576520919799805, + "logits/rejected": 3.9576520919799805, + "logps/chosen": -182.2401123046875, + "logps/rejected": -182.2401123046875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.610167503356934, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.610167503356934, + "step": 546 + }, + { + "epoch": 0.37743660514058996, + "grad_norm": 0.5110857486724854, + "learning_rate": 1.0482943656573401e-06, + "logits/chosen": 4.013522148132324, + "logits/rejected": 4.013522148132324, + "logps/chosen": -187.09585571289062, + "logps/rejected": -187.09585571289062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.81558609008789, + "rewards/margins": 0.0, + "rewards/rejected": -13.81558609008789, + "step": 547 + }, + { + "epoch": 0.37812661721580126, + "grad_norm": 20.489133834838867, + "learning_rate": 1.0502108087389805e-06, + "logits/chosen": 3.6746087074279785, + "logits/rejected": 3.680011749267578, + "logps/chosen": -168.24891662597656, + "logps/rejected": -174.31834411621094, + "loss": 1.1534, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.941776275634766, + "rewards/margins": 0.5773854851722717, + "rewards/rejected": -12.51916217803955, + "step": 548 + }, + { + "epoch": 0.3788166292910126, + "grad_norm": 0.42629796266555786, + "learning_rate": 1.052127251820621e-06, + "logits/chosen": 3.9079747200012207, + "logits/rejected": 3.9945297241210938, + "logps/chosen": -176.3614501953125, + "logps/rejected": -184.8439483642578, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.915366172790527, + "rewards/margins": 0.8655543923377991, + "rewards/rejected": -13.780920028686523, + "step": 549 + }, + { + "epoch": 0.3795066413662239, + "grad_norm": 1.0820469856262207, + "learning_rate": 1.0540436949022615e-06, + "logits/chosen": 3.438237190246582, + "logits/rejected": 3.6699438095092773, + "logps/chosen": -164.24270629882812, + "logps/rejected": -176.0426025390625, + "loss": 0.5259, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.63383674621582, + "rewards/margins": 1.1921818256378174, + "rewards/rejected": -12.826019287109375, + "step": 550 + }, + { + "epoch": 0.3801966534414352, + "grad_norm": 26.502534866333008, + "learning_rate": 1.0559601379839019e-06, + "logits/chosen": 3.5551583766937256, + "logits/rejected": 3.6587305068969727, + "logps/chosen": -149.977294921875, + "logps/rejected": -168.33944702148438, + "loss": 0.6491, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.051451683044434, + "rewards/margins": 1.843313455581665, + "rewards/rejected": -11.89476490020752, + "step": 551 + }, + { + "epoch": 0.3808866655166465, + "grad_norm": 1.5875178575515747, + "learning_rate": 1.0578765810655425e-06, + "logits/chosen": 3.8157949447631836, + "logits/rejected": 3.876645565032959, + "logps/chosen": -160.63711547851562, + "logps/rejected": -164.3904266357422, + "loss": 0.6175, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.302309036254883, + "rewards/margins": 0.2984992265701294, + "rewards/rejected": -11.600809097290039, + "step": 552 + }, + { + "epoch": 0.38157667759185787, + "grad_norm": 0.2931637763977051, + "learning_rate": 1.0597930241471829e-06, + "logits/chosen": 3.7096340656280518, + "logits/rejected": 3.740901231765747, + "logps/chosen": -167.10528564453125, + "logps/rejected": -189.7132568359375, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.980141639709473, + "rewards/margins": 2.204928398132324, + "rewards/rejected": -14.185070037841797, + "step": 553 + }, + { + "epoch": 0.38226668966706917, + "grad_norm": 0.34214258193969727, + "learning_rate": 1.0617094672288233e-06, + "logits/chosen": 4.110810279846191, + "logits/rejected": 4.110810279846191, + "logps/chosen": -188.6288299560547, + "logps/rejected": -188.6288299560547, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.03919792175293, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -14.039196014404297, + "step": 554 + }, + { + "epoch": 0.38295670174228047, + "grad_norm": 0.41538193821907043, + "learning_rate": 1.0636259103104639e-06, + "logits/chosen": 3.4457361698150635, + "logits/rejected": 3.4523494243621826, + "logps/chosen": -152.49554443359375, + "logps/rejected": -156.97531127929688, + "loss": 0.6088, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.421954154968262, + "rewards/margins": 0.5005165338516235, + "rewards/rejected": -10.922470092773438, + "step": 555 + }, + { + "epoch": 0.3836467138174918, + "grad_norm": 0.3498171269893646, + "learning_rate": 1.0655423533921043e-06, + "logits/chosen": 3.5238659381866455, + "logits/rejected": 3.6218597888946533, + "logps/chosen": -153.17816162109375, + "logps/rejected": -167.16905212402344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.612295150756836, + "rewards/margins": 1.3777894973754883, + "rewards/rejected": -11.990083694458008, + "step": 556 + }, + { + "epoch": 0.3843367258927031, + "grad_norm": 3.5892748832702637, + "learning_rate": 1.0674587964737449e-06, + "logits/chosen": 3.750021457672119, + "logits/rejected": 3.78701114654541, + "logps/chosen": -155.09738159179688, + "logps/rejected": -163.73202514648438, + "loss": 0.4875, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.717521667480469, + "rewards/margins": 0.821083664894104, + "rewards/rejected": -11.538604736328125, + "step": 557 + }, + { + "epoch": 0.3850267379679144, + "grad_norm": 14.182966232299805, + "learning_rate": 1.0693752395553853e-06, + "logits/chosen": 4.169857025146484, + "logits/rejected": 4.207025527954102, + "logps/chosen": -187.45594787597656, + "logps/rejected": -186.7241668701172, + "loss": 0.73, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.975442886352539, + "rewards/margins": -0.06521415710449219, + "rewards/rejected": -13.910228729248047, + "step": 558 + }, + { + "epoch": 0.3857167500431258, + "grad_norm": 0.31474077701568604, + "learning_rate": 1.0712916826370259e-06, + "logits/chosen": 3.676689386367798, + "logits/rejected": 3.7396483421325684, + "logps/chosen": -162.92169189453125, + "logps/rejected": -170.13812255859375, + "loss": 0.6068, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.431461334228516, + "rewards/margins": 0.7738292217254639, + "rewards/rejected": -12.205289840698242, + "step": 559 + }, + { + "epoch": 0.3864067621183371, + "grad_norm": 10.20930290222168, + "learning_rate": 1.0732081257186663e-06, + "logits/chosen": 3.6881635189056396, + "logits/rejected": 3.8954522609710693, + "logps/chosen": -162.38760375976562, + "logps/rejected": -173.77134704589844, + "loss": 0.5593, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.36262321472168, + "rewards/margins": 1.175971269607544, + "rewards/rejected": -12.538595199584961, + "step": 560 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 5.333576679229736, + "learning_rate": 1.0751245688003066e-06, + "logits/chosen": 3.807147979736328, + "logits/rejected": 4.014913558959961, + "logps/chosen": -149.19076538085938, + "logps/rejected": -161.8128204345703, + "loss": 0.5497, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.219876289367676, + "rewards/margins": 1.1772985458374023, + "rewards/rejected": -11.397174835205078, + "step": 561 + }, + { + "epoch": 0.3877867862687597, + "grad_norm": 0.3413603901863098, + "learning_rate": 1.0770410118819473e-06, + "logits/chosen": 3.535456657409668, + "logits/rejected": 3.638427257537842, + "logps/chosen": -166.8926544189453, + "logps/rejected": -178.00608825683594, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.000031471252441, + "rewards/margins": 1.1063857078552246, + "rewards/rejected": -13.106416702270508, + "step": 562 + }, + { + "epoch": 0.38847679834397103, + "grad_norm": 0.33229967951774597, + "learning_rate": 1.0789574549635876e-06, + "logits/chosen": 3.3659403324127197, + "logits/rejected": 3.7748448848724365, + "logps/chosen": -148.38519287109375, + "logps/rejected": -177.92625427246094, + "loss": 0.434, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.052091598510742, + "rewards/margins": 3.0159525871276855, + "rewards/rejected": -13.068044662475586, + "step": 563 + }, + { + "epoch": 0.38916681041918233, + "grad_norm": 0.43514353036880493, + "learning_rate": 1.080873898045228e-06, + "logits/chosen": 3.7390592098236084, + "logits/rejected": 3.7390592098236084, + "logps/chosen": -175.59921264648438, + "logps/rejected": -175.59921264648438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.542790412902832, + "rewards/margins": 0.0, + "rewards/rejected": -12.542790412902832, + "step": 564 + }, + { + "epoch": 0.38985682249439363, + "grad_norm": 0.32037898898124695, + "learning_rate": 1.0827903411268686e-06, + "logits/chosen": 3.7396395206451416, + "logits/rejected": 3.9072561264038086, + "logps/chosen": -170.73439025878906, + "logps/rejected": -188.95184326171875, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.22712516784668, + "rewards/margins": 1.8723423480987549, + "rewards/rejected": -14.099468231201172, + "step": 565 + }, + { + "epoch": 0.390546834569605, + "grad_norm": 0.4134833514690399, + "learning_rate": 1.084706784208509e-06, + "logits/chosen": 4.132999420166016, + "logits/rejected": 4.132999420166016, + "logps/chosen": -183.95748901367188, + "logps/rejected": -183.95748901367188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.621368408203125, + "rewards/margins": 0.0, + "rewards/rejected": -13.621368408203125, + "step": 566 + }, + { + "epoch": 0.3912368466448163, + "grad_norm": 0.682804524898529, + "learning_rate": 1.0866232272901496e-06, + "logits/chosen": 3.7654380798339844, + "logits/rejected": 3.7654380798339844, + "logps/chosen": -162.71112060546875, + "logps/rejected": -162.71112060546875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.584288597106934, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.584288597106934, + "step": 567 + }, + { + "epoch": 0.3919268587200276, + "grad_norm": 14.276021957397461, + "learning_rate": 1.08853967037179e-06, + "logits/chosen": 3.872774600982666, + "logits/rejected": 3.8949508666992188, + "logps/chosen": -183.05564880371094, + "logps/rejected": -177.2886505126953, + "loss": 1.0935, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.428085327148438, + "rewards/margins": -0.48445528745651245, + "rewards/rejected": -12.94363021850586, + "step": 568 + }, + { + "epoch": 0.39261687079523894, + "grad_norm": 0.3145519495010376, + "learning_rate": 1.0904561134534306e-06, + "logits/chosen": 3.5463783740997314, + "logits/rejected": 3.5463783740997314, + "logps/chosen": -178.74853515625, + "logps/rejected": -178.74853515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.045161247253418, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.045160293579102, + "step": 569 + }, + { + "epoch": 0.39330688287045024, + "grad_norm": 0.40849769115448, + "learning_rate": 1.092372556535071e-06, + "logits/chosen": 3.8079442977905273, + "logits/rejected": 3.8079442977905273, + "logps/chosen": -180.20144653320312, + "logps/rejected": -180.20144653320312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.080673217773438, + "rewards/margins": 0.0, + "rewards/rejected": -13.080673217773438, + "step": 570 + }, + { + "epoch": 0.39399689494566154, + "grad_norm": 1.2462069988250732, + "learning_rate": 1.0942889996167114e-06, + "logits/chosen": 3.994788646697998, + "logits/rejected": 4.158929824829102, + "logps/chosen": -157.47390747070312, + "logps/rejected": -170.3066864013672, + "loss": 0.5324, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.047348022460938, + "rewards/margins": 1.2049014568328857, + "rewards/rejected": -12.252250671386719, + "step": 571 + }, + { + "epoch": 0.3946869070208729, + "grad_norm": 0.42356300354003906, + "learning_rate": 1.096205442698352e-06, + "logits/chosen": 3.60445499420166, + "logits/rejected": 3.696147918701172, + "logps/chosen": -169.586181640625, + "logps/rejected": -182.15225219726562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.025179862976074, + "rewards/margins": 1.269141674041748, + "rewards/rejected": -13.29432201385498, + "step": 572 + }, + { + "epoch": 0.3953769190960842, + "grad_norm": 18.363245010375977, + "learning_rate": 1.0981218857799924e-06, + "logits/chosen": 4.142690658569336, + "logits/rejected": 4.1729736328125, + "logps/chosen": -181.1586456298828, + "logps/rejected": -186.31895446777344, + "loss": 1.3775, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.401515007019043, + "rewards/margins": 0.38892149925231934, + "rewards/rejected": -13.790437698364258, + "step": 573 + }, + { + "epoch": 0.3960669311712955, + "grad_norm": 1.093165636062622, + "learning_rate": 1.1000383288616328e-06, + "logits/chosen": 3.9400582313537598, + "logits/rejected": 3.9978227615356445, + "logps/chosen": -184.9495849609375, + "logps/rejected": -189.30978393554688, + "loss": 0.6117, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.815951347351074, + "rewards/margins": 0.3940678834915161, + "rewards/rejected": -14.2100191116333, + "step": 574 + }, + { + "epoch": 0.3967569432465068, + "grad_norm": 0.4186551868915558, + "learning_rate": 1.1019547719432734e-06, + "logits/chosen": 3.8794665336608887, + "logits/rejected": 3.9920358657836914, + "logps/chosen": -165.64170837402344, + "logps/rejected": -171.39639282226562, + "loss": 0.6078, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.075262069702148, + "rewards/margins": 0.568041205406189, + "rewards/rejected": -12.643302917480469, + "step": 575 + }, + { + "epoch": 0.39744695532171814, + "grad_norm": 0.49422022700309753, + "learning_rate": 1.1038712150249138e-06, + "logits/chosen": 3.545525550842285, + "logits/rejected": 3.545525550842285, + "logps/chosen": -169.2747802734375, + "logps/rejected": -169.2747802734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.350828170776367, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -12.350828170776367, + "step": 576 + }, + { + "epoch": 0.39813696739692944, + "grad_norm": 0.3237955868244171, + "learning_rate": 1.1057876581065544e-06, + "logits/chosen": 3.684525728225708, + "logits/rejected": 3.8532721996307373, + "logps/chosen": -175.2520751953125, + "logps/rejected": -193.9803009033203, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.60185432434082, + "rewards/margins": 1.8895854949951172, + "rewards/rejected": -14.491438865661621, + "step": 577 + }, + { + "epoch": 0.39882697947214074, + "grad_norm": 0.4566444754600525, + "learning_rate": 1.1077041011881948e-06, + "logits/chosen": 3.955336093902588, + "logits/rejected": 3.955336093902588, + "logps/chosen": -169.21868896484375, + "logps/rejected": -169.21868896484375, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.330411911010742, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.330412864685059, + "step": 578 + }, + { + "epoch": 0.3995169915473521, + "grad_norm": 0.3541266620159149, + "learning_rate": 1.1096205442698354e-06, + "logits/chosen": 3.548414707183838, + "logits/rejected": 3.5977795124053955, + "logps/chosen": -170.675537109375, + "logps/rejected": -180.17190551757812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.362013816833496, + "rewards/margins": 1.007013201713562, + "rewards/rejected": -13.369027137756348, + "step": 579 + }, + { + "epoch": 0.4002070036225634, + "grad_norm": 0.2897239327430725, + "learning_rate": 1.1115369873514758e-06, + "logits/chosen": 3.3889288902282715, + "logits/rejected": 3.5336945056915283, + "logps/chosen": -157.341552734375, + "logps/rejected": -179.44906616210938, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.003384590148926, + "rewards/margins": 2.307469129562378, + "rewards/rejected": -13.31085205078125, + "step": 580 + }, + { + "epoch": 0.4008970156977747, + "grad_norm": 0.3906192481517792, + "learning_rate": 1.1134534304331162e-06, + "logits/chosen": 3.762766122817993, + "logits/rejected": 3.762766122817993, + "logps/chosen": -184.75741577148438, + "logps/rejected": -184.75741577148438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.827898025512695, + "rewards/margins": 0.0, + "rewards/rejected": -13.827898025512695, + "step": 581 + }, + { + "epoch": 0.40158702777298605, + "grad_norm": 10.86971378326416, + "learning_rate": 1.1153698735147568e-06, + "logits/chosen": 3.9834439754486084, + "logits/rejected": 4.021835803985596, + "logps/chosen": -167.29046630859375, + "logps/rejected": -172.40150451660156, + "loss": 0.5851, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.01340389251709, + "rewards/margins": 0.48297083377838135, + "rewards/rejected": -12.49637508392334, + "step": 582 + }, + { + "epoch": 0.40227703984819735, + "grad_norm": 0.3839982748031616, + "learning_rate": 1.1172863165963972e-06, + "logits/chosen": 3.7562975883483887, + "logits/rejected": 3.7708382606506348, + "logps/chosen": -171.97824096679688, + "logps/rejected": -181.23684692382812, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.272682189941406, + "rewards/margins": 0.9010476469993591, + "rewards/rejected": -13.173730850219727, + "step": 583 + }, + { + "epoch": 0.40296705192340865, + "grad_norm": 0.9257275462150574, + "learning_rate": 1.1192027596780376e-06, + "logits/chosen": 3.933657169342041, + "logits/rejected": 4.124566555023193, + "logps/chosen": -181.94955444335938, + "logps/rejected": -195.78338623046875, + "loss": 0.5225, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.505504608154297, + "rewards/margins": 1.4105887413024902, + "rewards/rejected": -14.916093826293945, + "step": 584 + }, + { + "epoch": 0.40365706399862, + "grad_norm": 0.28408047556877136, + "learning_rate": 1.1211192027596782e-06, + "logits/chosen": 3.901585817337036, + "logits/rejected": 3.901585817337036, + "logps/chosen": -187.76187133789062, + "logps/rejected": -187.76187133789062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.02042293548584, + "rewards/margins": 0.0, + "rewards/rejected": -14.02042293548584, + "step": 585 + }, + { + "epoch": 0.4043470760738313, + "grad_norm": 27.394298553466797, + "learning_rate": 1.1230356458413186e-06, + "logits/chosen": 3.6938319206237793, + "logits/rejected": 3.8251953125, + "logps/chosen": -150.60800170898438, + "logps/rejected": -171.2427978515625, + "loss": 0.6343, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.45319938659668, + "rewards/margins": 2.06681752204895, + "rewards/rejected": -12.52001667022705, + "step": 586 + }, + { + "epoch": 0.4050370881490426, + "grad_norm": 1.021306037902832, + "learning_rate": 1.1249520889229592e-06, + "logits/chosen": 3.949608325958252, + "logits/rejected": 3.9412851333618164, + "logps/chosen": -169.13523864746094, + "logps/rejected": -173.00927734375, + "loss": 0.6137, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.184457778930664, + "rewards/margins": 0.3534224033355713, + "rewards/rejected": -12.537879943847656, + "step": 587 + }, + { + "epoch": 0.4057271002242539, + "grad_norm": 0.5184230208396912, + "learning_rate": 1.1268685320045995e-06, + "logits/chosen": 3.774449110031128, + "logits/rejected": 3.774449110031128, + "logps/chosen": -188.91964721679688, + "logps/rejected": -188.91964721679688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.050339698791504, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.050339698791504, + "step": 588 + }, + { + "epoch": 0.40641711229946526, + "grad_norm": 2.315542221069336, + "learning_rate": 1.1287849750862402e-06, + "logits/chosen": 3.720440149307251, + "logits/rejected": 3.933297872543335, + "logps/chosen": -152.33717346191406, + "logps/rejected": -168.7546844482422, + "loss": 0.5287, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.713136672973633, + "rewards/margins": 1.6111353635787964, + "rewards/rejected": -12.324271202087402, + "step": 589 + }, + { + "epoch": 0.40710712437467655, + "grad_norm": 0.4408206343650818, + "learning_rate": 1.1307014181678805e-06, + "logits/chosen": 4.187489986419678, + "logits/rejected": 4.187489986419678, + "logps/chosen": -186.4786834716797, + "logps/rejected": -186.4786834716797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.826399803161621, + "rewards/margins": 0.0, + "rewards/rejected": -13.826399803161621, + "step": 590 + }, + { + "epoch": 0.40779713644988785, + "grad_norm": 3.4284064769744873, + "learning_rate": 1.132617861249521e-06, + "logits/chosen": 3.960689067840576, + "logits/rejected": 3.9935286045074463, + "logps/chosen": -170.6560516357422, + "logps/rejected": -171.15505981445312, + "loss": 0.6679, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.243753433227539, + "rewards/margins": 0.05700027942657471, + "rewards/rejected": -12.300752639770508, + "step": 591 + }, + { + "epoch": 0.4084871485250992, + "grad_norm": 0.9122096300125122, + "learning_rate": 1.1345343043311615e-06, + "logits/chosen": 4.033603191375732, + "logits/rejected": 4.042888164520264, + "logps/chosen": -190.4272918701172, + "logps/rejected": -194.8939971923828, + "loss": 0.6093, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.163167953491211, + "rewards/margins": 0.4748185873031616, + "rewards/rejected": -14.63798713684082, + "step": 592 + }, + { + "epoch": 0.4091771606003105, + "grad_norm": 0.3252766728401184, + "learning_rate": 1.136450747412802e-06, + "logits/chosen": 3.5797832012176514, + "logits/rejected": 3.5201377868652344, + "logps/chosen": -161.83111572265625, + "logps/rejected": -167.6048583984375, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.187323570251465, + "rewards/margins": 0.6709891557693481, + "rewards/rejected": -11.85831356048584, + "step": 593 + }, + { + "epoch": 0.4098671726755218, + "grad_norm": 23.23531723022461, + "learning_rate": 1.1383671904944423e-06, + "logits/chosen": 4.04249382019043, + "logits/rejected": 3.970674753189087, + "logps/chosen": -179.29234313964844, + "logps/rejected": -185.70468139648438, + "loss": 1.2372, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.076519966125488, + "rewards/margins": 0.6475467681884766, + "rewards/rejected": -13.724066734313965, + "step": 594 + }, + { + "epoch": 0.41055718475073316, + "grad_norm": 0.3094814717769623, + "learning_rate": 1.140283633576083e-06, + "logits/chosen": 3.844250202178955, + "logits/rejected": 3.9872851371765137, + "logps/chosen": -167.7127227783203, + "logps/rejected": -185.85043334960938, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.958198547363281, + "rewards/margins": 1.7979828119277954, + "rewards/rejected": -13.756181716918945, + "step": 595 + }, + { + "epoch": 0.41124719682594446, + "grad_norm": 1.0954355001449585, + "learning_rate": 1.1422000766577233e-06, + "logits/chosen": 3.7023532390594482, + "logits/rejected": 3.933832883834839, + "logps/chosen": -174.9654541015625, + "logps/rejected": -183.78598022460938, + "loss": 0.5263, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.772103309631348, + "rewards/margins": 0.93653804063797, + "rewards/rejected": -13.708641052246094, + "step": 596 + }, + { + "epoch": 0.41193720890115576, + "grad_norm": 0.6140850186347961, + "learning_rate": 1.144116519739364e-06, + "logits/chosen": 3.4211320877075195, + "logits/rejected": 3.6975975036621094, + "logps/chosen": -152.58876037597656, + "logps/rejected": -167.21652221679688, + "loss": 0.523, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.774091720581055, + "rewards/margins": 1.4699524641036987, + "rewards/rejected": -12.244043350219727, + "step": 597 + }, + { + "epoch": 0.41262722097636706, + "grad_norm": 0.32193759083747864, + "learning_rate": 1.1460329628210043e-06, + "logits/chosen": 3.4379982948303223, + "logits/rejected": 3.4911742210388184, + "logps/chosen": -153.09693908691406, + "logps/rejected": -163.38198852539062, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.805538177490234, + "rewards/margins": 0.8843708634376526, + "rewards/rejected": -11.689908981323242, + "step": 598 + }, + { + "epoch": 0.4133172330515784, + "grad_norm": 0.35543394088745117, + "learning_rate": 1.147949405902645e-06, + "logits/chosen": 3.6967737674713135, + "logits/rejected": 3.6967737674713135, + "logps/chosen": -175.87738037109375, + "logps/rejected": -175.87738037109375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.74675464630127, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.74675464630127, + "step": 599 + }, + { + "epoch": 0.4140072451267897, + "grad_norm": 1.8328595161437988, + "learning_rate": 1.1498658489842853e-06, + "logits/chosen": 3.5673413276672363, + "logits/rejected": 3.5981297492980957, + "logps/chosen": -149.2419891357422, + "logps/rejected": -153.21009826660156, + "loss": 0.6128, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.206270217895508, + "rewards/margins": 0.3706228733062744, + "rewards/rejected": -10.576892852783203, + "step": 600 + }, + { + "epoch": 0.414697257202001, + "grad_norm": 0.34080445766448975, + "learning_rate": 1.1517822920659257e-06, + "logits/chosen": 3.5482168197631836, + "logits/rejected": 3.8843696117401123, + "logps/chosen": -150.11813354492188, + "logps/rejected": -190.18060302734375, + "loss": 0.4332, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.211857795715332, + "rewards/margins": 3.9844093322753906, + "rewards/rejected": -14.196267127990723, + "step": 601 + }, + { + "epoch": 0.41538726927721237, + "grad_norm": 0.38231438398361206, + "learning_rate": 1.1536987351475663e-06, + "logits/chosen": 3.8553919792175293, + "logits/rejected": 3.983372211456299, + "logps/chosen": -176.24085998535156, + "logps/rejected": -186.94363403320312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.787931442260742, + "rewards/margins": 1.0566271543502808, + "rewards/rejected": -13.844558715820312, + "step": 602 + }, + { + "epoch": 0.41607728135242367, + "grad_norm": 12.167851448059082, + "learning_rate": 1.1556151782292067e-06, + "logits/chosen": 3.800863265991211, + "logits/rejected": 3.7283754348754883, + "logps/chosen": -159.76368713378906, + "logps/rejected": -159.1166229248047, + "loss": 0.6975, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.285125732421875, + "rewards/margins": -0.00849902629852295, + "rewards/rejected": -11.276627540588379, + "step": 603 + }, + { + "epoch": 0.41676729342763497, + "grad_norm": 0.38413205742836, + "learning_rate": 1.157531621310847e-06, + "logits/chosen": 3.6664986610412598, + "logits/rejected": 3.626634120941162, + "logps/chosen": -168.34304809570312, + "logps/rejected": -176.42140197753906, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.103148460388184, + "rewards/margins": 0.8362510800361633, + "rewards/rejected": -12.939399719238281, + "step": 604 + }, + { + "epoch": 0.4174573055028463, + "grad_norm": 0.5531256794929504, + "learning_rate": 1.1594480643924877e-06, + "logits/chosen": 3.2159903049468994, + "logits/rejected": 3.6060872077941895, + "logps/chosen": -147.802490234375, + "logps/rejected": -174.54754638671875, + "loss": 0.351, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.884191513061523, + "rewards/margins": 2.7342638969421387, + "rewards/rejected": -12.61845588684082, + "step": 605 + }, + { + "epoch": 0.4181473175780576, + "grad_norm": 0.40483012795448303, + "learning_rate": 1.161364507474128e-06, + "logits/chosen": 3.528507709503174, + "logits/rejected": 3.528507709503174, + "logps/chosen": -165.9417724609375, + "logps/rejected": -165.9417724609375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.926422119140625, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.926422119140625, + "step": 606 + }, + { + "epoch": 0.4188373296532689, + "grad_norm": 31.4741153717041, + "learning_rate": 1.1632809505557687e-06, + "logits/chosen": 3.5739285945892334, + "logits/rejected": 3.8829832077026367, + "logps/chosen": -151.54302978515625, + "logps/rejected": -166.0693359375, + "loss": 1.5855, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.501354217529297, + "rewards/margins": 1.329791784286499, + "rewards/rejected": -11.831144332885742, + "step": 607 + }, + { + "epoch": 0.4195273417284803, + "grad_norm": 0.3870941698551178, + "learning_rate": 1.165197393637409e-06, + "logits/chosen": 3.73759388923645, + "logits/rejected": 3.8521621227264404, + "logps/chosen": -169.2719268798828, + "logps/rejected": -182.14649963378906, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.180728912353516, + "rewards/margins": 1.1964044570922852, + "rewards/rejected": -13.377134323120117, + "step": 608 + }, + { + "epoch": 0.4202173538036916, + "grad_norm": 0.321781188249588, + "learning_rate": 1.1671138367190497e-06, + "logits/chosen": 3.530693531036377, + "logits/rejected": 3.530693531036377, + "logps/chosen": -191.6028594970703, + "logps/rejected": -191.6028594970703, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.360068321228027, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.360068321228027, + "step": 609 + }, + { + "epoch": 0.4209073658789029, + "grad_norm": 0.592197835445404, + "learning_rate": 1.16903027980069e-06, + "logits/chosen": 3.8594164848327637, + "logits/rejected": 3.800293445587158, + "logps/chosen": -170.7066650390625, + "logps/rejected": -176.5223846435547, + "loss": 0.6082, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.46471881866455, + "rewards/margins": 0.5356531739234924, + "rewards/rejected": -13.000370979309082, + "step": 610 + }, + { + "epoch": 0.42159737795411417, + "grad_norm": 0.3435996174812317, + "learning_rate": 1.1709467228823305e-06, + "logits/chosen": 3.606395721435547, + "logits/rejected": 3.606395721435547, + "logps/chosen": -174.13392639160156, + "logps/rejected": -174.13392639160156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.724743843078613, + "rewards/margins": 0.0, + "rewards/rejected": -12.724743843078613, + "step": 611 + }, + { + "epoch": 0.4222873900293255, + "grad_norm": 0.4023621082305908, + "learning_rate": 1.172863165963971e-06, + "logits/chosen": 3.9632534980773926, + "logits/rejected": 4.147086143493652, + "logps/chosen": -183.62002563476562, + "logps/rejected": -190.2987060546875, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.639497756958008, + "rewards/margins": 0.6419932842254639, + "rewards/rejected": -14.28149127960205, + "step": 612 + }, + { + "epoch": 0.4229774021045368, + "grad_norm": 0.3903692066669464, + "learning_rate": 1.1747796090456115e-06, + "logits/chosen": 3.542623519897461, + "logits/rejected": 3.7553510665893555, + "logps/chosen": -159.90969848632812, + "logps/rejected": -169.6529998779297, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.199453353881836, + "rewards/margins": 0.9504759311676025, + "rewards/rejected": -12.14992904663086, + "step": 613 + }, + { + "epoch": 0.4236674141797481, + "grad_norm": 0.356521874666214, + "learning_rate": 1.1766960521272518e-06, + "logits/chosen": 3.4906527996063232, + "logits/rejected": 3.765676736831665, + "logps/chosen": -148.35806274414062, + "logps/rejected": -165.24900817871094, + "loss": 0.521, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.074697494506836, + "rewards/margins": 1.6226087808609009, + "rewards/rejected": -11.697305679321289, + "step": 614 + }, + { + "epoch": 0.4243574262549595, + "grad_norm": 0.36373963952064514, + "learning_rate": 1.1786124952088924e-06, + "logits/chosen": 3.6461896896362305, + "logits/rejected": 3.6886978149414062, + "logps/chosen": -182.26461791992188, + "logps/rejected": -188.3594970703125, + "loss": 0.6078, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.474037170410156, + "rewards/margins": 0.5751298666000366, + "rewards/rejected": -14.04916763305664, + "step": 615 + }, + { + "epoch": 0.4250474383301708, + "grad_norm": 10.420160293579102, + "learning_rate": 1.1805289382905328e-06, + "logits/chosen": 3.6374542713165283, + "logits/rejected": 3.674079179763794, + "logps/chosen": -177.85809326171875, + "logps/rejected": -180.73355102539062, + "loss": 0.63, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.928526878356934, + "rewards/margins": 0.3240572214126587, + "rewards/rejected": -13.252584457397461, + "step": 616 + }, + { + "epoch": 0.4257374504053821, + "grad_norm": 0.7277801632881165, + "learning_rate": 1.1824453813721734e-06, + "logits/chosen": 3.817105293273926, + "logits/rejected": 3.846451997756958, + "logps/chosen": -175.41806030273438, + "logps/rejected": -180.22698974609375, + "loss": 0.6084, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.788936614990234, + "rewards/margins": 0.5201075077056885, + "rewards/rejected": -13.309043884277344, + "step": 617 + }, + { + "epoch": 0.42642746248059343, + "grad_norm": 0.45482662320137024, + "learning_rate": 1.1843618244538138e-06, + "logits/chosen": 3.538820743560791, + "logits/rejected": 3.6089556217193604, + "logps/chosen": -163.73703002929688, + "logps/rejected": -169.1230926513672, + "loss": 0.6081, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.500652313232422, + "rewards/margins": 0.5444774627685547, + "rewards/rejected": -12.045129776000977, + "step": 618 + }, + { + "epoch": 0.42711747455580473, + "grad_norm": 0.2875225841999054, + "learning_rate": 1.1862782675354544e-06, + "logits/chosen": 3.6851918697357178, + "logits/rejected": 3.7987372875213623, + "logps/chosen": -166.12313842773438, + "logps/rejected": -175.41270446777344, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.866023063659668, + "rewards/margins": 0.9570314288139343, + "rewards/rejected": -12.823054313659668, + "step": 619 + }, + { + "epoch": 0.42780748663101603, + "grad_norm": 23.97669792175293, + "learning_rate": 1.1881947106170948e-06, + "logits/chosen": 3.6967530250549316, + "logits/rejected": 3.603581666946411, + "logps/chosen": -187.20755004882812, + "logps/rejected": -181.2786407470703, + "loss": 1.2115, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.047212600708008, + "rewards/margins": -0.6040033102035522, + "rewards/rejected": -13.443208694458008, + "step": 620 + }, + { + "epoch": 0.4284974987062274, + "grad_norm": 0.37968385219573975, + "learning_rate": 1.1901111536987352e-06, + "logits/chosen": 3.4512181282043457, + "logits/rejected": 3.669401168823242, + "logps/chosen": -161.99951171875, + "logps/rejected": -174.06405639648438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.40412712097168, + "rewards/margins": 1.216916561126709, + "rewards/rejected": -12.621044158935547, + "step": 621 + }, + { + "epoch": 0.4291875107814387, + "grad_norm": 0.3924183249473572, + "learning_rate": 1.1920275967803756e-06, + "logits/chosen": 3.8808774948120117, + "logits/rejected": 3.912632465362549, + "logps/chosen": -175.61180114746094, + "logps/rejected": -181.9381561279297, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.026748657226562, + "rewards/margins": 0.6419168710708618, + "rewards/rejected": -13.668664932250977, + "step": 622 + }, + { + "epoch": 0.42987752285665, + "grad_norm": 0.4144175052642822, + "learning_rate": 1.1939440398620162e-06, + "logits/chosen": 3.6897828578948975, + "logits/rejected": 3.6897828578948975, + "logps/chosen": -176.74087524414062, + "logps/rejected": -176.74087524414062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.98508358001709, + "rewards/margins": 0.0, + "rewards/rejected": -12.98508358001709, + "step": 623 + }, + { + "epoch": 0.4305675349318613, + "grad_norm": 0.398338258266449, + "learning_rate": 1.1958604829436566e-06, + "logits/chosen": 3.920485496520996, + "logits/rejected": 3.920485496520996, + "logps/chosen": -171.1122283935547, + "logps/rejected": -171.1122283935547, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.481245040893555, + "rewards/margins": 0.0, + "rewards/rejected": -12.481245040893555, + "step": 624 + }, + { + "epoch": 0.43125754700707264, + "grad_norm": 0.3095017373561859, + "learning_rate": 1.1977769260252972e-06, + "logits/chosen": 3.7193350791931152, + "logits/rejected": 3.7625272274017334, + "logps/chosen": -175.39102172851562, + "logps/rejected": -184.31362915039062, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.665611267089844, + "rewards/margins": 0.9521069526672363, + "rewards/rejected": -13.617717742919922, + "step": 625 + }, + { + "epoch": 0.43194755908228394, + "grad_norm": 0.3895118236541748, + "learning_rate": 1.1996933691069376e-06, + "logits/chosen": 3.459022283554077, + "logits/rejected": 3.588644504547119, + "logps/chosen": -156.3362274169922, + "logps/rejected": -171.20635986328125, + "loss": 0.5225, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.90511703491211, + "rewards/margins": 1.511392593383789, + "rewards/rejected": -12.416510581970215, + "step": 626 + }, + { + "epoch": 0.43263757115749524, + "grad_norm": 0.3488408923149109, + "learning_rate": 1.201609812188578e-06, + "logits/chosen": 3.854031801223755, + "logits/rejected": 3.854031801223755, + "logps/chosen": -171.787109375, + "logps/rejected": -171.787109375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.585031509399414, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.58503246307373, + "step": 627 + }, + { + "epoch": 0.4333275832327066, + "grad_norm": 0.31441530585289, + "learning_rate": 1.2035262552702186e-06, + "logits/chosen": 3.7275350093841553, + "logits/rejected": 3.7105865478515625, + "logps/chosen": -184.16012573242188, + "logps/rejected": -190.87826538085938, + "loss": 0.6074, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.476763725280762, + "rewards/margins": 0.6225342750549316, + "rewards/rejected": -14.099297523498535, + "step": 628 + }, + { + "epoch": 0.4340175953079179, + "grad_norm": 0.33614784479141235, + "learning_rate": 1.205442698351859e-06, + "logits/chosen": 3.440836191177368, + "logits/rejected": 3.641829252243042, + "logps/chosen": -137.51254272460938, + "logps/rejected": -156.75473022460938, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.992197036743164, + "rewards/margins": 1.9415616989135742, + "rewards/rejected": -10.933758735656738, + "step": 629 + }, + { + "epoch": 0.4347076073831292, + "grad_norm": 1.347123622894287, + "learning_rate": 1.2073591414334996e-06, + "logits/chosen": 3.6003801822662354, + "logits/rejected": 3.6123552322387695, + "logps/chosen": -150.60751342773438, + "logps/rejected": -166.45811462402344, + "loss": 0.4456, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.369316101074219, + "rewards/margins": 1.5228745937347412, + "rewards/rejected": -11.892190933227539, + "step": 630 + }, + { + "epoch": 0.43539761945834055, + "grad_norm": 1.5311187505722046, + "learning_rate": 1.20927558451514e-06, + "logits/chosen": 3.9317586421966553, + "logits/rejected": 3.9340996742248535, + "logps/chosen": -167.08277893066406, + "logps/rejected": -178.09970092773438, + "loss": 0.5364, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.835758209228516, + "rewards/margins": 1.119020700454712, + "rewards/rejected": -12.954778671264648, + "step": 631 + }, + { + "epoch": 0.43608763153355184, + "grad_norm": 0.3322153687477112, + "learning_rate": 1.2111920275967804e-06, + "logits/chosen": 4.0092902183532715, + "logits/rejected": 4.0092902183532715, + "logps/chosen": -197.68968200683594, + "logps/rejected": -197.68968200683594, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.90081787109375, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.90081787109375, + "step": 632 + }, + { + "epoch": 0.43677764360876314, + "grad_norm": 0.41310879588127136, + "learning_rate": 1.213108470678421e-06, + "logits/chosen": 4.302692413330078, + "logits/rejected": 4.302692413330078, + "logps/chosen": -187.8953857421875, + "logps/rejected": -187.8953857421875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.806638717651367, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.806638717651367, + "step": 633 + }, + { + "epoch": 0.43746765568397444, + "grad_norm": 0.35648828744888306, + "learning_rate": 1.2150249137600614e-06, + "logits/chosen": 3.757603168487549, + "logits/rejected": 3.8789706230163574, + "logps/chosen": -160.13748168945312, + "logps/rejected": -183.32528686523438, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.338051795959473, + "rewards/margins": 2.224363327026367, + "rewards/rejected": -13.562414169311523, + "step": 634 + }, + { + "epoch": 0.4381576677591858, + "grad_norm": 0.6557313799858093, + "learning_rate": 1.2169413568417018e-06, + "logits/chosen": 3.683711290359497, + "logits/rejected": 3.6596314907073975, + "logps/chosen": -171.17750549316406, + "logps/rejected": -175.45408630371094, + "loss": 0.6102, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.480663299560547, + "rewards/margins": 0.4378316402435303, + "rewards/rejected": -12.918495178222656, + "step": 635 + }, + { + "epoch": 0.4388476798343971, + "grad_norm": 0.29764312505722046, + "learning_rate": 1.2188577999233424e-06, + "logits/chosen": 3.776890754699707, + "logits/rejected": 3.776890754699707, + "logps/chosen": -184.6112060546875, + "logps/rejected": -184.6112060546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.564552307128906, + "rewards/margins": 0.0, + "rewards/rejected": -13.564552307128906, + "step": 636 + }, + { + "epoch": 0.4395376919096084, + "grad_norm": 0.34426364302635193, + "learning_rate": 1.2207742430049828e-06, + "logits/chosen": 4.167384624481201, + "logits/rejected": 4.293441295623779, + "logps/chosen": -186.23880004882812, + "logps/rejected": -200.99566650390625, + "loss": 0.5218, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.830350875854492, + "rewards/margins": 1.5105633735656738, + "rewards/rejected": -15.340913772583008, + "step": 637 + }, + { + "epoch": 0.44022770398481975, + "grad_norm": 0.3628818690776825, + "learning_rate": 1.2226906860866234e-06, + "logits/chosen": 4.129947185516357, + "logits/rejected": 4.129947185516357, + "logps/chosen": -182.46221923828125, + "logps/rejected": -182.4622344970703, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.475013732910156, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.475013732910156, + "step": 638 + }, + { + "epoch": 0.44091771606003105, + "grad_norm": 0.36878204345703125, + "learning_rate": 1.2246071291682638e-06, + "logits/chosen": 3.9991025924682617, + "logits/rejected": 4.207705497741699, + "logps/chosen": -164.4375, + "logps/rejected": -175.65896606445312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.853236198425293, + "rewards/margins": 1.139345407485962, + "rewards/rejected": -12.992581367492676, + "step": 639 + }, + { + "epoch": 0.44160772813524235, + "grad_norm": 0.38270044326782227, + "learning_rate": 1.2265235722499044e-06, + "logits/chosen": 4.045794486999512, + "logits/rejected": 4.045794486999512, + "logps/chosen": -181.36529541015625, + "logps/rejected": -181.36529541015625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.177495956420898, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.177495956420898, + "step": 640 + }, + { + "epoch": 0.4422977402104537, + "grad_norm": 0.4438311457633972, + "learning_rate": 1.2284400153315447e-06, + "logits/chosen": 3.428281784057617, + "logits/rejected": 3.6715054512023926, + "logps/chosen": -159.7056427001953, + "logps/rejected": -179.05230712890625, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.153287887573242, + "rewards/margins": 1.899707317352295, + "rewards/rejected": -13.052995681762695, + "step": 641 + }, + { + "epoch": 0.442987752285665, + "grad_norm": 0.43473225831985474, + "learning_rate": 1.2303564584131851e-06, + "logits/chosen": 3.979384183883667, + "logits/rejected": 4.138825416564941, + "logps/chosen": -171.2876739501953, + "logps/rejected": -179.66604614257812, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.2515230178833, + "rewards/margins": 0.8430153727531433, + "rewards/rejected": -13.094536781311035, + "step": 642 + }, + { + "epoch": 0.4436777643608763, + "grad_norm": 0.39876124262809753, + "learning_rate": 1.2322729014948257e-06, + "logits/chosen": 3.625917434692383, + "logits/rejected": 3.7327702045440674, + "logps/chosen": -169.16571044921875, + "logps/rejected": -179.69737243652344, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.1254301071167, + "rewards/margins": 1.0821762084960938, + "rewards/rejected": -13.207606315612793, + "step": 643 + }, + { + "epoch": 0.44436777643608766, + "grad_norm": 0.40702009201049805, + "learning_rate": 1.2341893445764661e-06, + "logits/chosen": 3.5037944316864014, + "logits/rejected": 3.5094408988952637, + "logps/chosen": -150.30303955078125, + "logps/rejected": -169.78916931152344, + "loss": 0.5221, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.42232608795166, + "rewards/margins": 1.85722017288208, + "rewards/rejected": -12.279546737670898, + "step": 644 + }, + { + "epoch": 0.44505778851129896, + "grad_norm": 1.4511992931365967, + "learning_rate": 1.2361057876581065e-06, + "logits/chosen": 3.4815139770507812, + "logits/rejected": 3.6234607696533203, + "logps/chosen": -169.57510375976562, + "logps/rejected": -187.22030639648438, + "loss": 0.5271, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.32200813293457, + "rewards/margins": 1.7667303085327148, + "rewards/rejected": -14.088737487792969, + "step": 645 + }, + { + "epoch": 0.44574780058651026, + "grad_norm": 0.4131982624530792, + "learning_rate": 1.2380222307397471e-06, + "logits/chosen": 3.497375011444092, + "logits/rejected": 3.6193737983703613, + "logps/chosen": -161.87249755859375, + "logps/rejected": -178.5144500732422, + "loss": 0.522, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.36776351928711, + "rewards/margins": 1.7305200099945068, + "rewards/rejected": -13.098284721374512, + "step": 646 + }, + { + "epoch": 0.44643781266172156, + "grad_norm": 0.41794872283935547, + "learning_rate": 1.2399386738213875e-06, + "logits/chosen": 3.7919974327087402, + "logits/rejected": 3.7919974327087402, + "logps/chosen": -181.36376953125, + "logps/rejected": -181.36376953125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.341649055480957, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.341649055480957, + "step": 647 + }, + { + "epoch": 0.4471278247369329, + "grad_norm": 0.32984858751296997, + "learning_rate": 1.2418551169030281e-06, + "logits/chosen": 3.658310651779175, + "logits/rejected": 3.8544204235076904, + "logps/chosen": -162.26495361328125, + "logps/rejected": -183.7568359375, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.529094696044922, + "rewards/margins": 2.2415919303894043, + "rewards/rejected": -13.770686149597168, + "step": 648 + }, + { + "epoch": 0.4478178368121442, + "grad_norm": 0.3633961081504822, + "learning_rate": 1.2437715599846685e-06, + "logits/chosen": 4.087891578674316, + "logits/rejected": 4.087891578674316, + "logps/chosen": -192.59632873535156, + "logps/rejected": -192.59634399414062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.502333641052246, + "rewards/margins": 0.0, + "rewards/rejected": -14.502333641052246, + "step": 649 + }, + { + "epoch": 0.4485078488873555, + "grad_norm": 0.3080894351005554, + "learning_rate": 1.2456880030663091e-06, + "logits/chosen": 3.9884629249572754, + "logits/rejected": 3.9884629249572754, + "logps/chosen": -189.671142578125, + "logps/rejected": -189.671142578125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.291792869567871, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.291792869567871, + "step": 650 + }, + { + "epoch": 0.44919786096256686, + "grad_norm": 0.34312543272972107, + "learning_rate": 1.2476044461479495e-06, + "logits/chosen": 3.5348308086395264, + "logits/rejected": 3.755094528198242, + "logps/chosen": -152.9459686279297, + "logps/rejected": -177.8912811279297, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.565020561218262, + "rewards/margins": 2.4767489433288574, + "rewards/rejected": -13.041769981384277, + "step": 651 + }, + { + "epoch": 0.44988787303777816, + "grad_norm": 0.3173482120037079, + "learning_rate": 1.24952088922959e-06, + "logits/chosen": 3.796661853790283, + "logits/rejected": 3.8448376655578613, + "logps/chosen": -177.75338745117188, + "logps/rejected": -186.53958129882812, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.794300079345703, + "rewards/margins": 0.9221422672271729, + "rewards/rejected": -13.716442108154297, + "step": 652 + }, + { + "epoch": 0.45057788511298946, + "grad_norm": 0.3429587781429291, + "learning_rate": 1.2514373323112305e-06, + "logits/chosen": 3.8006327152252197, + "logits/rejected": 3.7493398189544678, + "logps/chosen": -173.67535400390625, + "logps/rejected": -180.96322631835938, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.613740921020508, + "rewards/margins": 0.6523127555847168, + "rewards/rejected": -13.266053199768066, + "step": 653 + }, + { + "epoch": 0.4512678971882008, + "grad_norm": 0.3722083568572998, + "learning_rate": 1.2533537753928709e-06, + "logits/chosen": 4.149941921234131, + "logits/rejected": 4.149941921234131, + "logps/chosen": -180.29910278320312, + "logps/rejected": -180.2991180419922, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.19809341430664, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.198094367980957, + "step": 654 + }, + { + "epoch": 0.4519579092634121, + "grad_norm": 0.27257615327835083, + "learning_rate": 1.2552702184745113e-06, + "logits/chosen": 3.8257508277893066, + "logits/rejected": 3.9732518196105957, + "logps/chosen": -156.57919311523438, + "logps/rejected": -170.03863525390625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.934234619140625, + "rewards/margins": 1.2746987342834473, + "rewards/rejected": -12.20893383026123, + "step": 655 + }, + { + "epoch": 0.4526479213386234, + "grad_norm": 0.4407385587692261, + "learning_rate": 1.2571866615561517e-06, + "logits/chosen": 4.0761799812316895, + "logits/rejected": 4.0761799812316895, + "logps/chosen": -179.76333618164062, + "logps/rejected": -179.76333618164062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.341509819030762, + "rewards/margins": 0.0, + "rewards/rejected": -13.341509819030762, + "step": 656 + }, + { + "epoch": 0.4533379334138347, + "grad_norm": 1.533103108406067, + "learning_rate": 1.2591031046377925e-06, + "logits/chosen": 3.8250656127929688, + "logits/rejected": 3.9191789627075195, + "logps/chosen": -167.89646911621094, + "logps/rejected": -176.42135620117188, + "loss": 0.5291, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.989707946777344, + "rewards/margins": 0.8406450748443604, + "rewards/rejected": -12.830352783203125, + "step": 657 + }, + { + "epoch": 0.45402794548904607, + "grad_norm": 0.32564887404441833, + "learning_rate": 1.2610195477194329e-06, + "logits/chosen": 3.6799750328063965, + "logits/rejected": 3.7547073364257812, + "logps/chosen": -167.44346618652344, + "logps/rejected": -182.323974609375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.164918899536133, + "rewards/margins": 1.5088633298873901, + "rewards/rejected": -13.673782348632812, + "step": 658 + }, + { + "epoch": 0.45471795756425737, + "grad_norm": 0.38663774728775024, + "learning_rate": 1.2629359908010733e-06, + "logits/chosen": 3.3290014266967773, + "logits/rejected": 3.613839626312256, + "logps/chosen": -147.1576385498047, + "logps/rejected": -175.15838623046875, + "loss": 0.4335, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.120086669921875, + "rewards/margins": 2.816232204437256, + "rewards/rejected": -12.936319351196289, + "step": 659 + }, + { + "epoch": 0.45540796963946867, + "grad_norm": 3.237283706665039, + "learning_rate": 1.2648524338827137e-06, + "logits/chosen": 3.567556381225586, + "logits/rejected": 3.7197728157043457, + "logps/chosen": -168.51071166992188, + "logps/rejected": -171.96774291992188, + "loss": 0.6168, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.891561508178711, + "rewards/margins": 0.3072751760482788, + "rewards/rejected": -12.198836326599121, + "step": 660 + }, + { + "epoch": 0.45609798171468, + "grad_norm": 0.3829398453235626, + "learning_rate": 1.2667688769643543e-06, + "logits/chosen": 3.6572818756103516, + "logits/rejected": 3.7722396850585938, + "logps/chosen": -183.4185028076172, + "logps/rejected": -189.9166259765625, + "loss": 0.6074, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.593087196350098, + "rewards/margins": 0.6132357716560364, + "rewards/rejected": -14.206323623657227, + "step": 661 + }, + { + "epoch": 0.4567879937898913, + "grad_norm": 0.3690991997718811, + "learning_rate": 1.2686853200459947e-06, + "logits/chosen": 3.968289852142334, + "logits/rejected": 4.025967597961426, + "logps/chosen": -177.94635009765625, + "logps/rejected": -192.2700958251953, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.987692832946777, + "rewards/margins": 1.3774415254592896, + "rewards/rejected": -14.365135192871094, + "step": 662 + }, + { + "epoch": 0.4574780058651026, + "grad_norm": 0.36971521377563477, + "learning_rate": 1.2706017631276353e-06, + "logits/chosen": 4.0114006996154785, + "logits/rejected": 3.9520249366760254, + "logps/chosen": -180.1359100341797, + "logps/rejected": -185.8302764892578, + "loss": 0.6076, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.263729095458984, + "rewards/margins": 0.5895313024520874, + "rewards/rejected": -13.853260040283203, + "step": 663 + }, + { + "epoch": 0.458168017940314, + "grad_norm": 0.3752007484436035, + "learning_rate": 1.2725182062092757e-06, + "logits/chosen": 4.223807334899902, + "logits/rejected": 4.223807334899902, + "logps/chosen": -174.04490661621094, + "logps/rejected": -174.04489135742188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.633090019226074, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.633090019226074, + "step": 664 + }, + { + "epoch": 0.4588580300155253, + "grad_norm": 15.35667610168457, + "learning_rate": 1.2744346492909163e-06, + "logits/chosen": 3.610842704772949, + "logits/rejected": 3.625474452972412, + "logps/chosen": -183.9488525390625, + "logps/rejected": -193.73037719726562, + "loss": 1.2491, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.49129867553711, + "rewards/margins": 1.1066689491271973, + "rewards/rejected": -14.597967147827148, + "step": 665 + }, + { + "epoch": 0.4595480420907366, + "grad_norm": 18.889982223510742, + "learning_rate": 1.2763510923725567e-06, + "logits/chosen": 3.5259501934051514, + "logits/rejected": 3.7158875465393066, + "logps/chosen": -178.70042419433594, + "logps/rejected": -183.0911102294922, + "loss": 0.8957, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.09997844696045, + "rewards/margins": 0.46123576164245605, + "rewards/rejected": -13.561213493347168, + "step": 666 + }, + { + "epoch": 0.46023805416594793, + "grad_norm": 16.767292022705078, + "learning_rate": 1.278267535454197e-06, + "logits/chosen": 3.513803482055664, + "logits/rejected": 3.515530586242676, + "logps/chosen": -157.30352783203125, + "logps/rejected": -156.09808349609375, + "loss": 0.7972, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.051258087158203, + "rewards/margins": -0.16007781028747559, + "rewards/rejected": -10.891180038452148, + "step": 667 + }, + { + "epoch": 0.46092806624115923, + "grad_norm": 0.32049721479415894, + "learning_rate": 1.2801839785358374e-06, + "logits/chosen": 3.4847919940948486, + "logits/rejected": 3.5805304050445557, + "logps/chosen": -163.1932373046875, + "logps/rejected": -171.8250274658203, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.493209838867188, + "rewards/margins": 0.8735494613647461, + "rewards/rejected": -12.36676025390625, + "step": 668 + }, + { + "epoch": 0.46161807831637053, + "grad_norm": 28.078144073486328, + "learning_rate": 1.2821004216174782e-06, + "logits/chosen": 4.170052528381348, + "logits/rejected": 3.995523452758789, + "logps/chosen": -185.7801055908203, + "logps/rejected": -182.6055145263672, + "loss": 0.9384, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.821170806884766, + "rewards/margins": -0.30583715438842773, + "rewards/rejected": -13.515334129333496, + "step": 669 + }, + { + "epoch": 0.4623080903915818, + "grad_norm": 1.6925585269927979, + "learning_rate": 1.2840168646991186e-06, + "logits/chosen": 3.3660106658935547, + "logits/rejected": 3.5140085220336914, + "logps/chosen": -153.44129943847656, + "logps/rejected": -173.4120635986328, + "loss": 0.45, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.59740161895752, + "rewards/margins": 2.0181474685668945, + "rewards/rejected": -12.615548133850098, + "step": 670 + }, + { + "epoch": 0.4629981024667932, + "grad_norm": 0.3515417277812958, + "learning_rate": 1.285933307780759e-06, + "logits/chosen": 3.9898595809936523, + "logits/rejected": 3.9898595809936523, + "logps/chosen": -179.51608276367188, + "logps/rejected": -179.5160675048828, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.235513687133789, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.235513687133789, + "step": 671 + }, + { + "epoch": 0.4636881145420045, + "grad_norm": 0.6549307703971863, + "learning_rate": 1.2878497508623994e-06, + "logits/chosen": 3.7432503700256348, + "logits/rejected": 3.7808451652526855, + "logps/chosen": -178.3995361328125, + "logps/rejected": -189.10031127929688, + "loss": 0.524, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.246891021728516, + "rewards/margins": 1.0254765748977661, + "rewards/rejected": -14.272367477416992, + "step": 672 + }, + { + "epoch": 0.4643781266172158, + "grad_norm": 0.2872120440006256, + "learning_rate": 1.28976619394404e-06, + "logits/chosen": 4.028013706207275, + "logits/rejected": 4.212807655334473, + "logps/chosen": -165.52206420898438, + "logps/rejected": -172.140380859375, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.726041793823242, + "rewards/margins": 0.7360172271728516, + "rewards/rejected": -12.462059020996094, + "step": 673 + }, + { + "epoch": 0.46506813869242714, + "grad_norm": 0.42465195059776306, + "learning_rate": 1.2916826370256804e-06, + "logits/chosen": 4.250767707824707, + "logits/rejected": 4.250767707824707, + "logps/chosen": -177.45767211914062, + "logps/rejected": -177.4576873779297, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.995170593261719, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -12.995170593261719, + "step": 674 + }, + { + "epoch": 0.46575815076763843, + "grad_norm": 0.3314519226551056, + "learning_rate": 1.2935990801073208e-06, + "logits/chosen": 4.1301164627075195, + "logits/rejected": 4.0872955322265625, + "logps/chosen": -176.96771240234375, + "logps/rejected": -183.13560485839844, + "loss": 0.6077, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.786417007446289, + "rewards/margins": 0.5777984261512756, + "rewards/rejected": -13.364215850830078, + "step": 675 + }, + { + "epoch": 0.46644816284284973, + "grad_norm": 0.4210211932659149, + "learning_rate": 1.2955155231889612e-06, + "logits/chosen": 3.423588514328003, + "logits/rejected": 3.698258638381958, + "logps/chosen": -154.06207275390625, + "logps/rejected": -173.95913696289062, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.598657608032227, + "rewards/margins": 1.9682960510253906, + "rewards/rejected": -12.566953659057617, + "step": 676 + }, + { + "epoch": 0.4671381749180611, + "grad_norm": 0.31950220465660095, + "learning_rate": 1.297431966270602e-06, + "logits/chosen": 3.7741847038269043, + "logits/rejected": 3.7741847038269043, + "logps/chosen": -175.16030883789062, + "logps/rejected": -175.16030883789062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.750378608703613, + "rewards/margins": 0.0, + "rewards/rejected": -12.750378608703613, + "step": 677 + }, + { + "epoch": 0.4678281869932724, + "grad_norm": 0.32387620210647583, + "learning_rate": 1.2993484093522424e-06, + "logits/chosen": 3.697284698486328, + "logits/rejected": 4.128859043121338, + "logps/chosen": -155.74961853027344, + "logps/rejected": -177.87405395507812, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.708169937133789, + "rewards/margins": 2.238081932067871, + "rewards/rejected": -12.946250915527344, + "step": 678 + }, + { + "epoch": 0.4685181990684837, + "grad_norm": 22.65545082092285, + "learning_rate": 1.3012648524338828e-06, + "logits/chosen": 3.7640228271484375, + "logits/rejected": 4.04075813293457, + "logps/chosen": -176.3395538330078, + "logps/rejected": -187.53271484375, + "loss": 0.7481, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.868186950683594, + "rewards/margins": 1.1249268054962158, + "rewards/rejected": -13.99311351776123, + "step": 679 + }, + { + "epoch": 0.46920821114369504, + "grad_norm": 1.6168464422225952, + "learning_rate": 1.3031812955155232e-06, + "logits/chosen": 3.942800998687744, + "logits/rejected": 3.959878444671631, + "logps/chosen": -189.82931518554688, + "logps/rejected": -193.24966430664062, + "loss": 0.6144, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.126389503479004, + "rewards/margins": 0.34174656867980957, + "rewards/rejected": -14.468135833740234, + "step": 680 + }, + { + "epoch": 0.46989822321890634, + "grad_norm": 0.42335009574890137, + "learning_rate": 1.3050977385971638e-06, + "logits/chosen": 3.732922077178955, + "logits/rejected": 3.732922077178955, + "logps/chosen": -168.43997192382812, + "logps/rejected": -168.43997192382812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.151325225830078, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.151325225830078, + "step": 681 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.33812594413757324, + "learning_rate": 1.3070141816788042e-06, + "logits/chosen": 3.6028220653533936, + "logits/rejected": 3.6028220653533936, + "logps/chosen": -190.873291015625, + "logps/rejected": -190.873291015625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.27383041381836, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.27383041381836, + "step": 682 + }, + { + "epoch": 0.47127824736932894, + "grad_norm": 2.5532822608947754, + "learning_rate": 1.3089306247604448e-06, + "logits/chosen": 3.8426480293273926, + "logits/rejected": 3.9913463592529297, + "logps/chosen": -180.39622497558594, + "logps/rejected": -188.06146240234375, + "loss": 0.5337, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.44825267791748, + "rewards/margins": 0.7973055243492126, + "rewards/rejected": -14.24555778503418, + "step": 683 + }, + { + "epoch": 0.4719682594445403, + "grad_norm": 0.2847817838191986, + "learning_rate": 1.3108470678420852e-06, + "logits/chosen": 3.6124868392944336, + "logits/rejected": 3.8266897201538086, + "logps/chosen": -172.58914184570312, + "logps/rejected": -188.7519073486328, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.339662551879883, + "rewards/margins": 1.6241238117218018, + "rewards/rejected": -13.963787078857422, + "step": 684 + }, + { + "epoch": 0.4726582715197516, + "grad_norm": 10.890519142150879, + "learning_rate": 1.3127635109237258e-06, + "logits/chosen": 3.4687747955322266, + "logits/rejected": 3.630319833755493, + "logps/chosen": -159.40248107910156, + "logps/rejected": -174.62347412109375, + "loss": 0.5825, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.23250961303711, + "rewards/margins": 1.6081159114837646, + "rewards/rejected": -12.840624809265137, + "step": 685 + }, + { + "epoch": 0.4733482835949629, + "grad_norm": 0.39962175488471985, + "learning_rate": 1.3146799540053662e-06, + "logits/chosen": 3.7936830520629883, + "logits/rejected": 3.8526034355163574, + "logps/chosen": -173.7403106689453, + "logps/rejected": -181.6659393310547, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.385026931762695, + "rewards/margins": 0.7912938594818115, + "rewards/rejected": -13.17632007598877, + "step": 686 + }, + { + "epoch": 0.47403829567017425, + "grad_norm": 0.3712419271469116, + "learning_rate": 1.3165963970870066e-06, + "logits/chosen": 3.7622828483581543, + "logits/rejected": 3.7622828483581543, + "logps/chosen": -179.76535034179688, + "logps/rejected": -179.76535034179688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.251520156860352, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.251520156860352, + "step": 687 + }, + { + "epoch": 0.47472830774538555, + "grad_norm": 0.34605130553245544, + "learning_rate": 1.318512840168647e-06, + "logits/chosen": 3.5642776489257812, + "logits/rejected": 3.6344237327575684, + "logps/chosen": -173.43252563476562, + "logps/rejected": -185.58181762695312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.620512962341309, + "rewards/margins": 1.1395350694656372, + "rewards/rejected": -13.760048866271973, + "step": 688 + }, + { + "epoch": 0.47541831982059685, + "grad_norm": 0.4943521022796631, + "learning_rate": 1.3204292832502878e-06, + "logits/chosen": 3.742058753967285, + "logits/rejected": 3.8147847652435303, + "logps/chosen": -170.88623046875, + "logps/rejected": -176.5107421875, + "loss": 0.6077, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.349898338317871, + "rewards/margins": 0.5781456828117371, + "rewards/rejected": -12.928043365478516, + "step": 689 + }, + { + "epoch": 0.4761083318958082, + "grad_norm": 12.347955703735352, + "learning_rate": 1.3223457263319282e-06, + "logits/chosen": 3.6548266410827637, + "logits/rejected": 3.758394956588745, + "logps/chosen": -157.89114379882812, + "logps/rejected": -164.47543334960938, + "loss": 0.7448, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.904594421386719, + "rewards/margins": 0.6502510905265808, + "rewards/rejected": -11.554845809936523, + "step": 690 + }, + { + "epoch": 0.4767983439710195, + "grad_norm": 13.632719993591309, + "learning_rate": 1.3242621694135686e-06, + "logits/chosen": 4.060298442840576, + "logits/rejected": 4.079113960266113, + "logps/chosen": -176.93112182617188, + "logps/rejected": -173.5338592529297, + "loss": 1.6246, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.085441589355469, + "rewards/margins": -0.35448145866394043, + "rewards/rejected": -12.730960845947266, + "step": 691 + }, + { + "epoch": 0.4774883560462308, + "grad_norm": 22.98591423034668, + "learning_rate": 1.326178612495209e-06, + "logits/chosen": 3.5469183921813965, + "logits/rejected": 3.496324300765991, + "logps/chosen": -160.48904418945312, + "logps/rejected": -159.2470703125, + "loss": 1.3804, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.28866195678711, + "rewards/margins": -0.10845708847045898, + "rewards/rejected": -11.180204391479492, + "step": 692 + }, + { + "epoch": 0.4781783681214421, + "grad_norm": 0.36303627490997314, + "learning_rate": 1.3280950555768496e-06, + "logits/chosen": 3.829662322998047, + "logits/rejected": 3.924558162689209, + "logps/chosen": -174.26541137695312, + "logps/rejected": -180.09097290039062, + "loss": 0.6078, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.575204849243164, + "rewards/margins": 0.5670833587646484, + "rewards/rejected": -13.142288208007812, + "step": 693 + }, + { + "epoch": 0.47886838019665345, + "grad_norm": 0.3313372731208801, + "learning_rate": 1.33001149865849e-06, + "logits/chosen": 3.597599506378174, + "logits/rejected": 3.7618050575256348, + "logps/chosen": -184.40277099609375, + "logps/rejected": -197.07015991210938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.645432472229004, + "rewards/margins": 1.2677700519561768, + "rewards/rejected": -14.913201332092285, + "step": 694 + }, + { + "epoch": 0.47955839227186475, + "grad_norm": 0.3625301122665405, + "learning_rate": 1.3319279417401303e-06, + "logits/chosen": 3.4206340312957764, + "logits/rejected": 3.5030715465545654, + "logps/chosen": -160.70535278320312, + "logps/rejected": -172.41336059570312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.172558784484863, + "rewards/margins": 1.1917896270751953, + "rewards/rejected": -12.364349365234375, + "step": 695 + }, + { + "epoch": 0.48024840434707605, + "grad_norm": 0.3429049849510193, + "learning_rate": 1.3338443848217707e-06, + "logits/chosen": 3.946998119354248, + "logits/rejected": 4.088502407073975, + "logps/chosen": -179.89016723632812, + "logps/rejected": -187.1940155029297, + "loss": 0.607, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.254227638244629, + "rewards/margins": 0.680852472782135, + "rewards/rejected": -13.935080528259277, + "step": 696 + }, + { + "epoch": 0.4809384164222874, + "grad_norm": 0.2523461878299713, + "learning_rate": 1.3357608279034115e-06, + "logits/chosen": 3.707263946533203, + "logits/rejected": 3.8666374683380127, + "logps/chosen": -172.79263305664062, + "logps/rejected": -195.25045776367188, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.477505683898926, + "rewards/margins": 2.253542184829712, + "rewards/rejected": -14.731048583984375, + "step": 697 + }, + { + "epoch": 0.4816284284974987, + "grad_norm": 0.3946748375892639, + "learning_rate": 1.337677270985052e-06, + "logits/chosen": 3.6464552879333496, + "logits/rejected": 3.6464552879333496, + "logps/chosen": -168.31124877929688, + "logps/rejected": -168.31124877929688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.16688346862793, + "rewards/margins": 0.0, + "rewards/rejected": -12.16688346862793, + "step": 698 + }, + { + "epoch": 0.48231844057271, + "grad_norm": 0.4694591164588928, + "learning_rate": 1.3395937140666923e-06, + "logits/chosen": 3.9708008766174316, + "logits/rejected": 3.9708008766174316, + "logps/chosen": -183.95364379882812, + "logps/rejected": -183.95364379882812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.56404972076416, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.56404972076416, + "step": 699 + }, + { + "epoch": 0.48300845264792136, + "grad_norm": 0.4597722291946411, + "learning_rate": 1.3415101571483327e-06, + "logits/chosen": 3.497576951980591, + "logits/rejected": 3.72171950340271, + "logps/chosen": -160.67892456054688, + "logps/rejected": -170.8963165283203, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.348360061645508, + "rewards/margins": 1.0246846675872803, + "rewards/rejected": -12.37304401397705, + "step": 700 + }, + { + "epoch": 0.48369846472313266, + "grad_norm": 1.3198636770248413, + "learning_rate": 1.3434266002299733e-06, + "logits/chosen": 4.042470455169678, + "logits/rejected": 4.0850629806518555, + "logps/chosen": -172.9967041015625, + "logps/rejected": -181.05328369140625, + "loss": 0.5313, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.596854209899902, + "rewards/margins": 0.8119797110557556, + "rewards/rejected": -13.408833503723145, + "step": 701 + }, + { + "epoch": 0.48438847679834396, + "grad_norm": 13.577027320861816, + "learning_rate": 1.3453430433116137e-06, + "logits/chosen": 3.8246963024139404, + "logits/rejected": 3.7886691093444824, + "logps/chosen": -162.90939331054688, + "logps/rejected": -159.8555908203125, + "loss": 0.9233, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.540214538574219, + "rewards/margins": -0.3064562678337097, + "rewards/rejected": -11.233757972717285, + "step": 702 + }, + { + "epoch": 0.4850784888735553, + "grad_norm": 0.36040812730789185, + "learning_rate": 1.3472594863932543e-06, + "logits/chosen": 3.7806811332702637, + "logits/rejected": 3.784421443939209, + "logps/chosen": -167.92726135253906, + "logps/rejected": -175.91201782226562, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.26440143585205, + "rewards/margins": 0.7351160049438477, + "rewards/rejected": -12.999517440795898, + "step": 703 + }, + { + "epoch": 0.4857685009487666, + "grad_norm": 0.32034748792648315, + "learning_rate": 1.3491759294748947e-06, + "logits/chosen": 3.741468906402588, + "logits/rejected": 3.781303882598877, + "logps/chosen": -169.42227172851562, + "logps/rejected": -178.8160400390625, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.366232872009277, + "rewards/margins": 0.9173658490180969, + "rewards/rejected": -13.283597946166992, + "step": 704 + }, + { + "epoch": 0.4864585130239779, + "grad_norm": 0.3251931965351105, + "learning_rate": 1.3510923725565353e-06, + "logits/chosen": 3.6799871921539307, + "logits/rejected": 3.859769105911255, + "logps/chosen": -177.75694274902344, + "logps/rejected": -192.93255615234375, + "loss": 0.5203, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.801085472106934, + "rewards/margins": 1.5761818885803223, + "rewards/rejected": -14.377266883850098, + "step": 705 + }, + { + "epoch": 0.4871485250991892, + "grad_norm": 0.38507360219955444, + "learning_rate": 1.3530088156381757e-06, + "logits/chosen": 3.9079365730285645, + "logits/rejected": 3.9079365730285645, + "logps/chosen": -188.35284423828125, + "logps/rejected": -188.35284423828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.0780029296875, + "rewards/margins": 0.0, + "rewards/rejected": -14.0780029296875, + "step": 706 + }, + { + "epoch": 0.48783853717440057, + "grad_norm": 0.3261966109275818, + "learning_rate": 1.354925258719816e-06, + "logits/chosen": 4.081155776977539, + "logits/rejected": 4.081155776977539, + "logps/chosen": -179.2122344970703, + "logps/rejected": -179.2122344970703, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.013221740722656, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.013221740722656, + "step": 707 + }, + { + "epoch": 0.48852854924961187, + "grad_norm": 0.34499308466911316, + "learning_rate": 1.3568417018014565e-06, + "logits/chosen": 4.244012355804443, + "logits/rejected": 4.244012355804443, + "logps/chosen": -187.45166015625, + "logps/rejected": -187.45166015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.04142951965332, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -14.041428565979004, + "step": 708 + }, + { + "epoch": 0.48921856132482316, + "grad_norm": 0.3953697383403778, + "learning_rate": 1.3587581448830973e-06, + "logits/chosen": 3.958301067352295, + "logits/rejected": 3.973719596862793, + "logps/chosen": -170.4558868408203, + "logps/rejected": -176.10638427734375, + "loss": 0.6077, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.398042678833008, + "rewards/margins": 0.5770121812820435, + "rewards/rejected": -12.975054740905762, + "step": 709 + }, + { + "epoch": 0.4899085734000345, + "grad_norm": 0.32160481810569763, + "learning_rate": 1.3606745879647377e-06, + "logits/chosen": 3.755748748779297, + "logits/rejected": 3.793633460998535, + "logps/chosen": -169.35549926757812, + "logps/rejected": -179.69281005859375, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.368250846862793, + "rewards/margins": 0.9233831763267517, + "rewards/rejected": -13.291634559631348, + "step": 710 + }, + { + "epoch": 0.4905985854752458, + "grad_norm": 13.333036422729492, + "learning_rate": 1.362591031046378e-06, + "logits/chosen": 3.5100598335266113, + "logits/rejected": 3.4802088737487793, + "logps/chosen": -173.98867797851562, + "logps/rejected": -179.6471710205078, + "loss": 1.3159, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.71766185760498, + "rewards/margins": 0.6039968729019165, + "rewards/rejected": -13.32165813446045, + "step": 711 + }, + { + "epoch": 0.4912885975504571, + "grad_norm": 0.37820035219192505, + "learning_rate": 1.3645074741280185e-06, + "logits/chosen": 3.940932273864746, + "logits/rejected": 3.940932273864746, + "logps/chosen": -168.47787475585938, + "logps/rejected": -168.47787475585938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.119640350341797, + "rewards/margins": -2.980232238769531e-07, + "rewards/rejected": -12.119640350341797, + "step": 712 + }, + { + "epoch": 0.4919786096256685, + "grad_norm": 0.29521578550338745, + "learning_rate": 1.366423917209659e-06, + "logits/chosen": 3.3679850101470947, + "logits/rejected": 3.67972993850708, + "logps/chosen": -128.85989379882812, + "logps/rejected": -172.77980041503906, + "loss": 0.2607, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.094249725341797, + "rewards/margins": 4.407922267913818, + "rewards/rejected": -12.502172470092773, + "step": 713 + }, + { + "epoch": 0.49266862170087977, + "grad_norm": 14.929055213928223, + "learning_rate": 1.3683403602912995e-06, + "logits/chosen": 3.8185646533966064, + "logits/rejected": 3.9647576808929443, + "logps/chosen": -162.65972900390625, + "logps/rejected": -174.41213989257812, + "loss": 0.7681, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.469632148742676, + "rewards/margins": 1.1516757011413574, + "rewards/rejected": -12.621308326721191, + "step": 714 + }, + { + "epoch": 0.49335863377609107, + "grad_norm": 0.5172722339630127, + "learning_rate": 1.3702568033729399e-06, + "logits/chosen": 3.7818026542663574, + "logits/rejected": 3.7818026542663574, + "logps/chosen": -166.32925415039062, + "logps/rejected": -166.32925415039062, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.894624710083008, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.894624710083008, + "step": 715 + }, + { + "epoch": 0.49404864585130237, + "grad_norm": 0.33480650186538696, + "learning_rate": 1.3721732464545802e-06, + "logits/chosen": 4.008050441741943, + "logits/rejected": 4.008050441741943, + "logps/chosen": -167.92922973632812, + "logps/rejected": -167.92922973632812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.958368301391602, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -11.958368301391602, + "step": 716 + }, + { + "epoch": 0.4947386579265137, + "grad_norm": 11.490944862365723, + "learning_rate": 1.374089689536221e-06, + "logits/chosen": 3.666707754135132, + "logits/rejected": 3.714067220687866, + "logps/chosen": -175.79153442382812, + "logps/rejected": -176.10260009765625, + "loss": 0.6641, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.02923583984375, + "rewards/margins": 0.06684350967407227, + "rewards/rejected": -13.096080780029297, + "step": 717 + }, + { + "epoch": 0.495428670001725, + "grad_norm": 0.3358023166656494, + "learning_rate": 1.3760061326178615e-06, + "logits/chosen": 3.851417064666748, + "logits/rejected": 3.851417064666748, + "logps/chosen": -163.32684326171875, + "logps/rejected": -163.32684326171875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.564213752746582, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -11.564213752746582, + "step": 718 + }, + { + "epoch": 0.4961186820769363, + "grad_norm": 0.39582720398902893, + "learning_rate": 1.3779225756995018e-06, + "logits/chosen": 3.825927257537842, + "logits/rejected": 4.042500019073486, + "logps/chosen": -167.46487426757812, + "logps/rejected": -185.74362182617188, + "loss": 0.5217, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.807317733764648, + "rewards/margins": 1.8878626823425293, + "rewards/rejected": -13.695178985595703, + "step": 719 + }, + { + "epoch": 0.4968086941521477, + "grad_norm": 0.3661356270313263, + "learning_rate": 1.3798390187811422e-06, + "logits/chosen": 3.889253616333008, + "logits/rejected": 3.889253616333008, + "logps/chosen": -172.75808715820312, + "logps/rejected": -172.758056640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.31871223449707, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -12.31871223449707, + "step": 720 + }, + { + "epoch": 0.497498706227359, + "grad_norm": 0.348426878452301, + "learning_rate": 1.3817554618627828e-06, + "logits/chosen": 3.868913173675537, + "logits/rejected": 3.868913173675537, + "logps/chosen": -165.5628204345703, + "logps/rejected": -165.5628204345703, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.803567886352539, + "rewards/margins": 0.0, + "rewards/rejected": -11.803567886352539, + "step": 721 + }, + { + "epoch": 0.4981887183025703, + "grad_norm": 0.3688643276691437, + "learning_rate": 1.3836719049444232e-06, + "logits/chosen": 4.075584411621094, + "logits/rejected": 4.075584411621094, + "logps/chosen": -191.07115173339844, + "logps/rejected": -191.07115173339844, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.184324264526367, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.184325218200684, + "step": 722 + }, + { + "epoch": 0.49887873037778163, + "grad_norm": 0.31305235624313354, + "learning_rate": 1.3855883480260638e-06, + "logits/chosen": 3.97153902053833, + "logits/rejected": 3.97153902053833, + "logps/chosen": -175.2344512939453, + "logps/rejected": -175.2344512939453, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.646450996398926, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.646450996398926, + "step": 723 + }, + { + "epoch": 0.49956874245299293, + "grad_norm": 0.3667137920856476, + "learning_rate": 1.3875047911077042e-06, + "logits/chosen": 3.437385320663452, + "logits/rejected": 3.633197784423828, + "logps/chosen": -144.455078125, + "logps/rejected": -160.2965087890625, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.533102035522461, + "rewards/margins": 1.6085422039031982, + "rewards/rejected": -11.141643524169922, + "step": 724 + }, + { + "epoch": 0.5002587545282042, + "grad_norm": 0.34818512201309204, + "learning_rate": 1.3894212341893448e-06, + "logits/chosen": 3.9603588581085205, + "logits/rejected": 4.121387481689453, + "logps/chosen": -178.45687866210938, + "logps/rejected": -185.5862579345703, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.964471817016602, + "rewards/margins": 0.6765092611312866, + "rewards/rejected": -13.64098072052002, + "step": 725 + }, + { + "epoch": 0.5009487666034156, + "grad_norm": 0.5060935616493225, + "learning_rate": 1.3913376772709852e-06, + "logits/chosen": 3.8012728691101074, + "logits/rejected": 3.921189785003662, + "logps/chosen": -168.48394775390625, + "logps/rejected": -173.2005615234375, + "loss": 0.6088, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.312309265136719, + "rewards/margins": 0.49666064977645874, + "rewards/rejected": -12.80897045135498, + "step": 726 + }, + { + "epoch": 0.5016387786786268, + "grad_norm": 1.141942024230957, + "learning_rate": 1.3932541203526256e-06, + "logits/chosen": 3.8298277854919434, + "logits/rejected": 3.8722739219665527, + "logps/chosen": -161.40972900390625, + "logps/rejected": -164.72389221191406, + "loss": 0.6128, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.457035064697266, + "rewards/margins": 0.37008392810821533, + "rewards/rejected": -11.827118873596191, + "step": 727 + }, + { + "epoch": 0.5023287907538382, + "grad_norm": 0.317813903093338, + "learning_rate": 1.395170563434266e-06, + "logits/chosen": 3.4983370304107666, + "logits/rejected": 3.7554118633270264, + "logps/chosen": -158.92808532714844, + "logps/rejected": -178.44631958007812, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.959617614746094, + "rewards/margins": 1.946998119354248, + "rewards/rejected": -12.906615257263184, + "step": 728 + }, + { + "epoch": 0.5030188028290495, + "grad_norm": 0.39063236117362976, + "learning_rate": 1.3970870065159068e-06, + "logits/chosen": 3.6383495330810547, + "logits/rejected": 3.6383495330810547, + "logps/chosen": -179.37008666992188, + "logps/rejected": -179.37008666992188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.24100112915039, + "rewards/margins": 0.0, + "rewards/rejected": -13.24100112915039, + "step": 729 + }, + { + "epoch": 0.5037088149042608, + "grad_norm": 0.4111309349536896, + "learning_rate": 1.3990034495975472e-06, + "logits/chosen": 3.883455753326416, + "logits/rejected": 3.883455753326416, + "logps/chosen": -180.35264587402344, + "logps/rejected": -180.35264587402344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.342559814453125, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.342559814453125, + "step": 730 + }, + { + "epoch": 0.5043988269794721, + "grad_norm": 0.4857373535633087, + "learning_rate": 1.4009198926791876e-06, + "logits/chosen": 3.720102310180664, + "logits/rejected": 3.8037991523742676, + "logps/chosen": -156.8436279296875, + "logps/rejected": -161.41940307617188, + "loss": 0.6106, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.842644691467285, + "rewards/margins": 0.4244048595428467, + "rewards/rejected": -11.267048835754395, + "step": 731 + }, + { + "epoch": 0.5050888390546835, + "grad_norm": 1.6561881303787231, + "learning_rate": 1.402836335760828e-06, + "logits/chosen": 3.881197214126587, + "logits/rejected": 4.004304885864258, + "logps/chosen": -169.26239013671875, + "logps/rejected": -184.47898864746094, + "loss": 0.5259, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.279216766357422, + "rewards/margins": 1.5698082447052002, + "rewards/rejected": -13.84902572631836, + "step": 732 + }, + { + "epoch": 0.5057788511298947, + "grad_norm": 8.263772010803223, + "learning_rate": 1.4047527788424684e-06, + "logits/chosen": 3.770672082901001, + "logits/rejected": 3.905191421508789, + "logps/chosen": -149.95135498046875, + "logps/rejected": -170.8852996826172, + "loss": 0.4647, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.26793098449707, + "rewards/margins": 2.094882011413574, + "rewards/rejected": -12.362812995910645, + "step": 733 + }, + { + "epoch": 0.5064688632051061, + "grad_norm": 0.31773802638053894, + "learning_rate": 1.406669221924109e-06, + "logits/chosen": 4.141330718994141, + "logits/rejected": 4.141330718994141, + "logps/chosen": -174.68585205078125, + "logps/rejected": -174.68585205078125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.637079238891602, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.637079238891602, + "step": 734 + }, + { + "epoch": 0.5071588752803174, + "grad_norm": 14.650660514831543, + "learning_rate": 1.4085856650057494e-06, + "logits/chosen": 3.7311112880706787, + "logits/rejected": 3.9448795318603516, + "logps/chosen": -143.33197021484375, + "logps/rejected": -174.52296447753906, + "loss": 0.7113, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.755477905273438, + "rewards/margins": 2.8647966384887695, + "rewards/rejected": -12.62027359008789, + "step": 735 + }, + { + "epoch": 0.5078488873555287, + "grad_norm": 13.453968048095703, + "learning_rate": 1.4105021080873898e-06, + "logits/chosen": 3.4913649559020996, + "logits/rejected": 3.4599156379699707, + "logps/chosen": -163.87657165527344, + "logps/rejected": -161.02908325195312, + "loss": 0.9317, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.46143913269043, + "rewards/margins": -0.31553083658218384, + "rewards/rejected": -11.145907402038574, + "step": 736 + }, + { + "epoch": 0.50853889943074, + "grad_norm": 0.28898438811302185, + "learning_rate": 1.4124185511690302e-06, + "logits/chosen": 3.745771884918213, + "logits/rejected": 3.7821812629699707, + "logps/chosen": -161.72238159179688, + "logps/rejected": -170.01646423339844, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.304262161254883, + "rewards/margins": 0.8547341823577881, + "rewards/rejected": -12.15899658203125, + "step": 737 + }, + { + "epoch": 0.5092289115059514, + "grad_norm": 0.41893553733825684, + "learning_rate": 1.414334994250671e-06, + "logits/chosen": 3.7780628204345703, + "logits/rejected": 3.7780628204345703, + "logps/chosen": -164.1251220703125, + "logps/rejected": -164.1251220703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.612001419067383, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.612000465393066, + "step": 738 + }, + { + "epoch": 0.5099189235811626, + "grad_norm": 0.48385047912597656, + "learning_rate": 1.4162514373323114e-06, + "logits/chosen": 3.8863487243652344, + "logits/rejected": 3.8863487243652344, + "logps/chosen": -177.41226196289062, + "logps/rejected": -177.41226196289062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.822219848632812, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.822219848632812, + "step": 739 + }, + { + "epoch": 0.510608935656374, + "grad_norm": 0.3150101602077484, + "learning_rate": 1.4181678804139518e-06, + "logits/chosen": 3.415722131729126, + "logits/rejected": 3.415722131729126, + "logps/chosen": -168.8501739501953, + "logps/rejected": -168.8501739501953, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.11827278137207, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.118273735046387, + "step": 740 + }, + { + "epoch": 0.5112989477315854, + "grad_norm": 0.3309752643108368, + "learning_rate": 1.4200843234955922e-06, + "logits/chosen": 3.5572586059570312, + "logits/rejected": 3.8571577072143555, + "logps/chosen": -160.90621948242188, + "logps/rejected": -186.40274047851562, + "loss": 0.4339, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.435466766357422, + "rewards/margins": 2.5688390731811523, + "rewards/rejected": -14.004304885864258, + "step": 741 + }, + { + "epoch": 0.5119889598067966, + "grad_norm": 15.34310531616211, + "learning_rate": 1.4220007665772328e-06, + "logits/chosen": 3.9561328887939453, + "logits/rejected": 3.905879020690918, + "logps/chosen": -173.1607208251953, + "logps/rejected": -171.66970825195312, + "loss": 0.7909, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.478052139282227, + "rewards/margins": -0.1519147753715515, + "rewards/rejected": -12.32613754272461, + "step": 742 + }, + { + "epoch": 0.512678971882008, + "grad_norm": 0.3465685546398163, + "learning_rate": 1.4239172096588734e-06, + "logits/chosen": 4.045645713806152, + "logits/rejected": 4.045645713806152, + "logps/chosen": -179.84637451171875, + "logps/rejected": -179.84637451171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.247291564941406, + "rewards/margins": 0.0, + "rewards/rejected": -13.247291564941406, + "step": 743 + }, + { + "epoch": 0.5133689839572193, + "grad_norm": 0.26067718863487244, + "learning_rate": 1.4258336527405138e-06, + "logits/chosen": 3.8027374744415283, + "logits/rejected": 3.8058271408081055, + "logps/chosen": -157.39598083496094, + "logps/rejected": -179.1982879638672, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.188365936279297, + "rewards/margins": 2.12691593170166, + "rewards/rejected": -13.315282821655273, + "step": 744 + }, + { + "epoch": 0.5140589960324305, + "grad_norm": 0.2609906494617462, + "learning_rate": 1.4277500958221541e-06, + "logits/chosen": 3.6055312156677246, + "logits/rejected": 3.9241604804992676, + "logps/chosen": -153.3544921875, + "logps/rejected": -173.7902374267578, + "loss": 0.52, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.54489517211914, + "rewards/margins": 2.0135293006896973, + "rewards/rejected": -12.55842399597168, + "step": 745 + }, + { + "epoch": 0.5147490081076419, + "grad_norm": 0.3558259904384613, + "learning_rate": 1.4296665389037947e-06, + "logits/chosen": 3.6468987464904785, + "logits/rejected": 3.724008083343506, + "logps/chosen": -165.45556640625, + "logps/rejected": -175.32749938964844, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.852630615234375, + "rewards/margins": 0.9914137125015259, + "rewards/rejected": -12.844043731689453, + "step": 746 + }, + { + "epoch": 0.5154390201828531, + "grad_norm": 0.39894235134124756, + "learning_rate": 1.4315829819854351e-06, + "logits/chosen": 4.011781215667725, + "logits/rejected": 4.011781215667725, + "logps/chosen": -187.97601318359375, + "logps/rejected": -187.97601318359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.029946327209473, + "rewards/margins": 0.0, + "rewards/rejected": -14.029946327209473, + "step": 747 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 0.28657838702201843, + "learning_rate": 1.4334994250670755e-06, + "logits/chosen": 3.7877964973449707, + "logits/rejected": 3.7877964973449707, + "logps/chosen": -174.70965576171875, + "logps/rejected": -174.70965576171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.726078033447266, + "rewards/margins": 0.0, + "rewards/rejected": -12.726078033447266, + "step": 748 + }, + { + "epoch": 0.5168190443332759, + "grad_norm": 0.3482988476753235, + "learning_rate": 1.435415868148716e-06, + "logits/chosen": 3.769577980041504, + "logits/rejected": 3.904984712600708, + "logps/chosen": -179.52423095703125, + "logps/rejected": -187.67288208007812, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.24339485168457, + "rewards/margins": 0.779859185218811, + "rewards/rejected": -14.02325439453125, + "step": 749 + }, + { + "epoch": 0.5175090564084871, + "grad_norm": 1.6626960039138794, + "learning_rate": 1.4373323112303567e-06, + "logits/chosen": 3.840528726577759, + "logits/rejected": 3.7810497283935547, + "logps/chosen": -178.60610961914062, + "logps/rejected": -181.57406616210938, + "loss": 0.6199, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.263434410095215, + "rewards/margins": 0.27269911766052246, + "rewards/rejected": -13.536133766174316, + "step": 750 + }, + { + "epoch": 0.5181990684836985, + "grad_norm": 0.34187254309654236, + "learning_rate": 1.4392487543119971e-06, + "logits/chosen": 3.656566619873047, + "logits/rejected": 3.751889228820801, + "logps/chosen": -162.46600341796875, + "logps/rejected": -175.00302124023438, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.506555557250977, + "rewards/margins": 1.2925968170166016, + "rewards/rejected": -12.799152374267578, + "step": 751 + }, + { + "epoch": 0.5188890805589098, + "grad_norm": 0.4204702079296112, + "learning_rate": 1.4411651973936375e-06, + "logits/chosen": 3.5662012100219727, + "logits/rejected": 3.5662012100219727, + "logps/chosen": -166.96522521972656, + "logps/rejected": -166.96524047851562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.002939224243164, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.002939224243164, + "step": 752 + }, + { + "epoch": 0.519579092634121, + "grad_norm": 0.32262131571769714, + "learning_rate": 1.443081640475278e-06, + "logits/chosen": 3.4657485485076904, + "logits/rejected": 3.661184787750244, + "logps/chosen": -156.58819580078125, + "logps/rejected": -174.20492553710938, + "loss": 0.5211, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.994159698486328, + "rewards/margins": 1.7417341470718384, + "rewards/rejected": -12.735895156860352, + "step": 753 + }, + { + "epoch": 0.5202691047093324, + "grad_norm": 0.32235872745513916, + "learning_rate": 1.4449980835569185e-06, + "logits/chosen": 3.4690256118774414, + "logits/rejected": 3.4690256118774414, + "logps/chosen": -170.72723388671875, + "logps/rejected": -170.7272491455078, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.29670524597168, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.29670524597168, + "step": 754 + }, + { + "epoch": 0.5209591167845438, + "grad_norm": 0.298021525144577, + "learning_rate": 1.446914526638559e-06, + "logits/chosen": 3.8133838176727295, + "logits/rejected": 4.094695091247559, + "logps/chosen": -174.2286376953125, + "logps/rejected": -187.51315307617188, + "loss": 0.5216, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.591527938842773, + "rewards/margins": 1.3635485172271729, + "rewards/rejected": -13.955077171325684, + "step": 755 + }, + { + "epoch": 0.521649128859755, + "grad_norm": 0.45794814825057983, + "learning_rate": 1.4488309697201993e-06, + "logits/chosen": 4.088172912597656, + "logits/rejected": 4.088172912597656, + "logps/chosen": -170.00930786132812, + "logps/rejected": -170.00930786132812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.207828521728516, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.207828521728516, + "step": 756 + }, + { + "epoch": 0.5223391409349664, + "grad_norm": 0.3287133276462555, + "learning_rate": 1.4507474128018397e-06, + "logits/chosen": 3.65586256980896, + "logits/rejected": 3.65586256980896, + "logps/chosen": -162.19366455078125, + "logps/rejected": -162.19366455078125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.737311363220215, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.737311363220215, + "step": 757 + }, + { + "epoch": 0.5230291530101777, + "grad_norm": 0.32381516695022583, + "learning_rate": 1.4526638558834805e-06, + "logits/chosen": 3.750032424926758, + "logits/rejected": 3.793807029724121, + "logps/chosen": -142.46726989746094, + "logps/rejected": -167.54257202148438, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.851468086242676, + "rewards/margins": 2.2787652015686035, + "rewards/rejected": -12.130233764648438, + "step": 758 + }, + { + "epoch": 0.523719165085389, + "grad_norm": 6.381619453430176, + "learning_rate": 1.454580298965121e-06, + "logits/chosen": 3.544233798980713, + "logits/rejected": 3.6135950088500977, + "logps/chosen": -162.03746032714844, + "logps/rejected": -163.7737274169922, + "loss": 0.6167, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.363243103027344, + "rewards/margins": 0.22232955694198608, + "rewards/rejected": -11.58557415008545, + "step": 759 + }, + { + "epoch": 0.5244091771606003, + "grad_norm": 0.2779380977153778, + "learning_rate": 1.4564967420467613e-06, + "logits/chosen": 3.699721336364746, + "logits/rejected": 3.9227676391601562, + "logps/chosen": -165.07650756835938, + "logps/rejected": -175.52133178710938, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.57627010345459, + "rewards/margins": 1.069595456123352, + "rewards/rejected": -12.645865440368652, + "step": 760 + }, + { + "epoch": 0.5250991892358117, + "grad_norm": 16.195098876953125, + "learning_rate": 1.4584131851284017e-06, + "logits/chosen": 4.043641090393066, + "logits/rejected": 3.86960506439209, + "logps/chosen": -176.32669067382812, + "logps/rejected": -169.16519165039062, + "loss": 1.2838, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.857799530029297, + "rewards/margins": -0.6767065525054932, + "rewards/rejected": -12.181093215942383, + "step": 761 + }, + { + "epoch": 0.5257892013110229, + "grad_norm": 0.7004468441009521, + "learning_rate": 1.4603296282100423e-06, + "logits/chosen": 3.6229071617126465, + "logits/rejected": 3.6620917320251465, + "logps/chosen": -175.53680419921875, + "logps/rejected": -179.797119140625, + "loss": 0.6117, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.783224105834961, + "rewards/margins": 0.3942629098892212, + "rewards/rejected": -13.177488327026367, + "step": 762 + }, + { + "epoch": 0.5264792133862343, + "grad_norm": 0.3348390460014343, + "learning_rate": 1.4622460712916827e-06, + "logits/chosen": 3.759289503097534, + "logits/rejected": 3.8398873805999756, + "logps/chosen": -162.43765258789062, + "logps/rejected": -169.27169799804688, + "loss": 0.6072, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.417350769042969, + "rewards/margins": 0.6565669775009155, + "rewards/rejected": -12.073917388916016, + "step": 763 + }, + { + "epoch": 0.5271692254614456, + "grad_norm": 0.38329097628593445, + "learning_rate": 1.4641625143733233e-06, + "logits/chosen": 3.8204126358032227, + "logits/rejected": 3.8204126358032227, + "logps/chosen": -177.65341186523438, + "logps/rejected": -177.65341186523438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.25033950805664, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.25033950805664, + "step": 764 + }, + { + "epoch": 0.5278592375366569, + "grad_norm": 0.3917408585548401, + "learning_rate": 1.4660789574549637e-06, + "logits/chosen": 3.799760341644287, + "logits/rejected": 3.8373348712921143, + "logps/chosen": -175.503173828125, + "logps/rejected": -185.54080200195312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.724578857421875, + "rewards/margins": 1.0645339488983154, + "rewards/rejected": -13.789112091064453, + "step": 765 + }, + { + "epoch": 0.5285492496118682, + "grad_norm": 0.31271079182624817, + "learning_rate": 1.4679954005366043e-06, + "logits/chosen": 4.016763687133789, + "logits/rejected": 4.016763687133789, + "logps/chosen": -178.60171508789062, + "logps/rejected": -178.6016845703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.134382247924805, + "rewards/margins": -1.1920928955078125e-06, + "rewards/rejected": -13.134380340576172, + "step": 766 + }, + { + "epoch": 0.5292392616870796, + "grad_norm": 0.3249942362308502, + "learning_rate": 1.4699118436182447e-06, + "logits/chosen": 3.514704704284668, + "logits/rejected": 3.514704704284668, + "logps/chosen": -171.51641845703125, + "logps/rejected": -171.51641845703125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.306020736694336, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.306020736694336, + "step": 767 + }, + { + "epoch": 0.5299292737622908, + "grad_norm": 0.6362578272819519, + "learning_rate": 1.471828286699885e-06, + "logits/chosen": 3.553982734680176, + "logits/rejected": 3.636798143386841, + "logps/chosen": -144.0835418701172, + "logps/rejected": -158.68948364257812, + "loss": 0.5221, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.580206871032715, + "rewards/margins": 1.49678373336792, + "rewards/rejected": -11.076990127563477, + "step": 768 + }, + { + "epoch": 0.5306192858375022, + "grad_norm": 0.29977935552597046, + "learning_rate": 1.4737447297815254e-06, + "logits/chosen": 3.455662250518799, + "logits/rejected": 3.7499027252197266, + "logps/chosen": -154.74142456054688, + "logps/rejected": -170.8880615234375, + "loss": 0.5208, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.875677108764648, + "rewards/margins": 1.5733227729797363, + "rewards/rejected": -12.448999404907227, + "step": 769 + }, + { + "epoch": 0.5313092979127134, + "grad_norm": 0.3016223609447479, + "learning_rate": 1.4756611728631663e-06, + "logits/chosen": 3.5571417808532715, + "logits/rejected": 3.637894868850708, + "logps/chosen": -155.70980834960938, + "logps/rejected": -181.69374084472656, + "loss": 0.4337, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.677024841308594, + "rewards/margins": 2.6843996047973633, + "rewards/rejected": -13.361424446105957, + "step": 770 + }, + { + "epoch": 0.5319993099879248, + "grad_norm": 5.06437873840332, + "learning_rate": 1.4775776159448067e-06, + "logits/chosen": 3.614668846130371, + "logits/rejected": 3.5893282890319824, + "logps/chosen": -163.0862274169922, + "logps/rejected": -167.47665405273438, + "loss": 0.5974, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.630359649658203, + "rewards/margins": 0.37797361612319946, + "rewards/rejected": -12.008333206176758, + "step": 771 + }, + { + "epoch": 0.5326893220631361, + "grad_norm": 0.2840545177459717, + "learning_rate": 1.479494059026447e-06, + "logits/chosen": 3.7340612411499023, + "logits/rejected": 3.7340612411499023, + "logps/chosen": -170.45428466796875, + "logps/rejected": -170.45428466796875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.395454406738281, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.395454406738281, + "step": 772 + }, + { + "epoch": 0.5333793341383474, + "grad_norm": 18.417524337768555, + "learning_rate": 1.4814105021080874e-06, + "logits/chosen": 3.606902599334717, + "logits/rejected": 3.6195249557495117, + "logps/chosen": -161.40463256835938, + "logps/rejected": -160.19189453125, + "loss": 0.7841, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.48469352722168, + "rewards/margins": -0.1429818868637085, + "rewards/rejected": -11.341711044311523, + "step": 773 + }, + { + "epoch": 0.5340693462135587, + "grad_norm": 0.29988735914230347, + "learning_rate": 1.483326945189728e-06, + "logits/chosen": 3.856353759765625, + "logits/rejected": 3.856353759765625, + "logps/chosen": -175.77157592773438, + "logps/rejected": -175.77159118652344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.805420875549316, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.805420875549316, + "step": 774 + }, + { + "epoch": 0.5347593582887701, + "grad_norm": 0.2850341498851776, + "learning_rate": 1.4852433882713684e-06, + "logits/chosen": 3.7001094818115234, + "logits/rejected": 3.7497458457946777, + "logps/chosen": -173.76528930664062, + "logps/rejected": -182.8101348876953, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.608866691589355, + "rewards/margins": 0.9365438222885132, + "rewards/rejected": -13.545411109924316, + "step": 775 + }, + { + "epoch": 0.5354493703639813, + "grad_norm": 0.37644365429878235, + "learning_rate": 1.4871598313530088e-06, + "logits/chosen": 3.709704875946045, + "logits/rejected": 3.709704875946045, + "logps/chosen": -172.81521606445312, + "logps/rejected": -172.81521606445312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.490873336791992, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.490873336791992, + "step": 776 + }, + { + "epoch": 0.5361393824391927, + "grad_norm": 5.198019027709961, + "learning_rate": 1.4890762744346492e-06, + "logits/chosen": 3.723391532897949, + "logits/rejected": 3.662748336791992, + "logps/chosen": -168.4478759765625, + "logps/rejected": -169.13397216796875, + "loss": 0.6525, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.307822227478027, + "rewards/margins": 0.10132443904876709, + "rewards/rejected": -12.409147262573242, + "step": 777 + }, + { + "epoch": 0.536829394514404, + "grad_norm": 0.3018762469291687, + "learning_rate": 1.49099271751629e-06, + "logits/chosen": 3.616363048553467, + "logits/rejected": 3.716888666152954, + "logps/chosen": -156.55303955078125, + "logps/rejected": -168.8314971923828, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.854761123657227, + "rewards/margins": 1.2205535173416138, + "rewards/rejected": -12.07531452178955, + "step": 778 + }, + { + "epoch": 0.5375194065896153, + "grad_norm": 0.32363271713256836, + "learning_rate": 1.4929091605979304e-06, + "logits/chosen": 3.5435914993286133, + "logits/rejected": 3.7133278846740723, + "logps/chosen": -149.00253295898438, + "logps/rejected": -170.773193359375, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.218513488769531, + "rewards/margins": 2.1238861083984375, + "rewards/rejected": -12.342399597167969, + "step": 779 + }, + { + "epoch": 0.5382094186648266, + "grad_norm": 0.2993007302284241, + "learning_rate": 1.4948256036795708e-06, + "logits/chosen": 3.4947257041931152, + "logits/rejected": 3.669356346130371, + "logps/chosen": -154.97796630859375, + "logps/rejected": -171.53981018066406, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.6702880859375, + "rewards/margins": 1.7253284454345703, + "rewards/rejected": -12.395615577697754, + "step": 780 + }, + { + "epoch": 0.538899430740038, + "grad_norm": 0.3127504885196686, + "learning_rate": 1.4967420467612112e-06, + "logits/chosen": 3.9154281616210938, + "logits/rejected": 4.223833084106445, + "logps/chosen": -158.04623413085938, + "logps/rejected": -177.33828735351562, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.015474319458008, + "rewards/margins": 1.883944034576416, + "rewards/rejected": -12.899417877197266, + "step": 781 + }, + { + "epoch": 0.5395894428152492, + "grad_norm": 10.915230751037598, + "learning_rate": 1.4986584898428518e-06, + "logits/chosen": 4.071234226226807, + "logits/rejected": 4.236954689025879, + "logps/chosen": -166.70156860351562, + "logps/rejected": -177.2548370361328, + "loss": 0.8781, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.975778579711914, + "rewards/margins": 0.93089759349823, + "rewards/rejected": -12.906676292419434, + "step": 782 + }, + { + "epoch": 0.5402794548904606, + "grad_norm": 0.3759441375732422, + "learning_rate": 1.5005749329244922e-06, + "logits/chosen": 3.7219769954681396, + "logits/rejected": 3.7637953758239746, + "logps/chosen": -153.61463928222656, + "logps/rejected": -168.0113525390625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.69127368927002, + "rewards/margins": 1.390749216079712, + "rewards/rejected": -12.082022666931152, + "step": 783 + }, + { + "epoch": 0.5409694669656719, + "grad_norm": 0.3805663585662842, + "learning_rate": 1.5024913760061328e-06, + "logits/chosen": 3.628878593444824, + "logits/rejected": 3.8472812175750732, + "logps/chosen": -172.14675903320312, + "logps/rejected": -192.7626495361328, + "loss": 0.5218, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.459663391113281, + "rewards/margins": 1.9765454530715942, + "rewards/rejected": -14.436210632324219, + "step": 784 + }, + { + "epoch": 0.5416594790408832, + "grad_norm": 5.172417640686035, + "learning_rate": 1.5044078190877732e-06, + "logits/chosen": 3.9331908226013184, + "logits/rejected": 3.91908597946167, + "logps/chosen": -185.8341064453125, + "logps/rejected": -185.552978515625, + "loss": 0.6877, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.951086044311523, + "rewards/margins": 0.01109391450881958, + "rewards/rejected": -13.962179183959961, + "step": 785 + }, + { + "epoch": 0.5423494911160945, + "grad_norm": 0.37984248995780945, + "learning_rate": 1.5063242621694138e-06, + "logits/chosen": 4.109624862670898, + "logits/rejected": 4.194700241088867, + "logps/chosen": -184.5858917236328, + "logps/rejected": -190.48626708984375, + "loss": 0.6077, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.520069122314453, + "rewards/margins": 0.5803154706954956, + "rewards/rejected": -14.100384712219238, + "step": 786 + }, + { + "epoch": 0.5430395031913059, + "grad_norm": 0.3607383370399475, + "learning_rate": 1.5082407052510542e-06, + "logits/chosen": 3.869666814804077, + "logits/rejected": 3.869666814804077, + "logps/chosen": -174.60971069335938, + "logps/rejected": -174.60971069335938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.949874877929688, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.949874877929688, + "step": 787 + }, + { + "epoch": 0.5437295152665171, + "grad_norm": 0.3529718518257141, + "learning_rate": 1.5101571483326946e-06, + "logits/chosen": 3.8641042709350586, + "logits/rejected": 3.989391803741455, + "logps/chosen": -161.98928833007812, + "logps/rejected": -179.34237670898438, + "loss": 0.5202, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.407186508178711, + "rewards/margins": 1.7655198574066162, + "rewards/rejected": -13.17270565032959, + "step": 788 + }, + { + "epoch": 0.5444195273417285, + "grad_norm": 0.3513103723526001, + "learning_rate": 1.512073591414335e-06, + "logits/chosen": 3.790961980819702, + "logits/rejected": 3.790961980819702, + "logps/chosen": -181.29449462890625, + "logps/rejected": -181.29449462890625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.406095504760742, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.406095504760742, + "step": 789 + }, + { + "epoch": 0.5451095394169398, + "grad_norm": 21.447978973388672, + "learning_rate": 1.5139900344959758e-06, + "logits/chosen": 3.737748384475708, + "logits/rejected": 3.8959717750549316, + "logps/chosen": -129.95465087890625, + "logps/rejected": -140.85934448242188, + "loss": 0.9036, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.463370323181152, + "rewards/margins": 1.0069023370742798, + "rewards/rejected": -9.470272064208984, + "step": 790 + }, + { + "epoch": 0.5457995514921511, + "grad_norm": 0.37820810079574585, + "learning_rate": 1.5159064775776162e-06, + "logits/chosen": 3.649085283279419, + "logits/rejected": 3.649085283279419, + "logps/chosen": -169.3460693359375, + "logps/rejected": -169.3460693359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.155731201171875, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.155731201171875, + "step": 791 + }, + { + "epoch": 0.5464895635673624, + "grad_norm": 8.989850997924805, + "learning_rate": 1.5178229206592566e-06, + "logits/chosen": 4.061089992523193, + "logits/rejected": 4.010042667388916, + "logps/chosen": -171.56744384765625, + "logps/rejected": -170.7142333984375, + "loss": 0.7319, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.514449119567871, + "rewards/margins": -0.06822454929351807, + "rewards/rejected": -12.446224212646484, + "step": 792 + }, + { + "epoch": 0.5471795756425737, + "grad_norm": 0.3300066888332367, + "learning_rate": 1.519739363740897e-06, + "logits/chosen": 3.593050479888916, + "logits/rejected": 3.7805557250976562, + "logps/chosen": -173.17372131347656, + "logps/rejected": -180.3359375, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.45000171661377, + "rewards/margins": 0.6638736724853516, + "rewards/rejected": -13.113874435424805, + "step": 793 + }, + { + "epoch": 0.547869587717785, + "grad_norm": 0.3067081570625305, + "learning_rate": 1.5216558068225376e-06, + "logits/chosen": 3.7588553428649902, + "logits/rejected": 3.7803239822387695, + "logps/chosen": -170.10977172851562, + "logps/rejected": -177.5066375732422, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.129241943359375, + "rewards/margins": 0.7557680606842041, + "rewards/rejected": -12.885009765625, + "step": 794 + }, + { + "epoch": 0.5485595997929964, + "grad_norm": 0.4166925251483917, + "learning_rate": 1.523572249904178e-06, + "logits/chosen": 3.889976978302002, + "logits/rejected": 3.889976978302002, + "logps/chosen": -174.15472412109375, + "logps/rejected": -174.15472412109375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.686927795410156, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.68692684173584, + "step": 795 + }, + { + "epoch": 0.5492496118682076, + "grad_norm": 3.9147582054138184, + "learning_rate": 1.5254886929858183e-06, + "logits/chosen": 3.450702667236328, + "logits/rejected": 3.480567455291748, + "logps/chosen": -149.47119140625, + "logps/rejected": -157.9297637939453, + "loss": 0.558, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.081533432006836, + "rewards/margins": 0.8898399472236633, + "rewards/rejected": -10.971373558044434, + "step": 796 + }, + { + "epoch": 0.549939623943419, + "grad_norm": 0.7824663519859314, + "learning_rate": 1.5274051360674587e-06, + "logits/chosen": 3.9792733192443848, + "logits/rejected": 3.999441623687744, + "logps/chosen": -171.15602111816406, + "logps/rejected": -174.58197021484375, + "loss": 0.6145, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.161966323852539, + "rewards/margins": 0.3403317332267761, + "rewards/rejected": -12.502299308776855, + "step": 797 + }, + { + "epoch": 0.5506296360186304, + "grad_norm": 0.3646790683269501, + "learning_rate": 1.5293215791490996e-06, + "logits/chosen": 4.050804615020752, + "logits/rejected": 4.050804615020752, + "logps/chosen": -180.16539001464844, + "logps/rejected": -180.16539001464844, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.289138793945312, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.289138793945312, + "step": 798 + }, + { + "epoch": 0.5513196480938416, + "grad_norm": 0.30400264263153076, + "learning_rate": 1.53123802223074e-06, + "logits/chosen": 3.348416328430176, + "logits/rejected": 3.4154765605926514, + "logps/chosen": -166.3404083251953, + "logps/rejected": -176.47039794921875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.634020805358887, + "rewards/margins": 1.0519120693206787, + "rewards/rejected": -12.685933113098145, + "step": 799 + }, + { + "epoch": 0.552009660169053, + "grad_norm": 0.37918156385421753, + "learning_rate": 1.5331544653123803e-06, + "logits/chosen": 4.0896453857421875, + "logits/rejected": 4.0896453857421875, + "logps/chosen": -180.1869354248047, + "logps/rejected": -180.1869354248047, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.005788803100586, + "rewards/margins": 0.0, + "rewards/rejected": -13.005788803100586, + "step": 800 + }, + { + "epoch": 0.5526996722442643, + "grad_norm": 0.4193623661994934, + "learning_rate": 1.5350709083940207e-06, + "logits/chosen": 3.4906256198883057, + "logits/rejected": 3.5669894218444824, + "logps/chosen": -154.86422729492188, + "logps/rejected": -165.44757080078125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.699015617370605, + "rewards/margins": 1.086832880973816, + "rewards/rejected": -11.785848617553711, + "step": 801 + }, + { + "epoch": 0.5533896843194755, + "grad_norm": 5.740319728851318, + "learning_rate": 1.5369873514756613e-06, + "logits/chosen": 3.481168270111084, + "logits/rejected": 3.5517537593841553, + "logps/chosen": -160.0954132080078, + "logps/rejected": -161.47802734375, + "loss": 0.6552, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.241615295410156, + "rewards/margins": 0.09276485443115234, + "rewards/rejected": -11.334380149841309, + "step": 802 + }, + { + "epoch": 0.5540796963946869, + "grad_norm": 0.2813841700553894, + "learning_rate": 1.5389037945573017e-06, + "logits/chosen": 3.790658950805664, + "logits/rejected": 3.926929473876953, + "logps/chosen": -145.50677490234375, + "logps/rejected": -168.49012756347656, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.91506576538086, + "rewards/margins": 2.0725746154785156, + "rewards/rejected": -11.987640380859375, + "step": 803 + }, + { + "epoch": 0.5547697084698983, + "grad_norm": 12.457432746887207, + "learning_rate": 1.5408202376389423e-06, + "logits/chosen": 3.5373034477233887, + "logits/rejected": 3.5492022037506104, + "logps/chosen": -160.4827423095703, + "logps/rejected": -170.71234130859375, + "loss": 0.751, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.22744369506836, + "rewards/margins": 0.9974583387374878, + "rewards/rejected": -12.224902153015137, + "step": 804 + }, + { + "epoch": 0.5554597205451095, + "grad_norm": 0.3344154953956604, + "learning_rate": 1.5427366807205827e-06, + "logits/chosen": 3.965179443359375, + "logits/rejected": 3.965179443359375, + "logps/chosen": -180.90245056152344, + "logps/rejected": -180.90243530273438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.314325332641602, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.314325332641602, + "step": 805 + }, + { + "epoch": 0.5561497326203209, + "grad_norm": 0.37211158871650696, + "learning_rate": 1.5446531238022233e-06, + "logits/chosen": 3.6026499271392822, + "logits/rejected": 3.609318494796753, + "logps/chosen": -154.48272705078125, + "logps/rejected": -161.95126342773438, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.489461898803711, + "rewards/margins": 0.8408089876174927, + "rewards/rejected": -11.330270767211914, + "step": 806 + }, + { + "epoch": 0.5568397446955322, + "grad_norm": 0.35358357429504395, + "learning_rate": 1.5465695668838637e-06, + "logits/chosen": 3.6990325450897217, + "logits/rejected": 3.6990325450897217, + "logps/chosen": -168.25051879882812, + "logps/rejected": -168.25051879882812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.042455673217773, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.042455673217773, + "step": 807 + }, + { + "epoch": 0.5575297567707435, + "grad_norm": 0.32138609886169434, + "learning_rate": 1.548486009965504e-06, + "logits/chosen": 4.223138332366943, + "logits/rejected": 4.261716365814209, + "logps/chosen": -171.26791381835938, + "logps/rejected": -181.61679077148438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.247191429138184, + "rewards/margins": 1.0371092557907104, + "rewards/rejected": -13.284299850463867, + "step": 808 + }, + { + "epoch": 0.5582197688459548, + "grad_norm": 0.3449404239654541, + "learning_rate": 1.5504024530471445e-06, + "logits/chosen": 3.5165648460388184, + "logits/rejected": 3.578057050704956, + "logps/chosen": -137.10179138183594, + "logps/rejected": -153.12039184570312, + "loss": 0.5213, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.88978099822998, + "rewards/margins": 1.6004542112350464, + "rewards/rejected": -10.490235328674316, + "step": 809 + }, + { + "epoch": 0.5589097809211662, + "grad_norm": 0.3916918635368347, + "learning_rate": 1.5523188961287853e-06, + "logits/chosen": 3.783308506011963, + "logits/rejected": 3.783308506011963, + "logps/chosen": -169.88218688964844, + "logps/rejected": -169.88218688964844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.345173835754395, + "rewards/margins": 0.0, + "rewards/rejected": -12.345173835754395, + "step": 810 + }, + { + "epoch": 0.5595997929963774, + "grad_norm": 1.4737036228179932, + "learning_rate": 1.5542353392104257e-06, + "logits/chosen": 4.073970794677734, + "logits/rejected": 4.202978610992432, + "logps/chosen": -172.74017333984375, + "logps/rejected": -186.6790771484375, + "loss": 0.5249, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.469623565673828, + "rewards/margins": 1.4107582569122314, + "rewards/rejected": -13.880382537841797, + "step": 811 + }, + { + "epoch": 0.5602898050715888, + "grad_norm": 27.411008834838867, + "learning_rate": 1.556151782292066e-06, + "logits/chosen": 3.8460142612457275, + "logits/rejected": 3.9947304725646973, + "logps/chosen": -173.7490234375, + "logps/rejected": -182.54827880859375, + "loss": 0.757, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.600069046020508, + "rewards/margins": 0.954723596572876, + "rewards/rejected": -13.554792404174805, + "step": 812 + }, + { + "epoch": 0.5609798171468001, + "grad_norm": 0.33637818694114685, + "learning_rate": 1.5580682253737065e-06, + "logits/chosen": 4.108181476593018, + "logits/rejected": 4.108181476593018, + "logps/chosen": -186.94712829589844, + "logps/rejected": -186.94712829589844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.925978660583496, + "rewards/margins": 0.0, + "rewards/rejected": -13.925978660583496, + "step": 813 + }, + { + "epoch": 0.5616698292220114, + "grad_norm": 25.552785873413086, + "learning_rate": 1.559984668455347e-06, + "logits/chosen": 3.655441999435425, + "logits/rejected": 3.9123454093933105, + "logps/chosen": -157.11865234375, + "logps/rejected": -177.82949829101562, + "loss": 0.4419, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.799112319946289, + "rewards/margins": 2.0587034225463867, + "rewards/rejected": -12.857816696166992, + "step": 814 + }, + { + "epoch": 0.5623598412972227, + "grad_norm": 0.2771674692630768, + "learning_rate": 1.5619011115369875e-06, + "logits/chosen": 4.0956854820251465, + "logits/rejected": 4.19603967666626, + "logps/chosen": -178.20928955078125, + "logps/rejected": -192.70260620117188, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.063741683959961, + "rewards/margins": 1.3919315338134766, + "rewards/rejected": -14.455673217773438, + "step": 815 + }, + { + "epoch": 0.5630498533724341, + "grad_norm": 0.33862704038619995, + "learning_rate": 1.5638175546186279e-06, + "logits/chosen": 3.2817435264587402, + "logits/rejected": 3.331984519958496, + "logps/chosen": -132.21945190429688, + "logps/rejected": -145.34152221679688, + "loss": 0.5213, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.667654991149902, + "rewards/margins": 1.2882728576660156, + "rewards/rejected": -9.955928802490234, + "step": 816 + }, + { + "epoch": 0.5637398654476453, + "grad_norm": 0.4836674630641937, + "learning_rate": 1.5657339977002683e-06, + "logits/chosen": 3.5764315128326416, + "logits/rejected": 3.8313567638397217, + "logps/chosen": -143.19879150390625, + "logps/rejected": -167.04173278808594, + "loss": 0.4368, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.539109230041504, + "rewards/margins": 2.5141215324401855, + "rewards/rejected": -12.053230285644531, + "step": 817 + }, + { + "epoch": 0.5644298775228567, + "grad_norm": 0.2609269917011261, + "learning_rate": 1.567650440781909e-06, + "logits/chosen": 3.6884360313415527, + "logits/rejected": 3.8142943382263184, + "logps/chosen": -154.55068969726562, + "logps/rejected": -180.736572265625, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.822535514831543, + "rewards/margins": 2.6080827713012695, + "rewards/rejected": -13.430619239807129, + "step": 818 + }, + { + "epoch": 0.5651198895980679, + "grad_norm": 9.184243202209473, + "learning_rate": 1.5695668838635495e-06, + "logits/chosen": 3.8098533153533936, + "logits/rejected": 3.778738021850586, + "logps/chosen": -177.45236206054688, + "logps/rejected": -175.04185485839844, + "loss": 0.8229, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.0542631149292, + "rewards/margins": -0.19203829765319824, + "rewards/rejected": -12.862225532531738, + "step": 819 + }, + { + "epoch": 0.5658099016732793, + "grad_norm": 0.3801496624946594, + "learning_rate": 1.5714833269451899e-06, + "logits/chosen": 4.042424201965332, + "logits/rejected": 4.042424201965332, + "logps/chosen": -174.13394165039062, + "logps/rejected": -174.13394165039062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.605913162231445, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.605913162231445, + "step": 820 + }, + { + "epoch": 0.5664999137484906, + "grad_norm": 14.51308536529541, + "learning_rate": 1.5733997700268303e-06, + "logits/chosen": 4.063736438751221, + "logits/rejected": 3.9907567501068115, + "logps/chosen": -182.9357452392578, + "logps/rejected": -184.63400268554688, + "loss": 1.1642, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.602087020874023, + "rewards/margins": 0.22842085361480713, + "rewards/rejected": -13.830507278442383, + "step": 821 + }, + { + "epoch": 0.5671899258237019, + "grad_norm": 0.2982933521270752, + "learning_rate": 1.5753162131084709e-06, + "logits/chosen": 3.9500784873962402, + "logits/rejected": 4.2750630378723145, + "logps/chosen": -178.0072479248047, + "logps/rejected": -187.38401794433594, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.052107810974121, + "rewards/margins": 1.0061830282211304, + "rewards/rejected": -14.0582914352417, + "step": 822 + }, + { + "epoch": 0.5678799378989132, + "grad_norm": 0.3739455044269562, + "learning_rate": 1.5772326561901112e-06, + "logits/chosen": 4.0573883056640625, + "logits/rejected": 4.162537097930908, + "logps/chosen": -178.66506958007812, + "logps/rejected": -185.70098876953125, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.199542045593262, + "rewards/margins": 0.7134767770767212, + "rewards/rejected": -13.913019180297852, + "step": 823 + }, + { + "epoch": 0.5685699499741246, + "grad_norm": 0.3469533622264862, + "learning_rate": 1.5791490992717518e-06, + "logits/chosen": 3.473142147064209, + "logits/rejected": 3.630237102508545, + "logps/chosen": -151.18392944335938, + "logps/rejected": -171.1653594970703, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.448631286621094, + "rewards/margins": 1.9368481636047363, + "rewards/rejected": -12.385478973388672, + "step": 824 + }, + { + "epoch": 0.5692599620493358, + "grad_norm": 6.588101387023926, + "learning_rate": 1.5810655423533922e-06, + "logits/chosen": 3.6286866664886475, + "logits/rejected": 3.636035919189453, + "logps/chosen": -143.11813354492188, + "logps/rejected": -159.81398010253906, + "loss": 0.4762, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.62869930267334, + "rewards/margins": 1.539919376373291, + "rewards/rejected": -11.168619155883789, + "step": 825 + }, + { + "epoch": 0.5699499741245472, + "grad_norm": 0.2815130650997162, + "learning_rate": 1.5829819854350328e-06, + "logits/chosen": 3.8726823329925537, + "logits/rejected": 3.9530062675476074, + "logps/chosen": -178.65621948242188, + "logps/rejected": -188.44520568847656, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.122233390808105, + "rewards/margins": 0.9797601103782654, + "rewards/rejected": -14.101993560791016, + "step": 826 + }, + { + "epoch": 0.5706399861997585, + "grad_norm": 4.882701873779297, + "learning_rate": 1.5848984285166732e-06, + "logits/chosen": 3.5835909843444824, + "logits/rejected": 3.7324774265289307, + "logps/chosen": -167.6373291015625, + "logps/rejected": -192.46774291992188, + "loss": 0.4574, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.001384735107422, + "rewards/margins": 2.508810520172119, + "rewards/rejected": -14.5101957321167, + "step": 827 + }, + { + "epoch": 0.5713299982749698, + "grad_norm": 0.3963868021965027, + "learning_rate": 1.5868148715983136e-06, + "logits/chosen": 3.6052541732788086, + "logits/rejected": 3.7358412742614746, + "logps/chosen": -162.45458984375, + "logps/rejected": -169.9471435546875, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.549823760986328, + "rewards/margins": 0.7669708728790283, + "rewards/rejected": -12.316793441772461, + "step": 828 + }, + { + "epoch": 0.5720200103501811, + "grad_norm": 14.9633207321167, + "learning_rate": 1.588731314679954e-06, + "logits/chosen": 3.5632662773132324, + "logits/rejected": 3.6439499855041504, + "logps/chosen": -145.9838104248047, + "logps/rejected": -163.08151245117188, + "loss": 0.5423, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.871663093566895, + "rewards/margins": 1.7199170589447021, + "rewards/rejected": -11.59157943725586, + "step": 829 + }, + { + "epoch": 0.5727100224253925, + "grad_norm": 0.35096311569213867, + "learning_rate": 1.5906477577615948e-06, + "logits/chosen": 3.797621726989746, + "logits/rejected": 3.9680285453796387, + "logps/chosen": -181.1566619873047, + "logps/rejected": -188.8123779296875, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.443222045898438, + "rewards/margins": 0.7300385236740112, + "rewards/rejected": -14.173260688781738, + "step": 830 + }, + { + "epoch": 0.5734000345006037, + "grad_norm": 0.3193075656890869, + "learning_rate": 1.5925642008432352e-06, + "logits/chosen": 3.9520344734191895, + "logits/rejected": 4.069605350494385, + "logps/chosen": -165.7130126953125, + "logps/rejected": -175.63604736328125, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.82142448425293, + "rewards/margins": 0.9758703708648682, + "rewards/rejected": -12.797295570373535, + "step": 831 + }, + { + "epoch": 0.5740900465758151, + "grad_norm": 0.4241744577884674, + "learning_rate": 1.5944806439248756e-06, + "logits/chosen": 3.9448275566101074, + "logits/rejected": 3.9448275566101074, + "logps/chosen": -183.47540283203125, + "logps/rejected": -183.47540283203125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.610187530517578, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -13.610187530517578, + "step": 832 + }, + { + "epoch": 0.5747800586510264, + "grad_norm": 0.40249523520469666, + "learning_rate": 1.596397087006516e-06, + "logits/chosen": 4.015637397766113, + "logits/rejected": 4.086861610412598, + "logps/chosen": -161.56736755371094, + "logps/rejected": -173.95864868164062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.177366256713867, + "rewards/margins": 1.1387052536010742, + "rewards/rejected": -12.316070556640625, + "step": 833 + }, + { + "epoch": 0.5754700707262377, + "grad_norm": 0.4040238559246063, + "learning_rate": 1.5983135300881566e-06, + "logits/chosen": 3.7205119132995605, + "logits/rejected": 3.7205119132995605, + "logps/chosen": -175.07083129882812, + "logps/rejected": -175.07083129882812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.677783966064453, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.677783966064453, + "step": 834 + }, + { + "epoch": 0.576160082801449, + "grad_norm": 0.28647831082344055, + "learning_rate": 1.600229973169797e-06, + "logits/chosen": 3.842952013015747, + "logits/rejected": 3.9083058834075928, + "logps/chosen": -184.26296997070312, + "logps/rejected": -191.9800262451172, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.670477867126465, + "rewards/margins": 0.7698670625686646, + "rewards/rejected": -14.44034481048584, + "step": 835 + }, + { + "epoch": 0.5768500948766604, + "grad_norm": 10.919473648071289, + "learning_rate": 1.6021464162514374e-06, + "logits/chosen": 4.0959672927856445, + "logits/rejected": 3.951792001724243, + "logps/chosen": -177.8203582763672, + "logps/rejected": -171.2348175048828, + "loss": 1.2202, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.963996887207031, + "rewards/margins": -0.6128091812133789, + "rewards/rejected": -12.351187705993652, + "step": 836 + }, + { + "epoch": 0.5775401069518716, + "grad_norm": 0.38315248489379883, + "learning_rate": 1.6040628593330778e-06, + "logits/chosen": 4.069588661193848, + "logits/rejected": 4.069588661193848, + "logps/chosen": -181.41714477539062, + "logps/rejected": -181.41714477539062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.367963790893555, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.367963790893555, + "step": 837 + }, + { + "epoch": 0.578230119027083, + "grad_norm": 0.40492546558380127, + "learning_rate": 1.6059793024147182e-06, + "logits/chosen": 3.994589328765869, + "logits/rejected": 3.994589328765869, + "logps/chosen": -176.5445556640625, + "logps/rejected": -176.5445556640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.826553344726562, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.826553344726562, + "step": 838 + }, + { + "epoch": 0.5789201311022943, + "grad_norm": 13.42226791381836, + "learning_rate": 1.607895745496359e-06, + "logits/chosen": 3.9188976287841797, + "logits/rejected": 3.949920177459717, + "logps/chosen": -171.5319061279297, + "logps/rejected": -167.1969451904297, + "loss": 1.5079, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.495505332946777, + "rewards/margins": -0.4579182267189026, + "rewards/rejected": -12.03758716583252, + "step": 839 + }, + { + "epoch": 0.5796101431775056, + "grad_norm": 0.33478856086730957, + "learning_rate": 1.6098121885779994e-06, + "logits/chosen": 4.001485824584961, + "logits/rejected": 4.179047107696533, + "logps/chosen": -170.3568115234375, + "logps/rejected": -186.08755493164062, + "loss": 0.5205, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.360114097595215, + "rewards/margins": 1.5697572231292725, + "rewards/rejected": -13.92987060546875, + "step": 840 + }, + { + "epoch": 0.5803001552527169, + "grad_norm": 0.3667425215244293, + "learning_rate": 1.6117286316596398e-06, + "logits/chosen": 3.611908435821533, + "logits/rejected": 3.837294816970825, + "logps/chosen": -151.31210327148438, + "logps/rejected": -169.49713134765625, + "loss": 0.521, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.46337890625, + "rewards/margins": 1.7917057275772095, + "rewards/rejected": -12.255084037780762, + "step": 841 + }, + { + "epoch": 0.5809901673279282, + "grad_norm": 0.23025187849998474, + "learning_rate": 1.6136450747412802e-06, + "logits/chosen": 3.55165433883667, + "logits/rejected": 3.909837245941162, + "logps/chosen": -148.5197296142578, + "logps/rejected": -175.580810546875, + "loss": 0.4335, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.028060913085938, + "rewards/margins": 2.797694683074951, + "rewards/rejected": -12.82575511932373, + "step": 842 + }, + { + "epoch": 0.5816801794031395, + "grad_norm": 20.85237693786621, + "learning_rate": 1.6155615178229208e-06, + "logits/chosen": 3.8398778438568115, + "logits/rejected": 3.797090530395508, + "logps/chosen": -173.39715576171875, + "logps/rejected": -170.54727172851562, + "loss": 0.9535, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.576786994934082, + "rewards/margins": -0.36376523971557617, + "rewards/rejected": -12.213022232055664, + "step": 843 + }, + { + "epoch": 0.5823701914783509, + "grad_norm": 0.7223400473594666, + "learning_rate": 1.6174779609045614e-06, + "logits/chosen": 3.8936166763305664, + "logits/rejected": 4.02693510055542, + "logps/chosen": -174.0446319580078, + "logps/rejected": -184.5683135986328, + "loss": 0.5262, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.626239776611328, + "rewards/margins": 1.080328345298767, + "rewards/rejected": -13.706568717956543, + "step": 844 + }, + { + "epoch": 0.5830602035535621, + "grad_norm": 0.4030788838863373, + "learning_rate": 1.6193944039862018e-06, + "logits/chosen": 3.7404942512512207, + "logits/rejected": 3.7620320320129395, + "logps/chosen": -178.08489990234375, + "logps/rejected": -186.47561645507812, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.103553771972656, + "rewards/margins": 0.9031965732574463, + "rewards/rejected": -14.006750106811523, + "step": 845 + }, + { + "epoch": 0.5837502156287735, + "grad_norm": 0.35722023248672485, + "learning_rate": 1.6213108470678422e-06, + "logits/chosen": 4.128819942474365, + "logits/rejected": 4.128819942474365, + "logps/chosen": -188.08660888671875, + "logps/rejected": -188.08660888671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.89117431640625, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.89117431640625, + "step": 846 + }, + { + "epoch": 0.5844402277039848, + "grad_norm": 0.3440701961517334, + "learning_rate": 1.6232272901494828e-06, + "logits/chosen": 4.0329742431640625, + "logits/rejected": 4.0329742431640625, + "logps/chosen": -183.07901000976562, + "logps/rejected": -183.07901000976562, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.394203186035156, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.394203186035156, + "step": 847 + }, + { + "epoch": 0.5851302397791961, + "grad_norm": 0.3023083508014679, + "learning_rate": 1.6251437332311232e-06, + "logits/chosen": 4.3936262130737305, + "logits/rejected": 4.3936262130737305, + "logps/chosen": -192.77761840820312, + "logps/rejected": -192.77761840820312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.510114669799805, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.510114669799805, + "step": 848 + }, + { + "epoch": 0.5858202518544074, + "grad_norm": 15.772600173950195, + "learning_rate": 1.6270601763127635e-06, + "logits/chosen": 4.180096626281738, + "logits/rejected": 4.250634670257568, + "logps/chosen": -169.4361572265625, + "logps/rejected": -181.98919677734375, + "loss": 0.7304, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.338839530944824, + "rewards/margins": 1.1789854764938354, + "rewards/rejected": -13.51782512664795, + "step": 849 + }, + { + "epoch": 0.5865102639296188, + "grad_norm": 0.3202323019504547, + "learning_rate": 1.628976619394404e-06, + "logits/chosen": 3.7363531589508057, + "logits/rejected": 3.7363531589508057, + "logps/chosen": -178.50941467285156, + "logps/rejected": -178.50941467285156, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.107780456542969, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -13.107781410217285, + "step": 850 + }, + { + "epoch": 0.58720027600483, + "grad_norm": 0.46738189458847046, + "learning_rate": 1.6308930624760447e-06, + "logits/chosen": 4.009243965148926, + "logits/rejected": 4.009243965148926, + "logps/chosen": -176.3843994140625, + "logps/rejected": -176.3843994140625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.843805313110352, + "rewards/margins": 0.0, + "rewards/rejected": -12.843805313110352, + "step": 851 + }, + { + "epoch": 0.5878902880800414, + "grad_norm": 0.32605838775634766, + "learning_rate": 1.6328095055576851e-06, + "logits/chosen": 3.9699952602386475, + "logits/rejected": 3.9699952602386475, + "logps/chosen": -185.27731323242188, + "logps/rejected": -185.27731323242188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.846490859985352, + "rewards/margins": 0.0, + "rewards/rejected": -13.846490859985352, + "step": 852 + }, + { + "epoch": 0.5885803001552528, + "grad_norm": 5.037729740142822, + "learning_rate": 1.6347259486393255e-06, + "logits/chosen": 4.062795639038086, + "logits/rejected": 4.000646591186523, + "logps/chosen": -169.23048400878906, + "logps/rejected": -170.62362670898438, + "loss": 0.6388, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.280153274536133, + "rewards/margins": 0.15279066562652588, + "rewards/rejected": -12.432944297790527, + "step": 853 + }, + { + "epoch": 0.589270312230464, + "grad_norm": 0.3106141686439514, + "learning_rate": 1.636642391720966e-06, + "logits/chosen": 4.273151874542236, + "logits/rejected": 4.353687286376953, + "logps/chosen": -181.97811889648438, + "logps/rejected": -192.85031127929688, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.299236297607422, + "rewards/margins": 1.090427041053772, + "rewards/rejected": -14.389662742614746, + "step": 854 + }, + { + "epoch": 0.5899603243056754, + "grad_norm": 0.29792457818984985, + "learning_rate": 1.6385588348026065e-06, + "logits/chosen": 3.8464064598083496, + "logits/rejected": 4.0863356590271, + "logps/chosen": -156.5473175048828, + "logps/rejected": -180.53457641601562, + "loss": 0.5208, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.060107231140137, + "rewards/margins": 2.170738458633423, + "rewards/rejected": -13.230844497680664, + "step": 855 + }, + { + "epoch": 0.5906503363808867, + "grad_norm": 0.36655673384666443, + "learning_rate": 1.640475277884247e-06, + "logits/chosen": 4.098918914794922, + "logits/rejected": 4.235387325286865, + "logps/chosen": -188.28216552734375, + "logps/rejected": -194.12603759765625, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.677457809448242, + "rewards/margins": 0.6081845760345459, + "rewards/rejected": -14.285642623901367, + "step": 856 + }, + { + "epoch": 0.591340348456098, + "grad_norm": 1.744097113609314, + "learning_rate": 1.6423917209658873e-06, + "logits/chosen": 4.147280693054199, + "logits/rejected": 4.299409866333008, + "logps/chosen": -169.0706787109375, + "logps/rejected": -180.34872436523438, + "loss": 0.5345, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.074650764465332, + "rewards/margins": 1.1434885263442993, + "rewards/rejected": -13.2181396484375, + "step": 857 + }, + { + "epoch": 0.5920303605313093, + "grad_norm": 0.40099385380744934, + "learning_rate": 1.6443081640475277e-06, + "logits/chosen": 3.5422778129577637, + "logits/rejected": 3.5422778129577637, + "logps/chosen": -179.52423095703125, + "logps/rejected": -179.52423095703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.189325332641602, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.189325332641602, + "step": 858 + }, + { + "epoch": 0.5927203726065207, + "grad_norm": 0.3071611821651459, + "learning_rate": 1.6462246071291685e-06, + "logits/chosen": 3.939462423324585, + "logits/rejected": 3.9874258041381836, + "logps/chosen": -180.8251953125, + "logps/rejected": -196.27561950683594, + "loss": 0.5205, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.273458480834961, + "rewards/margins": 1.5153793096542358, + "rewards/rejected": -14.788836479187012, + "step": 859 + }, + { + "epoch": 0.5934103846817319, + "grad_norm": 11.388246536254883, + "learning_rate": 1.648141050210809e-06, + "logits/chosen": 4.023636817932129, + "logits/rejected": 4.146344184875488, + "logps/chosen": -169.34771728515625, + "logps/rejected": -182.55618286132812, + "loss": 0.693, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.200105667114258, + "rewards/margins": 1.3839458227157593, + "rewards/rejected": -13.584053039550781, + "step": 860 + }, + { + "epoch": 0.5941003967569433, + "grad_norm": 0.4007302224636078, + "learning_rate": 1.6500574932924493e-06, + "logits/chosen": 3.9107916355133057, + "logits/rejected": 3.909285545349121, + "logps/chosen": -165.52867126464844, + "logps/rejected": -179.9104461669922, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.820672035217285, + "rewards/margins": 1.4442238807678223, + "rewards/rejected": -13.264896392822266, + "step": 861 + }, + { + "epoch": 0.5947904088321546, + "grad_norm": 2.2305920124053955, + "learning_rate": 1.6519739363740897e-06, + "logits/chosen": 3.9759485721588135, + "logits/rejected": 3.993281126022339, + "logps/chosen": -171.72503662109375, + "logps/rejected": -173.70541381835938, + "loss": 0.623, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.298919677734375, + "rewards/margins": 0.2444911003112793, + "rewards/rejected": -12.54340934753418, + "step": 862 + }, + { + "epoch": 0.5954804209073659, + "grad_norm": 0.3723548650741577, + "learning_rate": 1.6538903794557303e-06, + "logits/chosen": 4.040742874145508, + "logits/rejected": 4.040742874145508, + "logps/chosen": -185.8033447265625, + "logps/rejected": -185.8033447265625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.869956016540527, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -13.869956016540527, + "step": 863 + }, + { + "epoch": 0.5961704329825772, + "grad_norm": 1.1193187236785889, + "learning_rate": 1.655806822537371e-06, + "logits/chosen": 3.9984652996063232, + "logits/rejected": 3.9741528034210205, + "logps/chosen": -163.078125, + "logps/rejected": -166.47119140625, + "loss": 0.6155, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.696178436279297, + "rewards/margins": 0.32403892278671265, + "rewards/rejected": -12.020217895507812, + "step": 864 + }, + { + "epoch": 0.5968604450577885, + "grad_norm": 22.344324111938477, + "learning_rate": 1.6577232656190113e-06, + "logits/chosen": 3.8582639694213867, + "logits/rejected": 4.001012325286865, + "logps/chosen": -147.2725830078125, + "logps/rejected": -164.9493408203125, + "loss": 0.8132, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.165847778320312, + "rewards/margins": 1.4965054988861084, + "rewards/rejected": -11.662353515625, + "step": 865 + }, + { + "epoch": 0.5975504571329998, + "grad_norm": 0.35292136669158936, + "learning_rate": 1.6596397087006517e-06, + "logits/chosen": 3.978811740875244, + "logits/rejected": 3.978811740875244, + "logps/chosen": -189.8008270263672, + "logps/rejected": -189.80084228515625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.231337547302246, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -14.231338500976562, + "step": 866 + }, + { + "epoch": 0.5982404692082112, + "grad_norm": 2.0652852058410645, + "learning_rate": 1.6615561517822923e-06, + "logits/chosen": 4.152830600738525, + "logits/rejected": 4.266855239868164, + "logps/chosen": -176.24295043945312, + "logps/rejected": -189.70697021484375, + "loss": 0.5329, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.779739379882812, + "rewards/margins": 1.3860660791397095, + "rewards/rejected": -14.165804862976074, + "step": 867 + }, + { + "epoch": 0.5989304812834224, + "grad_norm": 0.5702913403511047, + "learning_rate": 1.6634725948639327e-06, + "logits/chosen": 3.9344592094421387, + "logits/rejected": 4.072368621826172, + "logps/chosen": -160.67811584472656, + "logps/rejected": -165.92774963378906, + "loss": 0.6087, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.397416114807129, + "rewards/margins": 0.5029290318489075, + "rewards/rejected": -11.900344848632812, + "step": 868 + }, + { + "epoch": 0.5996204933586338, + "grad_norm": 11.061190605163574, + "learning_rate": 1.665389037945573e-06, + "logits/chosen": 3.922861337661743, + "logits/rejected": 3.990114688873291, + "logps/chosen": -162.00123596191406, + "logps/rejected": -165.61671447753906, + "loss": 0.6073, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.441465377807617, + "rewards/margins": 0.37009763717651367, + "rewards/rejected": -11.811563491821289, + "step": 869 + }, + { + "epoch": 0.6003105054338451, + "grad_norm": 0.37354180216789246, + "learning_rate": 1.6673054810272135e-06, + "logits/chosen": 4.2898759841918945, + "logits/rejected": 4.2898759841918945, + "logps/chosen": -181.85458374023438, + "logps/rejected": -181.85458374023438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.469803810119629, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.469803810119629, + "step": 870 + }, + { + "epoch": 0.6010005175090564, + "grad_norm": 0.3258558213710785, + "learning_rate": 1.6692219241088543e-06, + "logits/chosen": 3.582853317260742, + "logits/rejected": 3.747342109680176, + "logps/chosen": -165.1012725830078, + "logps/rejected": -181.70773315429688, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.664369583129883, + "rewards/margins": 1.6446714401245117, + "rewards/rejected": -13.309040069580078, + "step": 871 + }, + { + "epoch": 0.6016905295842677, + "grad_norm": 0.38220423460006714, + "learning_rate": 1.6711383671904947e-06, + "logits/chosen": 3.9585089683532715, + "logits/rejected": 3.9886035919189453, + "logps/chosen": -186.80747985839844, + "logps/rejected": -193.6204376220703, + "loss": 0.607, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.884486198425293, + "rewards/margins": 0.6991637945175171, + "rewards/rejected": -14.583649635314941, + "step": 872 + }, + { + "epoch": 0.6023805416594791, + "grad_norm": 0.319236695766449, + "learning_rate": 1.673054810272135e-06, + "logits/chosen": 4.458718776702881, + "logits/rejected": 4.500579357147217, + "logps/chosen": -173.84323120117188, + "logps/rejected": -186.39639282226562, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.617810249328613, + "rewards/margins": 1.1739861965179443, + "rewards/rejected": -13.791796684265137, + "step": 873 + }, + { + "epoch": 0.6030705537346903, + "grad_norm": 1.3094465732574463, + "learning_rate": 1.6749712533537754e-06, + "logits/chosen": 3.850377082824707, + "logits/rejected": 4.094783782958984, + "logps/chosen": -165.66184997558594, + "logps/rejected": -177.42430114746094, + "loss": 0.5265, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.982730865478516, + "rewards/margins": 1.2041776180267334, + "rewards/rejected": -13.186908721923828, + "step": 874 + }, + { + "epoch": 0.6037605658099017, + "grad_norm": 0.4979393184185028, + "learning_rate": 1.676887696435416e-06, + "logits/chosen": 4.204693794250488, + "logits/rejected": 4.443334579467773, + "logps/chosen": -162.076171875, + "logps/rejected": -193.542236328125, + "loss": 0.3513, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.290155410766602, + "rewards/margins": 3.0598888397216797, + "rewards/rejected": -14.350044250488281, + "step": 875 + }, + { + "epoch": 0.604450577885113, + "grad_norm": 0.2399548888206482, + "learning_rate": 1.6788041395170564e-06, + "logits/chosen": 4.247170925140381, + "logits/rejected": 4.297142028808594, + "logps/chosen": -157.4270477294922, + "logps/rejected": -179.06698608398438, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.03455638885498, + "rewards/margins": 2.130317211151123, + "rewards/rejected": -13.164874076843262, + "step": 876 + }, + { + "epoch": 0.6051405899603243, + "grad_norm": 2.6147713661193848, + "learning_rate": 1.6807205825986968e-06, + "logits/chosen": 3.982870101928711, + "logits/rejected": 4.13387393951416, + "logps/chosen": -169.11944580078125, + "logps/rejected": -179.21432495117188, + "loss": 0.534, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.165118217468262, + "rewards/margins": 1.021604061126709, + "rewards/rejected": -13.186722755432129, + "step": 877 + }, + { + "epoch": 0.6058306020355356, + "grad_norm": 0.29974129796028137, + "learning_rate": 1.6826370256803372e-06, + "logits/chosen": 4.177474021911621, + "logits/rejected": 4.193854331970215, + "logps/chosen": -175.64637756347656, + "logps/rejected": -185.45413208007812, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.723402976989746, + "rewards/margins": 1.032272219657898, + "rewards/rejected": -13.755674362182617, + "step": 878 + }, + { + "epoch": 0.606520614110747, + "grad_norm": 3.592836856842041, + "learning_rate": 1.684553468761978e-06, + "logits/chosen": 4.0629987716674805, + "logits/rejected": 4.143023490905762, + "logps/chosen": -178.70101928710938, + "logps/rejected": -187.92578125, + "loss": 0.6016, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.113269805908203, + "rewards/margins": 0.8110252618789673, + "rewards/rejected": -13.924295425415039, + "step": 879 + }, + { + "epoch": 0.6072106261859582, + "grad_norm": 0.35141339898109436, + "learning_rate": 1.6864699118436184e-06, + "logits/chosen": 3.9694511890411377, + "logits/rejected": 4.0844316482543945, + "logps/chosen": -179.7350616455078, + "logps/rejected": -190.75450134277344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.078506469726562, + "rewards/margins": 1.1353074312210083, + "rewards/rejected": -14.213813781738281, + "step": 880 + }, + { + "epoch": 0.6079006382611696, + "grad_norm": 0.5701307058334351, + "learning_rate": 1.6883863549252588e-06, + "logits/chosen": 4.232191562652588, + "logits/rejected": 4.226534843444824, + "logps/chosen": -171.59954833984375, + "logps/rejected": -176.7159423828125, + "loss": 0.6081, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.421364784240723, + "rewards/margins": 0.5410637259483337, + "rewards/rejected": -12.962428092956543, + "step": 881 + }, + { + "epoch": 0.6085906503363809, + "grad_norm": 0.5429003834724426, + "learning_rate": 1.6903027980068992e-06, + "logits/chosen": 4.007916450500488, + "logits/rejected": 4.1642303466796875, + "logps/chosen": -174.5428466796875, + "logps/rejected": -188.21597290039062, + "loss": 0.5224, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.674221992492676, + "rewards/margins": 1.4883761405944824, + "rewards/rejected": -14.16259765625, + "step": 882 + }, + { + "epoch": 0.6092806624115922, + "grad_norm": 0.33215048909187317, + "learning_rate": 1.6922192410885398e-06, + "logits/chosen": 3.9215128421783447, + "logits/rejected": 3.9215128421783447, + "logps/chosen": -200.03778076171875, + "logps/rejected": -200.03778076171875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -15.230304718017578, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -15.230304718017578, + "step": 883 + }, + { + "epoch": 0.6099706744868035, + "grad_norm": 0.38123059272766113, + "learning_rate": 1.6941356841701804e-06, + "logits/chosen": 3.914914131164551, + "logits/rejected": 4.03162956237793, + "logps/chosen": -164.94046020507812, + "logps/rejected": -177.7524871826172, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.926085472106934, + "rewards/margins": 1.2148648500442505, + "rewards/rejected": -13.140950202941895, + "step": 884 + }, + { + "epoch": 0.6106606865620149, + "grad_norm": 0.31167539954185486, + "learning_rate": 1.6960521272518208e-06, + "logits/chosen": 4.495752334594727, + "logits/rejected": 4.495752334594727, + "logps/chosen": -191.6983642578125, + "logps/rejected": -191.6983642578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.326837539672852, + "rewards/margins": 0.0, + "rewards/rejected": -14.326837539672852, + "step": 885 + }, + { + "epoch": 0.6113506986372261, + "grad_norm": 3.51664662361145, + "learning_rate": 1.6979685703334612e-06, + "logits/chosen": 3.9704627990722656, + "logits/rejected": 3.9797658920288086, + "logps/chosen": -183.3338623046875, + "logps/rejected": -192.18930053710938, + "loss": 0.47, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.555401802062988, + "rewards/margins": 0.8629275560379028, + "rewards/rejected": -14.418328285217285, + "step": 886 + }, + { + "epoch": 0.6120407107124375, + "grad_norm": 0.38558611273765564, + "learning_rate": 1.6998850134151018e-06, + "logits/chosen": 4.110500335693359, + "logits/rejected": 4.323752403259277, + "logps/chosen": -186.44448852539062, + "logps/rejected": -201.80563354492188, + "loss": 0.5223, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.683120727539062, + "rewards/margins": 1.5879963636398315, + "rewards/rejected": -15.271116256713867, + "step": 887 + }, + { + "epoch": 0.6127307227876487, + "grad_norm": 10.511465072631836, + "learning_rate": 1.7018014564967422e-06, + "logits/chosen": 3.830449342727661, + "logits/rejected": 3.8928256034851074, + "logps/chosen": -177.11233520507812, + "logps/rejected": -182.97357177734375, + "loss": 0.8309, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.94150161743164, + "rewards/margins": 0.4872208833694458, + "rewards/rejected": -13.428723335266113, + "step": 888 + }, + { + "epoch": 0.6134207348628601, + "grad_norm": 0.40656372904777527, + "learning_rate": 1.7037178995783826e-06, + "logits/chosen": 4.03815221786499, + "logits/rejected": 4.03815221786499, + "logps/chosen": -181.59368896484375, + "logps/rejected": -181.59368896484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.318881034851074, + "rewards/margins": 0.0, + "rewards/rejected": -13.318881034851074, + "step": 889 + }, + { + "epoch": 0.6141107469380714, + "grad_norm": 4.849832534790039, + "learning_rate": 1.705634342660023e-06, + "logits/chosen": 3.3646187782287598, + "logits/rejected": 3.5893301963806152, + "logps/chosen": -145.44960021972656, + "logps/rejected": -166.32232666015625, + "loss": 0.4036, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.706417083740234, + "rewards/margins": 2.0890679359436035, + "rewards/rejected": -11.79548454284668, + "step": 890 + }, + { + "epoch": 0.6148007590132827, + "grad_norm": 0.32828405499458313, + "learning_rate": 1.7075507857416638e-06, + "logits/chosen": 4.311677932739258, + "logits/rejected": 4.311677932739258, + "logps/chosen": -196.17691040039062, + "logps/rejected": -196.17691040039062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.825072288513184, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.825072288513184, + "step": 891 + }, + { + "epoch": 0.615490771088494, + "grad_norm": 15.414594650268555, + "learning_rate": 1.7094672288233042e-06, + "logits/chosen": 4.406328201293945, + "logits/rejected": 4.275228023529053, + "logps/chosen": -190.34812927246094, + "logps/rejected": -190.99072265625, + "loss": 0.8048, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.21101188659668, + "rewards/margins": 0.028758645057678223, + "rewards/rejected": -14.23976993560791, + "step": 892 + }, + { + "epoch": 0.6161807831637054, + "grad_norm": 0.2873317003250122, + "learning_rate": 1.7113836719049446e-06, + "logits/chosen": 4.274540424346924, + "logits/rejected": 4.274540424346924, + "logps/chosen": -183.39845275878906, + "logps/rejected": -183.39846801757812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.611330032348633, + "rewards/margins": 7.152557373046875e-07, + "rewards/rejected": -13.61133098602295, + "step": 893 + }, + { + "epoch": 0.6168707952389166, + "grad_norm": 20.545822143554688, + "learning_rate": 1.713300114986585e-06, + "logits/chosen": 4.183937072753906, + "logits/rejected": 4.172464370727539, + "logps/chosen": -178.38595581054688, + "logps/rejected": -176.59170532226562, + "loss": 0.7965, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.388774871826172, + "rewards/margins": -0.159193754196167, + "rewards/rejected": -13.229581832885742, + "step": 894 + }, + { + "epoch": 0.617560807314128, + "grad_norm": 15.566410064697266, + "learning_rate": 1.7152165580682256e-06, + "logits/chosen": 4.085574150085449, + "logits/rejected": 4.0220112800598145, + "logps/chosen": -167.32249450683594, + "logps/rejected": -169.45004272460938, + "loss": 0.9285, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.927360534667969, + "rewards/margins": 0.2395768165588379, + "rewards/rejected": -12.166937828063965, + "step": 895 + }, + { + "epoch": 0.6182508193893393, + "grad_norm": 0.4215223491191864, + "learning_rate": 1.717133001149866e-06, + "logits/chosen": 3.983119010925293, + "logits/rejected": 4.054329872131348, + "logps/chosen": -172.3914794921875, + "logps/rejected": -178.85440063476562, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.479687690734863, + "rewards/margins": 0.6529895067214966, + "rewards/rejected": -13.13267707824707, + "step": 896 + }, + { + "epoch": 0.6189408314645506, + "grad_norm": 0.2778431177139282, + "learning_rate": 1.7190494442315064e-06, + "logits/chosen": 3.6602866649627686, + "logits/rejected": 4.221484661102295, + "logps/chosen": -162.13885498046875, + "logps/rejected": -188.74160766601562, + "loss": 0.4344, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.450982093811035, + "rewards/margins": 2.6554343700408936, + "rewards/rejected": -14.106416702270508, + "step": 897 + }, + { + "epoch": 0.6196308435397619, + "grad_norm": 0.291157603263855, + "learning_rate": 1.7209658873131468e-06, + "logits/chosen": 4.038686752319336, + "logits/rejected": 4.038686752319336, + "logps/chosen": -185.77752685546875, + "logps/rejected": -185.77752685546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.886468887329102, + "rewards/margins": 0.0, + "rewards/rejected": -13.886468887329102, + "step": 898 + }, + { + "epoch": 0.6203208556149733, + "grad_norm": 0.3012746274471283, + "learning_rate": 1.7228823303947876e-06, + "logits/chosen": 3.8921046257019043, + "logits/rejected": 3.92437744140625, + "logps/chosen": -158.52963256835938, + "logps/rejected": -166.86630249023438, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.048880577087402, + "rewards/margins": 0.8785239458084106, + "rewards/rejected": -11.927404403686523, + "step": 899 + }, + { + "epoch": 0.6210108676901845, + "grad_norm": 20.89299201965332, + "learning_rate": 1.724798773476428e-06, + "logits/chosen": 3.9801676273345947, + "logits/rejected": 3.959707260131836, + "logps/chosen": -159.07891845703125, + "logps/rejected": -157.19789123535156, + "loss": 0.9426, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.19760513305664, + "rewards/margins": -0.1715335249900818, + "rewards/rejected": -11.026071548461914, + "step": 900 + }, + { + "epoch": 0.6217008797653959, + "grad_norm": 0.6512901186943054, + "learning_rate": 1.7267152165580683e-06, + "logits/chosen": 3.8512260913848877, + "logits/rejected": 3.8628342151641846, + "logps/chosen": -176.31436157226562, + "logps/rejected": -186.19064331054688, + "loss": 0.5245, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.031015396118164, + "rewards/margins": 1.0138843059539795, + "rewards/rejected": -14.044899940490723, + "step": 901 + }, + { + "epoch": 0.6223908918406073, + "grad_norm": 0.39852452278137207, + "learning_rate": 1.7286316596397087e-06, + "logits/chosen": 3.586054801940918, + "logits/rejected": 3.6311769485473633, + "logps/chosen": -168.379150390625, + "logps/rejected": -178.7252197265625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.076120376586914, + "rewards/margins": 1.0310707092285156, + "rewards/rejected": -13.107192039489746, + "step": 902 + }, + { + "epoch": 0.6230809039158185, + "grad_norm": 24.03654670715332, + "learning_rate": 1.7305481027213493e-06, + "logits/chosen": 3.6422042846679688, + "logits/rejected": 3.6537578105926514, + "logps/chosen": -176.93084716796875, + "logps/rejected": -172.66983032226562, + "loss": 1.0308, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.83593463897705, + "rewards/margins": -0.4200468063354492, + "rewards/rejected": -12.415888786315918, + "step": 903 + }, + { + "epoch": 0.6237709159910299, + "grad_norm": 0.3060596287250519, + "learning_rate": 1.7324645458029897e-06, + "logits/chosen": 3.727684259414673, + "logits/rejected": 3.9671053886413574, + "logps/chosen": -159.4463348388672, + "logps/rejected": -180.62538146972656, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.116127967834473, + "rewards/margins": 2.136336326599121, + "rewards/rejected": -13.252464294433594, + "step": 904 + }, + { + "epoch": 0.6244609280662412, + "grad_norm": 0.8642547130584717, + "learning_rate": 1.7343809888846303e-06, + "logits/chosen": 4.645674705505371, + "logits/rejected": 4.636739730834961, + "logps/chosen": -183.89678955078125, + "logps/rejected": -188.071044921875, + "loss": 0.6097, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.636117935180664, + "rewards/margins": 0.4559924602508545, + "rewards/rejected": -14.092109680175781, + "step": 905 + }, + { + "epoch": 0.6251509401414524, + "grad_norm": 30.403486251831055, + "learning_rate": 1.7362974319662707e-06, + "logits/chosen": 4.251239776611328, + "logits/rejected": 4.1796159744262695, + "logps/chosen": -177.345703125, + "logps/rejected": -182.59323120117188, + "loss": 0.7571, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.934926986694336, + "rewards/margins": 0.507201075553894, + "rewards/rejected": -13.442127227783203, + "step": 906 + }, + { + "epoch": 0.6258409522166638, + "grad_norm": 0.38824477791786194, + "learning_rate": 1.7382138750479113e-06, + "logits/chosen": 4.351802825927734, + "logits/rejected": 4.351802825927734, + "logps/chosen": -183.8062744140625, + "logps/rejected": -183.8062744140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.757097244262695, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.757097244262695, + "step": 907 + }, + { + "epoch": 0.6265309642918752, + "grad_norm": 0.3258521258831024, + "learning_rate": 1.7401303181295517e-06, + "logits/chosen": 4.068499565124512, + "logits/rejected": 4.156876564025879, + "logps/chosen": -171.90431213378906, + "logps/rejected": -185.63424682617188, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.342570304870605, + "rewards/margins": 1.3336018323898315, + "rewards/rejected": -13.676172256469727, + "step": 908 + }, + { + "epoch": 0.6272209763670864, + "grad_norm": 2.469243288040161, + "learning_rate": 1.7420467612111921e-06, + "logits/chosen": 3.8364081382751465, + "logits/rejected": 3.845656394958496, + "logps/chosen": -184.80213928222656, + "logps/rejected": -186.83416748046875, + "loss": 0.6211, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.659916877746582, + "rewards/margins": 0.2612558603286743, + "rewards/rejected": -13.921174049377441, + "step": 909 + }, + { + "epoch": 0.6279109884422978, + "grad_norm": 1.1918412446975708, + "learning_rate": 1.7439632042928325e-06, + "logits/chosen": 4.077528953552246, + "logits/rejected": 4.180663585662842, + "logps/chosen": -173.51441955566406, + "logps/rejected": -178.02542114257812, + "loss": 0.6114, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.699079513549805, + "rewards/margins": 0.4024319648742676, + "rewards/rejected": -13.101512908935547, + "step": 910 + }, + { + "epoch": 0.628601000517509, + "grad_norm": 1.7960578203201294, + "learning_rate": 1.7458796473744733e-06, + "logits/chosen": 4.08845329284668, + "logits/rejected": 4.103764533996582, + "logps/chosen": -177.3712158203125, + "logps/rejected": -180.97364807128906, + "loss": 0.6122, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.965690612792969, + "rewards/margins": 0.382331907749176, + "rewards/rejected": -13.3480224609375, + "step": 911 + }, + { + "epoch": 0.6292910125927204, + "grad_norm": 0.2885149419307709, + "learning_rate": 1.7477960904561137e-06, + "logits/chosen": 4.026134967803955, + "logits/rejected": 4.026134967803955, + "logps/chosen": -195.23275756835938, + "logps/rejected": -195.23275756835938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.691606521606445, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -14.691604614257812, + "step": 912 + }, + { + "epoch": 0.6299810246679317, + "grad_norm": 0.38338035345077515, + "learning_rate": 1.749712533537754e-06, + "logits/chosen": 3.7896947860717773, + "logits/rejected": 3.9205307960510254, + "logps/chosen": -174.42965698242188, + "logps/rejected": -184.73355102539062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.533830642700195, + "rewards/margins": 1.0284910202026367, + "rewards/rejected": -13.562320709228516, + "step": 913 + }, + { + "epoch": 0.630671036743143, + "grad_norm": 0.8222209811210632, + "learning_rate": 1.7516289766193945e-06, + "logits/chosen": 3.8449788093566895, + "logits/rejected": 3.9657015800476074, + "logps/chosen": -172.57745361328125, + "logps/rejected": -188.7217559814453, + "loss": 0.5253, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.611125946044922, + "rewards/margins": 1.7100591659545898, + "rewards/rejected": -14.321184158325195, + "step": 914 + }, + { + "epoch": 0.6313610488183543, + "grad_norm": 0.3076549470424652, + "learning_rate": 1.753545419701035e-06, + "logits/chosen": 3.9344611167907715, + "logits/rejected": 4.000519752502441, + "logps/chosen": -174.9913330078125, + "logps/rejected": -183.0855255126953, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.732741355895996, + "rewards/margins": 0.8357653021812439, + "rewards/rejected": -13.568506240844727, + "step": 915 + }, + { + "epoch": 0.6320510608935657, + "grad_norm": 0.4991316497325897, + "learning_rate": 1.7554618627826755e-06, + "logits/chosen": 4.040602684020996, + "logits/rejected": 4.031506538391113, + "logps/chosen": -172.05999755859375, + "logps/rejected": -177.20852661132812, + "loss": 0.6094, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.724056243896484, + "rewards/margins": 0.46918803453445435, + "rewards/rejected": -13.193243980407715, + "step": 916 + }, + { + "epoch": 0.6327410729687769, + "grad_norm": 0.808789074420929, + "learning_rate": 1.7573783058643159e-06, + "logits/chosen": 3.9070582389831543, + "logits/rejected": 3.9775540828704834, + "logps/chosen": -169.32611083984375, + "logps/rejected": -185.8018798828125, + "loss": 0.5238, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.245460510253906, + "rewards/margins": 1.544517993927002, + "rewards/rejected": -13.789978981018066, + "step": 917 + }, + { + "epoch": 0.6334310850439883, + "grad_norm": 1.3056186437606812, + "learning_rate": 1.7592947489459563e-06, + "logits/chosen": 3.605966806411743, + "logits/rejected": 3.771860361099243, + "logps/chosen": -176.45584106445312, + "logps/rejected": -180.311279296875, + "loss": 0.6109, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.792734146118164, + "rewards/margins": 0.4150702953338623, + "rewards/rejected": -13.207804679870605, + "step": 918 + }, + { + "epoch": 0.6341210971191996, + "grad_norm": 0.2995583415031433, + "learning_rate": 1.761211192027597e-06, + "logits/chosen": 3.883047103881836, + "logits/rejected": 3.883047103881836, + "logps/chosen": -176.37232971191406, + "logps/rejected": -176.37232971191406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.931103706359863, + "rewards/margins": 0.0, + "rewards/rejected": -12.931103706359863, + "step": 919 + }, + { + "epoch": 0.6348111091944109, + "grad_norm": 0.33899110555648804, + "learning_rate": 1.7631276351092375e-06, + "logits/chosen": 3.7846593856811523, + "logits/rejected": 3.936012029647827, + "logps/chosen": -177.09091186523438, + "logps/rejected": -187.61233520507812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.877504348754883, + "rewards/margins": 1.0422247648239136, + "rewards/rejected": -13.91972827911377, + "step": 920 + }, + { + "epoch": 0.6355011212696222, + "grad_norm": 0.4338635206222534, + "learning_rate": 1.7650440781908779e-06, + "logits/chosen": 3.5926578044891357, + "logits/rejected": 3.6463587284088135, + "logps/chosen": -158.093017578125, + "logps/rejected": -167.6980743408203, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.0748872756958, + "rewards/margins": 0.961405873298645, + "rewards/rejected": -12.036293029785156, + "step": 921 + }, + { + "epoch": 0.6361911333448336, + "grad_norm": 0.3194431960582733, + "learning_rate": 1.7669605212725183e-06, + "logits/chosen": 3.330886125564575, + "logits/rejected": 3.6602468490600586, + "logps/chosen": -148.9486541748047, + "logps/rejected": -177.53314208984375, + "loss": 0.4345, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.05427360534668, + "rewards/margins": 2.905548572540283, + "rewards/rejected": -12.959821701049805, + "step": 922 + }, + { + "epoch": 0.6368811454200448, + "grad_norm": 0.32986629009246826, + "learning_rate": 1.7688769643541589e-06, + "logits/chosen": 3.8262438774108887, + "logits/rejected": 3.956817388534546, + "logps/chosen": -159.66726684570312, + "logps/rejected": -172.10394287109375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.353875160217285, + "rewards/margins": 1.180096983909607, + "rewards/rejected": -12.53397274017334, + "step": 923 + }, + { + "epoch": 0.6375711574952562, + "grad_norm": 0.282458633184433, + "learning_rate": 1.7707934074357993e-06, + "logits/chosen": 4.03464412689209, + "logits/rejected": 4.072615146636963, + "logps/chosen": -177.68296813964844, + "logps/rejected": -187.68943786621094, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.938472747802734, + "rewards/margins": 1.0322426557540894, + "rewards/rejected": -13.970715522766113, + "step": 924 + }, + { + "epoch": 0.6382611695704675, + "grad_norm": 16.848398208618164, + "learning_rate": 1.7727098505174399e-06, + "logits/chosen": 3.7749857902526855, + "logits/rejected": 3.8096365928649902, + "logps/chosen": -179.02149963378906, + "logps/rejected": -176.6743621826172, + "loss": 0.879, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.255088806152344, + "rewards/margins": -0.2575559616088867, + "rewards/rejected": -12.99753189086914, + "step": 925 + }, + { + "epoch": 0.6389511816456788, + "grad_norm": 0.30570513010025024, + "learning_rate": 1.7746262935990803e-06, + "logits/chosen": 3.5718162059783936, + "logits/rejected": 3.7253620624542236, + "logps/chosen": -160.2969207763672, + "logps/rejected": -176.10919189453125, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.325265884399414, + "rewards/margins": 1.5449371337890625, + "rewards/rejected": -12.870203971862793, + "step": 926 + }, + { + "epoch": 0.6396411937208901, + "grad_norm": 0.37842050194740295, + "learning_rate": 1.7765427366807209e-06, + "logits/chosen": 4.099750518798828, + "logits/rejected": 4.099750518798828, + "logps/chosen": -187.91427612304688, + "logps/rejected": -187.91427612304688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.669906616210938, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -13.669906616210938, + "step": 927 + }, + { + "epoch": 0.6403312057961015, + "grad_norm": 0.32655972242355347, + "learning_rate": 1.7784591797623612e-06, + "logits/chosen": 3.6378581523895264, + "logits/rejected": 3.6378581523895264, + "logps/chosen": -165.04771423339844, + "logps/rejected": -165.0477294921875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.772172927856445, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.772173881530762, + "step": 928 + }, + { + "epoch": 0.6410212178713127, + "grad_norm": 0.31496283411979675, + "learning_rate": 1.7803756228440016e-06, + "logits/chosen": 3.697488307952881, + "logits/rejected": 3.7861738204956055, + "logps/chosen": -174.2545166015625, + "logps/rejected": -183.6796112060547, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.689933776855469, + "rewards/margins": 0.9539287090301514, + "rewards/rejected": -13.643861770629883, + "step": 929 + }, + { + "epoch": 0.6417112299465241, + "grad_norm": 4.065457344055176, + "learning_rate": 1.782292065925642e-06, + "logits/chosen": 3.957967758178711, + "logits/rejected": 4.05443000793457, + "logps/chosen": -164.78956604003906, + "logps/rejected": -183.26133728027344, + "loss": 0.4648, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.657658576965332, + "rewards/margins": 1.775346040725708, + "rewards/rejected": -13.433005332946777, + "step": 930 + }, + { + "epoch": 0.6424012420217354, + "grad_norm": 28.72833251953125, + "learning_rate": 1.7842085090072828e-06, + "logits/chosen": 3.7102103233337402, + "logits/rejected": 3.6890554428100586, + "logps/chosen": -162.8118438720703, + "logps/rejected": -158.72866821289062, + "loss": 1.0112, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.534357070922852, + "rewards/margins": -0.39971959590911865, + "rewards/rejected": -11.134637832641602, + "step": 931 + }, + { + "epoch": 0.6430912540969467, + "grad_norm": 5.248081684112549, + "learning_rate": 1.7861249520889232e-06, + "logits/chosen": 3.6906991004943848, + "logits/rejected": 4.008680820465088, + "logps/chosen": -168.871826171875, + "logps/rejected": -192.2307586669922, + "loss": 0.372, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.167165756225586, + "rewards/margins": 2.318990707397461, + "rewards/rejected": -14.486156463623047, + "step": 932 + }, + { + "epoch": 0.643781266172158, + "grad_norm": 0.3372159004211426, + "learning_rate": 1.7880413951705636e-06, + "logits/chosen": 3.5906636714935303, + "logits/rejected": 3.6731009483337402, + "logps/chosen": -171.4319305419922, + "logps/rejected": -179.7093048095703, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.350018501281738, + "rewards/margins": 0.8301348686218262, + "rewards/rejected": -13.180153846740723, + "step": 933 + }, + { + "epoch": 0.6444712782473694, + "grad_norm": 0.36664825677871704, + "learning_rate": 1.789957838252204e-06, + "logits/chosen": 4.031540870666504, + "logits/rejected": 4.174680709838867, + "logps/chosen": -172.90463256835938, + "logps/rejected": -185.0111846923828, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.700886726379395, + "rewards/margins": 1.1576889753341675, + "rewards/rejected": -13.858575820922852, + "step": 934 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 8.159806251525879, + "learning_rate": 1.7918742813338446e-06, + "logits/chosen": 3.4980289936065674, + "logits/rejected": 3.6107232570648193, + "logps/chosen": -145.62628173828125, + "logps/rejected": -165.57424926757812, + "loss": 0.4717, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.879650115966797, + "rewards/margins": 2.0304818153381348, + "rewards/rejected": -11.910131454467773, + "step": 935 + }, + { + "epoch": 0.645851302397792, + "grad_norm": 7.712725639343262, + "learning_rate": 1.793790724415485e-06, + "logits/chosen": 3.9175703525543213, + "logits/rejected": 3.887936592102051, + "logps/chosen": -178.96205139160156, + "logps/rejected": -185.45989990234375, + "loss": 0.5849, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.127227783203125, + "rewards/margins": 0.6732158660888672, + "rewards/rejected": -13.800443649291992, + "step": 936 + }, + { + "epoch": 0.6465413144730032, + "grad_norm": 1.6404708623886108, + "learning_rate": 1.7957071674971254e-06, + "logits/chosen": 3.8536956310272217, + "logits/rejected": 3.864260673522949, + "logps/chosen": -178.68772888183594, + "logps/rejected": -181.9010467529297, + "loss": 0.6137, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.050037384033203, + "rewards/margins": 0.3540687561035156, + "rewards/rejected": -13.404107093811035, + "step": 937 + }, + { + "epoch": 0.6472313265482146, + "grad_norm": 0.3464309871196747, + "learning_rate": 1.7976236105787658e-06, + "logits/chosen": 3.8629047870635986, + "logits/rejected": 3.9342939853668213, + "logps/chosen": -167.16481018066406, + "logps/rejected": -174.3743133544922, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.932276725769043, + "rewards/margins": 0.7489471435546875, + "rewards/rejected": -12.68122386932373, + "step": 938 + }, + { + "epoch": 0.6479213386234259, + "grad_norm": 0.6281587481498718, + "learning_rate": 1.7995400536604066e-06, + "logits/chosen": 3.6889243125915527, + "logits/rejected": 3.9359817504882812, + "logps/chosen": -163.84664916992188, + "logps/rejected": -187.88027954101562, + "loss": 0.4376, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.54926586151123, + "rewards/margins": 2.42164945602417, + "rewards/rejected": -13.970914840698242, + "step": 939 + }, + { + "epoch": 0.6486113506986372, + "grad_norm": 0.3186596930027008, + "learning_rate": 1.801456496742047e-06, + "logits/chosen": 3.880819320678711, + "logits/rejected": 3.880819320678711, + "logps/chosen": -185.66830444335938, + "logps/rejected": -185.66830444335938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.764585494995117, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.764585494995117, + "step": 940 + }, + { + "epoch": 0.6493013627738485, + "grad_norm": 0.3466106057167053, + "learning_rate": 1.8033729398236874e-06, + "logits/chosen": 3.9067893028259277, + "logits/rejected": 3.9067893028259277, + "logps/chosen": -181.16351318359375, + "logps/rejected": -181.16351318359375, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.381340026855469, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.381341934204102, + "step": 941 + }, + { + "epoch": 0.6499913748490599, + "grad_norm": 4.0907487869262695, + "learning_rate": 1.8052893829053278e-06, + "logits/chosen": 3.871941566467285, + "logits/rejected": 3.87320613861084, + "logps/chosen": -177.62918090820312, + "logps/rejected": -184.51661682128906, + "loss": 0.5418, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.076433181762695, + "rewards/margins": 0.7834942936897278, + "rewards/rejected": -13.859928131103516, + "step": 942 + }, + { + "epoch": 0.6506813869242711, + "grad_norm": 0.3473730981349945, + "learning_rate": 1.8072058259869682e-06, + "logits/chosen": 3.6767075061798096, + "logits/rejected": 3.6767075061798096, + "logps/chosen": -173.32293701171875, + "logps/rejected": -173.32293701171875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.534683227539062, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.534683227539062, + "step": 943 + }, + { + "epoch": 0.6513713989994825, + "grad_norm": 0.6310209035873413, + "learning_rate": 1.8091222690686088e-06, + "logits/chosen": 3.48974347114563, + "logits/rejected": 3.655993938446045, + "logps/chosen": -155.98614501953125, + "logps/rejected": -174.061279296875, + "loss": 0.4374, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.893308639526367, + "rewards/margins": 1.7944713830947876, + "rewards/rejected": -12.687780380249023, + "step": 944 + }, + { + "epoch": 0.6520614110746938, + "grad_norm": 0.3859781324863434, + "learning_rate": 1.8110387121502494e-06, + "logits/chosen": 3.4756388664245605, + "logits/rejected": 3.703458786010742, + "logps/chosen": -168.37167358398438, + "logps/rejected": -185.74732971191406, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.075540542602539, + "rewards/margins": 1.8081884384155273, + "rewards/rejected": -13.883729934692383, + "step": 945 + }, + { + "epoch": 0.6527514231499051, + "grad_norm": 0.3695357143878937, + "learning_rate": 1.8129551552318898e-06, + "logits/chosen": 3.6961965560913086, + "logits/rejected": 3.786806583404541, + "logps/chosen": -171.29591369628906, + "logps/rejected": -180.50216674804688, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.325455665588379, + "rewards/margins": 0.9167970418930054, + "rewards/rejected": -13.242253303527832, + "step": 946 + }, + { + "epoch": 0.6534414352251164, + "grad_norm": 3.604282855987549, + "learning_rate": 1.8148715983135302e-06, + "logits/chosen": 4.014003753662109, + "logits/rejected": 3.8774354457855225, + "logps/chosen": -170.63931274414062, + "logps/rejected": -182.02291870117188, + "loss": 0.4541, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.350360870361328, + "rewards/margins": 1.1169583797454834, + "rewards/rejected": -13.46731948852539, + "step": 947 + }, + { + "epoch": 0.6541314473003278, + "grad_norm": 0.6903977990150452, + "learning_rate": 1.8167880413951708e-06, + "logits/chosen": 3.8081037998199463, + "logits/rejected": 3.9968349933624268, + "logps/chosen": -172.05914306640625, + "logps/rejected": -181.6092529296875, + "loss": 0.5276, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.533594131469727, + "rewards/margins": 0.8666660785675049, + "rewards/rejected": -13.400260925292969, + "step": 948 + }, + { + "epoch": 0.654821459375539, + "grad_norm": 0.40912410616874695, + "learning_rate": 1.8187044844768112e-06, + "logits/chosen": 3.8923113346099854, + "logits/rejected": 3.8898463249206543, + "logps/chosen": -181.29852294921875, + "logps/rejected": -188.22987365722656, + "loss": 0.607, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.390275001525879, + "rewards/margins": 0.7001914978027344, + "rewards/rejected": -14.090465545654297, + "step": 949 + }, + { + "epoch": 0.6555114714507504, + "grad_norm": 0.3562586009502411, + "learning_rate": 1.8206209275584516e-06, + "logits/chosen": 3.788813352584839, + "logits/rejected": 3.788813352584839, + "logps/chosen": -183.4070281982422, + "logps/rejected": -183.4070281982422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.50377368927002, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.50377368927002, + "step": 950 + }, + { + "epoch": 0.6562014835259617, + "grad_norm": 0.38397666811943054, + "learning_rate": 1.822537370640092e-06, + "logits/chosen": 4.128597259521484, + "logits/rejected": 4.128597259521484, + "logps/chosen": -177.13169860839844, + "logps/rejected": -177.13169860839844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.787187576293945, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.787187576293945, + "step": 951 + }, + { + "epoch": 0.656891495601173, + "grad_norm": 0.2818267047405243, + "learning_rate": 1.8244538137217328e-06, + "logits/chosen": 3.7421772480010986, + "logits/rejected": 3.7881321907043457, + "logps/chosen": -163.76528930664062, + "logps/rejected": -172.4823760986328, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.6522798538208, + "rewards/margins": 0.937078595161438, + "rewards/rejected": -12.58935832977295, + "step": 952 + }, + { + "epoch": 0.6575815076763843, + "grad_norm": 0.33899781107902527, + "learning_rate": 1.8263702568033732e-06, + "logits/chosen": 4.224969387054443, + "logits/rejected": 4.373454570770264, + "logps/chosen": -178.02780151367188, + "logps/rejected": -187.40481567382812, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.895076751708984, + "rewards/margins": 0.9112739562988281, + "rewards/rejected": -13.806350708007812, + "step": 953 + }, + { + "epoch": 0.6582715197515957, + "grad_norm": 0.33699533343315125, + "learning_rate": 1.8282866998850135e-06, + "logits/chosen": 3.8875608444213867, + "logits/rejected": 3.936969041824341, + "logps/chosen": -167.45901489257812, + "logps/rejected": -178.25013732910156, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.207590103149414, + "rewards/margins": 1.075716495513916, + "rewards/rejected": -13.283307075500488, + "step": 954 + }, + { + "epoch": 0.658961531826807, + "grad_norm": 0.31704476475715637, + "learning_rate": 1.830203142966654e-06, + "logits/chosen": 3.8664140701293945, + "logits/rejected": 3.937774181365967, + "logps/chosen": -164.58377075195312, + "logps/rejected": -174.79547119140625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.677396774291992, + "rewards/margins": 1.0335495471954346, + "rewards/rejected": -12.710947036743164, + "step": 955 + }, + { + "epoch": 0.6596515439020183, + "grad_norm": 0.31397560238838196, + "learning_rate": 1.8321195860482945e-06, + "logits/chosen": 3.8650784492492676, + "logits/rejected": 3.961860179901123, + "logps/chosen": -165.31939697265625, + "logps/rejected": -180.86639404296875, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.726594924926758, + "rewards/margins": 1.6198960542678833, + "rewards/rejected": -13.346490859985352, + "step": 956 + }, + { + "epoch": 0.6603415559772297, + "grad_norm": 0.3585034906864166, + "learning_rate": 1.834036029129935e-06, + "logits/chosen": 3.629801034927368, + "logits/rejected": 3.698542833328247, + "logps/chosen": -154.53106689453125, + "logps/rejected": -165.68246459960938, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.702045440673828, + "rewards/margins": 1.1088588237762451, + "rewards/rejected": -11.810904502868652, + "step": 957 + }, + { + "epoch": 0.6610315680524409, + "grad_norm": 0.3911627233028412, + "learning_rate": 1.8359524722115753e-06, + "logits/chosen": 3.995349884033203, + "logits/rejected": 3.995349884033203, + "logps/chosen": -180.2135009765625, + "logps/rejected": -180.2135009765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.312324523925781, + "rewards/margins": 0.0, + "rewards/rejected": -13.312324523925781, + "step": 958 + }, + { + "epoch": 0.6617215801276523, + "grad_norm": 0.31925487518310547, + "learning_rate": 1.8378689152932157e-06, + "logits/chosen": 3.772298812866211, + "logits/rejected": 3.772298812866211, + "logps/chosen": -177.2867431640625, + "logps/rejected": -177.2867431640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.056153297424316, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.05615234375, + "step": 959 + }, + { + "epoch": 0.6624115922028635, + "grad_norm": 6.663559436798096, + "learning_rate": 1.8397853583748565e-06, + "logits/chosen": 3.565207004547119, + "logits/rejected": 3.9892990589141846, + "logps/chosen": -159.35675048828125, + "logps/rejected": -185.91299438476562, + "loss": 0.3913, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.233566284179688, + "rewards/margins": 2.5759143829345703, + "rewards/rejected": -13.809480667114258, + "step": 960 + }, + { + "epoch": 0.6631016042780749, + "grad_norm": 28.251121520996094, + "learning_rate": 1.841701801456497e-06, + "logits/chosen": 3.5585553646087646, + "logits/rejected": 3.5734989643096924, + "logps/chosen": -160.1917266845703, + "logps/rejected": -153.07608032226562, + "loss": 1.3318, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.336447715759277, + "rewards/margins": -0.7249088287353516, + "rewards/rejected": -10.61153793334961, + "step": 961 + }, + { + "epoch": 0.6637916163532862, + "grad_norm": 11.028215408325195, + "learning_rate": 1.8436182445381373e-06, + "logits/chosen": 3.7028841972351074, + "logits/rejected": 3.696282148361206, + "logps/chosen": -166.68896484375, + "logps/rejected": -170.40757751464844, + "loss": 0.6343, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.041570663452148, + "rewards/margins": 0.4234120845794678, + "rewards/rejected": -12.464982986450195, + "step": 962 + }, + { + "epoch": 0.6644816284284975, + "grad_norm": 0.3555391728878021, + "learning_rate": 1.8455346876197777e-06, + "logits/chosen": 3.5479671955108643, + "logits/rejected": 3.6786091327667236, + "logps/chosen": -153.24765014648438, + "logps/rejected": -163.0224609375, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.546403884887695, + "rewards/margins": 0.982257604598999, + "rewards/rejected": -11.528661727905273, + "step": 963 + }, + { + "epoch": 0.6651716405037088, + "grad_norm": 0.3303990066051483, + "learning_rate": 1.8474511307014183e-06, + "logits/chosen": 3.6634044647216797, + "logits/rejected": 3.6813712120056152, + "logps/chosen": -157.48504638671875, + "logps/rejected": -167.07635498046875, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.166605949401855, + "rewards/margins": 0.9642347097396851, + "rewards/rejected": -12.130840301513672, + "step": 964 + }, + { + "epoch": 0.6658616525789202, + "grad_norm": 0.35479894280433655, + "learning_rate": 1.849367573783059e-06, + "logits/chosen": 3.549785614013672, + "logits/rejected": 3.561232089996338, + "logps/chosen": -158.82196044921875, + "logps/rejected": -166.67703247070312, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.281835556030273, + "rewards/margins": 0.8057094812393188, + "rewards/rejected": -12.087545394897461, + "step": 965 + }, + { + "epoch": 0.6665516646541314, + "grad_norm": 1.0986064672470093, + "learning_rate": 1.8512840168646993e-06, + "logits/chosen": 3.6767873764038086, + "logits/rejected": 3.628051996231079, + "logps/chosen": -167.76748657226562, + "logps/rejected": -170.41192626953125, + "loss": 0.619, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.771727561950684, + "rewards/margins": 0.2811663746833801, + "rewards/rejected": -12.05289363861084, + "step": 966 + }, + { + "epoch": 0.6672416767293428, + "grad_norm": 0.9236999750137329, + "learning_rate": 1.8532004599463397e-06, + "logits/chosen": 3.693950653076172, + "logits/rejected": 4.024295330047607, + "logps/chosen": -163.92617797851562, + "logps/rejected": -178.90733337402344, + "loss": 0.5264, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.70530891418457, + "rewards/margins": 1.4529023170471191, + "rewards/rejected": -13.158211708068848, + "step": 967 + }, + { + "epoch": 0.6679316888045541, + "grad_norm": 0.36401399970054626, + "learning_rate": 1.8551169030279803e-06, + "logits/chosen": 3.8707950115203857, + "logits/rejected": 3.8707950115203857, + "logps/chosen": -170.4921112060547, + "logps/rejected": -170.4921112060547, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.424558639526367, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -12.424558639526367, + "step": 968 + }, + { + "epoch": 0.6686217008797654, + "grad_norm": 0.3553539216518402, + "learning_rate": 1.8570333461096207e-06, + "logits/chosen": 3.8777270317077637, + "logits/rejected": 3.8777270317077637, + "logps/chosen": -169.1523895263672, + "logps/rejected": -169.15240478515625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.167953491210938, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.167954444885254, + "step": 969 + }, + { + "epoch": 0.6693117129549767, + "grad_norm": 0.3696236312389374, + "learning_rate": 1.858949789191261e-06, + "logits/chosen": 3.6002824306488037, + "logits/rejected": 3.6600821018218994, + "logps/chosen": -172.54458618164062, + "logps/rejected": -184.62991333007812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.187702178955078, + "rewards/margins": 1.2051221132278442, + "rewards/rejected": -13.392824172973633, + "step": 970 + }, + { + "epoch": 0.6700017250301881, + "grad_norm": 0.34761765599250793, + "learning_rate": 1.8608662322729015e-06, + "logits/chosen": 3.464306592941284, + "logits/rejected": 3.464306592941284, + "logps/chosen": -146.33900451660156, + "logps/rejected": -146.33900451660156, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.041913986206055, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -10.041913986206055, + "step": 971 + }, + { + "epoch": 0.6706917371053993, + "grad_norm": 0.3196980357170105, + "learning_rate": 1.8627826753545423e-06, + "logits/chosen": 4.025696754455566, + "logits/rejected": 4.147774696350098, + "logps/chosen": -176.021240234375, + "logps/rejected": -189.46348571777344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.960342407226562, + "rewards/margins": 1.2455843687057495, + "rewards/rejected": -14.205928802490234, + "step": 972 + }, + { + "epoch": 0.6713817491806107, + "grad_norm": 0.4568239152431488, + "learning_rate": 1.8646991184361827e-06, + "logits/chosen": 3.7936794757843018, + "logits/rejected": 3.7936794757843018, + "logps/chosen": -173.25112915039062, + "logps/rejected": -173.25112915039062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.315713882446289, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.315712928771973, + "step": 973 + }, + { + "epoch": 0.672071761255822, + "grad_norm": 0.39013931155204773, + "learning_rate": 1.866615561517823e-06, + "logits/chosen": 3.5678627490997314, + "logits/rejected": 3.5678627490997314, + "logps/chosen": -187.11361694335938, + "logps/rejected": -187.11361694335938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.926209449768066, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.926209449768066, + "step": 974 + }, + { + "epoch": 0.6727617733310333, + "grad_norm": 6.204115390777588, + "learning_rate": 1.8685320045994635e-06, + "logits/chosen": 3.4682977199554443, + "logits/rejected": 3.565779685974121, + "logps/chosen": -175.06048583984375, + "logps/rejected": -189.12301635742188, + "loss": 0.4612, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.660213470458984, + "rewards/margins": 1.6045852899551392, + "rewards/rejected": -14.264799118041992, + "step": 975 + }, + { + "epoch": 0.6734517854062446, + "grad_norm": 3.3564751148223877, + "learning_rate": 1.870448447681104e-06, + "logits/chosen": 3.970337390899658, + "logits/rejected": 3.8863015174865723, + "logps/chosen": -182.60809326171875, + "logps/rejected": -184.99472045898438, + "loss": 0.6247, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.512027740478516, + "rewards/margins": 0.23179233074188232, + "rewards/rejected": -13.743819236755371, + "step": 976 + }, + { + "epoch": 0.674141797481456, + "grad_norm": 0.38408154249191284, + "learning_rate": 1.8723648907627445e-06, + "logits/chosen": 3.4517581462860107, + "logits/rejected": 3.596641778945923, + "logps/chosen": -167.98446655273438, + "logps/rejected": -184.45761108398438, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.953510284423828, + "rewards/margins": 1.6997482776641846, + "rewards/rejected": -13.653258323669434, + "step": 977 + }, + { + "epoch": 0.6748318095566672, + "grad_norm": 0.399826318025589, + "learning_rate": 1.8742813338443848e-06, + "logits/chosen": 3.7985854148864746, + "logits/rejected": 3.9082531929016113, + "logps/chosen": -176.15550231933594, + "logps/rejected": -181.5605926513672, + "loss": 0.6087, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.759069442749023, + "rewards/margins": 0.5017222166061401, + "rewards/rejected": -13.260791778564453, + "step": 978 + }, + { + "epoch": 0.6755218216318786, + "grad_norm": 0.34649765491485596, + "learning_rate": 1.8761977769260252e-06, + "logits/chosen": 3.715787172317505, + "logits/rejected": 3.7435317039489746, + "logps/chosen": -172.01345825195312, + "logps/rejected": -178.96974182128906, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.3330078125, + "rewards/margins": 0.7260391712188721, + "rewards/rejected": -13.05904769897461, + "step": 979 + }, + { + "epoch": 0.6762118337070899, + "grad_norm": 0.34765490889549255, + "learning_rate": 1.878114220007666e-06, + "logits/chosen": 3.5665993690490723, + "logits/rejected": 3.6961910724639893, + "logps/chosen": -161.30197143554688, + "logps/rejected": -171.38519287109375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.353029251098633, + "rewards/margins": 1.0134464502334595, + "rewards/rejected": -12.366476058959961, + "step": 980 + }, + { + "epoch": 0.6769018457823012, + "grad_norm": 0.3053417503833771, + "learning_rate": 1.8800306630893064e-06, + "logits/chosen": 4.230391502380371, + "logits/rejected": 4.251419544219971, + "logps/chosen": -180.57354736328125, + "logps/rejected": -190.05389404296875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.146570205688477, + "rewards/margins": 1.028342366218567, + "rewards/rejected": -14.17491340637207, + "step": 981 + }, + { + "epoch": 0.6775918578575125, + "grad_norm": 0.4831150770187378, + "learning_rate": 1.8819471061709468e-06, + "logits/chosen": 3.8590660095214844, + "logits/rejected": 3.853502035140991, + "logps/chosen": -177.20809936523438, + "logps/rejected": -185.87646484375, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.996672630310059, + "rewards/margins": 0.8923544883728027, + "rewards/rejected": -13.889026641845703, + "step": 982 + }, + { + "epoch": 0.6782818699327238, + "grad_norm": 0.3256362974643707, + "learning_rate": 1.8838635492525872e-06, + "logits/chosen": 3.8668084144592285, + "logits/rejected": 3.8668084144592285, + "logps/chosen": -187.24473571777344, + "logps/rejected": -187.24473571777344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.795266151428223, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -13.795266151428223, + "step": 983 + }, + { + "epoch": 0.6789718820079351, + "grad_norm": 0.8200797438621521, + "learning_rate": 1.8857799923342278e-06, + "logits/chosen": 3.5542654991149902, + "logits/rejected": 3.599987506866455, + "logps/chosen": -181.78224182128906, + "logps/rejected": -185.8772735595703, + "loss": 0.6128, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.335676193237305, + "rewards/margins": 0.36991405487060547, + "rewards/rejected": -13.705589294433594, + "step": 984 + }, + { + "epoch": 0.6796618940831465, + "grad_norm": 0.28970426321029663, + "learning_rate": 1.8876964354158684e-06, + "logits/chosen": 3.8410003185272217, + "logits/rejected": 3.8410003185272217, + "logps/chosen": -190.5145263671875, + "logps/rejected": -190.5145263671875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.153532981872559, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.153533935546875, + "step": 985 + }, + { + "epoch": 0.6803519061583577, + "grad_norm": 0.5038319230079651, + "learning_rate": 1.8896128784975088e-06, + "logits/chosen": 3.8325576782226562, + "logits/rejected": 3.8325576782226562, + "logps/chosen": -164.6519012451172, + "logps/rejected": -164.6519012451172, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.837480545043945, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.837479591369629, + "step": 986 + }, + { + "epoch": 0.6810419182335691, + "grad_norm": 0.39254409074783325, + "learning_rate": 1.8915293215791492e-06, + "logits/chosen": 3.4240384101867676, + "logits/rejected": 3.4240384101867676, + "logps/chosen": -167.3977813720703, + "logps/rejected": -167.3977813720703, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.083547592163086, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.083547592163086, + "step": 987 + }, + { + "epoch": 0.6817319303087804, + "grad_norm": 0.3899356722831726, + "learning_rate": 1.8934457646607898e-06, + "logits/chosen": 3.6889281272888184, + "logits/rejected": 3.6889281272888184, + "logps/chosen": -171.88986206054688, + "logps/rejected": -171.88986206054688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.564088821411133, + "rewards/margins": 0.0, + "rewards/rejected": -12.564088821411133, + "step": 988 + }, + { + "epoch": 0.6824219423839917, + "grad_norm": 0.3539946377277374, + "learning_rate": 1.8953622077424302e-06, + "logits/chosen": 3.7386913299560547, + "logits/rejected": 3.899840831756592, + "logps/chosen": -157.01974487304688, + "logps/rejected": -177.22879028320312, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.1144380569458, + "rewards/margins": 2.017019748687744, + "rewards/rejected": -13.131458282470703, + "step": 989 + }, + { + "epoch": 0.683111954459203, + "grad_norm": 0.3767626881599426, + "learning_rate": 1.8972786508240706e-06, + "logits/chosen": 3.4899253845214844, + "logits/rejected": 3.6514599323272705, + "logps/chosen": -163.9143829345703, + "logps/rejected": -179.17233276367188, + "loss": 0.5216, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.459817886352539, + "rewards/margins": 1.533532738685608, + "rewards/rejected": -12.9933500289917, + "step": 990 + }, + { + "epoch": 0.6838019665344144, + "grad_norm": 0.2662990391254425, + "learning_rate": 1.899195093905711e-06, + "logits/chosen": 3.3191897869110107, + "logits/rejected": 3.3941640853881836, + "logps/chosen": -160.25543212890625, + "logps/rejected": -173.96646118164062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.101941108703613, + "rewards/margins": 1.420068621635437, + "rewards/rejected": -12.52200984954834, + "step": 991 + }, + { + "epoch": 0.6844919786096256, + "grad_norm": 0.3012333810329437, + "learning_rate": 1.9011115369873518e-06, + "logits/chosen": 3.5762155055999756, + "logits/rejected": 3.696743965148926, + "logps/chosen": -159.98489379882812, + "logps/rejected": -172.33334350585938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.138703346252441, + "rewards/margins": 1.1879218816757202, + "rewards/rejected": -12.326624870300293, + "step": 992 + }, + { + "epoch": 0.685181990684837, + "grad_norm": 0.4006446301937103, + "learning_rate": 1.9030279800689922e-06, + "logits/chosen": 3.782784938812256, + "logits/rejected": 3.979494571685791, + "logps/chosen": -170.19728088378906, + "logps/rejected": -182.47488403320312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.569518089294434, + "rewards/margins": 1.246206521987915, + "rewards/rejected": -13.81572437286377, + "step": 993 + }, + { + "epoch": 0.6858720027600483, + "grad_norm": 0.9919675588607788, + "learning_rate": 1.9049444231506326e-06, + "logits/chosen": 3.5413105487823486, + "logits/rejected": 3.589956045150757, + "logps/chosen": -154.9502716064453, + "logps/rejected": -168.68345642089844, + "loss": 0.5269, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.556554794311523, + "rewards/margins": 1.4476830959320068, + "rewards/rejected": -12.00423812866211, + "step": 994 + }, + { + "epoch": 0.6865620148352596, + "grad_norm": 0.38238832354545593, + "learning_rate": 1.906860866232273e-06, + "logits/chosen": 3.683627128601074, + "logits/rejected": 3.683627128601074, + "logps/chosen": -154.07931518554688, + "logps/rejected": -154.07931518554688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.42993450164795, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -10.429933547973633, + "step": 995 + }, + { + "epoch": 0.6872520269104709, + "grad_norm": 2.87152361869812, + "learning_rate": 1.908777309313914e-06, + "logits/chosen": 3.5531439781188965, + "logits/rejected": 3.7338945865631104, + "logps/chosen": -158.33338928222656, + "logps/rejected": -167.99609375, + "loss": 0.5448, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.171712875366211, + "rewards/margins": 0.8944485187530518, + "rewards/rejected": -12.066161155700684, + "step": 996 + }, + { + "epoch": 0.6879420389856823, + "grad_norm": 15.584968566894531, + "learning_rate": 1.910693752395554e-06, + "logits/chosen": 3.3905162811279297, + "logits/rejected": 3.7463157176971436, + "logps/chosen": -156.43157958984375, + "logps/rejected": -174.28012084960938, + "loss": 0.5402, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.913089752197266, + "rewards/margins": 1.6995888948440552, + "rewards/rejected": -12.612678527832031, + "step": 997 + }, + { + "epoch": 0.6886320510608935, + "grad_norm": 1.5143917798995972, + "learning_rate": 1.9126101954771946e-06, + "logits/chosen": 3.7984507083892822, + "logits/rejected": 4.079416751861572, + "logps/chosen": -160.79452514648438, + "logps/rejected": -185.43748474121094, + "loss": 0.4462, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.100050926208496, + "rewards/margins": 2.513693332672119, + "rewards/rejected": -13.613744735717773, + "step": 998 + }, + { + "epoch": 0.6893220631361049, + "grad_norm": 8.610106468200684, + "learning_rate": 1.914526638558835e-06, + "logits/chosen": 3.5841012001037598, + "logits/rejected": 3.62404465675354, + "logps/chosen": -164.9071502685547, + "logps/rejected": -175.89266967773438, + "loss": 0.7467, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.849639892578125, + "rewards/margins": 1.0700145959854126, + "rewards/rejected": -12.919652938842773, + "step": 999 + }, + { + "epoch": 0.6900120752113162, + "grad_norm": 0.42157378792762756, + "learning_rate": 1.9164430816404754e-06, + "logits/chosen": 3.7914464473724365, + "logits/rejected": 3.7914464473724365, + "logps/chosen": -176.6429443359375, + "logps/rejected": -176.6429443359375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.914344787597656, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.914344787597656, + "step": 1000 + }, + { + "epoch": 0.6907020872865275, + "grad_norm": 0.3780723810195923, + "learning_rate": 1.9183595247221158e-06, + "logits/chosen": 3.645864963531494, + "logits/rejected": 3.766730308532715, + "logps/chosen": -151.8603973388672, + "logps/rejected": -165.15811157226562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.54897689819336, + "rewards/margins": 1.2562859058380127, + "rewards/rejected": -11.805262565612793, + "step": 1001 + }, + { + "epoch": 0.6913920993617388, + "grad_norm": 0.27995845675468445, + "learning_rate": 1.920275967803756e-06, + "logits/chosen": 3.6928091049194336, + "logits/rejected": 3.8437435626983643, + "logps/chosen": -169.93710327148438, + "logps/rejected": -184.5060577392578, + "loss": 0.5205, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.194664001464844, + "rewards/margins": 1.5001901388168335, + "rewards/rejected": -13.694854736328125, + "step": 1002 + }, + { + "epoch": 0.6920821114369502, + "grad_norm": 0.29164090752601624, + "learning_rate": 1.922192410885397e-06, + "logits/chosen": 3.948655128479004, + "logits/rejected": 4.045991897583008, + "logps/chosen": -181.38516235351562, + "logps/rejected": -193.42904663085938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.288566589355469, + "rewards/margins": 1.1973217725753784, + "rewards/rejected": -14.485889434814453, + "step": 1003 + }, + { + "epoch": 0.6927721235121614, + "grad_norm": 0.3241160213947296, + "learning_rate": 1.9241088539670374e-06, + "logits/chosen": 3.4316022396087646, + "logits/rejected": 3.4316022396087646, + "logps/chosen": -160.66171264648438, + "logps/rejected": -160.66171264648438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.493703842163086, + "rewards/margins": 0.0, + "rewards/rejected": -11.493703842163086, + "step": 1004 + }, + { + "epoch": 0.6934621355873728, + "grad_norm": 10.331762313842773, + "learning_rate": 1.9260252970486777e-06, + "logits/chosen": 3.711644172668457, + "logits/rejected": 3.5305469036102295, + "logps/chosen": -164.58938598632812, + "logps/rejected": -157.13818359375, + "loss": 1.6117, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.590421676635742, + "rewards/margins": -0.659299910068512, + "rewards/rejected": -10.931123733520508, + "step": 1005 + }, + { + "epoch": 0.694152147662584, + "grad_norm": 0.4250645637512207, + "learning_rate": 1.927941740130318e-06, + "logits/chosen": 3.6987273693084717, + "logits/rejected": 3.727879762649536, + "logps/chosen": -167.43057250976562, + "logps/rejected": -178.813232421875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.997690200805664, + "rewards/margins": 1.1856579780578613, + "rewards/rejected": -13.183347702026367, + "step": 1006 + }, + { + "epoch": 0.6948421597377954, + "grad_norm": 1.2845683097839355, + "learning_rate": 1.9298581832119585e-06, + "logits/chosen": 3.696702003479004, + "logits/rejected": 3.725919723510742, + "logps/chosen": -168.55856323242188, + "logps/rejected": -171.80763244628906, + "loss": 0.615, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.07248306274414, + "rewards/margins": 0.33183813095092773, + "rewards/rejected": -12.404319763183594, + "step": 1007 + }, + { + "epoch": 0.6955321718130067, + "grad_norm": 0.35435616970062256, + "learning_rate": 1.9317746262935993e-06, + "logits/chosen": 4.001852512359619, + "logits/rejected": 4.001852512359619, + "logps/chosen": -178.48178100585938, + "logps/rejected": -178.48178100585938, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.016712188720703, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.016714096069336, + "step": 1008 + }, + { + "epoch": 0.696222183888218, + "grad_norm": 0.35436269640922546, + "learning_rate": 1.9336910693752397e-06, + "logits/chosen": 3.5140414237976074, + "logits/rejected": 3.6709251403808594, + "logps/chosen": -176.28842163085938, + "logps/rejected": -182.2890625, + "loss": 0.6074, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.898737907409668, + "rewards/margins": 0.623431921005249, + "rewards/rejected": -13.522170066833496, + "step": 1009 + }, + { + "epoch": 0.6969121959634293, + "grad_norm": 0.38079163432121277, + "learning_rate": 1.93560751245688e-06, + "logits/chosen": 3.7647833824157715, + "logits/rejected": 3.7647833824157715, + "logps/chosen": -176.98016357421875, + "logps/rejected": -176.98016357421875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.899399757385254, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.89940071105957, + "step": 1010 + }, + { + "epoch": 0.6976022080386407, + "grad_norm": 1.0847405195236206, + "learning_rate": 1.9375239555385205e-06, + "logits/chosen": 3.6634814739227295, + "logits/rejected": 3.6723666191101074, + "logps/chosen": -164.5996856689453, + "logps/rejected": -166.87474060058594, + "loss": 0.6199, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.65213680267334, + "rewards/margins": 0.2726109027862549, + "rewards/rejected": -11.924747467041016, + "step": 1011 + }, + { + "epoch": 0.698292220113852, + "grad_norm": 0.31316229701042175, + "learning_rate": 1.9394403986201613e-06, + "logits/chosen": 3.7800793647766113, + "logits/rejected": 3.7800793647766113, + "logps/chosen": -173.14706420898438, + "logps/rejected": -173.14706420898438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.529970169067383, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.529970169067383, + "step": 1012 + }, + { + "epoch": 0.6989822321890633, + "grad_norm": 0.36231693625450134, + "learning_rate": 1.9413568417018017e-06, + "logits/chosen": 3.782382011413574, + "logits/rejected": 3.8529233932495117, + "logps/chosen": -180.5698699951172, + "logps/rejected": -187.98854064941406, + "loss": 0.6068, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.283803939819336, + "rewards/margins": 0.7629169225692749, + "rewards/rejected": -14.046720504760742, + "step": 1013 + }, + { + "epoch": 0.6996722442642747, + "grad_norm": 0.3146992027759552, + "learning_rate": 1.943273284783442e-06, + "logits/chosen": 3.7174293994903564, + "logits/rejected": 3.7174293994903564, + "logps/chosen": -175.15740966796875, + "logps/rejected": -175.15740966796875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.670350074768066, + "rewards/margins": 0.0, + "rewards/rejected": -12.67034912109375, + "step": 1014 + }, + { + "epoch": 0.7003622563394859, + "grad_norm": 1.1185113191604614, + "learning_rate": 1.9451897278650825e-06, + "logits/chosen": 3.5002634525299072, + "logits/rejected": 3.553389072418213, + "logps/chosen": -152.64418029785156, + "logps/rejected": -156.04283142089844, + "loss": 0.6124, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.411815643310547, + "rewards/margins": 0.37827998399734497, + "rewards/rejected": -10.790095329284668, + "step": 1015 + }, + { + "epoch": 0.7010522684146973, + "grad_norm": 0.2784156799316406, + "learning_rate": 1.9471061709467233e-06, + "logits/chosen": 3.689694881439209, + "logits/rejected": 3.6896092891693115, + "logps/chosen": -187.11093139648438, + "logps/rejected": -195.97003173828125, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.77493953704834, + "rewards/margins": 0.9013649225234985, + "rewards/rejected": -14.67630386352539, + "step": 1016 + }, + { + "epoch": 0.7017422804899086, + "grad_norm": 0.26111674308776855, + "learning_rate": 1.9490226140283637e-06, + "logits/chosen": 4.378222942352295, + "logits/rejected": 4.378222942352295, + "logps/chosen": -195.7812957763672, + "logps/rejected": -195.7812957763672, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.722466468811035, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -14.722466468811035, + "step": 1017 + }, + { + "epoch": 0.7024322925651199, + "grad_norm": 0.2783906161785126, + "learning_rate": 1.950939057110004e-06, + "logits/chosen": 3.868830919265747, + "logits/rejected": 3.942903757095337, + "logps/chosen": -187.44676208496094, + "logps/rejected": -199.83856201171875, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.874903678894043, + "rewards/margins": 1.2706695795059204, + "rewards/rejected": -15.145573616027832, + "step": 1018 + }, + { + "epoch": 0.7031223046403312, + "grad_norm": 14.936118125915527, + "learning_rate": 1.9528555001916445e-06, + "logits/chosen": 3.923799991607666, + "logits/rejected": 3.976259708404541, + "logps/chosen": -171.54624938964844, + "logps/rejected": -179.23301696777344, + "loss": 0.8014, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.411515235900879, + "rewards/margins": 0.6848004460334778, + "rewards/rejected": -13.096315383911133, + "step": 1019 + }, + { + "epoch": 0.7038123167155426, + "grad_norm": 0.3578755259513855, + "learning_rate": 1.954771943273285e-06, + "logits/chosen": 4.083935260772705, + "logits/rejected": 4.083935260772705, + "logps/chosen": -181.59115600585938, + "logps/rejected": -181.59115600585938, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.378150939941406, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.37814998626709, + "step": 1020 + }, + { + "epoch": 0.7045023287907538, + "grad_norm": 0.3972950875759125, + "learning_rate": 1.9566883863549253e-06, + "logits/chosen": 3.903857946395874, + "logits/rejected": 3.928431272506714, + "logps/chosen": -172.5343017578125, + "logps/rejected": -181.78660583496094, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.392084121704102, + "rewards/margins": 0.9509592056274414, + "rewards/rejected": -13.343042373657227, + "step": 1021 + }, + { + "epoch": 0.7051923408659652, + "grad_norm": 0.33282655477523804, + "learning_rate": 1.9586048294365657e-06, + "logits/chosen": 3.510727643966675, + "logits/rejected": 3.510727643966675, + "logps/chosen": -171.66358947753906, + "logps/rejected": -171.66358947753906, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.31853199005127, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.31853199005127, + "step": 1022 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.2935638725757599, + "learning_rate": 1.9605212725182065e-06, + "logits/chosen": 3.9334542751312256, + "logits/rejected": 3.9334542751312256, + "logps/chosen": -177.7769317626953, + "logps/rejected": -177.77694702148438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.955438613891602, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.955439567565918, + "step": 1023 + }, + { + "epoch": 0.7065723650163878, + "grad_norm": 0.3750779628753662, + "learning_rate": 1.962437715599847e-06, + "logits/chosen": 3.97182559967041, + "logits/rejected": 3.97182559967041, + "logps/chosen": -185.65109252929688, + "logps/rejected": -185.65109252929688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.646262168884277, + "rewards/margins": 0.0, + "rewards/rejected": -13.646262168884277, + "step": 1024 + }, + { + "epoch": 0.7072623770915991, + "grad_norm": 0.3414618670940399, + "learning_rate": 1.9643541586814873e-06, + "logits/chosen": 3.882199764251709, + "logits/rejected": 3.882199764251709, + "logps/chosen": -187.11740112304688, + "logps/rejected": -187.11740112304688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.83692741394043, + "rewards/margins": 0.0, + "rewards/rejected": -13.83692741394043, + "step": 1025 + }, + { + "epoch": 0.7079523891668105, + "grad_norm": 0.34566664695739746, + "learning_rate": 1.9662706017631277e-06, + "logits/chosen": 3.7652814388275146, + "logits/rejected": 3.7652814388275146, + "logps/chosen": -172.30625915527344, + "logps/rejected": -172.3062744140625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.44469165802002, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.44469165802002, + "step": 1026 + }, + { + "epoch": 0.7086424012420217, + "grad_norm": 0.2794359624385834, + "learning_rate": 1.968187044844768e-06, + "logits/chosen": 3.975595474243164, + "logits/rejected": 4.058375358581543, + "logps/chosen": -181.36233520507812, + "logps/rejected": -192.8677520751953, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.339366912841797, + "rewards/margins": 1.1934269666671753, + "rewards/rejected": -14.532794952392578, + "step": 1027 + }, + { + "epoch": 0.7093324133172331, + "grad_norm": 0.3298832178115845, + "learning_rate": 1.970103487926409e-06, + "logits/chosen": 3.8099350929260254, + "logits/rejected": 4.090336322784424, + "logps/chosen": -167.21231079101562, + "logps/rejected": -185.52035522460938, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.816373825073242, + "rewards/margins": 1.8582714796066284, + "rewards/rejected": -13.674644470214844, + "step": 1028 + }, + { + "epoch": 0.7100224253924444, + "grad_norm": 0.3749243915081024, + "learning_rate": 1.9720199310080493e-06, + "logits/chosen": 3.579730749130249, + "logits/rejected": 3.579730749130249, + "logps/chosen": -170.85406494140625, + "logps/rejected": -170.85406494140625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.450044631958008, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.450044631958008, + "step": 1029 + }, + { + "epoch": 0.7107124374676557, + "grad_norm": 0.2970142662525177, + "learning_rate": 1.9739363740896897e-06, + "logits/chosen": 4.02715539932251, + "logits/rejected": 4.02715539932251, + "logps/chosen": -192.73876953125, + "logps/rejected": -192.73876953125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.458610534667969, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.458610534667969, + "step": 1030 + }, + { + "epoch": 0.711402449542867, + "grad_norm": 0.3077709674835205, + "learning_rate": 1.97585281717133e-06, + "logits/chosen": 3.92901611328125, + "logits/rejected": 3.92901611328125, + "logps/chosen": -187.05416870117188, + "logps/rejected": -187.05416870117188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.748985290527344, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.748984336853027, + "step": 1031 + }, + { + "epoch": 0.7120924616180783, + "grad_norm": 0.26115599274635315, + "learning_rate": 1.977769260252971e-06, + "logits/chosen": 3.842547655105591, + "logits/rejected": 4.050209045410156, + "logps/chosen": -149.71258544921875, + "logps/rejected": -181.004638671875, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.02278995513916, + "rewards/margins": 3.1075477600097656, + "rewards/rejected": -13.130338668823242, + "step": 1032 + }, + { + "epoch": 0.7127824736932896, + "grad_norm": 0.2775585651397705, + "learning_rate": 1.9796857033346113e-06, + "logits/chosen": 3.552689552307129, + "logits/rejected": 3.6064529418945312, + "logps/chosen": -176.48789978027344, + "logps/rejected": -199.83551025390625, + "loss": 0.4342, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.816702842712402, + "rewards/margins": 2.53200626373291, + "rewards/rejected": -15.348710060119629, + "step": 1033 + }, + { + "epoch": 0.713472485768501, + "grad_norm": 4.5443501472473145, + "learning_rate": 1.9816021464162516e-06, + "logits/chosen": 3.950737953186035, + "logits/rejected": 3.98330020904541, + "logps/chosen": -172.1582794189453, + "logps/rejected": -173.4862823486328, + "loss": 0.6346, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.432825088500977, + "rewards/margins": 0.17249858379364014, + "rewards/rejected": -12.60532283782959, + "step": 1034 + }, + { + "epoch": 0.7141624978437122, + "grad_norm": 0.34423208236694336, + "learning_rate": 1.983518589497892e-06, + "logits/chosen": 3.6351001262664795, + "logits/rejected": 3.6351001262664795, + "logps/chosen": -176.5612030029297, + "logps/rejected": -176.5612030029297, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.75214672088623, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.752145767211914, + "step": 1035 + }, + { + "epoch": 0.7148525099189236, + "grad_norm": 0.3445097506046295, + "learning_rate": 1.985435032579533e-06, + "logits/chosen": 3.6834263801574707, + "logits/rejected": 3.7245092391967773, + "logps/chosen": -193.3917236328125, + "logps/rejected": -200.2981414794922, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.666082382202148, + "rewards/margins": 0.7095593214035034, + "rewards/rejected": -15.375641822814941, + "step": 1036 + }, + { + "epoch": 0.7155425219941349, + "grad_norm": 13.3444185256958, + "learning_rate": 1.9873514756611732e-06, + "logits/chosen": 3.9565305709838867, + "logits/rejected": 3.8764724731445312, + "logps/chosen": -179.50613403320312, + "logps/rejected": -178.0301971435547, + "loss": 0.793, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.229377746582031, + "rewards/margins": -0.15465128421783447, + "rewards/rejected": -13.074727058410645, + "step": 1037 + }, + { + "epoch": 0.7162325340693462, + "grad_norm": 0.3461894392967224, + "learning_rate": 1.9892679187428136e-06, + "logits/chosen": 3.5872702598571777, + "logits/rejected": 3.7000985145568848, + "logps/chosen": -172.61752319335938, + "logps/rejected": -190.44125366210938, + "loss": 0.5207, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.221118927001953, + "rewards/margins": 1.8173317909240723, + "rewards/rejected": -14.0384521484375, + "step": 1038 + }, + { + "epoch": 0.7169225461445575, + "grad_norm": 12.098941802978516, + "learning_rate": 1.991184361824454e-06, + "logits/chosen": 3.973316192626953, + "logits/rejected": 4.009685516357422, + "logps/chosen": -172.1885986328125, + "logps/rejected": -181.35562133789062, + "loss": 0.5899, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.310901641845703, + "rewards/margins": 0.9460890293121338, + "rewards/rejected": -13.256990432739258, + "step": 1039 + }, + { + "epoch": 0.7176125582197689, + "grad_norm": 0.2942649722099304, + "learning_rate": 1.9931008049060944e-06, + "logits/chosen": 3.8644843101501465, + "logits/rejected": 3.9151272773742676, + "logps/chosen": -172.04466247558594, + "logps/rejected": -187.1470184326172, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.328751564025879, + "rewards/margins": 1.4400402307510376, + "rewards/rejected": -13.768792152404785, + "step": 1040 + }, + { + "epoch": 0.7183025702949801, + "grad_norm": 0.4078068733215332, + "learning_rate": 1.995017247987735e-06, + "logits/chosen": 3.8064162731170654, + "logits/rejected": 3.9604835510253906, + "logps/chosen": -168.3382568359375, + "logps/rejected": -184.16445922851562, + "loss": 0.5217, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.1045560836792, + "rewards/margins": 1.5764617919921875, + "rewards/rejected": -13.681017875671387, + "step": 1041 + }, + { + "epoch": 0.7189925823701915, + "grad_norm": 0.3446521759033203, + "learning_rate": 1.996933691069375e-06, + "logits/chosen": 3.749105930328369, + "logits/rejected": 3.9864730834960938, + "logps/chosen": -174.16265869140625, + "logps/rejected": -191.49462890625, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.567197799682617, + "rewards/margins": 1.76945161819458, + "rewards/rejected": -14.336649894714355, + "step": 1042 + }, + { + "epoch": 0.7196825944454028, + "grad_norm": 0.31524500250816345, + "learning_rate": 1.998850134151016e-06, + "logits/chosen": 3.899536609649658, + "logits/rejected": 3.973809242248535, + "logps/chosen": -167.81300354003906, + "logps/rejected": -173.47506713867188, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.045943260192871, + "rewards/margins": 0.6086492538452148, + "rewards/rejected": -12.65459156036377, + "step": 1043 + }, + { + "epoch": 0.7203726065206141, + "grad_norm": 1.676964282989502, + "learning_rate": 2.0007665772326564e-06, + "logits/chosen": 4.125025272369385, + "logits/rejected": 4.182805061340332, + "logps/chosen": -171.19789123535156, + "logps/rejected": -183.55471801757812, + "loss": 0.548, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.096440315246582, + "rewards/margins": 1.2454862594604492, + "rewards/rejected": -13.341926574707031, + "step": 1044 + }, + { + "epoch": 0.7210626185958254, + "grad_norm": 9.963335037231445, + "learning_rate": 2.002683020314297e-06, + "logits/chosen": 4.016641616821289, + "logits/rejected": 3.992962598800659, + "logps/chosen": -169.86306762695312, + "logps/rejected": -173.27706909179688, + "loss": 0.6976, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.289682388305664, + "rewards/margins": 0.3784449100494385, + "rewards/rejected": -12.66812801361084, + "step": 1045 + }, + { + "epoch": 0.7217526306710368, + "grad_norm": 0.41151025891304016, + "learning_rate": 2.004599463395937e-06, + "logits/chosen": 3.800274610519409, + "logits/rejected": 3.800274610519409, + "logps/chosen": -181.90122985839844, + "logps/rejected": -181.90122985839844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.276110649108887, + "rewards/margins": 0.0, + "rewards/rejected": -13.276110649108887, + "step": 1046 + }, + { + "epoch": 0.722442642746248, + "grad_norm": 0.35205844044685364, + "learning_rate": 2.0065159064775776e-06, + "logits/chosen": 3.8693065643310547, + "logits/rejected": 3.9782652854919434, + "logps/chosen": -168.61180114746094, + "logps/rejected": -180.68748474121094, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.011301040649414, + "rewards/margins": 1.2131232023239136, + "rewards/rejected": -13.224424362182617, + "step": 1047 + }, + { + "epoch": 0.7231326548214594, + "grad_norm": 0.3387260138988495, + "learning_rate": 2.008432349559218e-06, + "logits/chosen": 3.78181791305542, + "logits/rejected": 4.005454063415527, + "logps/chosen": -163.25265502929688, + "logps/rejected": -186.71835327148438, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.806589126586914, + "rewards/margins": 2.245882511138916, + "rewards/rejected": -14.052472114562988, + "step": 1048 + }, + { + "epoch": 0.7238226668966707, + "grad_norm": 0.48735982179641724, + "learning_rate": 2.0103487926408588e-06, + "logits/chosen": 3.6561951637268066, + "logits/rejected": 3.7350282669067383, + "logps/chosen": -173.6302490234375, + "logps/rejected": -179.0948486328125, + "loss": 0.6078, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.571066856384277, + "rewards/margins": 0.5751690864562988, + "rewards/rejected": -13.146236419677734, + "step": 1049 + }, + { + "epoch": 0.724512678971882, + "grad_norm": 6.544926643371582, + "learning_rate": 2.012265235722499e-06, + "logits/chosen": 3.4199910163879395, + "logits/rejected": 3.5893971920013428, + "logps/chosen": -156.79852294921875, + "logps/rejected": -179.4228057861328, + "loss": 0.5052, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.075691223144531, + "rewards/margins": 2.2427964210510254, + "rewards/rejected": -13.318488121032715, + "step": 1050 + }, + { + "epoch": 0.7252026910470933, + "grad_norm": 0.33550578355789185, + "learning_rate": 2.0141816788041396e-06, + "logits/chosen": 3.904223918914795, + "logits/rejected": 3.904223918914795, + "logps/chosen": -181.0430450439453, + "logps/rejected": -181.0430450439453, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.264473915100098, + "rewards/margins": 0.0, + "rewards/rejected": -13.264473915100098, + "step": 1051 + }, + { + "epoch": 0.7258927031223047, + "grad_norm": 0.37725067138671875, + "learning_rate": 2.01609812188578e-06, + "logits/chosen": 4.196564197540283, + "logits/rejected": 4.196564197540283, + "logps/chosen": -188.77264404296875, + "logps/rejected": -188.77264404296875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.162139892578125, + "rewards/margins": 0.0, + "rewards/rejected": -14.162139892578125, + "step": 1052 + }, + { + "epoch": 0.7265827151975159, + "grad_norm": 0.2653055787086487, + "learning_rate": 2.0180145649674208e-06, + "logits/chosen": 3.7353224754333496, + "logits/rejected": 3.937715530395508, + "logps/chosen": -159.35086059570312, + "logps/rejected": -166.49185180664062, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.229494094848633, + "rewards/margins": 0.6769622564315796, + "rewards/rejected": -11.90645694732666, + "step": 1053 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.40287449955940247, + "learning_rate": 2.019931008049061e-06, + "logits/chosen": 3.6757454872131348, + "logits/rejected": 3.754178524017334, + "logps/chosen": -163.66152954101562, + "logps/rejected": -170.75177001953125, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.822074890136719, + "rewards/margins": 0.7094682455062866, + "rewards/rejected": -12.531543731689453, + "step": 1054 + }, + { + "epoch": 0.7279627393479385, + "grad_norm": 0.36433395743370056, + "learning_rate": 2.0218474511307016e-06, + "logits/chosen": 4.068850517272949, + "logits/rejected": 4.12248420715332, + "logps/chosen": -178.55728149414062, + "logps/rejected": -186.2308807373047, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.961122512817383, + "rewards/margins": 0.7936204671859741, + "rewards/rejected": -13.754743576049805, + "step": 1055 + }, + { + "epoch": 0.7286527514231499, + "grad_norm": 0.42184537649154663, + "learning_rate": 2.023763894212342e-06, + "logits/chosen": 4.012669563293457, + "logits/rejected": 4.288491249084473, + "logps/chosen": -172.31793212890625, + "logps/rejected": -188.02584838867188, + "loss": 0.5224, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.334270477294922, + "rewards/margins": 1.5863510370254517, + "rewards/rejected": -13.92061996459961, + "step": 1056 + }, + { + "epoch": 0.7293427634983612, + "grad_norm": 0.3803234398365021, + "learning_rate": 2.0256803372939828e-06, + "logits/chosen": 3.842649221420288, + "logits/rejected": 3.842649221420288, + "logps/chosen": -182.90606689453125, + "logps/rejected": -182.90606689453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.597599983215332, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.597599983215332, + "step": 1057 + }, + { + "epoch": 0.7300327755735725, + "grad_norm": 0.42279288172721863, + "learning_rate": 2.027596780375623e-06, + "logits/chosen": 3.929868221282959, + "logits/rejected": 3.929868221282959, + "logps/chosen": -176.11236572265625, + "logps/rejected": -176.11233520507812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.72204303741455, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.722041130065918, + "step": 1058 + }, + { + "epoch": 0.7307227876487838, + "grad_norm": 0.38423430919647217, + "learning_rate": 2.0295132234572635e-06, + "logits/chosen": 3.979902744293213, + "logits/rejected": 3.979902744293213, + "logps/chosen": -178.65211486816406, + "logps/rejected": -178.65211486816406, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.016796112060547, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -13.016796112060547, + "step": 1059 + }, + { + "epoch": 0.7314127997239952, + "grad_norm": 0.29162687063217163, + "learning_rate": 2.031429666538904e-06, + "logits/chosen": 4.077550888061523, + "logits/rejected": 4.077550888061523, + "logps/chosen": -186.77749633789062, + "logps/rejected": -186.77749633789062, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.913537979125977, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.91353988647461, + "step": 1060 + }, + { + "epoch": 0.7321028117992064, + "grad_norm": 0.40606164932250977, + "learning_rate": 2.0333461096205443e-06, + "logits/chosen": 3.7095932960510254, + "logits/rejected": 3.8019394874572754, + "logps/chosen": -171.40936279296875, + "logps/rejected": -182.99740600585938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.48742961883545, + "rewards/margins": 1.178084135055542, + "rewards/rejected": -13.66551399230957, + "step": 1061 + }, + { + "epoch": 0.7327928238744178, + "grad_norm": 0.37569859623908997, + "learning_rate": 2.0352625527021847e-06, + "logits/chosen": 3.9422597885131836, + "logits/rejected": 3.9422597885131836, + "logps/chosen": -192.38389587402344, + "logps/rejected": -192.38389587402344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.292777061462402, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -14.292776107788086, + "step": 1062 + }, + { + "epoch": 0.7334828359496292, + "grad_norm": 0.3227461874485016, + "learning_rate": 2.0371789957838255e-06, + "logits/chosen": 3.503171682357788, + "logits/rejected": 3.503171682357788, + "logps/chosen": -149.9434051513672, + "logps/rejected": -149.9434051513672, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.369941711425781, + "rewards/margins": 0.0, + "rewards/rejected": -10.369941711425781, + "step": 1063 + }, + { + "epoch": 0.7341728480248404, + "grad_norm": 0.4488525986671448, + "learning_rate": 2.039095438865466e-06, + "logits/chosen": 3.4627091884613037, + "logits/rejected": 3.541365385055542, + "logps/chosen": -150.5323028564453, + "logps/rejected": -160.81085205078125, + "loss": 0.5236, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.279383659362793, + "rewards/margins": 1.0576696395874023, + "rewards/rejected": -11.337053298950195, + "step": 1064 + }, + { + "epoch": 0.7348628601000518, + "grad_norm": 1.2171894311904907, + "learning_rate": 2.0410118819471063e-06, + "logits/chosen": 3.7932052612304688, + "logits/rejected": 4.000758171081543, + "logps/chosen": -166.1935577392578, + "logps/rejected": -184.5108642578125, + "loss": 0.5234, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.871612548828125, + "rewards/margins": 1.7276341915130615, + "rewards/rejected": -13.599246978759766, + "step": 1065 + }, + { + "epoch": 0.7355528721752631, + "grad_norm": 0.30484169721603394, + "learning_rate": 2.0429283250287467e-06, + "logits/chosen": 3.5641353130340576, + "logits/rejected": 3.773838520050049, + "logps/chosen": -162.42385864257812, + "logps/rejected": -190.837158203125, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.646881103515625, + "rewards/margins": 2.6886210441589355, + "rewards/rejected": -14.335502624511719, + "step": 1066 + }, + { + "epoch": 0.7362428842504743, + "grad_norm": 6.160388946533203, + "learning_rate": 2.044844768110387e-06, + "logits/chosen": 3.8506999015808105, + "logits/rejected": 3.9211487770080566, + "logps/chosen": -177.47869873046875, + "logps/rejected": -184.60104370117188, + "loss": 0.5688, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.91517162322998, + "rewards/margins": 0.7593156099319458, + "rewards/rejected": -13.67448616027832, + "step": 1067 + }, + { + "epoch": 0.7369328963256857, + "grad_norm": 0.3135848641395569, + "learning_rate": 2.0467612111920275e-06, + "logits/chosen": 4.027390480041504, + "logits/rejected": 4.1692280769348145, + "logps/chosen": -165.86434936523438, + "logps/rejected": -183.38128662109375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.846423149108887, + "rewards/margins": 1.592075228691101, + "rewards/rejected": -13.438498497009277, + "step": 1068 + }, + { + "epoch": 0.7376229084008971, + "grad_norm": 9.8374605178833, + "learning_rate": 2.0486776542736683e-06, + "logits/chosen": 3.888993740081787, + "logits/rejected": 4.026101112365723, + "logps/chosen": -183.00726318359375, + "logps/rejected": -186.14939880371094, + "loss": 0.6909, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.478927612304688, + "rewards/margins": 0.33357977867126465, + "rewards/rejected": -13.812506675720215, + "step": 1069 + }, + { + "epoch": 0.7383129204761083, + "grad_norm": 6.582334995269775, + "learning_rate": 2.0505940973553087e-06, + "logits/chosen": 3.65403413772583, + "logits/rejected": 3.73602557182312, + "logps/chosen": -175.98870849609375, + "logps/rejected": -189.28744506835938, + "loss": 0.5925, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.787199020385742, + "rewards/margins": 1.34361732006073, + "rewards/rejected": -14.130817413330078, + "step": 1070 + }, + { + "epoch": 0.7390029325513197, + "grad_norm": 9.517340660095215, + "learning_rate": 2.052510540436949e-06, + "logits/chosen": 4.015566825866699, + "logits/rejected": 4.021965026855469, + "logps/chosen": -173.53524780273438, + "logps/rejected": -172.2740478515625, + "loss": 0.7601, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.629426002502441, + "rewards/margins": -0.11028218269348145, + "rewards/rejected": -12.519144058227539, + "step": 1071 + }, + { + "epoch": 0.739692944626531, + "grad_norm": 0.3216734826564789, + "learning_rate": 2.0544269835185895e-06, + "logits/chosen": 3.8454055786132812, + "logits/rejected": 3.842899799346924, + "logps/chosen": -174.01409912109375, + "logps/rejected": -183.86581420898438, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.447843551635742, + "rewards/margins": 1.0612934827804565, + "rewards/rejected": -13.509136199951172, + "step": 1072 + }, + { + "epoch": 0.7403829567017423, + "grad_norm": 0.27688294649124146, + "learning_rate": 2.0563434266002303e-06, + "logits/chosen": 3.869354009628296, + "logits/rejected": 4.069801330566406, + "logps/chosen": -157.6419677734375, + "logps/rejected": -176.1190643310547, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.836429595947266, + "rewards/margins": 2.000882148742676, + "rewards/rejected": -12.837311744689941, + "step": 1073 + }, + { + "epoch": 0.7410729687769536, + "grad_norm": 0.30736616253852844, + "learning_rate": 2.0582598696818707e-06, + "logits/chosen": 4.032713413238525, + "logits/rejected": 4.032713413238525, + "logps/chosen": -184.6936492919922, + "logps/rejected": -184.6936492919922, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.48746395111084, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.487464904785156, + "step": 1074 + }, + { + "epoch": 0.741762980852165, + "grad_norm": 0.35121646523475647, + "learning_rate": 2.060176312763511e-06, + "logits/chosen": 3.6373348236083984, + "logits/rejected": 3.765958786010742, + "logps/chosen": -160.9340362548828, + "logps/rejected": -170.15615844726562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.39763069152832, + "rewards/margins": 0.9105479121208191, + "rewards/rejected": -12.308177947998047, + "step": 1075 + }, + { + "epoch": 0.7424529929273762, + "grad_norm": 0.3873668909072876, + "learning_rate": 2.0620927558451515e-06, + "logits/chosen": 3.6914496421813965, + "logits/rejected": 3.6914496421813965, + "logps/chosen": -163.07037353515625, + "logps/rejected": -163.07037353515625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.589509010314941, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -11.589509010314941, + "step": 1076 + }, + { + "epoch": 0.7431430050025876, + "grad_norm": 0.3831768333911896, + "learning_rate": 2.0640091989267923e-06, + "logits/chosen": 3.7919952869415283, + "logits/rejected": 3.895972967147827, + "logps/chosen": -163.63494873046875, + "logps/rejected": -182.2392120361328, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.590458869934082, + "rewards/margins": 1.712195873260498, + "rewards/rejected": -13.302654266357422, + "step": 1077 + }, + { + "epoch": 0.7438330170777988, + "grad_norm": 0.3960213363170624, + "learning_rate": 2.0659256420084327e-06, + "logits/chosen": 3.708688735961914, + "logits/rejected": 3.708688735961914, + "logps/chosen": -182.570556640625, + "logps/rejected": -182.570556640625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.313212394714355, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.313213348388672, + "step": 1078 + }, + { + "epoch": 0.7445230291530102, + "grad_norm": 2.896665573120117, + "learning_rate": 2.067842085090073e-06, + "logits/chosen": 3.9933829307556152, + "logits/rejected": 4.042393684387207, + "logps/chosen": -186.7991485595703, + "logps/rejected": -196.5033416748047, + "loss": 0.5481, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.745341300964355, + "rewards/margins": 1.0054057836532593, + "rewards/rejected": -14.750747680664062, + "step": 1079 + }, + { + "epoch": 0.7452130412282215, + "grad_norm": 0.3004795014858246, + "learning_rate": 2.0697585281717135e-06, + "logits/chosen": 4.137165546417236, + "logits/rejected": 4.137165546417236, + "logps/chosen": -195.51939392089844, + "logps/rejected": -195.51939392089844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.689349174499512, + "rewards/margins": 0.0, + "rewards/rejected": -14.689349174499512, + "step": 1080 + }, + { + "epoch": 0.7459030533034328, + "grad_norm": 0.33413922786712646, + "learning_rate": 2.071674971253354e-06, + "logits/chosen": 3.733703136444092, + "logits/rejected": 3.9465222358703613, + "logps/chosen": -177.2682342529297, + "logps/rejected": -190.74530029296875, + "loss": 0.5209, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.933152198791504, + "rewards/margins": 1.4232771396636963, + "rewards/rejected": -14.356430053710938, + "step": 1081 + }, + { + "epoch": 0.7465930653786441, + "grad_norm": 3.1478888988494873, + "learning_rate": 2.0735914143349942e-06, + "logits/chosen": 3.9740800857543945, + "logits/rejected": 3.8367042541503906, + "logps/chosen": -194.36972045898438, + "logps/rejected": -195.96214294433594, + "loss": 0.629, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.504472732543945, + "rewards/margins": 0.2031393051147461, + "rewards/rejected": -14.707611083984375, + "step": 1082 + }, + { + "epoch": 0.7472830774538555, + "grad_norm": 1.2721989154815674, + "learning_rate": 2.075507857416635e-06, + "logits/chosen": 3.5677967071533203, + "logits/rejected": 3.7170658111572266, + "logps/chosen": -163.64947509765625, + "logps/rejected": -179.10092163085938, + "loss": 0.5258, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.753173828125, + "rewards/margins": 1.5955933332443237, + "rewards/rejected": -13.348766326904297, + "step": 1083 + }, + { + "epoch": 0.7479730895290667, + "grad_norm": 8.697566032409668, + "learning_rate": 2.0774243004982755e-06, + "logits/chosen": 4.179056644439697, + "logits/rejected": 4.212874889373779, + "logps/chosen": -177.57168579101562, + "logps/rejected": -176.85064697265625, + "loss": 0.7376, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.782655715942383, + "rewards/margins": -0.07721090316772461, + "rewards/rejected": -12.7054443359375, + "step": 1084 + }, + { + "epoch": 0.7486631016042781, + "grad_norm": 0.3146958351135254, + "learning_rate": 2.079340743579916e-06, + "logits/chosen": 4.119006156921387, + "logits/rejected": 4.198330402374268, + "logps/chosen": -177.2919464111328, + "logps/rejected": -189.43603515625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.719738960266113, + "rewards/margins": 1.238796591758728, + "rewards/rejected": -13.958535194396973, + "step": 1085 + }, + { + "epoch": 0.7493531136794894, + "grad_norm": 0.29854947328567505, + "learning_rate": 2.0812571866615562e-06, + "logits/chosen": 4.365060329437256, + "logits/rejected": 4.365060329437256, + "logps/chosen": -183.263916015625, + "logps/rejected": -183.263916015625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.661216735839844, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -13.661215782165527, + "step": 1086 + }, + { + "epoch": 0.7500431257547007, + "grad_norm": 0.8219714760780334, + "learning_rate": 2.0831736297431966e-06, + "logits/chosen": 4.201374053955078, + "logits/rejected": 4.281074047088623, + "logps/chosen": -185.06756591796875, + "logps/rejected": -202.9090118408203, + "loss": 0.4365, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.78880500793457, + "rewards/margins": 1.8269065618515015, + "rewards/rejected": -15.615711212158203, + "step": 1087 + }, + { + "epoch": 0.750733137829912, + "grad_norm": 0.43614310026168823, + "learning_rate": 2.085090072824837e-06, + "logits/chosen": 3.822413206100464, + "logits/rejected": 3.821242570877075, + "logps/chosen": -173.27442932128906, + "logps/rejected": -178.7091064453125, + "loss": 0.6086, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.911212921142578, + "rewards/margins": 0.5126527547836304, + "rewards/rejected": -13.423866271972656, + "step": 1088 + }, + { + "epoch": 0.7514231499051234, + "grad_norm": 0.3290949761867523, + "learning_rate": 2.087006515906478e-06, + "logits/chosen": 4.01484489440918, + "logits/rejected": 4.01484489440918, + "logps/chosen": -182.72525024414062, + "logps/rejected": -182.72525024414062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.36196517944336, + "rewards/margins": 0.0, + "rewards/rejected": -13.36196517944336, + "step": 1089 + }, + { + "epoch": 0.7521131619803346, + "grad_norm": 2.755995988845825, + "learning_rate": 2.0889229589881182e-06, + "logits/chosen": 3.677506446838379, + "logits/rejected": 3.7116312980651855, + "logps/chosen": -169.25196838378906, + "logps/rejected": -176.6649169921875, + "loss": 0.5411, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.41765308380127, + "rewards/margins": 0.695404052734375, + "rewards/rejected": -13.113056182861328, + "step": 1090 + }, + { + "epoch": 0.752803174055546, + "grad_norm": 0.3600653111934662, + "learning_rate": 2.0908394020697586e-06, + "logits/chosen": 3.812636137008667, + "logits/rejected": 3.8728134632110596, + "logps/chosen": -169.30560302734375, + "logps/rejected": -177.48818969726562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.147256851196289, + "rewards/margins": 0.8470783233642578, + "rewards/rejected": -12.994335174560547, + "step": 1091 + }, + { + "epoch": 0.7534931861307573, + "grad_norm": 0.3479941189289093, + "learning_rate": 2.092755845151399e-06, + "logits/chosen": 3.93361496925354, + "logits/rejected": 3.936354875564575, + "logps/chosen": -186.29376220703125, + "logps/rejected": -195.951904296875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.73595905303955, + "rewards/margins": 1.0103503465652466, + "rewards/rejected": -14.746309280395508, + "step": 1092 + }, + { + "epoch": 0.7541831982059686, + "grad_norm": 8.580744743347168, + "learning_rate": 2.09467228823304e-06, + "logits/chosen": 3.6853814125061035, + "logits/rejected": 3.8434181213378906, + "logps/chosen": -168.4157257080078, + "logps/rejected": -182.3726806640625, + "loss": 0.5206, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.00778579711914, + "rewards/margins": 1.3236594200134277, + "rewards/rejected": -13.33144474029541, + "step": 1093 + }, + { + "epoch": 0.7548732102811799, + "grad_norm": 0.9801022410392761, + "learning_rate": 2.0965887313146802e-06, + "logits/chosen": 3.8148415088653564, + "logits/rejected": 4.019052982330322, + "logps/chosen": -166.6778564453125, + "logps/rejected": -178.7611083984375, + "loss": 0.5281, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.97055435180664, + "rewards/margins": 1.168241262435913, + "rewards/rejected": -13.138795852661133, + "step": 1094 + }, + { + "epoch": 0.7555632223563913, + "grad_norm": 0.4043503701686859, + "learning_rate": 2.0985051743963206e-06, + "logits/chosen": 3.7077364921569824, + "logits/rejected": 3.7077364921569824, + "logps/chosen": -168.11277770996094, + "logps/rejected": -168.11277770996094, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.116044998168945, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -12.116044998168945, + "step": 1095 + }, + { + "epoch": 0.7562532344316025, + "grad_norm": 0.3158372640609741, + "learning_rate": 2.100421617477961e-06, + "logits/chosen": 3.7714149951934814, + "logits/rejected": 3.8421196937561035, + "logps/chosen": -187.5209197998047, + "logps/rejected": -193.88009643554688, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.896249771118164, + "rewards/margins": 0.6542412042617798, + "rewards/rejected": -14.550491333007812, + "step": 1096 + }, + { + "epoch": 0.7569432465068139, + "grad_norm": 0.33550527691841125, + "learning_rate": 2.102338060559602e-06, + "logits/chosen": 3.865039348602295, + "logits/rejected": 3.865039348602295, + "logps/chosen": -175.0697021484375, + "logps/rejected": -175.0697021484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.663847923278809, + "rewards/margins": -2.980232238769531e-07, + "rewards/rejected": -12.663847923278809, + "step": 1097 + }, + { + "epoch": 0.7576332585820252, + "grad_norm": 32.442481994628906, + "learning_rate": 2.104254503641242e-06, + "logits/chosen": 3.6513519287109375, + "logits/rejected": 3.677272319793701, + "logps/chosen": -166.97413635253906, + "logps/rejected": -177.76284790039062, + "loss": 1.2866, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.861117362976074, + "rewards/margins": 1.0312933921813965, + "rewards/rejected": -12.892410278320312, + "step": 1098 + }, + { + "epoch": 0.7583232706572365, + "grad_norm": 0.3890313506126404, + "learning_rate": 2.1061709467228826e-06, + "logits/chosen": 3.5231995582580566, + "logits/rejected": 3.5231995582580566, + "logps/chosen": -184.981201171875, + "logps/rejected": -184.981201171875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.741777420043945, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.741778373718262, + "step": 1099 + }, + { + "epoch": 0.7590132827324478, + "grad_norm": 12.371565818786621, + "learning_rate": 2.108087389804523e-06, + "logits/chosen": 3.542290687561035, + "logits/rejected": 3.5119199752807617, + "logps/chosen": -149.81143188476562, + "logps/rejected": -147.2770538330078, + "loss": 0.8561, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.294285774230957, + "rewards/margins": -0.23140710592269897, + "rewards/rejected": -10.062878608703613, + "step": 1100 + }, + { + "epoch": 0.7597032948076591, + "grad_norm": 7.790571212768555, + "learning_rate": 2.1100038328861634e-06, + "logits/chosen": 3.4995455741882324, + "logits/rejected": 3.603844165802002, + "logps/chosen": -166.6759033203125, + "logps/rejected": -167.82733154296875, + "loss": 0.6482, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.817743301391602, + "rewards/margins": 0.1156541109085083, + "rewards/rejected": -11.933398246765137, + "step": 1101 + }, + { + "epoch": 0.7603933068828704, + "grad_norm": 1.875795841217041, + "learning_rate": 2.1119202759678038e-06, + "logits/chosen": 3.5733723640441895, + "logits/rejected": 3.6454341411590576, + "logps/chosen": -160.00119018554688, + "logps/rejected": -175.78440856933594, + "loss": 0.5354, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.348596572875977, + "rewards/margins": 1.5665780305862427, + "rewards/rejected": -12.91517448425293, + "step": 1102 + }, + { + "epoch": 0.7610833189580818, + "grad_norm": 0.40452972054481506, + "learning_rate": 2.1138367190494446e-06, + "logits/chosen": 3.831092119216919, + "logits/rejected": 3.9195635318756104, + "logps/chosen": -174.30459594726562, + "logps/rejected": -186.73284912109375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.597352981567383, + "rewards/margins": 1.2632486820220947, + "rewards/rejected": -13.860601425170898, + "step": 1103 + }, + { + "epoch": 0.761773331033293, + "grad_norm": 0.32083943486213684, + "learning_rate": 2.115753162131085e-06, + "logits/chosen": 3.6273725032806396, + "logits/rejected": 3.732914447784424, + "logps/chosen": -163.03811645507812, + "logps/rejected": -182.69949340820312, + "loss": 0.5201, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.618706703186035, + "rewards/margins": 1.984749436378479, + "rewards/rejected": -13.603455543518066, + "step": 1104 + }, + { + "epoch": 0.7624633431085044, + "grad_norm": 0.36654311418533325, + "learning_rate": 2.1176696052127254e-06, + "logits/chosen": 4.263819694519043, + "logits/rejected": 4.2484331130981445, + "logps/chosen": -183.0603790283203, + "logps/rejected": -189.22640991210938, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.431076049804688, + "rewards/margins": 0.6536459922790527, + "rewards/rejected": -14.084721565246582, + "step": 1105 + }, + { + "epoch": 0.7631533551837157, + "grad_norm": 0.3991377055644989, + "learning_rate": 2.1195860482943658e-06, + "logits/chosen": 4.01750373840332, + "logits/rejected": 4.01750373840332, + "logps/chosen": -177.53280639648438, + "logps/rejected": -177.53280639648438, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.154316902160645, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.154316902160645, + "step": 1106 + }, + { + "epoch": 0.763843367258927, + "grad_norm": 0.30748093128204346, + "learning_rate": 2.121502491376006e-06, + "logits/chosen": 3.961897373199463, + "logits/rejected": 4.0054216384887695, + "logps/chosen": -169.77027893066406, + "logps/rejected": -177.48797607421875, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.126924514770508, + "rewards/margins": 0.7929449677467346, + "rewards/rejected": -12.919870376586914, + "step": 1107 + }, + { + "epoch": 0.7645333793341383, + "grad_norm": 0.3511910140514374, + "learning_rate": 2.1234189344576465e-06, + "logits/chosen": 3.699842691421509, + "logits/rejected": 3.699842691421509, + "logps/chosen": -177.40869140625, + "logps/rejected": -177.40869140625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.9668607711792, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.9668607711792, + "step": 1108 + }, + { + "epoch": 0.7652233914093497, + "grad_norm": 0.340991735458374, + "learning_rate": 2.1253353775392874e-06, + "logits/chosen": 3.8638367652893066, + "logits/rejected": 3.8638367652893066, + "logps/chosen": -167.83145141601562, + "logps/rejected": -167.83145141601562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.925209045410156, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -11.92520809173584, + "step": 1109 + }, + { + "epoch": 0.7659134034845609, + "grad_norm": 12.207788467407227, + "learning_rate": 2.1272518206209278e-06, + "logits/chosen": 3.7753493785858154, + "logits/rejected": 3.8112921714782715, + "logps/chosen": -150.20562744140625, + "logps/rejected": -162.98049926757812, + "loss": 1.4067, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.413338661193848, + "rewards/margins": 1.1750519275665283, + "rewards/rejected": -11.588391304016113, + "step": 1110 + }, + { + "epoch": 0.7666034155597723, + "grad_norm": 0.35165485739707947, + "learning_rate": 2.129168263702568e-06, + "logits/chosen": 3.998410940170288, + "logits/rejected": 3.998410940170288, + "logps/chosen": -179.15786743164062, + "logps/rejected": -179.15786743164062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.113370895385742, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.11336898803711, + "step": 1111 + }, + { + "epoch": 0.7672934276349836, + "grad_norm": 0.4122745394706726, + "learning_rate": 2.1310847067842085e-06, + "logits/chosen": 4.050069332122803, + "logits/rejected": 4.231446266174316, + "logps/chosen": -164.078369140625, + "logps/rejected": -189.30938720703125, + "loss": 0.4345, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.655046463012695, + "rewards/margins": 2.504209518432617, + "rewards/rejected": -14.159257888793945, + "step": 1112 + }, + { + "epoch": 0.7679834397101949, + "grad_norm": 0.5793203711509705, + "learning_rate": 2.1330011498658493e-06, + "logits/chosen": 3.769375801086426, + "logits/rejected": 4.182580947875977, + "logps/chosen": -146.2366485595703, + "logps/rejected": -174.30250549316406, + "loss": 0.52, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.111858367919922, + "rewards/margins": 2.6476686000823975, + "rewards/rejected": -12.759526252746582, + "step": 1113 + }, + { + "epoch": 0.7686734517854062, + "grad_norm": 0.28562280535697937, + "learning_rate": 2.1349175929474897e-06, + "logits/chosen": 3.7598392963409424, + "logits/rejected": 3.818796157836914, + "logps/chosen": -176.74508666992188, + "logps/rejected": -184.8306427001953, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.700066566467285, + "rewards/margins": 0.8409106731414795, + "rewards/rejected": -13.540977478027344, + "step": 1114 + }, + { + "epoch": 0.7693634638606176, + "grad_norm": 0.44310277700424194, + "learning_rate": 2.13683403602913e-06, + "logits/chosen": 3.7441792488098145, + "logits/rejected": 3.7441792488098145, + "logps/chosen": -164.62403869628906, + "logps/rejected": -164.62405395507812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.6259765625, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -11.6259765625, + "step": 1115 + }, + { + "epoch": 0.7700534759358288, + "grad_norm": 2.6013052463531494, + "learning_rate": 2.1387504791107705e-06, + "logits/chosen": 3.8986716270446777, + "logits/rejected": 3.8640451431274414, + "logps/chosen": -153.95968627929688, + "logps/rejected": -156.7919158935547, + "loss": 0.6295, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.783830642700195, + "rewards/margins": 0.1997147798538208, + "rewards/rejected": -10.983545303344727, + "step": 1116 + }, + { + "epoch": 0.7707434880110402, + "grad_norm": 0.40180304646492004, + "learning_rate": 2.1406669221924113e-06, + "logits/chosen": 4.29714822769165, + "logits/rejected": 4.29714822769165, + "logps/chosen": -185.27767944335938, + "logps/rejected": -185.27767944335938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.649065017700195, + "rewards/margins": 0.0, + "rewards/rejected": -13.649065017700195, + "step": 1117 + }, + { + "epoch": 0.7714335000862516, + "grad_norm": 12.43107795715332, + "learning_rate": 2.1425833652740517e-06, + "logits/chosen": 4.099274635314941, + "logits/rejected": 4.090878963470459, + "logps/chosen": -174.32345581054688, + "logps/rejected": -171.9921112060547, + "loss": 0.8503, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.788932800292969, + "rewards/margins": -0.22458386421203613, + "rewards/rejected": -12.564350128173828, + "step": 1118 + }, + { + "epoch": 0.7721235121614628, + "grad_norm": 0.3058086335659027, + "learning_rate": 2.144499808355692e-06, + "logits/chosen": 3.593409538269043, + "logits/rejected": 3.743194103240967, + "logps/chosen": -174.33401489257812, + "logps/rejected": -184.2542724609375, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.486772537231445, + "rewards/margins": 0.9723888039588928, + "rewards/rejected": -13.459161758422852, + "step": 1119 + }, + { + "epoch": 0.7728135242366742, + "grad_norm": 3.908848524093628, + "learning_rate": 2.1464162514373325e-06, + "logits/chosen": 3.6567397117614746, + "logits/rejected": 3.7129933834075928, + "logps/chosen": -148.27310180664062, + "logps/rejected": -173.53927612304688, + "loss": 0.4614, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.135522842407227, + "rewards/margins": 2.385698080062866, + "rewards/rejected": -12.521221160888672, + "step": 1120 + }, + { + "epoch": 0.7735035363118855, + "grad_norm": 0.34296298027038574, + "learning_rate": 2.148332694518973e-06, + "logits/chosen": 4.227759838104248, + "logits/rejected": 4.227759838104248, + "logps/chosen": -189.8544921875, + "logps/rejected": -189.85452270507812, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.297872543334961, + "rewards/margins": 9.5367431640625e-07, + "rewards/rejected": -14.297872543334961, + "step": 1121 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.3559629023075104, + "learning_rate": 2.1502491376006133e-06, + "logits/chosen": 3.9764456748962402, + "logits/rejected": 3.9764456748962402, + "logps/chosen": -176.3410186767578, + "logps/rejected": -176.3410186767578, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.93873119354248, + "rewards/margins": 0.0, + "rewards/rejected": -12.93873119354248, + "step": 1122 + }, + { + "epoch": 0.7748835604623081, + "grad_norm": 0.31489646434783936, + "learning_rate": 2.1521655806822537e-06, + "logits/chosen": 4.119733810424805, + "logits/rejected": 4.119733810424805, + "logps/chosen": -181.12451171875, + "logps/rejected": -181.12451171875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.320837020874023, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.320837020874023, + "step": 1123 + }, + { + "epoch": 0.7755735725375194, + "grad_norm": 0.3714694380760193, + "learning_rate": 2.1540820237638945e-06, + "logits/chosen": 4.029453277587891, + "logits/rejected": 4.029453277587891, + "logps/chosen": -169.20262145996094, + "logps/rejected": -169.20262145996094, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.061443328857422, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -12.061443328857422, + "step": 1124 + }, + { + "epoch": 0.7762635846127307, + "grad_norm": 0.32739415764808655, + "learning_rate": 2.155998466845535e-06, + "logits/chosen": 4.105393409729004, + "logits/rejected": 4.105393409729004, + "logps/chosen": -163.8635711669922, + "logps/rejected": -163.86355590820312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.707084655761719, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -11.707084655761719, + "step": 1125 + }, + { + "epoch": 0.7769535966879421, + "grad_norm": 5.791823387145996, + "learning_rate": 2.1579149099271753e-06, + "logits/chosen": 3.9501471519470215, + "logits/rejected": 3.9435935020446777, + "logps/chosen": -168.96694946289062, + "logps/rejected": -179.22625732421875, + "loss": 0.5775, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.173027038574219, + "rewards/margins": 1.0907351970672607, + "rewards/rejected": -13.263761520385742, + "step": 1126 + }, + { + "epoch": 0.7776436087631533, + "grad_norm": 0.3203333914279938, + "learning_rate": 2.1598313530088157e-06, + "logits/chosen": 4.154695510864258, + "logits/rejected": 4.239513397216797, + "logps/chosen": -179.16812133789062, + "logps/rejected": -191.48382568359375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.04155445098877, + "rewards/margins": 1.2553741931915283, + "rewards/rejected": -14.296928405761719, + "step": 1127 + }, + { + "epoch": 0.7783336208383647, + "grad_norm": 1.9604923725128174, + "learning_rate": 2.161747796090456e-06, + "logits/chosen": 4.035339832305908, + "logits/rejected": 4.107280731201172, + "logps/chosen": -184.2862548828125, + "logps/rejected": -186.7843475341797, + "loss": 0.6154, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.47024917602539, + "rewards/margins": 0.3254411220550537, + "rewards/rejected": -13.795690536499023, + "step": 1128 + }, + { + "epoch": 0.779023632913576, + "grad_norm": 0.314098060131073, + "learning_rate": 2.163664239172097e-06, + "logits/chosen": 3.897460699081421, + "logits/rejected": 3.897460699081421, + "logps/chosen": -174.31640625, + "logps/rejected": -174.31640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.791173934936523, + "rewards/margins": 0.0, + "rewards/rejected": -12.791173934936523, + "step": 1129 + }, + { + "epoch": 0.7797136449887873, + "grad_norm": 22.34836196899414, + "learning_rate": 2.1655806822537373e-06, + "logits/chosen": 4.167272090911865, + "logits/rejected": 4.142423629760742, + "logps/chosen": -183.40667724609375, + "logps/rejected": -185.73358154296875, + "loss": 1.3763, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.536384582519531, + "rewards/margins": 0.23275232315063477, + "rewards/rejected": -13.76913833618164, + "step": 1130 + }, + { + "epoch": 0.7804036570639986, + "grad_norm": 0.4259466230869293, + "learning_rate": 2.1674971253353777e-06, + "logits/chosen": 3.8346753120422363, + "logits/rejected": 3.9901316165924072, + "logps/chosen": -157.23117065429688, + "logps/rejected": -172.1787872314453, + "loss": 0.5233, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.920166015625, + "rewards/margins": 1.4869638681411743, + "rewards/rejected": -12.407129287719727, + "step": 1131 + }, + { + "epoch": 0.78109366913921, + "grad_norm": 7.958653926849365, + "learning_rate": 2.169413568417018e-06, + "logits/chosen": 4.122259616851807, + "logits/rejected": 4.151723384857178, + "logps/chosen": -159.20565795898438, + "logps/rejected": -160.4965057373047, + "loss": 0.6472, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.225215911865234, + "rewards/margins": 0.11948388814926147, + "rewards/rejected": -11.34469985961914, + "step": 1132 + }, + { + "epoch": 0.7817836812144212, + "grad_norm": 19.921323776245117, + "learning_rate": 2.171330011498659e-06, + "logits/chosen": 4.138033866882324, + "logits/rejected": 4.14396333694458, + "logps/chosen": -184.6510009765625, + "logps/rejected": -184.11944580078125, + "loss": 1.0049, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.77499771118164, + "rewards/margins": -0.06739974021911621, + "rewards/rejected": -13.707597732543945, + "step": 1133 + }, + { + "epoch": 0.7824736932896326, + "grad_norm": 0.3552795350551605, + "learning_rate": 2.1732464545802993e-06, + "logits/chosen": 3.9823458194732666, + "logits/rejected": 4.105109691619873, + "logps/chosen": -160.8892822265625, + "logps/rejected": -173.56744384765625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.477690696716309, + "rewards/margins": 1.267368197441101, + "rewards/rejected": -12.7450590133667, + "step": 1134 + }, + { + "epoch": 0.7831637053648439, + "grad_norm": 0.35069531202316284, + "learning_rate": 2.1751628976619397e-06, + "logits/chosen": 4.263228416442871, + "logits/rejected": 4.263228416442871, + "logps/chosen": -186.12538146972656, + "logps/rejected": -186.12538146972656, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.710824966430664, + "rewards/margins": 0.0, + "rewards/rejected": -13.710824966430664, + "step": 1135 + }, + { + "epoch": 0.7838537174400552, + "grad_norm": 0.31093931198120117, + "learning_rate": 2.17707934074358e-06, + "logits/chosen": 3.6380598545074463, + "logits/rejected": 3.6380598545074463, + "logps/chosen": -179.03273010253906, + "logps/rejected": -179.03273010253906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.139853477478027, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.139853477478027, + "step": 1136 + }, + { + "epoch": 0.7845437295152665, + "grad_norm": 0.23528635501861572, + "learning_rate": 2.178995783825221e-06, + "logits/chosen": 4.071063995361328, + "logits/rejected": 4.275329113006592, + "logps/chosen": -175.49429321289062, + "logps/rejected": -183.56674194335938, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.815530776977539, + "rewards/margins": 0.8050766587257385, + "rewards/rejected": -13.620607376098633, + "step": 1137 + }, + { + "epoch": 0.7852337415904779, + "grad_norm": 0.3536698818206787, + "learning_rate": 2.1809122269068613e-06, + "logits/chosen": 4.012888431549072, + "logits/rejected": 4.012888431549072, + "logps/chosen": -176.03814697265625, + "logps/rejected": -176.03814697265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.616634368896484, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.616634368896484, + "step": 1138 + }, + { + "epoch": 0.7859237536656891, + "grad_norm": 0.37319886684417725, + "learning_rate": 2.1828286699885016e-06, + "logits/chosen": 3.73214054107666, + "logits/rejected": 3.73214054107666, + "logps/chosen": -175.89645385742188, + "logps/rejected": -175.89645385742188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.658498764038086, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.658498764038086, + "step": 1139 + }, + { + "epoch": 0.7866137657409005, + "grad_norm": 0.2870144844055176, + "learning_rate": 2.184745113070142e-06, + "logits/chosen": 4.134700775146484, + "logits/rejected": 4.344524383544922, + "logps/chosen": -175.6524200439453, + "logps/rejected": -184.21853637695312, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.639030456542969, + "rewards/margins": 0.8330647945404053, + "rewards/rejected": -13.472095489501953, + "step": 1140 + }, + { + "epoch": 0.7873037778161118, + "grad_norm": 0.3906314969062805, + "learning_rate": 2.1866615561517824e-06, + "logits/chosen": 3.8527920246124268, + "logits/rejected": 3.8527920246124268, + "logps/chosen": -170.2761688232422, + "logps/rejected": -170.2761688232422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.180017471313477, + "rewards/margins": 0.0, + "rewards/rejected": -12.180017471313477, + "step": 1141 + }, + { + "epoch": 0.7879937898913231, + "grad_norm": 0.27402350306510925, + "learning_rate": 2.188577999233423e-06, + "logits/chosen": 4.444576263427734, + "logits/rejected": 4.60526180267334, + "logps/chosen": -173.213134765625, + "logps/rejected": -198.02593994140625, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.730603218078613, + "rewards/margins": 2.4400744438171387, + "rewards/rejected": -15.170677185058594, + "step": 1142 + }, + { + "epoch": 0.7886838019665344, + "grad_norm": 0.3161904215812683, + "learning_rate": 2.190494442315063e-06, + "logits/chosen": 4.086836814880371, + "logits/rejected": 4.350276947021484, + "logps/chosen": -161.18194580078125, + "logps/rejected": -182.46209716796875, + "loss": 0.5203, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.412403106689453, + "rewards/margins": 2.1164286136627197, + "rewards/rejected": -13.528831481933594, + "step": 1143 + }, + { + "epoch": 0.7893738140417458, + "grad_norm": 0.30043676495552063, + "learning_rate": 2.192410885396704e-06, + "logits/chosen": 4.215889930725098, + "logits/rejected": 4.293857097625732, + "logps/chosen": -190.09161376953125, + "logps/rejected": -197.19293212890625, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.086421012878418, + "rewards/margins": 0.6952835321426392, + "rewards/rejected": -14.78170394897461, + "step": 1144 + }, + { + "epoch": 0.790063826116957, + "grad_norm": 0.3589024841785431, + "learning_rate": 2.1943273284783444e-06, + "logits/chosen": 4.085790634155273, + "logits/rejected": 4.085790634155273, + "logps/chosen": -181.42385864257812, + "logps/rejected": -181.42385864257812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.410577774047852, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -13.410577774047852, + "step": 1145 + }, + { + "epoch": 0.7907538381921684, + "grad_norm": 0.31354692578315735, + "learning_rate": 2.196243771559985e-06, + "logits/chosen": 3.7889552116394043, + "logits/rejected": 3.83789324760437, + "logps/chosen": -167.00062561035156, + "logps/rejected": -176.65306091308594, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.965822219848633, + "rewards/margins": 0.9462171792984009, + "rewards/rejected": -12.912040710449219, + "step": 1146 + }, + { + "epoch": 0.7914438502673797, + "grad_norm": 0.3440845012664795, + "learning_rate": 2.198160214641625e-06, + "logits/chosen": 4.035024166107178, + "logits/rejected": 4.029839515686035, + "logps/chosen": -178.73638916015625, + "logps/rejected": -187.36349487304688, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.033222198486328, + "rewards/margins": 0.9242503643035889, + "rewards/rejected": -13.95747184753418, + "step": 1147 + }, + { + "epoch": 0.792133862342591, + "grad_norm": 0.4253723621368408, + "learning_rate": 2.2000766577232656e-06, + "logits/chosen": 3.959644317626953, + "logits/rejected": 4.108675003051758, + "logps/chosen": -168.72366333007812, + "logps/rejected": -180.5382080078125, + "loss": 0.523, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.063653945922852, + "rewards/margins": 1.2642614841461182, + "rewards/rejected": -13.327916145324707, + "step": 1148 + }, + { + "epoch": 0.7928238744178023, + "grad_norm": 0.36173513531684875, + "learning_rate": 2.201993100804906e-06, + "logits/chosen": 3.955662965774536, + "logits/rejected": 3.9980380535125732, + "logps/chosen": -170.20358276367188, + "logps/rejected": -180.0394744873047, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.199116706848145, + "rewards/margins": 1.0009033679962158, + "rewards/rejected": -13.200019836425781, + "step": 1149 + }, + { + "epoch": 0.7935138864930136, + "grad_norm": 0.3288261592388153, + "learning_rate": 2.203909543886547e-06, + "logits/chosen": 4.041630268096924, + "logits/rejected": 4.114754676818848, + "logps/chosen": -187.6754913330078, + "logps/rejected": -193.51654052734375, + "loss": 0.6076, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.878639221191406, + "rewards/margins": 0.5897008180618286, + "rewards/rejected": -14.468339920043945, + "step": 1150 + }, + { + "epoch": 0.7942038985682249, + "grad_norm": 0.3979572057723999, + "learning_rate": 2.205825986968187e-06, + "logits/chosen": 4.089015960693359, + "logits/rejected": 4.089015960693359, + "logps/chosen": -168.7867431640625, + "logps/rejected": -168.7867431640625, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.090447425842285, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -12.090447425842285, + "step": 1151 + }, + { + "epoch": 0.7948939106434363, + "grad_norm": 17.57413673400879, + "learning_rate": 2.2077424300498276e-06, + "logits/chosen": 4.0385026931762695, + "logits/rejected": 4.040680885314941, + "logps/chosen": -161.55087280273438, + "logps/rejected": -171.75979614257812, + "loss": 1.4336, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.369436264038086, + "rewards/margins": 0.9755958914756775, + "rewards/rejected": -12.34503173828125, + "step": 1152 + }, + { + "epoch": 0.7955839227186475, + "grad_norm": 1.8401545286178589, + "learning_rate": 2.209658873131468e-06, + "logits/chosen": 3.7187585830688477, + "logits/rejected": 3.7385387420654297, + "logps/chosen": -150.16485595703125, + "logps/rejected": -168.32505798339844, + "loss": 0.539, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.260107040405273, + "rewards/margins": 1.7765545845031738, + "rewards/rejected": -12.036661148071289, + "step": 1153 + }, + { + "epoch": 0.7962739347938589, + "grad_norm": 0.36127030849456787, + "learning_rate": 2.211575316213109e-06, + "logits/chosen": 4.083345890045166, + "logits/rejected": 4.104700088500977, + "logps/chosen": -168.3495635986328, + "logps/rejected": -174.15078735351562, + "loss": 0.6076, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.12409496307373, + "rewards/margins": 0.5861450433731079, + "rewards/rejected": -12.710240364074707, + "step": 1154 + }, + { + "epoch": 0.7969639468690702, + "grad_norm": 0.2896864116191864, + "learning_rate": 2.213491759294749e-06, + "logits/chosen": 4.3203840255737305, + "logits/rejected": 4.3203840255737305, + "logps/chosen": -178.9912109375, + "logps/rejected": -178.9912109375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.261497497558594, + "rewards/margins": 0.0, + "rewards/rejected": -13.261497497558594, + "step": 1155 + }, + { + "epoch": 0.7976539589442815, + "grad_norm": 0.31962868571281433, + "learning_rate": 2.2154082023763896e-06, + "logits/chosen": 4.345552444458008, + "logits/rejected": 4.345552444458008, + "logps/chosen": -177.06805419921875, + "logps/rejected": -177.06805419921875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.219743728637695, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.219743728637695, + "step": 1156 + }, + { + "epoch": 0.7983439710194928, + "grad_norm": 18.134164810180664, + "learning_rate": 2.21732464545803e-06, + "logits/chosen": 3.850881338119507, + "logits/rejected": 4.14225959777832, + "logps/chosen": -162.40444946289062, + "logps/rejected": -172.09487915039062, + "loss": 0.7223, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.509522438049316, + "rewards/margins": 1.0741926431655884, + "rewards/rejected": -12.583715438842773, + "step": 1157 + }, + { + "epoch": 0.7990339830947042, + "grad_norm": 12.776606559753418, + "learning_rate": 2.2192410885396708e-06, + "logits/chosen": 3.9006543159484863, + "logits/rejected": 3.8282976150512695, + "logps/chosen": -178.07379150390625, + "logps/rejected": -170.09149169921875, + "loss": 1.4416, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.130962371826172, + "rewards/margins": -0.8349695205688477, + "rewards/rejected": -12.295992851257324, + "step": 1158 + }, + { + "epoch": 0.7997239951699154, + "grad_norm": 0.3466345965862274, + "learning_rate": 2.221157531621311e-06, + "logits/chosen": 4.141120910644531, + "logits/rejected": 4.16799783706665, + "logps/chosen": -168.31988525390625, + "logps/rejected": -176.7181396484375, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.252033233642578, + "rewards/margins": 0.8733524680137634, + "rewards/rejected": -13.125386238098145, + "step": 1159 + }, + { + "epoch": 0.8004140072451268, + "grad_norm": 0.32982951402664185, + "learning_rate": 2.2230739747029516e-06, + "logits/chosen": 3.932786464691162, + "logits/rejected": 4.1602606773376465, + "logps/chosen": -156.69569396972656, + "logps/rejected": -178.31153869628906, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.946227073669434, + "rewards/margins": 2.1495578289031982, + "rewards/rejected": -13.095785140991211, + "step": 1160 + }, + { + "epoch": 0.8011040193203381, + "grad_norm": 0.49341464042663574, + "learning_rate": 2.224990417784592e-06, + "logits/chosen": 3.8369548320770264, + "logits/rejected": 3.894228458404541, + "logps/chosen": -157.80929565429688, + "logps/rejected": -170.87026977539062, + "loss": 0.5232, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.117807388305664, + "rewards/margins": 1.2540650367736816, + "rewards/rejected": -12.371871948242188, + "step": 1161 + }, + { + "epoch": 0.8017940313955494, + "grad_norm": 0.3931719958782196, + "learning_rate": 2.2269068608662323e-06, + "logits/chosen": 4.256863594055176, + "logits/rejected": 4.256863594055176, + "logps/chosen": -172.4254913330078, + "logps/rejected": -172.4254913330078, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.360870361328125, + "rewards/margins": 0.0, + "rewards/rejected": -12.360870361328125, + "step": 1162 + }, + { + "epoch": 0.8024840434707607, + "grad_norm": 0.3396580219268799, + "learning_rate": 2.2288233039478727e-06, + "logits/chosen": 4.046815872192383, + "logits/rejected": 4.069516181945801, + "logps/chosen": -170.86611938476562, + "logps/rejected": -177.73410034179688, + "loss": 0.6069, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.294334411621094, + "rewards/margins": 0.7146047353744507, + "rewards/rejected": -13.008938789367676, + "step": 1163 + }, + { + "epoch": 0.8031740555459721, + "grad_norm": 0.3505268394947052, + "learning_rate": 2.2307397470295136e-06, + "logits/chosen": 4.09156608581543, + "logits/rejected": 4.09156608581543, + "logps/chosen": -166.7716064453125, + "logps/rejected": -166.7716064453125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.892854690551758, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.892854690551758, + "step": 1164 + }, + { + "epoch": 0.8038640676211833, + "grad_norm": 11.084898948669434, + "learning_rate": 2.232656190111154e-06, + "logits/chosen": 4.074986934661865, + "logits/rejected": 4.097744464874268, + "logps/chosen": -169.91061401367188, + "logps/rejected": -172.31283569335938, + "loss": 0.622, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.27631950378418, + "rewards/margins": 0.24765384197235107, + "rewards/rejected": -12.52397346496582, + "step": 1165 + }, + { + "epoch": 0.8045540796963947, + "grad_norm": 0.3037528693675995, + "learning_rate": 2.2345726331927943e-06, + "logits/chosen": 4.0772600173950195, + "logits/rejected": 4.104818344116211, + "logps/chosen": -174.9204559326172, + "logps/rejected": -181.34640502929688, + "loss": 0.6073, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.738061904907227, + "rewards/margins": 0.6377978324890137, + "rewards/rejected": -13.375860214233398, + "step": 1166 + }, + { + "epoch": 0.805244091771606, + "grad_norm": 1.7422740459442139, + "learning_rate": 2.2364890762744347e-06, + "logits/chosen": 4.212123870849609, + "logits/rejected": 4.379250526428223, + "logps/chosen": -178.4366912841797, + "logps/rejected": -180.68185424804688, + "loss": 0.6215, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.059480667114258, + "rewards/margins": 0.2573697566986084, + "rewards/rejected": -13.316850662231445, + "step": 1167 + }, + { + "epoch": 0.8059341038468173, + "grad_norm": 0.3473019003868103, + "learning_rate": 2.238405519356075e-06, + "logits/chosen": 4.30033016204834, + "logits/rejected": 4.30033016204834, + "logps/chosen": -183.33509826660156, + "logps/rejected": -183.33509826660156, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.45519733428955, + "rewards/margins": 0.0, + "rewards/rejected": -13.455196380615234, + "step": 1168 + }, + { + "epoch": 0.8066241159220287, + "grad_norm": 0.2685372829437256, + "learning_rate": 2.2403219624377155e-06, + "logits/chosen": 4.055266857147217, + "logits/rejected": 4.183692455291748, + "logps/chosen": -177.7921142578125, + "logps/rejected": -185.77691650390625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.891481399536133, + "rewards/margins": 0.8479773998260498, + "rewards/rejected": -13.739459991455078, + "step": 1169 + }, + { + "epoch": 0.80731412799724, + "grad_norm": 0.30451685190200806, + "learning_rate": 2.2422384055193563e-06, + "logits/chosen": 4.020512580871582, + "logits/rejected": 4.210159778594971, + "logps/chosen": -166.33587646484375, + "logps/rejected": -182.1763458251953, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.836950302124023, + "rewards/margins": 1.6548030376434326, + "rewards/rejected": -13.491752624511719, + "step": 1170 + }, + { + "epoch": 0.8080041400724512, + "grad_norm": 0.7067708373069763, + "learning_rate": 2.2441548486009967e-06, + "logits/chosen": 4.172672271728516, + "logits/rejected": 4.289368152618408, + "logps/chosen": -169.48684692382812, + "logps/rejected": -182.61825561523438, + "loss": 0.5234, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.33945083618164, + "rewards/margins": 1.3353177309036255, + "rewards/rejected": -13.674768447875977, + "step": 1171 + }, + { + "epoch": 0.8086941521476626, + "grad_norm": 0.25797438621520996, + "learning_rate": 2.246071291682637e-06, + "logits/chosen": 3.7040648460388184, + "logits/rejected": 3.8488588333129883, + "logps/chosen": -172.61825561523438, + "logps/rejected": -183.68783569335938, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.486522674560547, + "rewards/margins": 1.1074755191802979, + "rewards/rejected": -13.593997955322266, + "step": 1172 + }, + { + "epoch": 0.8093841642228738, + "grad_norm": 0.24030627310276031, + "learning_rate": 2.2479877347642775e-06, + "logits/chosen": 4.170450210571289, + "logits/rejected": 4.170450210571289, + "logps/chosen": -197.16114807128906, + "logps/rejected": -197.16114807128906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.902753829956055, + "rewards/margins": 0.0, + "rewards/rejected": -14.902753829956055, + "step": 1173 + }, + { + "epoch": 0.8100741762980852, + "grad_norm": 0.2651205062866211, + "learning_rate": 2.2499041778459183e-06, + "logits/chosen": 3.8023574352264404, + "logits/rejected": 4.076988220214844, + "logps/chosen": -166.60018920898438, + "logps/rejected": -194.71641540527344, + "loss": 0.4335, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.872238159179688, + "rewards/margins": 2.9154109954833984, + "rewards/rejected": -14.787649154663086, + "step": 1174 + }, + { + "epoch": 0.8107641883732966, + "grad_norm": 0.25995194911956787, + "learning_rate": 2.2518206209275587e-06, + "logits/chosen": 3.931382656097412, + "logits/rejected": 4.128393173217773, + "logps/chosen": -168.71542358398438, + "logps/rejected": -180.03097534179688, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.983915328979492, + "rewards/margins": 1.1625820398330688, + "rewards/rejected": -13.146496772766113, + "step": 1175 + }, + { + "epoch": 0.8114542004485078, + "grad_norm": 0.4198318421840668, + "learning_rate": 2.253737064009199e-06, + "logits/chosen": 4.011165142059326, + "logits/rejected": 4.011165142059326, + "logps/chosen": -173.38414001464844, + "logps/rejected": -173.38412475585938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.56353759765625, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -12.563536643981934, + "step": 1176 + }, + { + "epoch": 0.8121442125237192, + "grad_norm": 0.31357795000076294, + "learning_rate": 2.2556535070908395e-06, + "logits/chosen": 3.8421943187713623, + "logits/rejected": 3.935580015182495, + "logps/chosen": -151.45156860351562, + "logps/rejected": -170.4777069091797, + "loss": 0.5208, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.435551643371582, + "rewards/margins": 1.904111385345459, + "rewards/rejected": -12.339662551879883, + "step": 1177 + }, + { + "epoch": 0.8128342245989305, + "grad_norm": 0.2953084409236908, + "learning_rate": 2.2575699501724803e-06, + "logits/chosen": 4.115677833557129, + "logits/rejected": 4.115677833557129, + "logps/chosen": -176.15402221679688, + "logps/rejected": -176.15402221679688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.78469467163086, + "rewards/margins": 0.0, + "rewards/rejected": -12.78469467163086, + "step": 1178 + }, + { + "epoch": 0.8135242366741418, + "grad_norm": 6.306047439575195, + "learning_rate": 2.2594863932541207e-06, + "logits/chosen": 4.001429080963135, + "logits/rejected": 4.088857173919678, + "logps/chosen": -177.62191772460938, + "logps/rejected": -186.86293029785156, + "loss": 0.629, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.640122413635254, + "rewards/margins": 0.9341204166412354, + "rewards/rejected": -13.574243545532227, + "step": 1179 + }, + { + "epoch": 0.8142142487493531, + "grad_norm": 0.30578911304473877, + "learning_rate": 2.261402836335761e-06, + "logits/chosen": 4.273906707763672, + "logits/rejected": 4.273906707763672, + "logps/chosen": -177.20860290527344, + "logps/rejected": -177.20860290527344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.012197494506836, + "rewards/margins": 0.0, + "rewards/rejected": -13.012197494506836, + "step": 1180 + }, + { + "epoch": 0.8149042608245645, + "grad_norm": 4.228540897369385, + "learning_rate": 2.2633192794174015e-06, + "logits/chosen": 4.026158332824707, + "logits/rejected": 4.113190650939941, + "logps/chosen": -181.15675354003906, + "logps/rejected": -188.36264038085938, + "loss": 0.577, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.4181489944458, + "rewards/margins": 0.6712538003921509, + "rewards/rejected": -14.089402198791504, + "step": 1181 + }, + { + "epoch": 0.8155942728997757, + "grad_norm": 0.3720669150352478, + "learning_rate": 2.265235722499042e-06, + "logits/chosen": 4.168859004974365, + "logits/rejected": 4.251379013061523, + "logps/chosen": -180.56280517578125, + "logps/rejected": -186.1248779296875, + "loss": 0.6081, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.3505277633667, + "rewards/margins": 0.5421586036682129, + "rewards/rejected": -13.89268684387207, + "step": 1182 + }, + { + "epoch": 0.8162842849749871, + "grad_norm": 0.3193899393081665, + "learning_rate": 2.2671521655806823e-06, + "logits/chosen": 3.9553050994873047, + "logits/rejected": 4.071456432342529, + "logps/chosen": -183.59515380859375, + "logps/rejected": -195.78582763671875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.608860969543457, + "rewards/margins": 1.2204971313476562, + "rewards/rejected": -14.829358100891113, + "step": 1183 + }, + { + "epoch": 0.8169742970501984, + "grad_norm": 19.395263671875, + "learning_rate": 2.269068608662323e-06, + "logits/chosen": 3.834688186645508, + "logits/rejected": 3.7356607913970947, + "logps/chosen": -154.96044921875, + "logps/rejected": -163.1656494140625, + "loss": 1.3152, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.829643249511719, + "rewards/margins": 0.8264205455780029, + "rewards/rejected": -11.656063079833984, + "step": 1184 + }, + { + "epoch": 0.8176643091254097, + "grad_norm": 0.2785923480987549, + "learning_rate": 2.2709850517439635e-06, + "logits/chosen": 3.892345428466797, + "logits/rejected": 4.063638687133789, + "logps/chosen": -154.87245178222656, + "logps/rejected": -181.7186279296875, + "loss": 0.435, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.831055641174316, + "rewards/margins": 2.6483263969421387, + "rewards/rejected": -13.47938060760498, + "step": 1185 + }, + { + "epoch": 0.818354321200621, + "grad_norm": 0.365547239780426, + "learning_rate": 2.272901494825604e-06, + "logits/chosen": 3.6636486053466797, + "logits/rejected": 3.920142412185669, + "logps/chosen": -151.11236572265625, + "logps/rejected": -168.38491821289062, + "loss": 0.522, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.325653076171875, + "rewards/margins": 1.79088294506073, + "rewards/rejected": -12.116537094116211, + "step": 1186 + }, + { + "epoch": 0.8190443332758324, + "grad_norm": 0.2698531448841095, + "learning_rate": 2.2748179379072442e-06, + "logits/chosen": 4.22537899017334, + "logits/rejected": 4.376053333282471, + "logps/chosen": -179.70545959472656, + "logps/rejected": -185.49560546875, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.119894027709961, + "rewards/margins": 0.654695987701416, + "rewards/rejected": -13.774590492248535, + "step": 1187 + }, + { + "epoch": 0.8197343453510436, + "grad_norm": 0.34673187136650085, + "learning_rate": 2.2767343809888846e-06, + "logits/chosen": 4.221607208251953, + "logits/rejected": 4.221607208251953, + "logps/chosen": -189.14549255371094, + "logps/rejected": -189.14549255371094, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.063928604125977, + "rewards/margins": 0.0, + "rewards/rejected": -14.063928604125977, + "step": 1188 + }, + { + "epoch": 0.820424357426255, + "grad_norm": 0.34675610065460205, + "learning_rate": 2.278650824070525e-06, + "logits/chosen": 4.186360836029053, + "logits/rejected": 4.186360836029053, + "logps/chosen": -171.8861541748047, + "logps/rejected": -171.8861541748047, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.30312728881836, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.303125381469727, + "step": 1189 + }, + { + "epoch": 0.8211143695014663, + "grad_norm": 2.6087450981140137, + "learning_rate": 2.280567267152166e-06, + "logits/chosen": 4.327525615692139, + "logits/rejected": 4.2096428871154785, + "logps/chosen": -163.1231689453125, + "logps/rejected": -164.8203125, + "loss": 0.6335, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.65778923034668, + "rewards/margins": 0.1780184507369995, + "rewards/rejected": -11.835807800292969, + "step": 1190 + }, + { + "epoch": 0.8218043815766776, + "grad_norm": 0.3229818344116211, + "learning_rate": 2.2824837102338062e-06, + "logits/chosen": 4.02106237411499, + "logits/rejected": 4.149102687835693, + "logps/chosen": -166.45285034179688, + "logps/rejected": -184.06192016601562, + "loss": 0.5205, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.994665145874023, + "rewards/margins": 1.8059982061386108, + "rewards/rejected": -13.800662994384766, + "step": 1191 + }, + { + "epoch": 0.8224943936518889, + "grad_norm": 0.32993635535240173, + "learning_rate": 2.2844001533154466e-06, + "logits/chosen": 4.185545921325684, + "logits/rejected": 4.185545921325684, + "logps/chosen": -182.9010772705078, + "logps/rejected": -182.9010772705078, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.386281967163086, + "rewards/margins": 0.0, + "rewards/rejected": -13.386281967163086, + "step": 1192 + }, + { + "epoch": 0.8231844057271003, + "grad_norm": 0.2915858030319214, + "learning_rate": 2.286316596397087e-06, + "logits/chosen": 4.148767471313477, + "logits/rejected": 4.148767471313477, + "logps/chosen": -195.00106811523438, + "logps/rejected": -195.00106811523438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.614510536193848, + "rewards/margins": 0.0, + "rewards/rejected": -14.614510536193848, + "step": 1193 + }, + { + "epoch": 0.8238744178023115, + "grad_norm": 0.30870065093040466, + "learning_rate": 2.288233039478728e-06, + "logits/chosen": 4.279206275939941, + "logits/rejected": 4.279206275939941, + "logps/chosen": -193.63275146484375, + "logps/rejected": -193.63275146484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.650620460510254, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -14.650619506835938, + "step": 1194 + }, + { + "epoch": 0.8245644298775229, + "grad_norm": 0.2947782874107361, + "learning_rate": 2.2901494825603682e-06, + "logits/chosen": 3.8607423305511475, + "logits/rejected": 4.137571811676025, + "logps/chosen": -163.47242736816406, + "logps/rejected": -188.81146240234375, + "loss": 0.4339, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.540857315063477, + "rewards/margins": 2.5480144023895264, + "rewards/rejected": -14.088871002197266, + "step": 1195 + }, + { + "epoch": 0.8252544419527341, + "grad_norm": 0.34571319818496704, + "learning_rate": 2.2920659256420086e-06, + "logits/chosen": 4.572519302368164, + "logits/rejected": 4.572519302368164, + "logps/chosen": -185.92642211914062, + "logps/rejected": -185.92642211914062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.7027006149292, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.702699661254883, + "step": 1196 + }, + { + "epoch": 0.8259444540279455, + "grad_norm": 0.27929097414016724, + "learning_rate": 2.293982368723649e-06, + "logits/chosen": 3.994483232498169, + "logits/rejected": 4.0994157791137695, + "logps/chosen": -191.32196044921875, + "logps/rejected": -207.7943878173828, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.359153747558594, + "rewards/margins": 1.6932270526885986, + "rewards/rejected": -16.05238151550293, + "step": 1197 + }, + { + "epoch": 0.8266344661031568, + "grad_norm": 0.2756359875202179, + "learning_rate": 2.29589881180529e-06, + "logits/chosen": 4.24394416809082, + "logits/rejected": 4.40260648727417, + "logps/chosen": -166.0724334716797, + "logps/rejected": -185.5845184326172, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.999457359313965, + "rewards/margins": 1.989020586013794, + "rewards/rejected": -13.98847770690918, + "step": 1198 + }, + { + "epoch": 0.8273244781783681, + "grad_norm": 0.3637833595275879, + "learning_rate": 2.2978152548869302e-06, + "logits/chosen": 4.053462982177734, + "logits/rejected": 4.053462982177734, + "logps/chosen": -187.09153747558594, + "logps/rejected": -187.09153747558594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.07606315612793, + "rewards/margins": 0.0, + "rewards/rejected": -14.07606315612793, + "step": 1199 + }, + { + "epoch": 0.8280144902535794, + "grad_norm": 2.023141384124756, + "learning_rate": 2.2997316979685706e-06, + "logits/chosen": 4.210182189941406, + "logits/rejected": 4.2999348640441895, + "logps/chosen": -176.64447021484375, + "logps/rejected": -179.39952087402344, + "loss": 0.6226, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.748359680175781, + "rewards/margins": 0.24842119216918945, + "rewards/rejected": -12.996781349182129, + "step": 1200 + }, + { + "epoch": 0.8287045023287908, + "grad_norm": 0.32197123765945435, + "learning_rate": 2.301648141050211e-06, + "logits/chosen": 4.375729084014893, + "logits/rejected": 4.410757064819336, + "logps/chosen": -172.0775146484375, + "logps/rejected": -179.9259490966797, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.420656204223633, + "rewards/margins": 0.8099328279495239, + "rewards/rejected": -13.230589866638184, + "step": 1201 + }, + { + "epoch": 0.829394514404002, + "grad_norm": 0.2885468304157257, + "learning_rate": 2.3035645841318514e-06, + "logits/chosen": 4.024230003356934, + "logits/rejected": 4.145138263702393, + "logps/chosen": -181.99440002441406, + "logps/rejected": -193.33297729492188, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.47921371459961, + "rewards/margins": 1.1544657945632935, + "rewards/rejected": -14.63368034362793, + "step": 1202 + }, + { + "epoch": 0.8300845264792134, + "grad_norm": 0.28287211060523987, + "learning_rate": 2.3054810272134918e-06, + "logits/chosen": 3.7250280380249023, + "logits/rejected": 3.9043941497802734, + "logps/chosen": -150.02012634277344, + "logps/rejected": -173.99977111816406, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.051140785217285, + "rewards/margins": 2.386727809906006, + "rewards/rejected": -12.437868118286133, + "step": 1203 + }, + { + "epoch": 0.8307745385544247, + "grad_norm": 0.7114261984825134, + "learning_rate": 2.3073974702951326e-06, + "logits/chosen": 4.05922269821167, + "logits/rejected": 4.098392963409424, + "logps/chosen": -153.53912353515625, + "logps/rejected": -173.84178161621094, + "loss": 0.4408, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.7472505569458, + "rewards/margins": 1.873275637626648, + "rewards/rejected": -12.620526313781738, + "step": 1204 + }, + { + "epoch": 0.831464550629636, + "grad_norm": 0.8374214768409729, + "learning_rate": 2.309313913376773e-06, + "logits/chosen": 3.923943519592285, + "logits/rejected": 4.061413288116455, + "logps/chosen": -166.021728515625, + "logps/rejected": -178.1210174560547, + "loss": 0.5238, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.822938919067383, + "rewards/margins": 1.2209619283676147, + "rewards/rejected": -13.043901443481445, + "step": 1205 + }, + { + "epoch": 0.8321545627048473, + "grad_norm": 0.35193970799446106, + "learning_rate": 2.3112303564584134e-06, + "logits/chosen": 4.115898132324219, + "logits/rejected": 4.115898132324219, + "logps/chosen": -175.42347717285156, + "logps/rejected": -175.42347717285156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.70497989654541, + "rewards/margins": 0.0, + "rewards/rejected": -12.70497989654541, + "step": 1206 + }, + { + "epoch": 0.8328445747800587, + "grad_norm": 0.3358634412288666, + "learning_rate": 2.3131467995400538e-06, + "logits/chosen": 3.846400737762451, + "logits/rejected": 3.924480438232422, + "logps/chosen": -181.47601318359375, + "logps/rejected": -188.245849609375, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.356466293334961, + "rewards/margins": 0.6538845300674438, + "rewards/rejected": -14.01034927368164, + "step": 1207 + }, + { + "epoch": 0.8335345868552699, + "grad_norm": 0.3107414245605469, + "learning_rate": 2.315063242621694e-06, + "logits/chosen": 3.7451255321502686, + "logits/rejected": 3.8230364322662354, + "logps/chosen": -155.02139282226562, + "logps/rejected": -165.2827606201172, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.900055885314941, + "rewards/margins": 1.0533112287521362, + "rewards/rejected": -11.953367233276367, + "step": 1208 + }, + { + "epoch": 0.8342245989304813, + "grad_norm": 0.31849366426467896, + "learning_rate": 2.3169796857033346e-06, + "logits/chosen": 4.173074722290039, + "logits/rejected": 4.173074722290039, + "logps/chosen": -196.27841186523438, + "logps/rejected": -196.27841186523438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.87620735168457, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -14.87620735168457, + "step": 1209 + }, + { + "epoch": 0.8349146110056926, + "grad_norm": 0.36580803990364075, + "learning_rate": 2.3188961287849754e-06, + "logits/chosen": 4.395401954650879, + "logits/rejected": 4.395401954650879, + "logps/chosen": -179.07888793945312, + "logps/rejected": -179.07888793945312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.188922882080078, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.188921928405762, + "step": 1210 + }, + { + "epoch": 0.8356046230809039, + "grad_norm": 0.35949286818504333, + "learning_rate": 2.3208125718666158e-06, + "logits/chosen": 4.040815353393555, + "logits/rejected": 4.040815353393555, + "logps/chosen": -172.57461547851562, + "logps/rejected": -172.5746307373047, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.553380966186523, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.553380966186523, + "step": 1211 + }, + { + "epoch": 0.8362946351561152, + "grad_norm": 0.3661092221736908, + "learning_rate": 2.322729014948256e-06, + "logits/chosen": 4.237571716308594, + "logits/rejected": 4.237571716308594, + "logps/chosen": -168.28863525390625, + "logps/rejected": -168.28863525390625, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.994744300842285, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -11.994745254516602, + "step": 1212 + }, + { + "epoch": 0.8369846472313266, + "grad_norm": 0.33310726284980774, + "learning_rate": 2.3246454580298965e-06, + "logits/chosen": 4.2875142097473145, + "logits/rejected": 4.2875142097473145, + "logps/chosen": -187.9098663330078, + "logps/rejected": -187.9098663330078, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.041786193847656, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -14.041788101196289, + "step": 1213 + }, + { + "epoch": 0.8376746593065378, + "grad_norm": 0.30548644065856934, + "learning_rate": 2.3265619011115374e-06, + "logits/chosen": 4.3559675216674805, + "logits/rejected": 4.3559675216674805, + "logps/chosen": -187.031005859375, + "logps/rejected": -187.031005859375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.8120698928833, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.812070846557617, + "step": 1214 + }, + { + "epoch": 0.8383646713817492, + "grad_norm": 0.3441934883594513, + "learning_rate": 2.3284783441931778e-06, + "logits/chosen": 4.423918724060059, + "logits/rejected": 4.423918724060059, + "logps/chosen": -188.70603942871094, + "logps/rejected": -188.7060546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.129924774169922, + "rewards/margins": 0.0, + "rewards/rejected": -14.129924774169922, + "step": 1215 + }, + { + "epoch": 0.8390546834569605, + "grad_norm": 0.3207920789718628, + "learning_rate": 2.330394787274818e-06, + "logits/chosen": 4.272557735443115, + "logits/rejected": 4.272557735443115, + "logps/chosen": -188.84449768066406, + "logps/rejected": -188.84451293945312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.194494247436523, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -14.194494247436523, + "step": 1216 + }, + { + "epoch": 0.8397446955321718, + "grad_norm": 0.2855813503265381, + "learning_rate": 2.3323112303564585e-06, + "logits/chosen": 4.024970531463623, + "logits/rejected": 4.3540568351745605, + "logps/chosen": -167.49716186523438, + "logps/rejected": -188.14788818359375, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.99044418334961, + "rewards/margins": 2.107462167739868, + "rewards/rejected": -14.097906112670898, + "step": 1217 + }, + { + "epoch": 0.8404347076073831, + "grad_norm": 0.2895805239677429, + "learning_rate": 2.3342276734380994e-06, + "logits/chosen": 4.437131404876709, + "logits/rejected": 4.556315898895264, + "logps/chosen": -170.7140350341797, + "logps/rejected": -183.87403869628906, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.356892585754395, + "rewards/margins": 1.243051528930664, + "rewards/rejected": -13.599944114685059, + "step": 1218 + }, + { + "epoch": 0.8411247196825944, + "grad_norm": 0.2880151569843292, + "learning_rate": 2.3361441165197397e-06, + "logits/chosen": 3.8384857177734375, + "logits/rejected": 3.8384857177734375, + "logps/chosen": -162.2538299560547, + "logps/rejected": -162.2538299560547, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.577922821044922, + "rewards/margins": 0.0, + "rewards/rejected": -11.577922821044922, + "step": 1219 + }, + { + "epoch": 0.8418147317578057, + "grad_norm": 0.349295973777771, + "learning_rate": 2.33806055960138e-06, + "logits/chosen": 3.9870691299438477, + "logits/rejected": 3.9870691299438477, + "logps/chosen": -175.38865661621094, + "logps/rejected": -175.38865661621094, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.753999710083008, + "rewards/margins": 4.172325134277344e-07, + "rewards/rejected": -12.753999710083008, + "step": 1220 + }, + { + "epoch": 0.8425047438330171, + "grad_norm": 0.35037004947662354, + "learning_rate": 2.3399770026830205e-06, + "logits/chosen": 4.225088596343994, + "logits/rejected": 4.225088596343994, + "logps/chosen": -185.0673828125, + "logps/rejected": -185.0673828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.82302474975586, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.82302474975586, + "step": 1221 + }, + { + "epoch": 0.8431947559082283, + "grad_norm": 0.3490009307861328, + "learning_rate": 2.341893445764661e-06, + "logits/chosen": 3.9399819374084473, + "logits/rejected": 3.9583466053009033, + "logps/chosen": -176.8119354248047, + "logps/rejected": -185.46453857421875, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.869938850402832, + "rewards/margins": 0.8736026287078857, + "rewards/rejected": -13.743541717529297, + "step": 1222 + }, + { + "epoch": 0.8438847679834397, + "grad_norm": 0.32094806432724, + "learning_rate": 2.3438098888463013e-06, + "logits/chosen": 4.033304691314697, + "logits/rejected": 4.033304691314697, + "logps/chosen": -196.4792938232422, + "logps/rejected": -196.4792938232422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.705698013305664, + "rewards/margins": 0.0, + "rewards/rejected": -14.705698013305664, + "step": 1223 + }, + { + "epoch": 0.844574780058651, + "grad_norm": 0.3075718879699707, + "learning_rate": 2.345726331927942e-06, + "logits/chosen": 4.242884159088135, + "logits/rejected": 4.242884159088135, + "logps/chosen": -195.16104125976562, + "logps/rejected": -195.16104125976562, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.821956634521484, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -14.821956634521484, + "step": 1224 + }, + { + "epoch": 0.8452647921338623, + "grad_norm": 0.3480469584465027, + "learning_rate": 2.3476427750095825e-06, + "logits/chosen": 4.193185806274414, + "logits/rejected": 4.304584503173828, + "logps/chosen": -161.61622619628906, + "logps/rejected": -175.03271484375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.272796630859375, + "rewards/margins": 1.3868839740753174, + "rewards/rejected": -12.65968132019043, + "step": 1225 + }, + { + "epoch": 0.8459548042090737, + "grad_norm": 0.3449055850505829, + "learning_rate": 2.349559218091223e-06, + "logits/chosen": 3.8710944652557373, + "logits/rejected": 3.9812936782836914, + "logps/chosen": -168.60269165039062, + "logps/rejected": -186.00184631347656, + "loss": 0.5212, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.257368087768555, + "rewards/margins": 1.7843017578125, + "rewards/rejected": -14.041669845581055, + "step": 1226 + }, + { + "epoch": 0.846644816284285, + "grad_norm": 1.0324608087539673, + "learning_rate": 2.3514756611728633e-06, + "logits/chosen": 4.192251682281494, + "logits/rejected": 4.40867280960083, + "logps/chosen": -174.90377807617188, + "logps/rejected": -190.07916259765625, + "loss": 0.5281, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.617538452148438, + "rewards/margins": 1.5414254665374756, + "rewards/rejected": -14.158965110778809, + "step": 1227 + }, + { + "epoch": 0.8473348283594963, + "grad_norm": 1.7320950031280518, + "learning_rate": 2.3533921042545037e-06, + "logits/chosen": 3.593902826309204, + "logits/rejected": 4.109328746795654, + "logps/chosen": -148.58616638183594, + "logps/rejected": -169.06529235839844, + "loss": 0.4397, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.050167083740234, + "rewards/margins": 2.1223058700561523, + "rewards/rejected": -12.172472953796387, + "step": 1228 + }, + { + "epoch": 0.8480248404347076, + "grad_norm": 0.35986167192459106, + "learning_rate": 2.355308547336144e-06, + "logits/chosen": 4.316693305969238, + "logits/rejected": 4.316693305969238, + "logps/chosen": -185.3483123779297, + "logps/rejected": -185.34832763671875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.893815994262695, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -13.893815994262695, + "step": 1229 + }, + { + "epoch": 0.848714852509919, + "grad_norm": 0.2956233322620392, + "learning_rate": 2.357224990417785e-06, + "logits/chosen": 3.892146587371826, + "logits/rejected": 3.952320098876953, + "logps/chosen": -183.08868408203125, + "logps/rejected": -190.16738891601562, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.55225944519043, + "rewards/margins": 0.7318673133850098, + "rewards/rejected": -14.284126281738281, + "step": 1230 + }, + { + "epoch": 0.8494048645851302, + "grad_norm": 0.3694020211696625, + "learning_rate": 2.3591414334994253e-06, + "logits/chosen": 4.065553665161133, + "logits/rejected": 4.065553665161133, + "logps/chosen": -174.18902587890625, + "logps/rejected": -174.18902587890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.60948371887207, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.60948371887207, + "step": 1231 + }, + { + "epoch": 0.8500948766603416, + "grad_norm": 0.30280765891075134, + "learning_rate": 2.3610578765810657e-06, + "logits/chosen": 3.6266117095947266, + "logits/rejected": 3.970078468322754, + "logps/chosen": -148.160400390625, + "logps/rejected": -176.85174560546875, + "loss": 0.3495, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.129349708557129, + "rewards/margins": 3.0022313594818115, + "rewards/rejected": -13.13158130645752, + "step": 1232 + }, + { + "epoch": 0.8507848887355529, + "grad_norm": 0.32571837306022644, + "learning_rate": 2.362974319662706e-06, + "logits/chosen": 3.746419668197632, + "logits/rejected": 4.015650749206543, + "logps/chosen": -146.8272705078125, + "logps/rejected": -180.3065185546875, + "loss": 0.4341, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.78534984588623, + "rewards/margins": 3.2970123291015625, + "rewards/rejected": -13.08236312866211, + "step": 1233 + }, + { + "epoch": 0.8514749008107642, + "grad_norm": 0.35364046692848206, + "learning_rate": 2.364890762744347e-06, + "logits/chosen": 3.922494411468506, + "logits/rejected": 3.9322619438171387, + "logps/chosen": -190.05673217773438, + "logps/rejected": -201.24630737304688, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.110127449035645, + "rewards/margins": 1.1796927452087402, + "rewards/rejected": -15.289819717407227, + "step": 1234 + }, + { + "epoch": 0.8521649128859755, + "grad_norm": 7.954428672790527, + "learning_rate": 2.3668072058259873e-06, + "logits/chosen": 3.8793892860412598, + "logits/rejected": 3.991286277770996, + "logps/chosen": -174.46795654296875, + "logps/rejected": -183.0003662109375, + "loss": 0.5639, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.730732917785645, + "rewards/margins": 0.8681147694587708, + "rewards/rejected": -13.598847389221191, + "step": 1235 + }, + { + "epoch": 0.8528549249611869, + "grad_norm": 0.33888503909111023, + "learning_rate": 2.3687236489076277e-06, + "logits/chosen": 4.037179946899414, + "logits/rejected": 4.037179946899414, + "logps/chosen": -174.236572265625, + "logps/rejected": -174.236572265625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.513577461242676, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.513578414916992, + "step": 1236 + }, + { + "epoch": 0.8535449370363981, + "grad_norm": 0.24342313408851624, + "learning_rate": 2.370640091989268e-06, + "logits/chosen": 3.7974698543548584, + "logits/rejected": 3.9961447715759277, + "logps/chosen": -184.7515869140625, + "logps/rejected": -209.45425415039062, + "loss": 0.4339, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.73194408416748, + "rewards/margins": 2.429800033569336, + "rewards/rejected": -16.161745071411133, + "step": 1237 + }, + { + "epoch": 0.8542349491116095, + "grad_norm": 0.2751787602901459, + "learning_rate": 2.372556535070909e-06, + "logits/chosen": 4.128932476043701, + "logits/rejected": 4.164791584014893, + "logps/chosen": -176.83888244628906, + "logps/rejected": -196.33316040039062, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.883221626281738, + "rewards/margins": 1.9632964134216309, + "rewards/rejected": -14.846517562866211, + "step": 1238 + }, + { + "epoch": 0.8549249611868208, + "grad_norm": 0.3054389953613281, + "learning_rate": 2.3744729781525493e-06, + "logits/chosen": 4.130302429199219, + "logits/rejected": 4.244381427764893, + "logps/chosen": -167.27713012695312, + "logps/rejected": -177.7019805908203, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.032072067260742, + "rewards/margins": 1.0026379823684692, + "rewards/rejected": -13.034709930419922, + "step": 1239 + }, + { + "epoch": 0.8556149732620321, + "grad_norm": 10.189393997192383, + "learning_rate": 2.3763894212341897e-06, + "logits/chosen": 4.139307498931885, + "logits/rejected": 4.114058017730713, + "logps/chosen": -155.78762817382812, + "logps/rejected": -156.55174255371094, + "loss": 0.6478, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.62173843383789, + "rewards/margins": 0.11724340915679932, + "rewards/rejected": -10.738982200622559, + "step": 1240 + }, + { + "epoch": 0.8563049853372434, + "grad_norm": 0.3072119355201721, + "learning_rate": 2.37830586431583e-06, + "logits/chosen": 3.906528949737549, + "logits/rejected": 4.21790885925293, + "logps/chosen": -161.46624755859375, + "logps/rejected": -181.0057373046875, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.249275207519531, + "rewards/margins": 1.9946613311767578, + "rewards/rejected": -13.243936538696289, + "step": 1241 + }, + { + "epoch": 0.8569949974124548, + "grad_norm": 6.635034561157227, + "learning_rate": 2.3802223073974704e-06, + "logits/chosen": 4.048306941986084, + "logits/rejected": 4.329343318939209, + "logps/chosen": -167.01576232910156, + "logps/rejected": -177.69091796875, + "loss": 0.5509, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.91839599609375, + "rewards/margins": 1.054558277130127, + "rewards/rejected": -12.972955703735352, + "step": 1242 + }, + { + "epoch": 0.857685009487666, + "grad_norm": 0.31746259331703186, + "learning_rate": 2.382138750479111e-06, + "logits/chosen": 4.396036624908447, + "logits/rejected": 4.396036624908447, + "logps/chosen": -189.979736328125, + "logps/rejected": -189.979736328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.14564037322998, + "rewards/margins": 0.0, + "rewards/rejected": -14.14564037322998, + "step": 1243 + }, + { + "epoch": 0.8583750215628774, + "grad_norm": 0.3440495729446411, + "learning_rate": 2.3840551935607512e-06, + "logits/chosen": 3.9025228023529053, + "logits/rejected": 3.992650270462036, + "logps/chosen": -166.79583740234375, + "logps/rejected": -174.82945251464844, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.009529113769531, + "rewards/margins": 0.806520938873291, + "rewards/rejected": -12.816049575805664, + "step": 1244 + }, + { + "epoch": 0.8590650336380886, + "grad_norm": 0.6991493105888367, + "learning_rate": 2.385971636642392e-06, + "logits/chosen": 4.02758264541626, + "logits/rejected": 4.003774642944336, + "logps/chosen": -170.52459716796875, + "logps/rejected": -174.1786651611328, + "loss": 0.6112, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.283559799194336, + "rewards/margins": 0.4091600179672241, + "rewards/rejected": -12.692720413208008, + "step": 1245 + }, + { + "epoch": 0.8597550457133, + "grad_norm": 16.614477157592773, + "learning_rate": 2.3878880797240324e-06, + "logits/chosen": 3.99884033203125, + "logits/rejected": 4.08255672454834, + "logps/chosen": -159.67587280273438, + "logps/rejected": -163.40570068359375, + "loss": 0.8377, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.34611701965332, + "rewards/margins": 0.2533003091812134, + "rewards/rejected": -11.599418640136719, + "step": 1246 + }, + { + "epoch": 0.8604450577885113, + "grad_norm": 0.32205283641815186, + "learning_rate": 2.389804522805673e-06, + "logits/chosen": 3.720956563949585, + "logits/rejected": 3.8273444175720215, + "logps/chosen": -174.2999267578125, + "logps/rejected": -182.42333984375, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.599815368652344, + "rewards/margins": 0.8230571150779724, + "rewards/rejected": -13.422873497009277, + "step": 1247 + }, + { + "epoch": 0.8611350698637226, + "grad_norm": 0.30736011266708374, + "learning_rate": 2.3917209658873132e-06, + "logits/chosen": 4.204251289367676, + "logits/rejected": 4.29058837890625, + "logps/chosen": -170.16787719726562, + "logps/rejected": -179.08262634277344, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.301862716674805, + "rewards/margins": 0.8633429408073425, + "rewards/rejected": -13.165205001831055, + "step": 1248 + }, + { + "epoch": 0.8618250819389339, + "grad_norm": 17.519624710083008, + "learning_rate": 2.3936374089689536e-06, + "logits/chosen": 4.109344005584717, + "logits/rejected": 4.157627582550049, + "logps/chosen": -169.42047119140625, + "logps/rejected": -174.78793334960938, + "loss": 0.6042, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.511785507202148, + "rewards/margins": 0.4694017171859741, + "rewards/rejected": -12.98118782043457, + "step": 1249 + }, + { + "epoch": 0.8625150940141453, + "grad_norm": 0.405771404504776, + "learning_rate": 2.3955538520505944e-06, + "logits/chosen": 4.041942596435547, + "logits/rejected": 4.041942596435547, + "logps/chosen": -176.39788818359375, + "logps/rejected": -176.39788818359375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.83578872680664, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.83578872680664, + "step": 1250 + }, + { + "epoch": 0.8632051060893565, + "grad_norm": 3.1905367374420166, + "learning_rate": 2.397470295132235e-06, + "logits/chosen": 3.8356573581695557, + "logits/rejected": 4.400703430175781, + "logps/chosen": -166.20932006835938, + "logps/rejected": -180.407958984375, + "loss": 0.4616, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.67304801940918, + "rewards/margins": 1.3738248348236084, + "rewards/rejected": -13.046873092651367, + "step": 1251 + }, + { + "epoch": 0.8638951181645679, + "grad_norm": 0.5100119113922119, + "learning_rate": 2.399386738213875e-06, + "logits/chosen": 4.163896560668945, + "logits/rejected": 4.270790100097656, + "logps/chosen": -181.33009338378906, + "logps/rejected": -185.8479766845703, + "loss": 0.6099, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.20784854888916, + "rewards/margins": 0.4474012851715088, + "rewards/rejected": -13.65524959564209, + "step": 1252 + }, + { + "epoch": 0.8645851302397792, + "grad_norm": 0.30333688855171204, + "learning_rate": 2.4013031812955156e-06, + "logits/chosen": 4.0102009773254395, + "logits/rejected": 4.231925964355469, + "logps/chosen": -162.9400177001953, + "logps/rejected": -184.48304748535156, + "loss": 0.4347, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.45889663696289, + "rewards/margins": 2.2486181259155273, + "rewards/rejected": -13.707513809204102, + "step": 1253 + }, + { + "epoch": 0.8652751423149905, + "grad_norm": 0.3590581715106964, + "learning_rate": 2.403219624377156e-06, + "logits/chosen": 4.261439323425293, + "logits/rejected": 4.405585765838623, + "logps/chosen": -175.51596069335938, + "logps/rejected": -185.591064453125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.787793159484863, + "rewards/margins": 1.0333813428878784, + "rewards/rejected": -13.821174621582031, + "step": 1254 + }, + { + "epoch": 0.8659651543902018, + "grad_norm": 7.580277442932129, + "learning_rate": 2.405136067458797e-06, + "logits/chosen": 4.262002468109131, + "logits/rejected": 4.481973648071289, + "logps/chosen": -166.1484375, + "logps/rejected": -188.03158569335938, + "loss": 0.5287, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.962271690368652, + "rewards/margins": 2.155726432800293, + "rewards/rejected": -14.117998123168945, + "step": 1255 + }, + { + "epoch": 0.8666551664654132, + "grad_norm": 0.3352581262588501, + "learning_rate": 2.407052510540437e-06, + "logits/chosen": 4.637488842010498, + "logits/rejected": 4.637488842010498, + "logps/chosen": -192.53836059570312, + "logps/rejected": -192.53836059570312, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.406856536865234, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -14.40685749053955, + "step": 1256 + }, + { + "epoch": 0.8673451785406244, + "grad_norm": 0.31149357557296753, + "learning_rate": 2.4089689536220776e-06, + "logits/chosen": 4.36845588684082, + "logits/rejected": 4.561179161071777, + "logps/chosen": -169.05618286132812, + "logps/rejected": -180.17005920410156, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.08990478515625, + "rewards/margins": 1.1692683696746826, + "rewards/rejected": -13.259173393249512, + "step": 1257 + }, + { + "epoch": 0.8680351906158358, + "grad_norm": 10.305549621582031, + "learning_rate": 2.410885396703718e-06, + "logits/chosen": 4.1406731605529785, + "logits/rejected": 4.1145524978637695, + "logps/chosen": -158.1810760498047, + "logps/rejected": -162.51637268066406, + "loss": 0.6195, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.097232818603516, + "rewards/margins": 0.4377114772796631, + "rewards/rejected": -11.534944534301758, + "step": 1258 + }, + { + "epoch": 0.8687252026910471, + "grad_norm": 0.3097033202648163, + "learning_rate": 2.412801839785359e-06, + "logits/chosen": 4.158917427062988, + "logits/rejected": 4.1901421546936035, + "logps/chosen": -176.83087158203125, + "logps/rejected": -183.78704833984375, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.98364543914795, + "rewards/margins": 0.7252302169799805, + "rewards/rejected": -13.70887565612793, + "step": 1259 + }, + { + "epoch": 0.8694152147662584, + "grad_norm": 0.2962169349193573, + "learning_rate": 2.414718282866999e-06, + "logits/chosen": 4.175595283508301, + "logits/rejected": 4.39500093460083, + "logps/chosen": -187.64476013183594, + "logps/rejected": -199.89051818847656, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.937430381774902, + "rewards/margins": 1.1753305196762085, + "rewards/rejected": -15.112760543823242, + "step": 1260 + }, + { + "epoch": 0.8701052268414697, + "grad_norm": 0.2917456328868866, + "learning_rate": 2.4166347259486396e-06, + "logits/chosen": 4.705911636352539, + "logits/rejected": 4.705911636352539, + "logps/chosen": -168.59695434570312, + "logps/rejected": -168.59695434570312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.047110557556152, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.047109603881836, + "step": 1261 + }, + { + "epoch": 0.8707952389166811, + "grad_norm": 0.3145409822463989, + "learning_rate": 2.41855116903028e-06, + "logits/chosen": 4.058206081390381, + "logits/rejected": 4.1898651123046875, + "logps/chosen": -153.8986358642578, + "logps/rejected": -180.99215698242188, + "loss": 0.4344, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.547168731689453, + "rewards/margins": 2.665497303009033, + "rewards/rejected": -13.212665557861328, + "step": 1262 + }, + { + "epoch": 0.8714852509918923, + "grad_norm": 0.44205379486083984, + "learning_rate": 2.4204676121119204e-06, + "logits/chosen": 4.2892279624938965, + "logits/rejected": 4.352926254272461, + "logps/chosen": -165.893798828125, + "logps/rejected": -181.37188720703125, + "loss": 0.5209, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.822673797607422, + "rewards/margins": 1.5725085735321045, + "rewards/rejected": -13.395182609558105, + "step": 1263 + }, + { + "epoch": 0.8721752630671037, + "grad_norm": 0.3206360340118408, + "learning_rate": 2.4223840551935607e-06, + "logits/chosen": 4.181121826171875, + "logits/rejected": 4.229727745056152, + "logps/chosen": -172.68710327148438, + "logps/rejected": -182.22923278808594, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.49902057647705, + "rewards/margins": 0.9922611117362976, + "rewards/rejected": -13.491279602050781, + "step": 1264 + }, + { + "epoch": 0.872865275142315, + "grad_norm": 0.3496232032775879, + "learning_rate": 2.4243004982752016e-06, + "logits/chosen": 4.106175899505615, + "logits/rejected": 4.298008441925049, + "logps/chosen": -176.79721069335938, + "logps/rejected": -185.46084594726562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.027828216552734, + "rewards/margins": 0.8792619705200195, + "rewards/rejected": -13.907090187072754, + "step": 1265 + }, + { + "epoch": 0.8735552872175263, + "grad_norm": 0.34649184346199036, + "learning_rate": 2.426216941356842e-06, + "logits/chosen": 4.091391086578369, + "logits/rejected": 4.294764518737793, + "logps/chosen": -175.23318481445312, + "logps/rejected": -188.21768188476562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.579388618469238, + "rewards/margins": 1.2576802968978882, + "rewards/rejected": -13.837068557739258, + "step": 1266 + }, + { + "epoch": 0.8742452992927376, + "grad_norm": 0.4610278904438019, + "learning_rate": 2.4281333844384823e-06, + "logits/chosen": 4.112835884094238, + "logits/rejected": 4.112835884094238, + "logps/chosen": -162.81671142578125, + "logps/rejected": -162.81671142578125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.701565742492676, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -11.701565742492676, + "step": 1267 + }, + { + "epoch": 0.8749353113679489, + "grad_norm": 1.1891776323318481, + "learning_rate": 2.4300498275201227e-06, + "logits/chosen": 4.463886260986328, + "logits/rejected": 4.566933631896973, + "logps/chosen": -176.1179962158203, + "logps/rejected": -179.87709045410156, + "loss": 0.6113, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.935253143310547, + "rewards/margins": 0.40521132946014404, + "rewards/rejected": -13.340463638305664, + "step": 1268 + }, + { + "epoch": 0.8756253234431602, + "grad_norm": 0.3325238823890686, + "learning_rate": 2.431966270601763e-06, + "logits/chosen": 4.4024810791015625, + "logits/rejected": 4.4024810791015625, + "logps/chosen": -186.71896362304688, + "logps/rejected": -186.71896362304688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.915990829467773, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.915990829467773, + "step": 1269 + }, + { + "epoch": 0.8763153355183716, + "grad_norm": 0.38782018423080444, + "learning_rate": 2.4338827136834035e-06, + "logits/chosen": 4.095740795135498, + "logits/rejected": 4.095740795135498, + "logps/chosen": -161.31442260742188, + "logps/rejected": -161.31442260742188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.537585258483887, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.537585258483887, + "step": 1270 + }, + { + "epoch": 0.8770053475935828, + "grad_norm": 0.3278772830963135, + "learning_rate": 2.4357991567650443e-06, + "logits/chosen": 4.426251411437988, + "logits/rejected": 4.426251411437988, + "logps/chosen": -183.18212890625, + "logps/rejected": -183.18212890625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.556802749633789, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.556802749633789, + "step": 1271 + }, + { + "epoch": 0.8776953596687942, + "grad_norm": 0.28764039278030396, + "learning_rate": 2.4377155998466847e-06, + "logits/chosen": 4.282812118530273, + "logits/rejected": 4.265713214874268, + "logps/chosen": -172.12936401367188, + "logps/rejected": -182.98512268066406, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.502313613891602, + "rewards/margins": 1.0517935752868652, + "rewards/rejected": -13.554107666015625, + "step": 1272 + }, + { + "epoch": 0.8783853717440055, + "grad_norm": 0.3391878306865692, + "learning_rate": 2.439632042928325e-06, + "logits/chosen": 4.453338146209717, + "logits/rejected": 4.453338146209717, + "logps/chosen": -197.8320770263672, + "logps/rejected": -197.8320770263672, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.951988220214844, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.951988220214844, + "step": 1273 + }, + { + "epoch": 0.8790753838192168, + "grad_norm": 0.30446311831474304, + "learning_rate": 2.4415484860099655e-06, + "logits/chosen": 4.192585468292236, + "logits/rejected": 4.386507511138916, + "logps/chosen": -162.27212524414062, + "logps/rejected": -183.23846435546875, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.487539291381836, + "rewards/margins": 2.076185941696167, + "rewards/rejected": -13.563725471496582, + "step": 1274 + }, + { + "epoch": 0.8797653958944281, + "grad_norm": 22.737018585205078, + "learning_rate": 2.4434649290916063e-06, + "logits/chosen": 4.09974479675293, + "logits/rejected": 4.056926727294922, + "logps/chosen": -166.5511932373047, + "logps/rejected": -164.68136596679688, + "loss": 0.8309, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.9769287109375, + "rewards/margins": -0.20174378156661987, + "rewards/rejected": -11.775185585021973, + "step": 1275 + }, + { + "epoch": 0.8804554079696395, + "grad_norm": 0.2815355956554413, + "learning_rate": 2.4453813721732467e-06, + "logits/chosen": 4.394782066345215, + "logits/rejected": 4.577112674713135, + "logps/chosen": -185.14559936523438, + "logps/rejected": -193.948974609375, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.805453300476074, + "rewards/margins": 0.8778603076934814, + "rewards/rejected": -14.683313369750977, + "step": 1276 + }, + { + "epoch": 0.8811454200448507, + "grad_norm": 0.3279244005680084, + "learning_rate": 2.447297815254887e-06, + "logits/chosen": 4.3194379806518555, + "logits/rejected": 4.3194379806518555, + "logps/chosen": -191.30999755859375, + "logps/rejected": -191.30999755859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.341690063476562, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -14.341690063476562, + "step": 1277 + }, + { + "epoch": 0.8818354321200621, + "grad_norm": 1.9833678007125854, + "learning_rate": 2.4492142583365275e-06, + "logits/chosen": 4.283015251159668, + "logits/rejected": 4.440306663513184, + "logps/chosen": -169.61483764648438, + "logps/rejected": -181.65402221679688, + "loss": 0.53, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.272192001342773, + "rewards/margins": 1.2687090635299683, + "rewards/rejected": -13.540900230407715, + "step": 1278 + }, + { + "epoch": 0.8825254441952735, + "grad_norm": 0.32882386445999146, + "learning_rate": 2.4511307014181683e-06, + "logits/chosen": 4.086451053619385, + "logits/rejected": 4.32439661026001, + "logps/chosen": -153.57644653320312, + "logps/rejected": -174.92630004882812, + "loss": 0.5202, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.571481704711914, + "rewards/margins": 2.012336254119873, + "rewards/rejected": -12.583818435668945, + "step": 1279 + }, + { + "epoch": 0.8832154562704847, + "grad_norm": 0.791092574596405, + "learning_rate": 2.4530471444998087e-06, + "logits/chosen": 4.153511047363281, + "logits/rejected": 4.305323600769043, + "logps/chosen": -161.05140686035156, + "logps/rejected": -172.41546630859375, + "loss": 0.5241, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.348432540893555, + "rewards/margins": 1.0374882221221924, + "rewards/rejected": -12.385919570922852, + "step": 1280 + }, + { + "epoch": 0.883905468345696, + "grad_norm": 0.32705357670783997, + "learning_rate": 2.454963587581449e-06, + "logits/chosen": 4.595350742340088, + "logits/rejected": 4.595350742340088, + "logps/chosen": -182.61276245117188, + "logps/rejected": -182.61276245117188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.614036560058594, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -13.61403751373291, + "step": 1281 + }, + { + "epoch": 0.8845954804209074, + "grad_norm": 0.32851582765579224, + "learning_rate": 2.4568800306630895e-06, + "logits/chosen": 4.037549018859863, + "logits/rejected": 4.279472351074219, + "logps/chosen": -167.08001708984375, + "logps/rejected": -185.51626586914062, + "loss": 0.5204, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.812910079956055, + "rewards/margins": 1.8474925756454468, + "rewards/rejected": -13.660404205322266, + "step": 1282 + }, + { + "epoch": 0.8852854924961187, + "grad_norm": 0.39110901951789856, + "learning_rate": 2.45879647374473e-06, + "logits/chosen": 4.351410865783691, + "logits/rejected": 4.409853458404541, + "logps/chosen": -170.0176544189453, + "logps/rejected": -177.35882568359375, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.101293563842773, + "rewards/margins": 0.7883018255233765, + "rewards/rejected": -12.889596939086914, + "step": 1283 + }, + { + "epoch": 0.88597550457133, + "grad_norm": 0.3920035660266876, + "learning_rate": 2.4607129168263703e-06, + "logits/chosen": 4.46082878112793, + "logits/rejected": 4.46082878112793, + "logps/chosen": -178.07354736328125, + "logps/rejected": -178.07354736328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.001110076904297, + "rewards/margins": -6.556510925292969e-07, + "rewards/rejected": -13.00110912322998, + "step": 1284 + }, + { + "epoch": 0.8866655166465414, + "grad_norm": 0.44745028018951416, + "learning_rate": 2.462629359908011e-06, + "logits/chosen": 4.576826095581055, + "logits/rejected": 4.627938747406006, + "logps/chosen": -188.35952758789062, + "logps/rejected": -193.29617309570312, + "loss": 0.6095, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.022211074829102, + "rewards/margins": 0.46476781368255615, + "rewards/rejected": -14.486979484558105, + "step": 1285 + }, + { + "epoch": 0.8873555287217526, + "grad_norm": 20.863393783569336, + "learning_rate": 2.4645458029896515e-06, + "logits/chosen": 4.036070346832275, + "logits/rejected": 4.441439151763916, + "logps/chosen": -155.92999267578125, + "logps/rejected": -182.765380859375, + "loss": 0.78, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.881596565246582, + "rewards/margins": 2.6578688621520996, + "rewards/rejected": -13.539464950561523, + "step": 1286 + }, + { + "epoch": 0.888045540796964, + "grad_norm": 3.824181318283081, + "learning_rate": 2.466462246071292e-06, + "logits/chosen": 4.43246603012085, + "logits/rejected": 4.475803852081299, + "logps/chosen": -174.69671630859375, + "logps/rejected": -176.59353637695312, + "loss": 0.6257, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.3942232131958, + "rewards/margins": 0.2245655059814453, + "rewards/rejected": -12.618788719177246, + "step": 1287 + }, + { + "epoch": 0.8887355528721753, + "grad_norm": 0.9138690233230591, + "learning_rate": 2.4683786891529323e-06, + "logits/chosen": 4.611201286315918, + "logits/rejected": 4.673182487487793, + "logps/chosen": -185.45169067382812, + "logps/rejected": -193.77017211914062, + "loss": 0.5256, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.465967178344727, + "rewards/margins": 0.9600439071655273, + "rewards/rejected": -14.42601203918457, + "step": 1288 + }, + { + "epoch": 0.8894255649473866, + "grad_norm": 0.3146876394748688, + "learning_rate": 2.4702951322345727e-06, + "logits/chosen": 4.087219715118408, + "logits/rejected": 4.112563610076904, + "logps/chosen": -166.1857147216797, + "logps/rejected": -174.80906677246094, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.715876579284668, + "rewards/margins": 0.8238128423690796, + "rewards/rejected": -12.539690017700195, + "step": 1289 + }, + { + "epoch": 0.8901155770225979, + "grad_norm": 0.32282206416130066, + "learning_rate": 2.472211575316213e-06, + "logits/chosen": 4.210185527801514, + "logits/rejected": 4.349288463592529, + "logps/chosen": -169.2001953125, + "logps/rejected": -187.85736083984375, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.340619087219238, + "rewards/margins": 1.8973467350006104, + "rewards/rejected": -14.23796558380127, + "step": 1290 + }, + { + "epoch": 0.8908055890978092, + "grad_norm": 0.4306529760360718, + "learning_rate": 2.474128018397854e-06, + "logits/chosen": 4.330923080444336, + "logits/rejected": 4.330923080444336, + "logps/chosen": -161.19668579101562, + "logps/rejected": -161.19668579101562, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.414772033691406, + "rewards/margins": 0.0, + "rewards/rejected": -11.414772033691406, + "step": 1291 + }, + { + "epoch": 0.8914956011730205, + "grad_norm": 0.4085914194583893, + "learning_rate": 2.4760444614794943e-06, + "logits/chosen": 3.9748010635375977, + "logits/rejected": 3.9748010635375977, + "logps/chosen": -159.90792846679688, + "logps/rejected": -159.90792846679688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.231266021728516, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.231266021728516, + "step": 1292 + }, + { + "epoch": 0.8921856132482319, + "grad_norm": 0.3081630766391754, + "learning_rate": 2.4779609045611346e-06, + "logits/chosen": 4.637977600097656, + "logits/rejected": 4.637977600097656, + "logps/chosen": -194.55764770507812, + "logps/rejected": -194.55764770507812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.352925300598145, + "rewards/margins": 0.0, + "rewards/rejected": -14.352925300598145, + "step": 1293 + }, + { + "epoch": 0.8928756253234431, + "grad_norm": 0.35322102904319763, + "learning_rate": 2.479877347642775e-06, + "logits/chosen": 4.38900089263916, + "logits/rejected": 4.38900089263916, + "logps/chosen": -187.07867431640625, + "logps/rejected": -187.07867431640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.924663543701172, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.924662590026855, + "step": 1294 + }, + { + "epoch": 0.8935656373986545, + "grad_norm": 0.4079078435897827, + "learning_rate": 2.481793790724416e-06, + "logits/chosen": 4.176311016082764, + "logits/rejected": 4.278666019439697, + "logps/chosen": -152.17320251464844, + "logps/rejected": -159.04196166992188, + "loss": 0.6073, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.453394889831543, + "rewards/margins": 0.6338387727737427, + "rewards/rejected": -11.087233543395996, + "step": 1295 + }, + { + "epoch": 0.8942556494738658, + "grad_norm": 0.3226391673088074, + "learning_rate": 2.4837102338060562e-06, + "logits/chosen": 4.470174789428711, + "logits/rejected": 4.509627819061279, + "logps/chosen": -169.53512573242188, + "logps/rejected": -177.8068389892578, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.159005165100098, + "rewards/margins": 0.8243013620376587, + "rewards/rejected": -12.983306884765625, + "step": 1296 + }, + { + "epoch": 0.8949456615490771, + "grad_norm": 0.344167023897171, + "learning_rate": 2.4856266768876966e-06, + "logits/chosen": 4.312934398651123, + "logits/rejected": 4.312934398651123, + "logps/chosen": -180.26290893554688, + "logps/rejected": -180.26290893554688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.185545921325684, + "rewards/margins": 0.0, + "rewards/rejected": -13.185545921325684, + "step": 1297 + }, + { + "epoch": 0.8956356736242884, + "grad_norm": 0.33718088269233704, + "learning_rate": 2.487543119969337e-06, + "logits/chosen": 4.521151542663574, + "logits/rejected": 4.547398567199707, + "logps/chosen": -172.15774536132812, + "logps/rejected": -178.84408569335938, + "loss": 0.607, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.481132507324219, + "rewards/margins": 0.6907361745834351, + "rewards/rejected": -13.171868324279785, + "step": 1298 + }, + { + "epoch": 0.8963256856994998, + "grad_norm": 1.9642343521118164, + "learning_rate": 2.489459563050978e-06, + "logits/chosen": 4.42565393447876, + "logits/rejected": 4.409845352172852, + "logps/chosen": -170.1402587890625, + "logps/rejected": -180.6546630859375, + "loss": 0.5314, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.254899978637695, + "rewards/margins": 1.1278952360153198, + "rewards/rejected": -13.382795333862305, + "step": 1299 + }, + { + "epoch": 0.897015697774711, + "grad_norm": 0.5300499796867371, + "learning_rate": 2.4913760061326182e-06, + "logits/chosen": 4.235281944274902, + "logits/rejected": 4.279865264892578, + "logps/chosen": -170.25668334960938, + "logps/rejected": -174.8201904296875, + "loss": 0.6093, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.254331588745117, + "rewards/margins": 0.47285354137420654, + "rewards/rejected": -12.727185249328613, + "step": 1300 + }, + { + "epoch": 0.8977057098499224, + "grad_norm": 0.3218652009963989, + "learning_rate": 2.4932924492142586e-06, + "logits/chosen": 4.12419319152832, + "logits/rejected": 4.299347877502441, + "logps/chosen": -175.61854553222656, + "logps/rejected": -183.7637939453125, + "loss": 0.6067, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.835681915283203, + "rewards/margins": 0.8144958019256592, + "rewards/rejected": -13.650178909301758, + "step": 1301 + }, + { + "epoch": 0.8983957219251337, + "grad_norm": 0.49051299691200256, + "learning_rate": 2.495208892295899e-06, + "logits/chosen": 4.1168317794799805, + "logits/rejected": 4.2547407150268555, + "logps/chosen": -160.99423217773438, + "logps/rejected": -178.554443359375, + "loss": 0.5217, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.368629455566406, + "rewards/margins": 1.7878217697143555, + "rewards/rejected": -13.156450271606445, + "step": 1302 + }, + { + "epoch": 0.899085734000345, + "grad_norm": 0.36481615900993347, + "learning_rate": 2.4971253353775394e-06, + "logits/chosen": 4.14975643157959, + "logits/rejected": 4.140446186065674, + "logps/chosen": -172.49392700195312, + "logps/rejected": -177.9036865234375, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.466856002807617, + "rewards/margins": 0.6058129072189331, + "rewards/rejected": -13.07266902923584, + "step": 1303 + }, + { + "epoch": 0.8997757460755563, + "grad_norm": 0.3643089532852173, + "learning_rate": 2.49904177845918e-06, + "logits/chosen": 4.600375175476074, + "logits/rejected": 4.600375175476074, + "logps/chosen": -177.58547973632812, + "logps/rejected": -177.58546447753906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.98046875, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.980467796325684, + "step": 1304 + }, + { + "epoch": 0.9004657581507677, + "grad_norm": 0.37500429153442383, + "learning_rate": 2.5009582215408206e-06, + "logits/chosen": 4.208690166473389, + "logits/rejected": 4.419601917266846, + "logps/chosen": -159.90145874023438, + "logps/rejected": -167.52078247070312, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.276259422302246, + "rewards/margins": 0.7755271792411804, + "rewards/rejected": -12.051786422729492, + "step": 1305 + }, + { + "epoch": 0.9011557702259789, + "grad_norm": 0.325812429189682, + "learning_rate": 2.502874664622461e-06, + "logits/chosen": 4.219998359680176, + "logits/rejected": 4.219998359680176, + "logps/chosen": -179.85140991210938, + "logps/rejected": -179.85140991210938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.232030868530273, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.232030868530273, + "step": 1306 + }, + { + "epoch": 0.9018457823011903, + "grad_norm": 0.3708207309246063, + "learning_rate": 2.5047911077041014e-06, + "logits/chosen": 4.293846130371094, + "logits/rejected": 4.293846130371094, + "logps/chosen": -191.65760803222656, + "logps/rejected": -191.65760803222656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.378087043762207, + "rewards/margins": 0.0, + "rewards/rejected": -14.378087043762207, + "step": 1307 + }, + { + "epoch": 0.9025357943764016, + "grad_norm": 0.5783950090408325, + "learning_rate": 2.5067075507857418e-06, + "logits/chosen": 4.243620872497559, + "logits/rejected": 4.3118696212768555, + "logps/chosen": -178.78106689453125, + "logps/rejected": -184.72024536132812, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.148120880126953, + "rewards/margins": 0.6016908288002014, + "rewards/rejected": -13.749811172485352, + "step": 1308 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 0.5067310333251953, + "learning_rate": 2.508623993867382e-06, + "logits/chosen": 4.02876091003418, + "logits/rejected": 4.340898513793945, + "logps/chosen": -145.56333923339844, + "logps/rejected": -184.0858917236328, + "loss": 0.3499, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.788816452026367, + "rewards/margins": 3.806124687194824, + "rewards/rejected": -13.594942092895508, + "step": 1309 + }, + { + "epoch": 0.9039158185268242, + "grad_norm": 16.032047271728516, + "learning_rate": 2.5105404369490226e-06, + "logits/chosen": 4.1867218017578125, + "logits/rejected": 4.083689212799072, + "logps/chosen": -164.10922241210938, + "logps/rejected": -168.99188232421875, + "loss": 0.9429, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.717290878295898, + "rewards/margins": 0.5248500108718872, + "rewards/rejected": -12.242140769958496, + "step": 1310 + }, + { + "epoch": 0.9046058306020356, + "grad_norm": 0.3393704295158386, + "learning_rate": 2.512456880030663e-06, + "logits/chosen": 4.343820095062256, + "logits/rejected": 4.343820095062256, + "logps/chosen": -181.27474975585938, + "logps/rejected": -181.27474975585938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.269646644592285, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.269646644592285, + "step": 1311 + }, + { + "epoch": 0.9052958426772468, + "grad_norm": 0.3479856550693512, + "learning_rate": 2.5143733231123034e-06, + "logits/chosen": 4.2473320960998535, + "logits/rejected": 4.2473320960998535, + "logps/chosen": -182.83157348632812, + "logps/rejected": -182.83157348632812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.497060775756836, + "rewards/margins": 0.0, + "rewards/rejected": -13.497060775756836, + "step": 1312 + }, + { + "epoch": 0.9059858547524582, + "grad_norm": 0.2990933358669281, + "learning_rate": 2.5162897661939446e-06, + "logits/chosen": 4.286516189575195, + "logits/rejected": 4.286516189575195, + "logps/chosen": -160.99603271484375, + "logps/rejected": -160.99603271484375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.183319091796875, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.183319091796875, + "step": 1313 + }, + { + "epoch": 0.9066758668276694, + "grad_norm": 0.3495821952819824, + "learning_rate": 2.518206209275585e-06, + "logits/chosen": 4.876380920410156, + "logits/rejected": 4.876380920410156, + "logps/chosen": -188.31167602539062, + "logps/rejected": -188.3116912841797, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.209760665893555, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.209760665893555, + "step": 1314 + }, + { + "epoch": 0.9073658789028808, + "grad_norm": 0.3641141951084137, + "learning_rate": 2.5201226523572254e-06, + "logits/chosen": 4.575431823730469, + "logits/rejected": 4.575431823730469, + "logps/chosen": -179.58094787597656, + "logps/rejected": -179.58094787597656, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.24955940246582, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.24955940246582, + "step": 1315 + }, + { + "epoch": 0.9080558909780921, + "grad_norm": 0.3185564875602722, + "learning_rate": 2.5220390954388658e-06, + "logits/chosen": 4.09471321105957, + "logits/rejected": 4.3015289306640625, + "logps/chosen": -146.5609130859375, + "logps/rejected": -158.44224548339844, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.942566871643066, + "rewards/margins": 1.1741554737091064, + "rewards/rejected": -11.116722106933594, + "step": 1316 + }, + { + "epoch": 0.9087459030533034, + "grad_norm": 0.279235303401947, + "learning_rate": 2.523955538520506e-06, + "logits/chosen": 4.110337734222412, + "logits/rejected": 4.4199419021606445, + "logps/chosen": -134.70172119140625, + "logps/rejected": -167.18994140625, + "loss": 0.3477, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.814702033996582, + "rewards/margins": 3.263185501098633, + "rewards/rejected": -12.077887535095215, + "step": 1317 + }, + { + "epoch": 0.9094359151285147, + "grad_norm": 0.3483198583126068, + "learning_rate": 2.5258719816021465e-06, + "logits/chosen": 4.612261772155762, + "logits/rejected": 4.573214530944824, + "logps/chosen": -173.7276611328125, + "logps/rejected": -180.69244384765625, + "loss": 0.6073, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.441920280456543, + "rewards/margins": 0.6321921348571777, + "rewards/rejected": -13.074111938476562, + "step": 1318 + }, + { + "epoch": 0.9101259272037261, + "grad_norm": 0.32430222630500793, + "learning_rate": 2.527788424683787e-06, + "logits/chosen": 4.140262126922607, + "logits/rejected": 4.208015441894531, + "logps/chosen": -153.4322052001953, + "logps/rejected": -164.8095703125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.505716323852539, + "rewards/margins": 1.145318865776062, + "rewards/rejected": -11.65103530883789, + "step": 1319 + }, + { + "epoch": 0.9108159392789373, + "grad_norm": 0.3330170810222626, + "learning_rate": 2.5297048677654273e-06, + "logits/chosen": 4.630241870880127, + "logits/rejected": 4.770220756530762, + "logps/chosen": -177.4722900390625, + "logps/rejected": -189.434326171875, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.143168449401855, + "rewards/margins": 1.2024940252304077, + "rewards/rejected": -14.345662117004395, + "step": 1320 + }, + { + "epoch": 0.9115059513541487, + "grad_norm": 0.30596524477005005, + "learning_rate": 2.531621310847068e-06, + "logits/chosen": 4.244617462158203, + "logits/rejected": 4.346353530883789, + "logps/chosen": -157.97866821289062, + "logps/rejected": -174.23583984375, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.129379272460938, + "rewards/margins": 1.6476550102233887, + "rewards/rejected": -12.777032852172852, + "step": 1321 + }, + { + "epoch": 0.91219596342936, + "grad_norm": 0.3693171441555023, + "learning_rate": 2.5335377539287085e-06, + "logits/chosen": 4.376736164093018, + "logits/rejected": 4.582945346832275, + "logps/chosen": -173.74453735351562, + "logps/rejected": -181.20355224609375, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.498785018920898, + "rewards/margins": 0.7066576480865479, + "rewards/rejected": -13.205442428588867, + "step": 1322 + }, + { + "epoch": 0.9128859755045713, + "grad_norm": 0.5576413869857788, + "learning_rate": 2.535454197010349e-06, + "logits/chosen": 4.2389349937438965, + "logits/rejected": 4.37794303894043, + "logps/chosen": -160.83450317382812, + "logps/rejected": -179.03573608398438, + "loss": 0.5219, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.411120414733887, + "rewards/margins": 1.8275938034057617, + "rewards/rejected": -13.238714218139648, + "step": 1323 + }, + { + "epoch": 0.9135759875797826, + "grad_norm": 0.38632968068122864, + "learning_rate": 2.5373706400919893e-06, + "logits/chosen": 4.2081618309021, + "logits/rejected": 4.477814674377441, + "logps/chosen": -148.7834930419922, + "logps/rejected": -169.4439697265625, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.116866111755371, + "rewards/margins": 1.987439513206482, + "rewards/rejected": -12.104305267333984, + "step": 1324 + }, + { + "epoch": 0.914265999654994, + "grad_norm": 0.3325524628162384, + "learning_rate": 2.53928708317363e-06, + "logits/chosen": 4.286393165588379, + "logits/rejected": 4.399292469024658, + "logps/chosen": -172.34603881835938, + "logps/rejected": -182.3975830078125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.525724411010742, + "rewards/margins": 0.9984577894210815, + "rewards/rejected": -13.524181365966797, + "step": 1325 + }, + { + "epoch": 0.9149560117302052, + "grad_norm": 0.49030861258506775, + "learning_rate": 2.5412035262552705e-06, + "logits/chosen": 4.190550804138184, + "logits/rejected": 4.370148658752441, + "logps/chosen": -152.76376342773438, + "logps/rejected": -164.82247924804688, + "loss": 0.5225, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.612889289855957, + "rewards/margins": 1.2295527458190918, + "rewards/rejected": -11.84244155883789, + "step": 1326 + }, + { + "epoch": 0.9156460238054166, + "grad_norm": 0.43951964378356934, + "learning_rate": 2.543119969336911e-06, + "logits/chosen": 4.432568550109863, + "logits/rejected": 4.597115516662598, + "logps/chosen": -182.16636657714844, + "logps/rejected": -188.11109924316406, + "loss": 0.6077, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.322182655334473, + "rewards/margins": 0.5836706161499023, + "rewards/rejected": -13.905853271484375, + "step": 1327 + }, + { + "epoch": 0.916336035880628, + "grad_norm": 0.3029120862483978, + "learning_rate": 2.5450364124185513e-06, + "logits/chosen": 4.370926856994629, + "logits/rejected": 4.370926856994629, + "logps/chosen": -186.80307006835938, + "logps/rejected": -186.80307006835938, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.010496139526367, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -14.010498046875, + "step": 1328 + }, + { + "epoch": 0.9170260479558392, + "grad_norm": 26.75121307373047, + "learning_rate": 2.546952855500192e-06, + "logits/chosen": 4.601480484008789, + "logits/rejected": 4.5167741775512695, + "logps/chosen": -179.74267578125, + "logps/rejected": -171.51710510253906, + "loss": 1.4151, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.361315727233887, + "rewards/margins": -0.8196285963058472, + "rewards/rejected": -12.541687965393066, + "step": 1329 + }, + { + "epoch": 0.9177160600310506, + "grad_norm": 0.4185536503791809, + "learning_rate": 2.5488692985818325e-06, + "logits/chosen": 4.335102558135986, + "logits/rejected": 4.363118648529053, + "logps/chosen": -174.4199676513672, + "logps/rejected": -179.36134338378906, + "loss": 0.6087, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.788113594055176, + "rewards/margins": 0.5023375749588013, + "rewards/rejected": -13.290452003479004, + "step": 1330 + }, + { + "epoch": 0.9184060721062619, + "grad_norm": 0.9359927773475647, + "learning_rate": 2.550785741663473e-06, + "logits/chosen": 4.105798721313477, + "logits/rejected": 4.122730255126953, + "logps/chosen": -168.9534912109375, + "logps/rejected": -172.46722412109375, + "loss": 0.6128, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.042851448059082, + "rewards/margins": 0.3696131110191345, + "rewards/rejected": -12.412464141845703, + "step": 1331 + }, + { + "epoch": 0.9190960841814731, + "grad_norm": 0.4252376854419708, + "learning_rate": 2.5527021847451133e-06, + "logits/chosen": 4.201601982116699, + "logits/rejected": 4.201601982116699, + "logps/chosen": -184.97064208984375, + "logps/rejected": -184.97064208984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.653436660766602, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.653436660766602, + "step": 1332 + }, + { + "epoch": 0.9197860962566845, + "grad_norm": 7.078742980957031, + "learning_rate": 2.5546186278267537e-06, + "logits/chosen": 4.3808674812316895, + "logits/rejected": 4.414941787719727, + "logps/chosen": -188.36520385742188, + "logps/rejected": -187.97518920898438, + "loss": 0.7305, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.059015274047852, + "rewards/margins": -0.06600666046142578, + "rewards/rejected": -13.993008613586426, + "step": 1333 + }, + { + "epoch": 0.9204761083318959, + "grad_norm": 23.528600692749023, + "learning_rate": 2.556535070908394e-06, + "logits/chosen": 4.099719524383545, + "logits/rejected": 4.036689281463623, + "logps/chosen": -162.94363403320312, + "logps/rejected": -161.69403076171875, + "loss": 1.3385, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.526829719543457, + "rewards/margins": -0.14329558610916138, + "rewards/rejected": -11.383533477783203, + "step": 1334 + }, + { + "epoch": 0.9211661204071071, + "grad_norm": 0.4233867824077606, + "learning_rate": 2.5584515139900345e-06, + "logits/chosen": 4.110930442810059, + "logits/rejected": 4.212163925170898, + "logps/chosen": -160.1214141845703, + "logps/rejected": -171.49310302734375, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.217495918273926, + "rewards/margins": 1.1299400329589844, + "rewards/rejected": -12.34743595123291, + "step": 1335 + }, + { + "epoch": 0.9218561324823185, + "grad_norm": 0.5343468189239502, + "learning_rate": 2.560367957071675e-06, + "logits/chosen": 4.449089527130127, + "logits/rejected": 4.5850510597229, + "logps/chosen": -169.38470458984375, + "logps/rejected": -188.8323974609375, + "loss": 0.5218, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.098388671875, + "rewards/margins": 1.9554328918457031, + "rewards/rejected": -14.053821563720703, + "step": 1336 + }, + { + "epoch": 0.9225461445575297, + "grad_norm": 0.26012784242630005, + "learning_rate": 2.562284400153316e-06, + "logits/chosen": 4.233506679534912, + "logits/rejected": 4.43470573425293, + "logps/chosen": -142.042724609375, + "logps/rejected": -179.7047882080078, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.480613708496094, + "rewards/margins": 3.7599024772644043, + "rewards/rejected": -13.240516662597656, + "step": 1337 + }, + { + "epoch": 0.9232361566327411, + "grad_norm": 0.37022095918655396, + "learning_rate": 2.5642008432349565e-06, + "logits/chosen": 4.414337158203125, + "logits/rejected": 4.414337158203125, + "logps/chosen": -171.20053100585938, + "logps/rejected": -171.20053100585938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.530009269714355, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.530010223388672, + "step": 1338 + }, + { + "epoch": 0.9239261687079524, + "grad_norm": 0.4036596417427063, + "learning_rate": 2.566117286316597e-06, + "logits/chosen": 4.518241882324219, + "logits/rejected": 4.549322605133057, + "logps/chosen": -158.08628845214844, + "logps/rejected": -167.92478942871094, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.224431037902832, + "rewards/margins": 0.9237262606620789, + "rewards/rejected": -12.148157119750977, + "step": 1339 + }, + { + "epoch": 0.9246161807831637, + "grad_norm": 0.4638528823852539, + "learning_rate": 2.5680337293982373e-06, + "logits/chosen": 4.102719783782959, + "logits/rejected": 4.2647929191589355, + "logps/chosen": -145.443359375, + "logps/rejected": -163.99981689453125, + "loss": 0.5224, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.771637916564941, + "rewards/margins": 1.9060120582580566, + "rewards/rejected": -11.677650451660156, + "step": 1340 + }, + { + "epoch": 0.925306192858375, + "grad_norm": 0.30311691761016846, + "learning_rate": 2.5699501724798777e-06, + "logits/chosen": 4.365791320800781, + "logits/rejected": 4.365791320800781, + "logps/chosen": -185.27261352539062, + "logps/rejected": -185.27261352539062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.682527542114258, + "rewards/margins": 0.0, + "rewards/rejected": -13.682527542114258, + "step": 1341 + }, + { + "epoch": 0.9259962049335864, + "grad_norm": 0.4030858278274536, + "learning_rate": 2.571866615561518e-06, + "logits/chosen": 4.335644245147705, + "logits/rejected": 4.378016948699951, + "logps/chosen": -161.10122680664062, + "logps/rejected": -176.48121643066406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.401138305664062, + "rewards/margins": 1.4695978164672852, + "rewards/rejected": -12.870737075805664, + "step": 1342 + }, + { + "epoch": 0.9266862170087976, + "grad_norm": 5.317934036254883, + "learning_rate": 2.5737830586431585e-06, + "logits/chosen": 3.790313720703125, + "logits/rejected": 3.8180103302001953, + "logps/chosen": -146.21923828125, + "logps/rejected": -162.03854370117188, + "loss": 0.5773, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.654552459716797, + "rewards/margins": 1.5605363845825195, + "rewards/rejected": -11.215089797973633, + "step": 1343 + }, + { + "epoch": 0.927376229084009, + "grad_norm": 0.3203530013561249, + "learning_rate": 2.575699501724799e-06, + "logits/chosen": 4.134928226470947, + "logits/rejected": 4.224915504455566, + "logps/chosen": -173.5297088623047, + "logps/rejected": -180.4145965576172, + "loss": 0.607, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.530200958251953, + "rewards/margins": 0.6845965385437012, + "rewards/rejected": -13.214797973632812, + "step": 1344 + }, + { + "epoch": 0.9280662411592203, + "grad_norm": 0.3291870057582855, + "learning_rate": 2.5776159448064397e-06, + "logits/chosen": 4.632022380828857, + "logits/rejected": 4.723866939544678, + "logps/chosen": -168.2821807861328, + "logps/rejected": -181.65191650390625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.09688949584961, + "rewards/margins": 1.3197412490844727, + "rewards/rejected": -13.416630744934082, + "step": 1345 + }, + { + "epoch": 0.9287562532344316, + "grad_norm": 0.28770357370376587, + "learning_rate": 2.57953238788808e-06, + "logits/chosen": 4.461379051208496, + "logits/rejected": 4.540327072143555, + "logps/chosen": -164.84197998046875, + "logps/rejected": -174.092041015625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.748796463012695, + "rewards/margins": 0.8961683511734009, + "rewards/rejected": -12.644964218139648, + "step": 1346 + }, + { + "epoch": 0.9294462653096429, + "grad_norm": 0.3862048387527466, + "learning_rate": 2.5814488309697204e-06, + "logits/chosen": 4.704195022583008, + "logits/rejected": 4.704195022583008, + "logps/chosen": -188.37890625, + "logps/rejected": -188.37890625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.889847755432129, + "rewards/margins": 0.0, + "rewards/rejected": -13.889847755432129, + "step": 1347 + }, + { + "epoch": 0.9301362773848543, + "grad_norm": 0.3709765076637268, + "learning_rate": 2.583365274051361e-06, + "logits/chosen": 4.042741775512695, + "logits/rejected": 4.1359453201293945, + "logps/chosen": -165.56784057617188, + "logps/rejected": -176.9481964111328, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.754129409790039, + "rewards/margins": 1.166011929512024, + "rewards/rejected": -12.920141220092773, + "step": 1348 + }, + { + "epoch": 0.9308262894600655, + "grad_norm": 2.0023984909057617, + "learning_rate": 2.5852817171330012e-06, + "logits/chosen": 4.178206443786621, + "logits/rejected": 4.356955051422119, + "logps/chosen": -163.88284301757812, + "logps/rejected": -187.0378875732422, + "loss": 0.445, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.595271110534668, + "rewards/margins": 2.3454039096832275, + "rewards/rejected": -13.940674781799316, + "step": 1349 + }, + { + "epoch": 0.9315163015352769, + "grad_norm": 0.33836302161216736, + "learning_rate": 2.5871981602146416e-06, + "logits/chosen": 4.53660249710083, + "logits/rejected": 4.600231170654297, + "logps/chosen": -171.54931640625, + "logps/rejected": -182.55026245117188, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.255416870117188, + "rewards/margins": 1.1109168529510498, + "rewards/rejected": -13.366333961486816, + "step": 1350 + }, + { + "epoch": 0.9322063136104882, + "grad_norm": 0.32761350274086, + "learning_rate": 2.589114603296282e-06, + "logits/chosen": 4.246192932128906, + "logits/rejected": 4.246192932128906, + "logps/chosen": -180.45799255371094, + "logps/rejected": -180.45797729492188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.269340515136719, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.269340515136719, + "step": 1351 + }, + { + "epoch": 0.9328963256856995, + "grad_norm": 0.32951945066452026, + "learning_rate": 2.5910310463779224e-06, + "logits/chosen": 4.2236552238464355, + "logits/rejected": 4.2236552238464355, + "logps/chosen": -170.9203643798828, + "logps/rejected": -170.9203643798828, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.13735580444336, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.13735580444336, + "step": 1352 + }, + { + "epoch": 0.9335863377609108, + "grad_norm": 0.2882882356643677, + "learning_rate": 2.5929474894595636e-06, + "logits/chosen": 4.169986724853516, + "logits/rejected": 4.182120323181152, + "logps/chosen": -164.60739135742188, + "logps/rejected": -175.04342651367188, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.548979759216309, + "rewards/margins": 1.0986641645431519, + "rewards/rejected": -12.64764404296875, + "step": 1353 + }, + { + "epoch": 0.9342763498361222, + "grad_norm": 0.3754199743270874, + "learning_rate": 2.594863932541204e-06, + "logits/chosen": 4.334379196166992, + "logits/rejected": 4.334379196166992, + "logps/chosen": -174.59500122070312, + "logps/rejected": -174.5950164794922, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.610963821411133, + "rewards/margins": 0.0, + "rewards/rejected": -12.610963821411133, + "step": 1354 + }, + { + "epoch": 0.9349663619113334, + "grad_norm": 0.35345223546028137, + "learning_rate": 2.5967803756228444e-06, + "logits/chosen": 4.36100435256958, + "logits/rejected": 4.438126087188721, + "logps/chosen": -176.06146240234375, + "logps/rejected": -185.4429931640625, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.712800025939941, + "rewards/margins": 0.9171583652496338, + "rewards/rejected": -13.629958152770996, + "step": 1355 + }, + { + "epoch": 0.9356563739865448, + "grad_norm": 0.3596075475215912, + "learning_rate": 2.598696818704485e-06, + "logits/chosen": 4.317559719085693, + "logits/rejected": 4.317559719085693, + "logps/chosen": -168.5319061279297, + "logps/rejected": -168.5319061279297, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.335596084594727, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.335596084594727, + "step": 1356 + }, + { + "epoch": 0.9363463860617561, + "grad_norm": 0.38870060443878174, + "learning_rate": 2.600613261786125e-06, + "logits/chosen": 4.299888610839844, + "logits/rejected": 4.299888610839844, + "logps/chosen": -158.1328582763672, + "logps/rejected": -158.1328582763672, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.926942825317383, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -10.926942825317383, + "step": 1357 + }, + { + "epoch": 0.9370363981369674, + "grad_norm": 0.3796284794807434, + "learning_rate": 2.6025297048677656e-06, + "logits/chosen": 4.241944313049316, + "logits/rejected": 4.334790229797363, + "logps/chosen": -180.93348693847656, + "logps/rejected": -191.58761596679688, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.326482772827148, + "rewards/margins": 1.0831719636917114, + "rewards/rejected": -14.409655570983887, + "step": 1358 + }, + { + "epoch": 0.9377264102121787, + "grad_norm": 0.39905616641044617, + "learning_rate": 2.604446147949406e-06, + "logits/chosen": 4.158782005310059, + "logits/rejected": 4.236865997314453, + "logps/chosen": -160.78659057617188, + "logps/rejected": -170.45773315429688, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.465200424194336, + "rewards/margins": 0.9758307933807373, + "rewards/rejected": -12.441030502319336, + "step": 1359 + }, + { + "epoch": 0.9384164222873901, + "grad_norm": 0.42528432607650757, + "learning_rate": 2.6063625910310464e-06, + "logits/chosen": 4.05797815322876, + "logits/rejected": 4.042489051818848, + "logps/chosen": -175.8983154296875, + "logps/rejected": -181.48377990722656, + "loss": 0.6081, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.776814460754395, + "rewards/margins": 0.5408837199211121, + "rewards/rejected": -13.317697525024414, + "step": 1360 + }, + { + "epoch": 0.9391064343626013, + "grad_norm": 0.3389101028442383, + "learning_rate": 2.6082790341126868e-06, + "logits/chosen": 4.235008716583252, + "logits/rejected": 4.388050079345703, + "logps/chosen": -162.39572143554688, + "logps/rejected": -176.32147216796875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.362853050231934, + "rewards/margins": 1.4175931215286255, + "rewards/rejected": -12.78044605255127, + "step": 1361 + }, + { + "epoch": 0.9397964464378127, + "grad_norm": 6.910523414611816, + "learning_rate": 2.6101954771943276e-06, + "logits/chosen": 4.068483352661133, + "logits/rejected": 4.148787498474121, + "logps/chosen": -154.50674438476562, + "logps/rejected": -154.8646240234375, + "loss": 0.6981, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.688454627990723, + "rewards/margins": -0.009646058082580566, + "rewards/rejected": -10.678808212280273, + "step": 1362 + }, + { + "epoch": 0.9404864585130239, + "grad_norm": 4.221713066101074, + "learning_rate": 2.612111920275968e-06, + "logits/chosen": 3.9180212020874023, + "logits/rejected": 4.08856201171875, + "logps/chosen": -156.29763793945312, + "logps/rejected": -183.73211669921875, + "loss": 0.3739, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.084891319274902, + "rewards/margins": 2.6754112243652344, + "rewards/rejected": -13.760302543640137, + "step": 1363 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 1.7415947914123535, + "learning_rate": 2.6140283633576084e-06, + "logits/chosen": 4.181864261627197, + "logits/rejected": 4.20279598236084, + "logps/chosen": -146.4817352294922, + "logps/rejected": -148.9931182861328, + "loss": 0.6178, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.826022148132324, + "rewards/margins": 0.29503339529037476, + "rewards/rejected": -10.121055603027344, + "step": 1364 + }, + { + "epoch": 0.9418664826634466, + "grad_norm": 0.35828423500061035, + "learning_rate": 2.615944806439249e-06, + "logits/chosen": 3.925734758377075, + "logits/rejected": 3.925734758377075, + "logps/chosen": -168.74957275390625, + "logps/rejected": -168.7495880126953, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.012981414794922, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.012981414794922, + "step": 1365 + }, + { + "epoch": 0.9425564947386579, + "grad_norm": 0.32448193430900574, + "learning_rate": 2.6178612495208896e-06, + "logits/chosen": 4.161332130432129, + "logits/rejected": 4.207962512969971, + "logps/chosen": -159.859619140625, + "logps/rejected": -168.15965270996094, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.927927017211914, + "rewards/margins": 0.8808541297912598, + "rewards/rejected": -11.808780670166016, + "step": 1366 + }, + { + "epoch": 0.9432465068138692, + "grad_norm": 0.26911893486976624, + "learning_rate": 2.61977769260253e-06, + "logits/chosen": 4.424808502197266, + "logits/rejected": 4.53487491607666, + "logps/chosen": -162.51585388183594, + "logps/rejected": -174.5143585205078, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.392364501953125, + "rewards/margins": 1.209010124206543, + "rewards/rejected": -12.601375579833984, + "step": 1367 + }, + { + "epoch": 0.9439365188890806, + "grad_norm": 0.3117417097091675, + "learning_rate": 2.6216941356841704e-06, + "logits/chosen": 4.3167595863342285, + "logits/rejected": 4.404139995574951, + "logps/chosen": -163.76870727539062, + "logps/rejected": -173.84169006347656, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.733907699584961, + "rewards/margins": 0.988986611366272, + "rewards/rejected": -12.722894668579102, + "step": 1368 + }, + { + "epoch": 0.9446265309642918, + "grad_norm": 0.2688639163970947, + "learning_rate": 2.6236105787658108e-06, + "logits/chosen": 4.2221574783325195, + "logits/rejected": 4.401123523712158, + "logps/chosen": -163.56463623046875, + "logps/rejected": -183.68792724609375, + "loss": 0.5201, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.59113883972168, + "rewards/margins": 1.9545615911483765, + "rewards/rejected": -13.545701026916504, + "step": 1369 + }, + { + "epoch": 0.9453165430395032, + "grad_norm": 7.648237228393555, + "learning_rate": 2.6255270218474516e-06, + "logits/chosen": 4.117635726928711, + "logits/rejected": 4.231095790863037, + "logps/chosen": -172.2930908203125, + "logps/rejected": -180.15078735351562, + "loss": 0.6107, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.340194702148438, + "rewards/margins": 0.7396957278251648, + "rewards/rejected": -13.079891204833984, + "step": 1370 + }, + { + "epoch": 0.9460065551147145, + "grad_norm": 0.27770695090293884, + "learning_rate": 2.627443464929092e-06, + "logits/chosen": 4.1555070877075195, + "logits/rejected": 4.243068695068359, + "logps/chosen": -191.89805603027344, + "logps/rejected": -199.72702026367188, + "loss": 0.6067, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.474333763122559, + "rewards/margins": 0.8214836120605469, + "rewards/rejected": -15.295817375183105, + "step": 1371 + }, + { + "epoch": 0.9466965671899258, + "grad_norm": 3.472928047180176, + "learning_rate": 2.6293599080107323e-06, + "logits/chosen": 4.116364002227783, + "logits/rejected": 4.047409534454346, + "logps/chosen": -149.501953125, + "logps/rejected": -164.17156982421875, + "loss": 0.5623, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.209964752197266, + "rewards/margins": 1.4169254302978516, + "rewards/rejected": -11.626890182495117, + "step": 1372 + }, + { + "epoch": 0.9473865792651371, + "grad_norm": 0.3076417148113251, + "learning_rate": 2.6312763510923727e-06, + "logits/chosen": 4.39103889465332, + "logits/rejected": 4.39103889465332, + "logps/chosen": -180.10775756835938, + "logps/rejected": -180.10775756835938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.42281436920166, + "rewards/margins": 0.0, + "rewards/rejected": -13.42281436920166, + "step": 1373 + }, + { + "epoch": 0.9480765913403485, + "grad_norm": 0.30877164006233215, + "learning_rate": 2.633192794174013e-06, + "logits/chosen": 3.715101480484009, + "logits/rejected": 3.9514834880828857, + "logps/chosen": -169.8336639404297, + "logps/rejected": -189.6422576904297, + "loss": 0.5208, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.251006126403809, + "rewards/margins": 1.992753267288208, + "rewards/rejected": -14.243759155273438, + "step": 1374 + }, + { + "epoch": 0.9487666034155597, + "grad_norm": 0.3704124391078949, + "learning_rate": 2.6351092372556535e-06, + "logits/chosen": 3.943415641784668, + "logits/rejected": 4.0651984214782715, + "logps/chosen": -167.04000854492188, + "logps/rejected": -196.06759643554688, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.022069931030273, + "rewards/margins": 2.9387590885162354, + "rewards/rejected": -14.960829734802246, + "step": 1375 + }, + { + "epoch": 0.9494566154907711, + "grad_norm": 29.446081161499023, + "learning_rate": 2.637025680337294e-06, + "logits/chosen": 4.5690226554870605, + "logits/rejected": 4.360441207885742, + "logps/chosen": -182.6470184326172, + "logps/rejected": -180.44700622558594, + "loss": 0.8334, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.295470237731934, + "rewards/margins": -0.182794451713562, + "rewards/rejected": -13.112676620483398, + "step": 1376 + }, + { + "epoch": 0.9501466275659824, + "grad_norm": 0.3193773329257965, + "learning_rate": 2.6389421234189343e-06, + "logits/chosen": 4.208681106567383, + "logits/rejected": 4.208681106567383, + "logps/chosen": -178.17584228515625, + "logps/rejected": -178.17584228515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.07133960723877, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.07133960723877, + "step": 1377 + }, + { + "epoch": 0.9508366396411937, + "grad_norm": 0.376655250787735, + "learning_rate": 2.6408585665005755e-06, + "logits/chosen": 4.139534950256348, + "logits/rejected": 4.116791248321533, + "logps/chosen": -172.24285888671875, + "logps/rejected": -179.7261962890625, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.698334693908691, + "rewards/margins": 0.7060679197311401, + "rewards/rejected": -13.404402732849121, + "step": 1378 + }, + { + "epoch": 0.951526651716405, + "grad_norm": 0.3428686857223511, + "learning_rate": 2.642775009582216e-06, + "logits/chosen": 3.8503150939941406, + "logits/rejected": 3.8503150939941406, + "logps/chosen": -183.10128784179688, + "logps/rejected": -183.10128784179688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.569038391113281, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.569038391113281, + "step": 1379 + }, + { + "epoch": 0.9522166637916164, + "grad_norm": 0.3702099323272705, + "learning_rate": 2.6446914526638563e-06, + "logits/chosen": 4.032794952392578, + "logits/rejected": 4.056596755981445, + "logps/chosen": -163.58131408691406, + "logps/rejected": -172.42140197753906, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.644126892089844, + "rewards/margins": 0.932675838470459, + "rewards/rejected": -12.576803207397461, + "step": 1380 + }, + { + "epoch": 0.9529066758668276, + "grad_norm": 14.354796409606934, + "learning_rate": 2.6466078957454967e-06, + "logits/chosen": 3.8558220863342285, + "logits/rejected": 3.9659342765808105, + "logps/chosen": -168.59390258789062, + "logps/rejected": -173.31224060058594, + "loss": 0.8205, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.055999755859375, + "rewards/margins": 0.47174108028411865, + "rewards/rejected": -12.527740478515625, + "step": 1381 + }, + { + "epoch": 0.953596687942039, + "grad_norm": 0.23529882729053497, + "learning_rate": 2.648524338827137e-06, + "logits/chosen": 4.1596503257751465, + "logits/rejected": 4.1596503257751465, + "logps/chosen": -180.33984375, + "logps/rejected": -180.33984375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.087855339050293, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.08785629272461, + "step": 1382 + }, + { + "epoch": 0.9542867000172504, + "grad_norm": 0.3818678855895996, + "learning_rate": 2.6504407819087775e-06, + "logits/chosen": 4.102474212646484, + "logits/rejected": 4.102474212646484, + "logps/chosen": -173.58419799804688, + "logps/rejected": -173.58421325683594, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.481832504272461, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -12.481833457946777, + "step": 1383 + }, + { + "epoch": 0.9549767120924616, + "grad_norm": 1.270934820175171, + "learning_rate": 2.652357224990418e-06, + "logits/chosen": 4.097095489501953, + "logits/rejected": 4.124865531921387, + "logps/chosen": -167.7222900390625, + "logps/rejected": -179.00924682617188, + "loss": 0.5344, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.703939437866211, + "rewards/margins": 1.2317402362823486, + "rewards/rejected": -12.935680389404297, + "step": 1384 + }, + { + "epoch": 0.955666724167673, + "grad_norm": 0.32517531514167786, + "learning_rate": 2.6542736680720583e-06, + "logits/chosen": 4.180222511291504, + "logits/rejected": 4.265134811401367, + "logps/chosen": -147.08990478515625, + "logps/rejected": -152.8082733154297, + "loss": 0.608, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.918017387390137, + "rewards/margins": 0.5549001693725586, + "rewards/rejected": -10.472917556762695, + "step": 1385 + }, + { + "epoch": 0.9563567362428842, + "grad_norm": 0.35243725776672363, + "learning_rate": 2.656190111153699e-06, + "logits/chosen": 4.03546142578125, + "logits/rejected": 4.03546142578125, + "logps/chosen": -181.34326171875, + "logps/rejected": -181.34326171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.448065757751465, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.448064804077148, + "step": 1386 + }, + { + "epoch": 0.9570467483180956, + "grad_norm": 1.0467514991760254, + "learning_rate": 2.6581065542353395e-06, + "logits/chosen": 4.175746440887451, + "logits/rejected": 4.252584457397461, + "logps/chosen": -171.5898895263672, + "logps/rejected": -187.2298583984375, + "loss": 0.5251, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.454627990722656, + "rewards/margins": 1.6403417587280273, + "rewards/rejected": -14.094970703125, + "step": 1387 + }, + { + "epoch": 0.9577367603933069, + "grad_norm": 1.7342290878295898, + "learning_rate": 2.66002299731698e-06, + "logits/chosen": 3.8162248134613037, + "logits/rejected": 3.9578397274017334, + "logps/chosen": -132.52224731445312, + "logps/rejected": -149.96347045898438, + "loss": 0.4518, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.525131225585938, + "rewards/margins": 1.8018018007278442, + "rewards/rejected": -10.326932907104492, + "step": 1388 + }, + { + "epoch": 0.9584267724685182, + "grad_norm": 0.32212960720062256, + "learning_rate": 2.6619394403986203e-06, + "logits/chosen": 4.142909526824951, + "logits/rejected": 4.142909526824951, + "logps/chosen": -175.75363159179688, + "logps/rejected": -175.75363159179688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.620182991027832, + "rewards/margins": 0.0, + "rewards/rejected": -12.620182991027832, + "step": 1389 + }, + { + "epoch": 0.9591167845437295, + "grad_norm": 12.510128021240234, + "learning_rate": 2.6638558834802607e-06, + "logits/chosen": 4.058135986328125, + "logits/rejected": 4.007263660430908, + "logps/chosen": -171.69239807128906, + "logps/rejected": -182.5584716796875, + "loss": 0.5963, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.382906913757324, + "rewards/margins": 1.2332571744918823, + "rewards/rejected": -13.616164207458496, + "step": 1390 + }, + { + "epoch": 0.9598067966189409, + "grad_norm": 0.30818116664886475, + "learning_rate": 2.665772326561901e-06, + "logits/chosen": 3.884767532348633, + "logits/rejected": 3.8769984245300293, + "logps/chosen": -174.5006103515625, + "logps/rejected": -183.1588134765625, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.630169868469238, + "rewards/margins": 0.8688452243804932, + "rewards/rejected": -13.499014854431152, + "step": 1391 + }, + { + "epoch": 0.9604968086941521, + "grad_norm": 14.284507751464844, + "learning_rate": 2.6676887696435414e-06, + "logits/chosen": 4.18768835067749, + "logits/rejected": 4.148921966552734, + "logps/chosen": -173.1961669921875, + "logps/rejected": -172.3308563232422, + "loss": 0.7914, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.654924392700195, + "rewards/margins": -0.15254509449005127, + "rewards/rejected": -12.502379417419434, + "step": 1392 + }, + { + "epoch": 0.9611868207693635, + "grad_norm": 20.50948143005371, + "learning_rate": 2.669605212725182e-06, + "logits/chosen": 3.8374900817871094, + "logits/rejected": 3.900562286376953, + "logps/chosen": -174.84805297851562, + "logps/rejected": -182.33489990234375, + "loss": 0.8542, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.695547103881836, + "rewards/margins": 0.7867084741592407, + "rewards/rejected": -13.482255935668945, + "step": 1393 + }, + { + "epoch": 0.9618768328445748, + "grad_norm": 0.3256402611732483, + "learning_rate": 2.671521655806823e-06, + "logits/chosen": 4.138729095458984, + "logits/rejected": 4.203276634216309, + "logps/chosen": -168.30181884765625, + "logps/rejected": -181.031005859375, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.015054702758789, + "rewards/margins": 1.2868283987045288, + "rewards/rejected": -13.301881790161133, + "step": 1394 + }, + { + "epoch": 0.9625668449197861, + "grad_norm": 0.31302234530448914, + "learning_rate": 2.6734380988884635e-06, + "logits/chosen": 4.227721691131592, + "logits/rejected": 4.377755165100098, + "logps/chosen": -183.4887237548828, + "logps/rejected": -193.162353515625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.488374710083008, + "rewards/margins": 0.9813376069068909, + "rewards/rejected": -14.46971321105957, + "step": 1395 + }, + { + "epoch": 0.9632568569949974, + "grad_norm": 1.3268108367919922, + "learning_rate": 2.675354541970104e-06, + "logits/chosen": 3.839357852935791, + "logits/rejected": 3.911572217941284, + "logps/chosen": -138.59808349609375, + "logps/rejected": -162.18161010742188, + "loss": 0.4408, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.030393600463867, + "rewards/margins": 2.3859152793884277, + "rewards/rejected": -11.416309356689453, + "step": 1396 + }, + { + "epoch": 0.9639468690702088, + "grad_norm": 0.3330242931842804, + "learning_rate": 2.6772709850517443e-06, + "logits/chosen": 4.156269073486328, + "logits/rejected": 4.346987724304199, + "logps/chosen": -166.14065551757812, + "logps/rejected": -182.900634765625, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.801021575927734, + "rewards/margins": 1.6319301128387451, + "rewards/rejected": -13.432951927185059, + "step": 1397 + }, + { + "epoch": 0.96463688114542, + "grad_norm": 5.399084091186523, + "learning_rate": 2.6791874281333846e-06, + "logits/chosen": 3.9413034915924072, + "logits/rejected": 3.9311089515686035, + "logps/chosen": -168.6980743408203, + "logps/rejected": -169.9439697265625, + "loss": 0.6507, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.041789054870605, + "rewards/margins": 0.10723006725311279, + "rewards/rejected": -12.149019241333008, + "step": 1398 + }, + { + "epoch": 0.9653268932206314, + "grad_norm": 0.41963130235671997, + "learning_rate": 2.681103871215025e-06, + "logits/chosen": 3.813260078430176, + "logits/rejected": 3.813260078430176, + "logps/chosen": -158.0673828125, + "logps/rejected": -158.0673828125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.050410270690918, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -11.050410270690918, + "step": 1399 + }, + { + "epoch": 0.9660169052958427, + "grad_norm": 0.4136756956577301, + "learning_rate": 2.6830203142966654e-06, + "logits/chosen": 4.274281978607178, + "logits/rejected": 4.274281978607178, + "logps/chosen": -173.34799194335938, + "logps/rejected": -173.34799194335938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.629207611083984, + "rewards/margins": 0.0, + "rewards/rejected": -12.629207611083984, + "step": 1400 + }, + { + "epoch": 0.966706917371054, + "grad_norm": 18.53649139404297, + "learning_rate": 2.684936757378306e-06, + "logits/chosen": 4.105966567993164, + "logits/rejected": 4.015472888946533, + "logps/chosen": -162.77728271484375, + "logps/rejected": -160.08120727539062, + "loss": 0.9608, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.425500869750977, + "rewards/margins": -0.2631680369377136, + "rewards/rejected": -11.162332534790039, + "step": 1401 + }, + { + "epoch": 0.9673969294462653, + "grad_norm": 14.225007057189941, + "learning_rate": 2.6868532004599466e-06, + "logits/chosen": 3.794595718383789, + "logits/rejected": 3.8591866493225098, + "logps/chosen": -153.05335998535156, + "logps/rejected": -152.2340545654297, + "loss": 0.7249, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.609004020690918, + "rewards/margins": -0.05705082416534424, + "rewards/rejected": -10.55195426940918, + "step": 1402 + }, + { + "epoch": 0.9680869415214767, + "grad_norm": 0.2634364366531372, + "learning_rate": 2.688769643541587e-06, + "logits/chosen": 3.7838854789733887, + "logits/rejected": 4.042921543121338, + "logps/chosen": -157.7755126953125, + "logps/rejected": -186.91079711914062, + "loss": 0.4336, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.063292503356934, + "rewards/margins": 2.8852953910827637, + "rewards/rejected": -13.948587417602539, + "step": 1403 + }, + { + "epoch": 0.9687769535966879, + "grad_norm": 0.39863741397857666, + "learning_rate": 2.6906860866232274e-06, + "logits/chosen": 3.750274658203125, + "logits/rejected": 3.9317626953125, + "logps/chosen": -159.9061737060547, + "logps/rejected": -165.92874145507812, + "loss": 0.6074, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.09083080291748, + "rewards/margins": 0.614374041557312, + "rewards/rejected": -11.705204963684082, + "step": 1404 + }, + { + "epoch": 0.9694669656718993, + "grad_norm": 0.8365033268928528, + "learning_rate": 2.692602529704868e-06, + "logits/chosen": 3.945167303085327, + "logits/rejected": 4.027563095092773, + "logps/chosen": -161.237060546875, + "logps/rejected": -170.91038513183594, + "loss": 0.5243, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.335338592529297, + "rewards/margins": 1.0709373950958252, + "rewards/rejected": -12.40627670288086, + "step": 1405 + }, + { + "epoch": 0.9701569777471106, + "grad_norm": 0.4314631223678589, + "learning_rate": 2.6945189727865086e-06, + "logits/chosen": 3.95647931098938, + "logits/rejected": 3.961770534515381, + "logps/chosen": -169.62196350097656, + "logps/rejected": -180.35009765625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.1431884765625, + "rewards/margins": 1.0887261629104614, + "rewards/rejected": -13.231914520263672, + "step": 1406 + }, + { + "epoch": 0.9708469898223219, + "grad_norm": 4.777519226074219, + "learning_rate": 2.696435415868149e-06, + "logits/chosen": 3.9291269779205322, + "logits/rejected": 3.9955620765686035, + "logps/chosen": -141.1295166015625, + "logps/rejected": -166.0811309814453, + "loss": 0.4823, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.447698593139648, + "rewards/margins": 2.5033020973205566, + "rewards/rejected": -11.951000213623047, + "step": 1407 + }, + { + "epoch": 0.9715370018975332, + "grad_norm": 0.45622631907463074, + "learning_rate": 2.6983518589497894e-06, + "logits/chosen": 4.320534706115723, + "logits/rejected": 4.320534706115723, + "logps/chosen": -163.91146850585938, + "logps/rejected": -163.91146850585938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.748960494995117, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -11.748960494995117, + "step": 1408 + }, + { + "epoch": 0.9722270139727445, + "grad_norm": 0.5703312754631042, + "learning_rate": 2.70026830203143e-06, + "logits/chosen": 3.9158425331115723, + "logits/rejected": 3.9591896533966064, + "logps/chosen": -152.49346923828125, + "logps/rejected": -165.5087890625, + "loss": 0.5254, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.31637954711914, + "rewards/margins": 1.3070955276489258, + "rewards/rejected": -11.623476028442383, + "step": 1409 + }, + { + "epoch": 0.9729170260479558, + "grad_norm": 12.273452758789062, + "learning_rate": 2.7021847451130706e-06, + "logits/chosen": 3.656316041946411, + "logits/rejected": 3.7537031173706055, + "logps/chosen": -167.86349487304688, + "logps/rejected": -174.74411010742188, + "loss": 0.9429, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.971400260925293, + "rewards/margins": 0.7052930593490601, + "rewards/rejected": -12.676694869995117, + "step": 1410 + }, + { + "epoch": 0.9736070381231672, + "grad_norm": 0.34725475311279297, + "learning_rate": 2.704101188194711e-06, + "logits/chosen": 3.6151695251464844, + "logits/rejected": 3.663214683532715, + "logps/chosen": -163.494873046875, + "logps/rejected": -172.50607299804688, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.818452835083008, + "rewards/margins": 0.8649225234985352, + "rewards/rejected": -12.68337631225586, + "step": 1411 + }, + { + "epoch": 0.9742970501983784, + "grad_norm": 0.28457632660865784, + "learning_rate": 2.7060176312763514e-06, + "logits/chosen": 4.082350730895996, + "logits/rejected": 4.082350730895996, + "logps/chosen": -187.4647979736328, + "logps/rejected": -187.4647979736328, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.829509735107422, + "rewards/margins": 0.0, + "rewards/rejected": -13.829509735107422, + "step": 1412 + }, + { + "epoch": 0.9749870622735898, + "grad_norm": 0.3810705840587616, + "learning_rate": 2.707934074357992e-06, + "logits/chosen": 3.8586769104003906, + "logits/rejected": 3.8586769104003906, + "logps/chosen": -175.52664184570312, + "logps/rejected": -175.52664184570312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.66215705871582, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.66215705871582, + "step": 1413 + }, + { + "epoch": 0.9756770743488011, + "grad_norm": 10.031157493591309, + "learning_rate": 2.709850517439632e-06, + "logits/chosen": 3.9112894535064697, + "logits/rejected": 3.940331220626831, + "logps/chosen": -165.89187622070312, + "logps/rejected": -165.0823516845703, + "loss": 1.1739, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.617097854614258, + "rewards/margins": -0.0005565881729125977, + "rewards/rejected": -11.61654281616211, + "step": 1414 + }, + { + "epoch": 0.9763670864240124, + "grad_norm": 0.38664668798446655, + "learning_rate": 2.7117669605212726e-06, + "logits/chosen": 3.977461814880371, + "logits/rejected": 4.013411521911621, + "logps/chosen": -148.65289306640625, + "logps/rejected": -160.81614685058594, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.139256477355957, + "rewards/margins": 1.3267228603363037, + "rewards/rejected": -11.46597957611084, + "step": 1415 + }, + { + "epoch": 0.9770570984992237, + "grad_norm": 0.3797755837440491, + "learning_rate": 2.713683403602913e-06, + "logits/chosen": 4.139531135559082, + "logits/rejected": 4.23211145401001, + "logps/chosen": -168.99159240722656, + "logps/rejected": -176.86129760742188, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.066940307617188, + "rewards/margins": 0.8408751487731934, + "rewards/rejected": -12.907815933227539, + "step": 1416 + }, + { + "epoch": 0.9777471105744351, + "grad_norm": 0.33412179350852966, + "learning_rate": 2.7155998466845534e-06, + "logits/chosen": 4.157811164855957, + "logits/rejected": 4.307333946228027, + "logps/chosen": -146.46205139160156, + "logps/rejected": -153.21177673339844, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.753987312316895, + "rewards/margins": 0.6724050045013428, + "rewards/rejected": -10.426392555236816, + "step": 1417 + }, + { + "epoch": 0.9784371226496463, + "grad_norm": 0.281473845243454, + "learning_rate": 2.7175162897661946e-06, + "logits/chosen": 4.323545932769775, + "logits/rejected": 4.442912578582764, + "logps/chosen": -169.7340850830078, + "logps/rejected": -181.55006408691406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.338581085205078, + "rewards/margins": 1.1687332391738892, + "rewards/rejected": -13.50731372833252, + "step": 1418 + }, + { + "epoch": 0.9791271347248577, + "grad_norm": 1.2023046016693115, + "learning_rate": 2.719432732847835e-06, + "logits/chosen": 3.720696449279785, + "logits/rejected": 3.7466578483581543, + "logps/chosen": -160.12252807617188, + "logps/rejected": -162.9051513671875, + "loss": 0.6163, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.309732437133789, + "rewards/margins": 0.3134666681289673, + "rewards/rejected": -11.623198509216309, + "step": 1419 + }, + { + "epoch": 0.979817146800069, + "grad_norm": 0.2942812740802765, + "learning_rate": 2.7213491759294754e-06, + "logits/chosen": 4.311108112335205, + "logits/rejected": 4.311108112335205, + "logps/chosen": -166.52218627929688, + "logps/rejected": -166.52218627929688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.691112518310547, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -11.691112518310547, + "step": 1420 + }, + { + "epoch": 0.9805071588752803, + "grad_norm": 0.27501779794692993, + "learning_rate": 2.7232656190111158e-06, + "logits/chosen": 3.996825695037842, + "logits/rejected": 4.0332231521606445, + "logps/chosen": -153.64004516601562, + "logps/rejected": -167.36184692382812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.617231369018555, + "rewards/margins": 1.3390095233917236, + "rewards/rejected": -11.9562406539917, + "step": 1421 + }, + { + "epoch": 0.9811971709504916, + "grad_norm": 3.9563686847686768, + "learning_rate": 2.725182062092756e-06, + "logits/chosen": 3.5520052909851074, + "logits/rejected": 3.52215313911438, + "logps/chosen": -162.08895874023438, + "logps/rejected": -163.20179748535156, + "loss": 0.6445, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.294477462768555, + "rewards/margins": 0.12926578521728516, + "rewards/rejected": -11.42374324798584, + "step": 1422 + }, + { + "epoch": 0.981887183025703, + "grad_norm": 0.538079023361206, + "learning_rate": 2.7270985051743966e-06, + "logits/chosen": 3.6053125858306885, + "logits/rejected": 3.8918304443359375, + "logps/chosen": -155.23367309570312, + "logps/rejected": -168.11541748046875, + "loss": 0.5226, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.877970695495605, + "rewards/margins": 1.2881619930267334, + "rewards/rejected": -12.166132926940918, + "step": 1423 + }, + { + "epoch": 0.9825771951009142, + "grad_norm": 0.31523412466049194, + "learning_rate": 2.729014948256037e-06, + "logits/chosen": 4.035902976989746, + "logits/rejected": 4.046939849853516, + "logps/chosen": -144.87083435058594, + "logps/rejected": -160.5848388671875, + "loss": 0.5213, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.670476913452148, + "rewards/margins": 1.3784571886062622, + "rewards/rejected": -11.048933029174805, + "step": 1424 + }, + { + "epoch": 0.9832672071761256, + "grad_norm": 0.32285118103027344, + "learning_rate": 2.7309313913376773e-06, + "logits/chosen": 4.143002033233643, + "logits/rejected": 4.230789661407471, + "logps/chosen": -169.84033203125, + "logps/rejected": -182.9134979248047, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.343138694763184, + "rewards/margins": 1.3462110757827759, + "rewards/rejected": -13.689350128173828, + "step": 1425 + }, + { + "epoch": 0.983957219251337, + "grad_norm": 0.31349435448646545, + "learning_rate": 2.732847834419318e-06, + "logits/chosen": 3.9822492599487305, + "logits/rejected": 4.01230001449585, + "logps/chosen": -165.0984649658203, + "logps/rejected": -172.25421142578125, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.789905548095703, + "rewards/margins": 0.7456711530685425, + "rewards/rejected": -12.535577774047852, + "step": 1426 + }, + { + "epoch": 0.9846472313265482, + "grad_norm": 41.57890701293945, + "learning_rate": 2.7347642775009585e-06, + "logits/chosen": 4.183610916137695, + "logits/rejected": 4.085484981536865, + "logps/chosen": -178.2122039794922, + "logps/rejected": -181.40133666992188, + "loss": 0.9139, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.078441619873047, + "rewards/margins": 0.36469531059265137, + "rewards/rejected": -13.443138122558594, + "step": 1427 + }, + { + "epoch": 0.9853372434017595, + "grad_norm": 0.28430935740470886, + "learning_rate": 2.736680720582599e-06, + "logits/chosen": 3.9698753356933594, + "logits/rejected": 3.9698753356933594, + "logps/chosen": -178.78567504882812, + "logps/rejected": -178.78567504882812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.23947525024414, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -13.23947525024414, + "step": 1428 + }, + { + "epoch": 0.9860272554769709, + "grad_norm": 0.4738975763320923, + "learning_rate": 2.7385971636642393e-06, + "logits/chosen": 3.908668279647827, + "logits/rejected": 4.0182318687438965, + "logps/chosen": -161.22198486328125, + "logps/rejected": -167.7735137939453, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.350011825561523, + "rewards/margins": 0.6401882171630859, + "rewards/rejected": -11.990199089050293, + "step": 1429 + }, + { + "epoch": 0.9867172675521821, + "grad_norm": 3.7799859046936035, + "learning_rate": 2.7405136067458797e-06, + "logits/chosen": 3.9259495735168457, + "logits/rejected": 3.945888042449951, + "logps/chosen": -154.69122314453125, + "logps/rejected": -156.59437561035156, + "loss": 0.6338, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.524889945983887, + "rewards/margins": 0.1763244867324829, + "rewards/rejected": -10.701213836669922, + "step": 1430 + }, + { + "epoch": 0.9874072796273935, + "grad_norm": 0.3211883008480072, + "learning_rate": 2.74243004982752e-06, + "logits/chosen": 4.132708549499512, + "logits/rejected": 4.316928863525391, + "logps/chosen": -167.05184936523438, + "logps/rejected": -174.72344970703125, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.798176765441895, + "rewards/margins": 0.8023774027824402, + "rewards/rejected": -12.600553512573242, + "step": 1431 + }, + { + "epoch": 0.9880972917026047, + "grad_norm": 0.2957701086997986, + "learning_rate": 2.7443464929091605e-06, + "logits/chosen": 4.034802436828613, + "logits/rejected": 4.131900787353516, + "logps/chosen": -159.60601806640625, + "logps/rejected": -180.8137969970703, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.170560836791992, + "rewards/margins": 2.047585964202881, + "rewards/rejected": -13.218147277832031, + "step": 1432 + }, + { + "epoch": 0.9887873037778161, + "grad_norm": 1.1436638832092285, + "learning_rate": 2.746262935990801e-06, + "logits/chosen": 4.00480842590332, + "logits/rejected": 4.017815113067627, + "logps/chosen": -177.96646118164062, + "logps/rejected": -181.63507080078125, + "loss": 0.6138, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.256290435791016, + "rewards/margins": 0.3522477149963379, + "rewards/rejected": -13.608537673950195, + "step": 1433 + }, + { + "epoch": 0.9894773158530275, + "grad_norm": 0.29144105315208435, + "learning_rate": 2.748179379072442e-06, + "logits/chosen": 3.5929317474365234, + "logits/rejected": 3.8634815216064453, + "logps/chosen": -160.61781311035156, + "logps/rejected": -187.0927734375, + "loss": 0.4347, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.118885040283203, + "rewards/margins": 2.7193961143493652, + "rewards/rejected": -13.838279724121094, + "step": 1434 + }, + { + "epoch": 0.9901673279282387, + "grad_norm": 2.5353786945343018, + "learning_rate": 2.7500958221540825e-06, + "logits/chosen": 3.818040370941162, + "logits/rejected": 4.073038101196289, + "logps/chosen": -147.89395141601562, + "logps/rejected": -163.180908203125, + "loss": 0.4506, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.122576713562012, + "rewards/margins": 1.4311909675598145, + "rewards/rejected": -11.553768157958984, + "step": 1435 + }, + { + "epoch": 0.99085734000345, + "grad_norm": 0.29133346676826477, + "learning_rate": 2.752012265235723e-06, + "logits/chosen": 4.226711273193359, + "logits/rejected": 4.40620231628418, + "logps/chosen": -165.6061248779297, + "logps/rejected": -174.39447021484375, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.912689208984375, + "rewards/margins": 0.9163705110549927, + "rewards/rejected": -12.829059600830078, + "step": 1436 + }, + { + "epoch": 0.9915473520786614, + "grad_norm": 1.2526490688323975, + "learning_rate": 2.7539287083173633e-06, + "logits/chosen": 3.931419849395752, + "logits/rejected": 3.9545788764953613, + "logps/chosen": -158.20977783203125, + "logps/rejected": -177.60015869140625, + "loss": 0.5304, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.029603958129883, + "rewards/margins": 1.9005131721496582, + "rewards/rejected": -12.930116653442383, + "step": 1437 + }, + { + "epoch": 0.9922373641538726, + "grad_norm": 0.29634997248649597, + "learning_rate": 2.7558451513990037e-06, + "logits/chosen": 3.8452372550964355, + "logits/rejected": 3.9591598510742188, + "logps/chosen": -164.19224548339844, + "logps/rejected": -187.663330078125, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.618999481201172, + "rewards/margins": 2.286299228668213, + "rewards/rejected": -13.90530014038086, + "step": 1438 + }, + { + "epoch": 0.992927376229084, + "grad_norm": 0.2757100760936737, + "learning_rate": 2.757761594480644e-06, + "logits/chosen": 4.100494384765625, + "logits/rejected": 4.221014976501465, + "logps/chosen": -158.137451171875, + "logps/rejected": -183.76483154296875, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.086233139038086, + "rewards/margins": 2.435035228729248, + "rewards/rejected": -13.521268844604492, + "step": 1439 + }, + { + "epoch": 0.9936173883042954, + "grad_norm": 0.2561935782432556, + "learning_rate": 2.7596780375622845e-06, + "logits/chosen": 4.021990776062012, + "logits/rejected": 4.105168342590332, + "logps/chosen": -170.32827758789062, + "logps/rejected": -178.5658721923828, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.329815864562988, + "rewards/margins": 0.8415448069572449, + "rewards/rejected": -13.171360969543457, + "step": 1440 + }, + { + "epoch": 0.9943074003795066, + "grad_norm": 0.2652626633644104, + "learning_rate": 2.761594480643925e-06, + "logits/chosen": 4.113903522491455, + "logits/rejected": 4.104273796081543, + "logps/chosen": -175.36944580078125, + "logps/rejected": -184.09750366210938, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.651193618774414, + "rewards/margins": 0.8936048746109009, + "rewards/rejected": -13.544798851013184, + "step": 1441 + }, + { + "epoch": 0.994997412454718, + "grad_norm": 0.6510137319564819, + "learning_rate": 2.7635109237255657e-06, + "logits/chosen": 3.710832118988037, + "logits/rejected": 3.850112199783325, + "logps/chosen": -175.9161376953125, + "logps/rejected": -180.23635864257812, + "loss": 0.6117, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.85224723815918, + "rewards/margins": 0.3943077325820923, + "rewards/rejected": -13.246554374694824, + "step": 1442 + }, + { + "epoch": 0.9956874245299293, + "grad_norm": 0.34079501032829285, + "learning_rate": 2.765427366807206e-06, + "logits/chosen": 4.479079723358154, + "logits/rejected": 4.479079723358154, + "logps/chosen": -176.29156494140625, + "logps/rejected": -176.29156494140625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.670674324035645, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.670673370361328, + "step": 1443 + }, + { + "epoch": 0.9963774366051406, + "grad_norm": 0.38141897320747375, + "learning_rate": 2.7673438098888465e-06, + "logits/chosen": 4.116291046142578, + "logits/rejected": 4.116291046142578, + "logps/chosen": -177.49310302734375, + "logps/rejected": -177.49310302734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.962014198303223, + "rewards/margins": 0.0, + "rewards/rejected": -12.962014198303223, + "step": 1444 + }, + { + "epoch": 0.9970674486803519, + "grad_norm": 0.3503197133541107, + "learning_rate": 2.769260252970487e-06, + "logits/chosen": 4.012975692749023, + "logits/rejected": 4.173627853393555, + "logps/chosen": -164.27508544921875, + "logps/rejected": -171.72354125976562, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.564598083496094, + "rewards/margins": 0.7147186994552612, + "rewards/rejected": -12.279315948486328, + "step": 1445 + }, + { + "epoch": 0.9977574607555633, + "grad_norm": 0.5964245200157166, + "learning_rate": 2.7711766960521277e-06, + "logits/chosen": 3.773772716522217, + "logits/rejected": 4.1196699142456055, + "logps/chosen": -167.88421630859375, + "logps/rejected": -191.18060302734375, + "loss": 0.4363, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.002900123596191, + "rewards/margins": 2.3185317516326904, + "rewards/rejected": -14.321432113647461, + "step": 1446 + }, + { + "epoch": 0.9984474728307745, + "grad_norm": 1.7068397998809814, + "learning_rate": 2.773093139133768e-06, + "logits/chosen": 4.0561203956604, + "logits/rejected": 4.214562892913818, + "logps/chosen": -158.25039672851562, + "logps/rejected": -167.55313110351562, + "loss": 0.5294, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.862346649169922, + "rewards/margins": 1.0649055242538452, + "rewards/rejected": -11.927253723144531, + "step": 1447 + }, + { + "epoch": 0.9991374849059859, + "grad_norm": 0.7030170559883118, + "learning_rate": 2.7750095822154085e-06, + "logits/chosen": 3.8010237216949463, + "logits/rejected": 3.814467668533325, + "logps/chosen": -162.86134338378906, + "logps/rejected": -166.8561248779297, + "loss": 0.6131, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.587238311767578, + "rewards/margins": 0.3646688461303711, + "rewards/rejected": -11.95190715789795, + "step": 1448 + }, + { + "epoch": 0.9998274969811972, + "grad_norm": 0.25843140482902527, + "learning_rate": 2.776926025297049e-06, + "logits/chosen": 3.8779537677764893, + "logits/rejected": 3.9036033153533936, + "logps/chosen": -171.79098510742188, + "logps/rejected": -180.05859375, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.33242130279541, + "rewards/margins": 0.8634829521179199, + "rewards/rejected": -13.195903778076172, + "step": 1449 + }, + { + "epoch": 1.0, + "grad_norm": 0.20359039306640625, + "learning_rate": 2.7788424683786897e-06, + "logits/chosen": 3.821326971054077, + "logits/rejected": 3.821326971054077, + "logps/chosen": -185.48789978027344, + "logps/rejected": -185.48788452148438, + "loss": 0.1733, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.654120445251465, + "rewards/margins": -3.814697265625e-06, + "rewards/rejected": -13.6541166305542, + "step": 1450 + }, + { + "epoch": 1.0006900120752114, + "grad_norm": 0.36284080147743225, + "learning_rate": 2.78075891146033e-06, + "logits/chosen": 4.112224102020264, + "logits/rejected": 4.264671802520752, + "logps/chosen": -157.5005645751953, + "logps/rejected": -163.83961486816406, + "loss": 0.6071, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.859132766723633, + "rewards/margins": 0.6750649213790894, + "rewards/rejected": -11.534196853637695, + "step": 1451 + }, + { + "epoch": 1.0013800241504227, + "grad_norm": 13.418360710144043, + "learning_rate": 2.7826753545419704e-06, + "logits/chosen": 3.8561789989471436, + "logits/rejected": 3.9151971340179443, + "logps/chosen": -158.36782836914062, + "logps/rejected": -162.98971557617188, + "loss": 0.8087, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.26130199432373, + "rewards/margins": 0.42302608489990234, + "rewards/rejected": -11.684328079223633, + "step": 1452 + }, + { + "epoch": 1.0020700362256338, + "grad_norm": 0.36012381315231323, + "learning_rate": 2.784591797623611e-06, + "logits/chosen": 3.841134786605835, + "logits/rejected": 3.915256977081299, + "logps/chosen": -166.89111328125, + "logps/rejected": -176.99295043945312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.078927993774414, + "rewards/margins": 1.063673734664917, + "rewards/rejected": -13.142601013183594, + "step": 1453 + }, + { + "epoch": 1.0027600483008452, + "grad_norm": 0.29053449630737305, + "learning_rate": 2.7865082407052512e-06, + "logits/chosen": 4.122166156768799, + "logits/rejected": 4.122166156768799, + "logps/chosen": -179.801513671875, + "logps/rejected": -179.801513671875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.162590026855469, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.162590026855469, + "step": 1454 + }, + { + "epoch": 1.0034500603760566, + "grad_norm": 0.3640584647655487, + "learning_rate": 2.7884246837868916e-06, + "logits/chosen": 3.4245693683624268, + "logits/rejected": 3.6013526916503906, + "logps/chosen": -141.7188720703125, + "logps/rejected": -158.71263122558594, + "loss": 0.5224, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.390166282653809, + "rewards/margins": 1.7402966022491455, + "rewards/rejected": -11.130462646484375, + "step": 1455 + }, + { + "epoch": 1.004140072451268, + "grad_norm": 0.3088219165802002, + "learning_rate": 2.790341126868532e-06, + "logits/chosen": 3.8248400688171387, + "logits/rejected": 3.8368043899536133, + "logps/chosen": -173.692626953125, + "logps/rejected": -181.77349853515625, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.691753387451172, + "rewards/margins": 0.828647792339325, + "rewards/rejected": -13.520401000976562, + "step": 1456 + }, + { + "epoch": 1.0048300845264793, + "grad_norm": 0.35346469283103943, + "learning_rate": 2.7922575699501724e-06, + "logits/chosen": 3.9417948722839355, + "logits/rejected": 3.9417948722839355, + "logps/chosen": -170.22918701171875, + "logps/rejected": -170.22918701171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.333343505859375, + "rewards/margins": 0.0, + "rewards/rejected": -12.333343505859375, + "step": 1457 + }, + { + "epoch": 1.0055200966016906, + "grad_norm": 0.29529085755348206, + "learning_rate": 2.7941740130318136e-06, + "logits/chosen": 3.7593469619750977, + "logits/rejected": 3.9445512294769287, + "logps/chosen": -161.45851135253906, + "logps/rejected": -169.85592651367188, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.368849754333496, + "rewards/margins": 0.824209451675415, + "rewards/rejected": -12.193058967590332, + "step": 1458 + }, + { + "epoch": 1.0062101086769017, + "grad_norm": 0.34407928586006165, + "learning_rate": 2.796090456113454e-06, + "logits/chosen": 4.128990650177002, + "logits/rejected": 4.114391326904297, + "logps/chosen": -162.82574462890625, + "logps/rejected": -173.6080322265625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.604633331298828, + "rewards/margins": 1.118191123008728, + "rewards/rejected": -12.722824096679688, + "step": 1459 + }, + { + "epoch": 1.006900120752113, + "grad_norm": 0.37882399559020996, + "learning_rate": 2.7980068991950944e-06, + "logits/chosen": 4.100772857666016, + "logits/rejected": 4.100772857666016, + "logps/chosen": -170.88134765625, + "logps/rejected": -170.88134765625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.305990219116211, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.305989265441895, + "step": 1460 + }, + { + "epoch": 1.0075901328273245, + "grad_norm": 0.3981424570083618, + "learning_rate": 2.799923342276735e-06, + "logits/chosen": 3.7134623527526855, + "logits/rejected": 3.7345380783081055, + "logps/chosen": -158.50648498535156, + "logps/rejected": -166.8710479736328, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.105327606201172, + "rewards/margins": 0.8483662605285645, + "rewards/rejected": -11.953693389892578, + "step": 1461 + }, + { + "epoch": 1.0082801449025358, + "grad_norm": 0.31716224551200867, + "learning_rate": 2.801839785358375e-06, + "logits/chosen": 3.5128421783447266, + "logits/rejected": 3.7327375411987305, + "logps/chosen": -148.7254180908203, + "logps/rejected": -168.0715789794922, + "loss": 0.5205, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.122005462646484, + "rewards/margins": 1.85892653465271, + "rewards/rejected": -11.980932235717773, + "step": 1462 + }, + { + "epoch": 1.0089701569777472, + "grad_norm": 0.3482617437839508, + "learning_rate": 2.8037562284400156e-06, + "logits/chosen": 3.437066078186035, + "logits/rejected": 3.525041103363037, + "logps/chosen": -168.94619750976562, + "logps/rejected": -174.01663208007812, + "loss": 0.6082, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.091117858886719, + "rewards/margins": 0.5342929363250732, + "rewards/rejected": -12.625411987304688, + "step": 1463 + }, + { + "epoch": 1.0096601690529585, + "grad_norm": 0.4051046073436737, + "learning_rate": 2.805672671521656e-06, + "logits/chosen": 3.573500156402588, + "logits/rejected": 3.573500156402588, + "logps/chosen": -153.87652587890625, + "logps/rejected": -153.87652587890625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.621572494506836, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -10.621572494506836, + "step": 1464 + }, + { + "epoch": 1.0103501811281697, + "grad_norm": 0.34352219104766846, + "learning_rate": 2.8075891146032964e-06, + "logits/chosen": 3.773454427719116, + "logits/rejected": 3.9319660663604736, + "logps/chosen": -159.1129913330078, + "logps/rejected": -166.61761474609375, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.089102745056152, + "rewards/margins": 0.7085406184196472, + "rewards/rejected": -11.797643661499023, + "step": 1465 + }, + { + "epoch": 1.011040193203381, + "grad_norm": 0.32227009534835815, + "learning_rate": 2.8095055576849368e-06, + "logits/chosen": 3.9579200744628906, + "logits/rejected": 3.9426679611206055, + "logps/chosen": -165.22047424316406, + "logps/rejected": -180.21475219726562, + "loss": 0.5205, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.65457534790039, + "rewards/margins": 1.5996695756912231, + "rewards/rejected": -13.25424575805664, + "step": 1466 + }, + { + "epoch": 1.0117302052785924, + "grad_norm": 0.27767738699913025, + "learning_rate": 2.8114220007665776e-06, + "logits/chosen": 3.3033511638641357, + "logits/rejected": 3.5552194118499756, + "logps/chosen": -150.10252380371094, + "logps/rejected": -177.6090850830078, + "loss": 0.4336, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.3812837600708, + "rewards/margins": 2.7507355213165283, + "rewards/rejected": -13.13201904296875, + "step": 1467 + }, + { + "epoch": 1.0124202173538037, + "grad_norm": 0.38436663150787354, + "learning_rate": 2.813338443848218e-06, + "logits/chosen": 3.791992425918579, + "logits/rejected": 3.791992425918579, + "logps/chosen": -184.35682678222656, + "logps/rejected": -184.35682678222656, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.898488998413086, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.898488998413086, + "step": 1468 + }, + { + "epoch": 1.013110229429015, + "grad_norm": 0.3254653513431549, + "learning_rate": 2.8152548869298584e-06, + "logits/chosen": 3.597503662109375, + "logits/rejected": 3.77523136138916, + "logps/chosen": -149.37509155273438, + "logps/rejected": -171.47109985351562, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.236502647399902, + "rewards/margins": 2.189786911010742, + "rewards/rejected": -12.426288604736328, + "step": 1469 + }, + { + "epoch": 1.0138002415042264, + "grad_norm": 0.25192320346832275, + "learning_rate": 2.8171713300114988e-06, + "logits/chosen": 3.2339720726013184, + "logits/rejected": 3.448529005050659, + "logps/chosen": -117.56037902832031, + "logps/rejected": -160.833251953125, + "loss": 0.4333, + "rewards/accuracies": 0.5, + "rewards/chosen": -7.362837791442871, + "rewards/margins": 4.152004718780518, + "rewards/rejected": -11.51484203338623, + "step": 1470 + }, + { + "epoch": 1.0144902535794376, + "grad_norm": 0.31873130798339844, + "learning_rate": 2.819087773093139e-06, + "logits/chosen": 3.6465694904327393, + "logits/rejected": 3.8083691596984863, + "logps/chosen": -163.34336853027344, + "logps/rejected": -169.86676025390625, + "loss": 0.6073, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.60368537902832, + "rewards/margins": 0.6316084265708923, + "rewards/rejected": -12.235292434692383, + "step": 1471 + }, + { + "epoch": 1.015180265654649, + "grad_norm": 0.27377986907958984, + "learning_rate": 2.8210042161747795e-06, + "logits/chosen": 4.199794769287109, + "logits/rejected": 4.355742454528809, + "logps/chosen": -161.5255889892578, + "logps/rejected": -168.5732421875, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.262092590332031, + "rewards/margins": 0.7645716071128845, + "rewards/rejected": -12.026664733886719, + "step": 1472 + }, + { + "epoch": 1.0158702777298603, + "grad_norm": 0.3618207275867462, + "learning_rate": 2.82292065925642e-06, + "logits/chosen": 3.5816073417663574, + "logits/rejected": 3.7673535346984863, + "logps/chosen": -131.9425506591797, + "logps/rejected": -155.62132263183594, + "loss": 0.4362, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.421856880187988, + "rewards/margins": 2.342421531677246, + "rewards/rejected": -10.76427936553955, + "step": 1473 + }, + { + "epoch": 1.0165602898050716, + "grad_norm": 0.27764612436294556, + "learning_rate": 2.8248371023380603e-06, + "logits/chosen": 3.5911037921905518, + "logits/rejected": 3.7114884853363037, + "logps/chosen": -173.4234161376953, + "logps/rejected": -183.89268493652344, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.500619888305664, + "rewards/margins": 1.0591461658477783, + "rewards/rejected": -13.55976676940918, + "step": 1474 + }, + { + "epoch": 1.017250301880283, + "grad_norm": 0.3192034363746643, + "learning_rate": 2.8267535454197016e-06, + "logits/chosen": 3.7796871662139893, + "logits/rejected": 4.035949230194092, + "logps/chosen": -167.0224609375, + "logps/rejected": -184.86727905273438, + "loss": 0.5207, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.0535306930542, + "rewards/margins": 1.7968167066574097, + "rewards/rejected": -13.850347518920898, + "step": 1475 + }, + { + "epoch": 1.0179403139554941, + "grad_norm": 0.24350681900978088, + "learning_rate": 2.828669988501342e-06, + "logits/chosen": 3.923069953918457, + "logits/rejected": 3.9868621826171875, + "logps/chosen": -182.14028930664062, + "logps/rejected": -195.16180419921875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.40992546081543, + "rewards/margins": 1.289294958114624, + "rewards/rejected": -14.69922161102295, + "step": 1476 + }, + { + "epoch": 1.0186303260307055, + "grad_norm": 0.629822313785553, + "learning_rate": 2.8305864315829824e-06, + "logits/chosen": 3.6882331371307373, + "logits/rejected": 3.716742753982544, + "logps/chosen": -157.64866638183594, + "logps/rejected": -162.05633544921875, + "loss": 0.6099, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.963310241699219, + "rewards/margins": 0.4508376717567444, + "rewards/rejected": -11.41414737701416, + "step": 1477 + }, + { + "epoch": 1.0193203381059168, + "grad_norm": 0.37223684787750244, + "learning_rate": 2.8325028746646227e-06, + "logits/chosen": 3.7859084606170654, + "logits/rejected": 3.8773045539855957, + "logps/chosen": -153.79153442382812, + "logps/rejected": -175.54342651367188, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.509798049926758, + "rewards/margins": 2.2987632751464844, + "rewards/rejected": -12.808562278747559, + "step": 1478 + }, + { + "epoch": 1.0200103501811282, + "grad_norm": 0.34779319167137146, + "learning_rate": 2.834419317746263e-06, + "logits/chosen": 4.010154724121094, + "logits/rejected": 4.184730529785156, + "logps/chosen": -172.04733276367188, + "logps/rejected": -179.08990478515625, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.250959396362305, + "rewards/margins": 0.6658297777175903, + "rewards/rejected": -12.916790008544922, + "step": 1479 + }, + { + "epoch": 1.0207003622563395, + "grad_norm": 0.3752140700817108, + "learning_rate": 2.8363357608279035e-06, + "logits/chosen": 4.09527063369751, + "logits/rejected": 4.103447437286377, + "logps/chosen": -179.4856414794922, + "logps/rejected": -186.8614044189453, + "loss": 0.607, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.10114860534668, + "rewards/margins": 0.6894985437393188, + "rewards/rejected": -13.790645599365234, + "step": 1480 + }, + { + "epoch": 1.0213903743315509, + "grad_norm": 0.3413587212562561, + "learning_rate": 2.838252203909544e-06, + "logits/chosen": 3.5421061515808105, + "logits/rejected": 3.5421061515808105, + "logps/chosen": -164.0733642578125, + "logps/rejected": -164.0733642578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.579276084899902, + "rewards/margins": 0.0, + "rewards/rejected": -11.579276084899902, + "step": 1481 + }, + { + "epoch": 1.022080386406762, + "grad_norm": 0.308782160282135, + "learning_rate": 2.8401686469911843e-06, + "logits/chosen": 3.885820150375366, + "logits/rejected": 3.9154274463653564, + "logps/chosen": -165.83241271972656, + "logps/rejected": -171.23655700683594, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.686113357543945, + "rewards/margins": 0.676373302936554, + "rewards/rejected": -12.362485885620117, + "step": 1482 + }, + { + "epoch": 1.0227703984819734, + "grad_norm": 0.42348307371139526, + "learning_rate": 2.842085090072825e-06, + "logits/chosen": 3.9036037921905518, + "logits/rejected": 3.911548137664795, + "logps/chosen": -160.3925018310547, + "logps/rejected": -165.58856201171875, + "loss": 0.6082, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.250730514526367, + "rewards/margins": 0.5361160039901733, + "rewards/rejected": -11.786847114562988, + "step": 1483 + }, + { + "epoch": 1.0234604105571847, + "grad_norm": 0.3229873478412628, + "learning_rate": 2.8440015331544655e-06, + "logits/chosen": 4.008821964263916, + "logits/rejected": 4.008821964263916, + "logps/chosen": -171.37734985351562, + "logps/rejected": -171.37734985351562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.370264053344727, + "rewards/margins": 0.0, + "rewards/rejected": -12.370264053344727, + "step": 1484 + }, + { + "epoch": 1.024150422632396, + "grad_norm": 0.3926430642604828, + "learning_rate": 2.845917976236106e-06, + "logits/chosen": 3.525998830795288, + "logits/rejected": 3.6029434204101562, + "logps/chosen": -161.3733367919922, + "logps/rejected": -180.2672882080078, + "loss": 0.5205, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.4054594039917, + "rewards/margins": 1.9061965942382812, + "rewards/rejected": -13.311656951904297, + "step": 1485 + }, + { + "epoch": 1.0248404347076074, + "grad_norm": 0.30621129274368286, + "learning_rate": 2.8478344193177467e-06, + "logits/chosen": 3.621777296066284, + "logits/rejected": 3.7515289783477783, + "logps/chosen": -152.43191528320312, + "logps/rejected": -163.11849975585938, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.57585620880127, + "rewards/margins": 1.0969384908676147, + "rewards/rejected": -11.672794342041016, + "step": 1486 + }, + { + "epoch": 1.0255304467828188, + "grad_norm": 0.4559832811355591, + "learning_rate": 2.849750862399387e-06, + "logits/chosen": 3.439509630203247, + "logits/rejected": 4.084845542907715, + "logps/chosen": -145.27426147460938, + "logps/rejected": -177.11642456054688, + "loss": 0.3499, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.76061725616455, + "rewards/margins": 3.102666139602661, + "rewards/rejected": -12.863283157348633, + "step": 1487 + }, + { + "epoch": 1.02622045885803, + "grad_norm": 0.38670432567596436, + "learning_rate": 2.8516673054810275e-06, + "logits/chosen": 4.060101509094238, + "logits/rejected": 3.9730350971221924, + "logps/chosen": -164.23915100097656, + "logps/rejected": -171.86410522460938, + "loss": 0.6068, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.594168663024902, + "rewards/margins": 0.7384717464447021, + "rewards/rejected": -12.332640647888184, + "step": 1488 + }, + { + "epoch": 1.0269104709332413, + "grad_norm": 0.3221833407878876, + "learning_rate": 2.853583748562668e-06, + "logits/chosen": 3.610395908355713, + "logits/rejected": 3.821059465408325, + "logps/chosen": -146.20449829101562, + "logps/rejected": -171.31585693359375, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.029939651489258, + "rewards/margins": 2.4355406761169434, + "rewards/rejected": -12.46548080444336, + "step": 1489 + }, + { + "epoch": 1.0276004830084526, + "grad_norm": 0.3526119589805603, + "learning_rate": 2.8555001916443083e-06, + "logits/chosen": 3.9824697971343994, + "logits/rejected": 3.9824697971343994, + "logps/chosen": -167.3211669921875, + "logps/rejected": -167.3211669921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.059379577636719, + "rewards/margins": 0.0, + "rewards/rejected": -12.059379577636719, + "step": 1490 + }, + { + "epoch": 1.028290495083664, + "grad_norm": 1.14559006690979, + "learning_rate": 2.857416634725949e-06, + "logits/chosen": 3.6019999980926514, + "logits/rejected": 3.6075825691223145, + "logps/chosen": -142.68438720703125, + "logps/rejected": -150.6501922607422, + "loss": 0.5322, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.397767066955566, + "rewards/margins": 0.823632538318634, + "rewards/rejected": -10.221399307250977, + "step": 1491 + }, + { + "epoch": 1.0289805071588753, + "grad_norm": 0.32890015840530396, + "learning_rate": 2.8593330778075895e-06, + "logits/chosen": 3.7696332931518555, + "logits/rejected": 3.9003067016601562, + "logps/chosen": -162.01486206054688, + "logps/rejected": -184.1502685546875, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.350951194763184, + "rewards/margins": 2.2388863563537598, + "rewards/rejected": -13.589838027954102, + "step": 1492 + }, + { + "epoch": 1.0296705192340867, + "grad_norm": 0.28148141503334045, + "learning_rate": 2.86124952088923e-06, + "logits/chosen": 3.9630258083343506, + "logits/rejected": 4.065276622772217, + "logps/chosen": -172.41226196289062, + "logps/rejected": -183.915283203125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.287155151367188, + "rewards/margins": 1.1944308280944824, + "rewards/rejected": -13.481586456298828, + "step": 1493 + }, + { + "epoch": 1.0303605313092978, + "grad_norm": 0.2947300672531128, + "learning_rate": 2.8631659639708703e-06, + "logits/chosen": 3.9158194065093994, + "logits/rejected": 4.039277076721191, + "logps/chosen": -161.0867919921875, + "logps/rejected": -179.80953979492188, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.35873031616211, + "rewards/margins": 1.9316126108169556, + "rewards/rejected": -13.290342330932617, + "step": 1494 + }, + { + "epoch": 1.0310505433845092, + "grad_norm": 0.36902010440826416, + "learning_rate": 2.8650824070525107e-06, + "logits/chosen": 4.155815124511719, + "logits/rejected": 4.155815124511719, + "logps/chosen": -178.97982788085938, + "logps/rejected": -178.97982788085938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.145485877990723, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.145485877990723, + "step": 1495 + }, + { + "epoch": 1.0317405554597205, + "grad_norm": 0.3707321882247925, + "learning_rate": 2.866998850134151e-06, + "logits/chosen": 3.960358142852783, + "logits/rejected": 4.029971122741699, + "logps/chosen": -160.09823608398438, + "logps/rejected": -171.10179138183594, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.383152961730957, + "rewards/margins": 1.102752923965454, + "rewards/rejected": -12.485906600952148, + "step": 1496 + }, + { + "epoch": 1.032430567534932, + "grad_norm": 0.3019746243953705, + "learning_rate": 2.8689152932157915e-06, + "logits/chosen": 3.538447856903076, + "logits/rejected": 3.6538519859313965, + "logps/chosen": -176.12205505371094, + "logps/rejected": -183.44338989257812, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.710395812988281, + "rewards/margins": 0.7026083469390869, + "rewards/rejected": -13.413004875183105, + "step": 1497 + }, + { + "epoch": 1.0331205796101433, + "grad_norm": 0.3753436505794525, + "learning_rate": 2.870831736297432e-06, + "logits/chosen": 4.16666841506958, + "logits/rejected": 4.16666841506958, + "logps/chosen": -180.1271209716797, + "logps/rejected": -180.1271209716797, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.162097930908203, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.162097930908203, + "step": 1498 + }, + { + "epoch": 1.0338105916853544, + "grad_norm": 0.3367999196052551, + "learning_rate": 2.872748179379073e-06, + "logits/chosen": 3.999356269836426, + "logits/rejected": 4.0439558029174805, + "logps/chosen": -166.66131591796875, + "logps/rejected": -183.6537628173828, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.027085304260254, + "rewards/margins": 1.6417951583862305, + "rewards/rejected": -13.668880462646484, + "step": 1499 + }, + { + "epoch": 1.0345006037605657, + "grad_norm": 0.3282417953014374, + "learning_rate": 2.8746646224607135e-06, + "logits/chosen": 3.9828944206237793, + "logits/rejected": 4.111169815063477, + "logps/chosen": -155.97494506835938, + "logps/rejected": -173.8303680419922, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.028053283691406, + "rewards/margins": 1.5735350847244263, + "rewards/rejected": -12.60158920288086, + "step": 1500 + }, + { + "epoch": 1.0358806279109884, + "grad_norm": 0.39618542790412903, + "learning_rate": 2.876581065542354e-06, + "logits/chosen": 3.980482578277588, + "logits/rejected": 3.980482578277588, + "logps/chosen": -174.689453125, + "logps/rejected": -174.689453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.758251190185547, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.758251190185547, + "step": 1501 + }, + { + "epoch": 1.0365706399861998, + "grad_norm": 0.4296952784061432, + "learning_rate": 2.8784975086239943e-06, + "logits/chosen": 4.0744428634643555, + "logits/rejected": 4.195071220397949, + "logps/chosen": -165.55113220214844, + "logps/rejected": -179.92721557617188, + "loss": 0.5213, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.62725830078125, + "rewards/margins": 1.3931002616882324, + "rewards/rejected": -13.02035903930664, + "step": 1502 + }, + { + "epoch": 1.0372606520614112, + "grad_norm": 0.26014965772628784, + "learning_rate": 2.8804139517056346e-06, + "logits/chosen": 3.6285009384155273, + "logits/rejected": 3.7067208290100098, + "logps/chosen": -132.25164794921875, + "logps/rejected": -146.45013427734375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -8.386788368225098, + "rewards/margins": 1.4472312927246094, + "rewards/rejected": -9.834019660949707, + "step": 1503 + }, + { + "epoch": 1.0379506641366223, + "grad_norm": 0.27812764048576355, + "learning_rate": 2.882330394787275e-06, + "logits/chosen": 3.960927963256836, + "logits/rejected": 4.114255428314209, + "logps/chosen": -151.00665283203125, + "logps/rejected": -158.82444763183594, + "loss": 0.6068, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.345622062683105, + "rewards/margins": 0.7779095768928528, + "rewards/rejected": -11.123531341552734, + "step": 1504 + }, + { + "epoch": 1.0386406762118336, + "grad_norm": 0.23263956606388092, + "learning_rate": 2.8842468378689154e-06, + "logits/chosen": 3.755471706390381, + "logits/rejected": 4.259437084197998, + "logps/chosen": -149.94076538085938, + "logps/rejected": -190.1394500732422, + "loss": 0.3467, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.086956977844238, + "rewards/margins": 4.152714252471924, + "rewards/rejected": -14.23967170715332, + "step": 1505 + }, + { + "epoch": 1.039330688287045, + "grad_norm": 0.38163575530052185, + "learning_rate": 2.886163280950556e-06, + "logits/chosen": 3.748331069946289, + "logits/rejected": 3.748331069946289, + "logps/chosen": -171.50689697265625, + "logps/rejected": -171.50689697265625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.289403915405273, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.289403915405273, + "step": 1506 + }, + { + "epoch": 1.0400207003622564, + "grad_norm": 0.3236556053161621, + "learning_rate": 2.8880797240321966e-06, + "logits/chosen": 4.267690658569336, + "logits/rejected": 4.425240993499756, + "logps/chosen": -172.1528778076172, + "logps/rejected": -180.442626953125, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.607465744018555, + "rewards/margins": 0.845537543296814, + "rewards/rejected": -13.4530029296875, + "step": 1507 + }, + { + "epoch": 1.0407107124374677, + "grad_norm": 0.2786690294742584, + "learning_rate": 2.889996167113837e-06, + "logits/chosen": 4.1700358390808105, + "logits/rejected": 4.1700358390808105, + "logps/chosen": -189.48037719726562, + "logps/rejected": -189.48037719726562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.073887825012207, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.073887825012207, + "step": 1508 + }, + { + "epoch": 1.041400724512679, + "grad_norm": 0.35449740290641785, + "learning_rate": 2.8919126101954774e-06, + "logits/chosen": 4.227049350738525, + "logits/rejected": 4.227049350738525, + "logps/chosen": -174.56735229492188, + "logps/rejected": -174.56735229492188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.716915130615234, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.716914176940918, + "step": 1509 + }, + { + "epoch": 1.0420907365878902, + "grad_norm": 0.39543235301971436, + "learning_rate": 2.893829053277118e-06, + "logits/chosen": 4.074232578277588, + "logits/rejected": 4.074232578277588, + "logps/chosen": -158.63836669921875, + "logps/rejected": -158.63836669921875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.992496490478516, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -10.992498397827148, + "step": 1510 + }, + { + "epoch": 1.0427807486631016, + "grad_norm": 0.3173534870147705, + "learning_rate": 2.895745496358758e-06, + "logits/chosen": 4.189208984375, + "logits/rejected": 4.222626686096191, + "logps/chosen": -175.20216369628906, + "logps/rejected": -183.80816650390625, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.734105110168457, + "rewards/margins": 0.8256305456161499, + "rewards/rejected": -13.559736251831055, + "step": 1511 + }, + { + "epoch": 1.043470760738313, + "grad_norm": 0.8812817335128784, + "learning_rate": 2.8976619394403986e-06, + "logits/chosen": 4.362222194671631, + "logits/rejected": 4.355042934417725, + "logps/chosen": -182.6986083984375, + "logps/rejected": -185.66104125976562, + "loss": 0.6154, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.474248886108398, + "rewards/margins": 0.32601165771484375, + "rewards/rejected": -13.800260543823242, + "step": 1512 + }, + { + "epoch": 1.0441607728135243, + "grad_norm": 4.212695598602295, + "learning_rate": 2.899578382522039e-06, + "logits/chosen": 3.818296194076538, + "logits/rejected": 4.1795525550842285, + "logps/chosen": -171.45669555664062, + "logps/rejected": -191.1416015625, + "loss": 0.4802, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.165218353271484, + "rewards/margins": 2.038419723510742, + "rewards/rejected": -14.203638076782227, + "step": 1513 + }, + { + "epoch": 1.0448507848887356, + "grad_norm": 0.31539979577064514, + "learning_rate": 2.9014948256036794e-06, + "logits/chosen": 4.060215950012207, + "logits/rejected": 4.060215950012207, + "logps/chosen": -182.5564422607422, + "logps/rejected": -182.5564422607422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.64315414428711, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.64315414428711, + "step": 1514 + }, + { + "epoch": 1.045540796963947, + "grad_norm": 0.29507243633270264, + "learning_rate": 2.9034112686853206e-06, + "logits/chosen": 4.055942058563232, + "logits/rejected": 4.175293445587158, + "logps/chosen": -166.07293701171875, + "logps/rejected": -179.5782470703125, + "loss": 0.5223, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.87121295928955, + "rewards/margins": 1.3613979816436768, + "rewards/rejected": -13.232609748840332, + "step": 1515 + }, + { + "epoch": 1.046230809039158, + "grad_norm": 10.144216537475586, + "learning_rate": 2.905327711766961e-06, + "logits/chosen": 3.599508285522461, + "logits/rejected": 3.7637858390808105, + "logps/chosen": -141.86209106445312, + "logps/rejected": -159.59548950195312, + "loss": 0.4836, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.645439147949219, + "rewards/margins": 1.478027105331421, + "rewards/rejected": -11.123466491699219, + "step": 1516 + }, + { + "epoch": 1.0469208211143695, + "grad_norm": 0.3152676820755005, + "learning_rate": 2.9072441548486014e-06, + "logits/chosen": 4.000053405761719, + "logits/rejected": 4.066262722015381, + "logps/chosen": -168.52871704101562, + "logps/rejected": -184.3726806640625, + "loss": 0.5206, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.027135848999023, + "rewards/margins": 1.6689200401306152, + "rewards/rejected": -13.696054458618164, + "step": 1517 + }, + { + "epoch": 1.0476108331895808, + "grad_norm": 0.3423638641834259, + "learning_rate": 2.909160597930242e-06, + "logits/chosen": 3.5882859230041504, + "logits/rejected": 3.6828196048736572, + "logps/chosen": -142.24066162109375, + "logps/rejected": -162.8803253173828, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.30232048034668, + "rewards/margins": 2.0757617950439453, + "rewards/rejected": -11.378082275390625, + "step": 1518 + }, + { + "epoch": 1.0483008452647922, + "grad_norm": 0.2912023067474365, + "learning_rate": 2.911077041011882e-06, + "logits/chosen": 3.8394393920898438, + "logits/rejected": 4.066675186157227, + "logps/chosen": -165.86077880859375, + "logps/rejected": -180.31838989257812, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.74738883972168, + "rewards/margins": 1.4882690906524658, + "rewards/rejected": -13.235658645629883, + "step": 1519 + }, + { + "epoch": 1.0489908573400035, + "grad_norm": 0.3056580126285553, + "learning_rate": 2.9129934840935226e-06, + "logits/chosen": 3.8925232887268066, + "logits/rejected": 3.9382224082946777, + "logps/chosen": -151.50595092773438, + "logps/rejected": -158.58956909179688, + "loss": 0.6071, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.41466236114502, + "rewards/margins": 0.6706889867782593, + "rewards/rejected": -11.085351943969727, + "step": 1520 + }, + { + "epoch": 1.0496808694152149, + "grad_norm": 0.26689788699150085, + "learning_rate": 2.914909927175163e-06, + "logits/chosen": 4.1265764236450195, + "logits/rejected": 4.134848117828369, + "logps/chosen": -177.8087615966797, + "logps/rejected": -188.26004028320312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.937246322631836, + "rewards/margins": 1.11076819896698, + "rewards/rejected": -14.048013687133789, + "step": 1521 + }, + { + "epoch": 1.050370881490426, + "grad_norm": 9.245174407958984, + "learning_rate": 2.9168263702568034e-06, + "logits/chosen": 4.127288818359375, + "logits/rejected": 4.164822578430176, + "logps/chosen": -163.3954315185547, + "logps/rejected": -177.1589813232422, + "loss": 0.5554, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.50864315032959, + "rewards/margins": 1.3263949155807495, + "rewards/rejected": -12.835039138793945, + "step": 1522 + }, + { + "epoch": 1.0510608935656374, + "grad_norm": 0.28416016697883606, + "learning_rate": 2.918742813338444e-06, + "logits/chosen": 4.16273832321167, + "logits/rejected": 4.16273832321167, + "logps/chosen": -195.1432647705078, + "logps/rejected": -195.1432647705078, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.84080696105957, + "rewards/margins": 0.0, + "rewards/rejected": -14.84080696105957, + "step": 1523 + }, + { + "epoch": 1.0517509056408487, + "grad_norm": 1.0817245244979858, + "learning_rate": 2.9206592564200846e-06, + "logits/chosen": 3.6315088272094727, + "logits/rejected": 3.7094709873199463, + "logps/chosen": -142.70620727539062, + "logps/rejected": -172.50286865234375, + "loss": 0.2676, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.519641876220703, + "rewards/margins": 3.087184429168701, + "rewards/rejected": -12.606826782226562, + "step": 1524 + }, + { + "epoch": 1.05244091771606, + "grad_norm": 0.32327622175216675, + "learning_rate": 2.922575699501725e-06, + "logits/chosen": 3.8519997596740723, + "logits/rejected": 3.9895853996276855, + "logps/chosen": -175.50167846679688, + "logps/rejected": -186.66455078125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.815197944641113, + "rewards/margins": 1.0845837593078613, + "rewards/rejected": -13.899781227111816, + "step": 1525 + }, + { + "epoch": 1.0531309297912714, + "grad_norm": 0.29674309492111206, + "learning_rate": 2.9244921425833653e-06, + "logits/chosen": 3.5505056381225586, + "logits/rejected": 3.630006790161133, + "logps/chosen": -140.51422119140625, + "logps/rejected": -158.8375244140625, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.34114933013916, + "rewards/margins": 1.9016565084457397, + "rewards/rejected": -11.242805480957031, + "step": 1526 + }, + { + "epoch": 1.0538209418664826, + "grad_norm": 0.3936616778373718, + "learning_rate": 2.926408585665006e-06, + "logits/chosen": 4.0854620933532715, + "logits/rejected": 4.0854620933532715, + "logps/chosen": -173.92550659179688, + "logps/rejected": -173.92550659179688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.708160400390625, + "rewards/margins": 0.0, + "rewards/rejected": -12.708160400390625, + "step": 1527 + }, + { + "epoch": 1.054510953941694, + "grad_norm": 0.25575628876686096, + "learning_rate": 2.9283250287466466e-06, + "logits/chosen": 3.981449842453003, + "logits/rejected": 3.958247661590576, + "logps/chosen": -180.7632598876953, + "logps/rejected": -201.17681884765625, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.341683387756348, + "rewards/margins": 2.1541895866394043, + "rewards/rejected": -15.495872497558594, + "step": 1528 + }, + { + "epoch": 1.0552009660169053, + "grad_norm": 0.5079259276390076, + "learning_rate": 2.930241471828287e-06, + "logits/chosen": 4.047630310058594, + "logits/rejected": 4.024611473083496, + "logps/chosen": -174.9213409423828, + "logps/rejected": -179.50619506835938, + "loss": 0.6093, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.76845932006836, + "rewards/margins": 0.47181200981140137, + "rewards/rejected": -13.240270614624023, + "step": 1529 + }, + { + "epoch": 1.0558909780921166, + "grad_norm": 0.3946928083896637, + "learning_rate": 2.9321579149099273e-06, + "logits/chosen": 4.245185852050781, + "logits/rejected": 4.245185852050781, + "logps/chosen": -180.82449340820312, + "logps/rejected": -180.82449340820312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.28433609008789, + "rewards/margins": 0.0, + "rewards/rejected": -13.28433609008789, + "step": 1530 + }, + { + "epoch": 1.056580990167328, + "grad_norm": 0.2806274890899658, + "learning_rate": 2.934074357991568e-06, + "logits/chosen": 3.9721291065216064, + "logits/rejected": 4.015979290008545, + "logps/chosen": -167.47767639160156, + "logps/rejected": -179.93695068359375, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.930212020874023, + "rewards/margins": 1.2840536832809448, + "rewards/rejected": -13.214265823364258, + "step": 1531 + }, + { + "epoch": 1.0572710022425393, + "grad_norm": 0.33172792196273804, + "learning_rate": 2.9359908010732085e-06, + "logits/chosen": 3.663555383682251, + "logits/rejected": 3.6998612880706787, + "logps/chosen": -182.69227600097656, + "logps/rejected": -189.9181671142578, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.516793251037598, + "rewards/margins": 0.7535802125930786, + "rewards/rejected": -14.270374298095703, + "step": 1532 + }, + { + "epoch": 1.0579610143177505, + "grad_norm": 25.59088134765625, + "learning_rate": 2.937907244154849e-06, + "logits/chosen": 4.017070293426514, + "logits/rejected": 4.029088020324707, + "logps/chosen": -189.18557739257812, + "logps/rejected": -186.8567352294922, + "loss": 0.892, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.174724578857422, + "rewards/margins": -0.2721177339553833, + "rewards/rejected": -13.902606964111328, + "step": 1533 + }, + { + "epoch": 1.0586510263929618, + "grad_norm": 0.2785431742668152, + "learning_rate": 2.9398236872364893e-06, + "logits/chosen": 3.9643971920013428, + "logits/rejected": 4.050213813781738, + "logps/chosen": -173.29864501953125, + "logps/rejected": -196.22409057617188, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.702221870422363, + "rewards/margins": 2.2772469520568848, + "rewards/rejected": -14.97946834564209, + "step": 1534 + }, + { + "epoch": 1.0593410384681732, + "grad_norm": 0.24514314532279968, + "learning_rate": 2.9417401303181297e-06, + "logits/chosen": 3.7230405807495117, + "logits/rejected": 3.895897150039673, + "logps/chosen": -145.04147338867188, + "logps/rejected": -184.21359252929688, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.990467071533203, + "rewards/margins": 3.820514440536499, + "rewards/rejected": -13.810981750488281, + "step": 1535 + }, + { + "epoch": 1.0600310505433845, + "grad_norm": 0.3220904469490051, + "learning_rate": 2.94365657339977e-06, + "logits/chosen": 3.9169366359710693, + "logits/rejected": 3.9169366359710693, + "logps/chosen": -154.98867797851562, + "logps/rejected": -154.98867797851562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.578909873962402, + "rewards/margins": 0.0, + "rewards/rejected": -10.578909873962402, + "step": 1536 + }, + { + "epoch": 1.060721062618596, + "grad_norm": 0.32836493849754333, + "learning_rate": 2.9455730164814105e-06, + "logits/chosen": 4.251908302307129, + "logits/rejected": 4.251908302307129, + "logps/chosen": -196.81857299804688, + "logps/rejected": -196.81854248046875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -15.085273742675781, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -15.085273742675781, + "step": 1537 + }, + { + "epoch": 1.0614110746938072, + "grad_norm": 28.762182235717773, + "learning_rate": 2.947489459563051e-06, + "logits/chosen": 3.2510476112365723, + "logits/rejected": 3.7064242362976074, + "logps/chosen": -141.14175415039062, + "logps/rejected": -178.59573364257812, + "loss": 0.5007, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.381752014160156, + "rewards/margins": 3.559999465942383, + "rewards/rejected": -12.941752433776855, + "step": 1538 + }, + { + "epoch": 1.0621010867690184, + "grad_norm": 0.2799762785434723, + "learning_rate": 2.949405902644692e-06, + "logits/chosen": 4.118659496307373, + "logits/rejected": 4.167483329772949, + "logps/chosen": -178.01962280273438, + "logps/rejected": -193.77471923828125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.000635147094727, + "rewards/margins": 1.5238256454467773, + "rewards/rejected": -14.524459838867188, + "step": 1539 + }, + { + "epoch": 1.0627910988442297, + "grad_norm": 16.339941024780273, + "learning_rate": 2.9513223457263325e-06, + "logits/chosen": 4.033044338226318, + "logits/rejected": 3.8939199447631836, + "logps/chosen": -170.37936401367188, + "logps/rejected": -190.6766815185547, + "loss": 0.588, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.277029991149902, + "rewards/margins": 1.9230766296386719, + "rewards/rejected": -14.20010757446289, + "step": 1540 + }, + { + "epoch": 1.063481110919441, + "grad_norm": 0.37367865443229675, + "learning_rate": 2.953238788807973e-06, + "logits/chosen": 3.7856481075286865, + "logits/rejected": 3.8261353969573975, + "logps/chosen": -179.049072265625, + "logps/rejected": -195.89112854003906, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.11395263671875, + "rewards/margins": 1.692997932434082, + "rewards/rejected": -14.806950569152832, + "step": 1541 + }, + { + "epoch": 1.0641711229946524, + "grad_norm": 0.49115556478500366, + "learning_rate": 2.9551552318896133e-06, + "logits/chosen": 3.441265106201172, + "logits/rejected": 3.5361804962158203, + "logps/chosen": -150.5062255859375, + "logps/rejected": -157.6910400390625, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.25229263305664, + "rewards/margins": 0.712218165397644, + "rewards/rejected": -10.964509963989258, + "step": 1542 + }, + { + "epoch": 1.0648611350698638, + "grad_norm": 0.35300207138061523, + "learning_rate": 2.9570716749712537e-06, + "logits/chosen": 4.13484001159668, + "logits/rejected": 4.13484001159668, + "logps/chosen": -178.1407928466797, + "logps/rejected": -178.14077758789062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.12459659576416, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.12459659576416, + "step": 1543 + }, + { + "epoch": 1.065551147145075, + "grad_norm": 0.35096386075019836, + "learning_rate": 2.958988118052894e-06, + "logits/chosen": 3.6051671504974365, + "logits/rejected": 3.662034273147583, + "logps/chosen": -167.9906768798828, + "logps/rejected": -175.70208740234375, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.92487907409668, + "rewards/margins": 0.8536746501922607, + "rewards/rejected": -12.778554916381836, + "step": 1544 + }, + { + "epoch": 1.0662411592202863, + "grad_norm": 14.359066009521484, + "learning_rate": 2.9609045611345345e-06, + "logits/chosen": 3.5473556518554688, + "logits/rejected": 3.657799243927002, + "logps/chosen": -166.52354431152344, + "logps/rejected": -188.65139770507812, + "loss": 0.561, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.823699951171875, + "rewards/margins": 2.1889145374298096, + "rewards/rejected": -14.012615203857422, + "step": 1545 + }, + { + "epoch": 1.0669311712954976, + "grad_norm": 0.41355493664741516, + "learning_rate": 2.962821004216175e-06, + "logits/chosen": 3.7801809310913086, + "logits/rejected": 4.012443542480469, + "logps/chosen": -152.14585876464844, + "logps/rejected": -169.33761596679688, + "loss": 0.5213, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.498164176940918, + "rewards/margins": 1.708701729774475, + "rewards/rejected": -12.206865310668945, + "step": 1546 + }, + { + "epoch": 1.067621183370709, + "grad_norm": 0.308462917804718, + "learning_rate": 2.9647374472978157e-06, + "logits/chosen": 3.87105655670166, + "logits/rejected": 3.87105655670166, + "logps/chosen": -184.85284423828125, + "logps/rejected": -184.85284423828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.642837524414062, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.642837524414062, + "step": 1547 + }, + { + "epoch": 1.0683111954459203, + "grad_norm": 15.95934772491455, + "learning_rate": 2.966653890379456e-06, + "logits/chosen": 3.5124926567077637, + "logits/rejected": 3.7335174083709717, + "logps/chosen": -154.18418884277344, + "logps/rejected": -165.94862365722656, + "loss": 0.5178, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.655291557312012, + "rewards/margins": 1.164367914199829, + "rewards/rejected": -11.819659233093262, + "step": 1548 + }, + { + "epoch": 1.0690012075211317, + "grad_norm": 1.0715172290802002, + "learning_rate": 2.9685703334610965e-06, + "logits/chosen": 3.7021260261535645, + "logits/rejected": 3.668097496032715, + "logps/chosen": -163.03831481933594, + "logps/rejected": -166.24655151367188, + "loss": 0.6124, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.560015678405762, + "rewards/margins": 0.3780784010887146, + "rewards/rejected": -11.938094139099121, + "step": 1549 + }, + { + "epoch": 1.0696912195963428, + "grad_norm": 0.3167480230331421, + "learning_rate": 2.970486776542737e-06, + "logits/chosen": 3.544387102127075, + "logits/rejected": 3.7625813484191895, + "logps/chosen": -132.67111206054688, + "logps/rejected": -164.8491973876953, + "loss": 0.4335, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.655045509338379, + "rewards/margins": 3.2777369022369385, + "rewards/rejected": -11.932783126831055, + "step": 1550 + }, + { + "epoch": 1.0703812316715542, + "grad_norm": 0.4061012268066406, + "learning_rate": 2.9724032196243773e-06, + "logits/chosen": 3.756115436553955, + "logits/rejected": 3.756115436553955, + "logps/chosen": -163.26905822753906, + "logps/rejected": -163.26905822753906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.489654541015625, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.489654541015625, + "step": 1551 + }, + { + "epoch": 1.0710712437467655, + "grad_norm": 0.3114866018295288, + "learning_rate": 2.9743196627060176e-06, + "logits/chosen": 4.443460464477539, + "logits/rejected": 4.443460464477539, + "logps/chosen": -187.40567016601562, + "logps/rejected": -187.4056396484375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.878979682922363, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.878979682922363, + "step": 1552 + }, + { + "epoch": 1.071761255821977, + "grad_norm": 1.6158586740493774, + "learning_rate": 2.976236105787658e-06, + "logits/chosen": 3.7249557971954346, + "logits/rejected": 3.7229018211364746, + "logps/chosen": -180.4600830078125, + "logps/rejected": -183.58050537109375, + "loss": 0.6149, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.390155792236328, + "rewards/margins": 0.33259063959121704, + "rewards/rejected": -13.722745895385742, + "step": 1553 + }, + { + "epoch": 1.0724512678971883, + "grad_norm": 1.793979287147522, + "learning_rate": 2.9781525488692984e-06, + "logits/chosen": 3.8691201210021973, + "logits/rejected": 3.9036829471588135, + "logps/chosen": -151.11074829101562, + "logps/rejected": -161.70880126953125, + "loss": 0.5314, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.351128578186035, + "rewards/margins": 1.1215845346450806, + "rewards/rejected": -11.472713470458984, + "step": 1554 + }, + { + "epoch": 1.0731412799723996, + "grad_norm": 2.756192684173584, + "learning_rate": 2.9800689919509397e-06, + "logits/chosen": 3.4784092903137207, + "logits/rejected": 3.829941511154175, + "logps/chosen": -146.38296508789062, + "logps/rejected": -164.05575561523438, + "loss": 0.4582, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.037307739257812, + "rewards/margins": 1.7081162929534912, + "rewards/rejected": -11.7454252243042, + "step": 1555 + }, + { + "epoch": 1.0738312920476107, + "grad_norm": 2.2700650691986084, + "learning_rate": 2.98198543503258e-06, + "logits/chosen": 3.7735114097595215, + "logits/rejected": 3.9220447540283203, + "logps/chosen": -167.59112548828125, + "logps/rejected": -187.12135314941406, + "loss": 0.5307, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.961496353149414, + "rewards/margins": 1.9970966577529907, + "rewards/rejected": -13.95859146118164, + "step": 1556 + }, + { + "epoch": 1.074521304122822, + "grad_norm": 0.2933208644390106, + "learning_rate": 2.9839018781142204e-06, + "logits/chosen": 3.9130237102508545, + "logits/rejected": 3.896467924118042, + "logps/chosen": -181.0247344970703, + "logps/rejected": -192.5537109375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.442962646484375, + "rewards/margins": 1.0957568883895874, + "rewards/rejected": -14.538719177246094, + "step": 1557 + }, + { + "epoch": 1.0752113161980335, + "grad_norm": 0.3163449764251709, + "learning_rate": 2.985818321195861e-06, + "logits/chosen": 3.557405471801758, + "logits/rejected": 3.5420496463775635, + "logps/chosen": -156.4971466064453, + "logps/rejected": -168.39955139160156, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.95030689239502, + "rewards/margins": 1.2337126731872559, + "rewards/rejected": -12.184020042419434, + "step": 1558 + }, + { + "epoch": 1.0759013282732448, + "grad_norm": 0.3238767683506012, + "learning_rate": 2.9877347642775012e-06, + "logits/chosen": 3.617431163787842, + "logits/rejected": 3.617431163787842, + "logps/chosen": -172.30548095703125, + "logps/rejected": -172.30548095703125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.379289627075195, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.379289627075195, + "step": 1559 + }, + { + "epoch": 1.0765913403484562, + "grad_norm": 0.5124818086624146, + "learning_rate": 2.9896512073591416e-06, + "logits/chosen": 3.7228400707244873, + "logits/rejected": 3.6881117820739746, + "logps/chosen": -167.7424774169922, + "logps/rejected": -173.04278564453125, + "loss": 0.6081, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.952096939086914, + "rewards/margins": 0.5442211031913757, + "rewards/rejected": -12.496317863464355, + "step": 1560 + }, + { + "epoch": 1.0772813524236675, + "grad_norm": 0.3595713675022125, + "learning_rate": 2.991567650440782e-06, + "logits/chosen": 3.959102153778076, + "logits/rejected": 4.008647441864014, + "logps/chosen": -164.5550537109375, + "logps/rejected": -178.0158233642578, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.562742233276367, + "rewards/margins": 1.3787859678268433, + "rewards/rejected": -12.9415283203125, + "step": 1561 + }, + { + "epoch": 1.0779713644988786, + "grad_norm": 0.2791968286037445, + "learning_rate": 2.9934840935224224e-06, + "logits/chosen": 3.5337722301483154, + "logits/rejected": 3.8192522525787354, + "logps/chosen": -172.53805541992188, + "logps/rejected": -206.6414794921875, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.426299095153809, + "rewards/margins": 3.471895217895508, + "rewards/rejected": -15.898195266723633, + "step": 1562 + }, + { + "epoch": 1.07866137657409, + "grad_norm": 39.43358612060547, + "learning_rate": 2.9954005366040632e-06, + "logits/chosen": 3.9090590476989746, + "logits/rejected": 3.9464778900146484, + "logps/chosen": -157.0230255126953, + "logps/rejected": -170.36083984375, + "loss": 1.1896, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.959205627441406, + "rewards/margins": 1.3432316780090332, + "rewards/rejected": -12.302436828613281, + "step": 1563 + }, + { + "epoch": 1.0793513886493014, + "grad_norm": 5.067192077636719, + "learning_rate": 2.9973169796857036e-06, + "logits/chosen": 3.826517343521118, + "logits/rejected": 4.076174736022949, + "logps/chosen": -173.8111114501953, + "logps/rejected": -189.14125061035156, + "loss": 0.5068, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.684300422668457, + "rewards/margins": 1.5576424598693848, + "rewards/rejected": -14.241942405700684, + "step": 1564 + }, + { + "epoch": 1.0800414007245127, + "grad_norm": 0.36506885290145874, + "learning_rate": 2.999233422767344e-06, + "logits/chosen": 3.548417806625366, + "logits/rejected": 3.663630247116089, + "logps/chosen": -161.25909423828125, + "logps/rejected": -175.44113159179688, + "loss": 0.5207, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.243476867675781, + "rewards/margins": 1.459341049194336, + "rewards/rejected": -12.702817916870117, + "step": 1565 + }, + { + "epoch": 1.080731412799724, + "grad_norm": 0.37127140164375305, + "learning_rate": 3.0011498658489844e-06, + "logits/chosen": 3.556589365005493, + "logits/rejected": 3.7080891132354736, + "logps/chosen": -137.21994018554688, + "logps/rejected": -151.17172241210938, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.153966903686523, + "rewards/margins": 1.3377645015716553, + "rewards/rejected": -10.491731643676758, + "step": 1566 + }, + { + "epoch": 1.0814214248749354, + "grad_norm": 0.3595629632472992, + "learning_rate": 3.003066308930625e-06, + "logits/chosen": 3.7082912921905518, + "logits/rejected": 3.7241287231445312, + "logps/chosen": -160.322021484375, + "logps/rejected": -171.28050231933594, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.429784774780273, + "rewards/margins": 1.197925329208374, + "rewards/rejected": -12.627708435058594, + "step": 1567 + }, + { + "epoch": 1.0821114369501466, + "grad_norm": 0.28408801555633545, + "learning_rate": 3.0049827520122656e-06, + "logits/chosen": 3.659060001373291, + "logits/rejected": 3.659060001373291, + "logps/chosen": -171.79278564453125, + "logps/rejected": -171.79278564453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.41312313079834, + "rewards/margins": -4.172325134277344e-07, + "rewards/rejected": -12.413122177124023, + "step": 1568 + }, + { + "epoch": 1.082801449025358, + "grad_norm": 1.4827344417572021, + "learning_rate": 3.006899195093906e-06, + "logits/chosen": 3.6556501388549805, + "logits/rejected": 3.83968448638916, + "logps/chosen": -150.45556640625, + "logps/rejected": -170.24380493164062, + "loss": 0.438, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.279937744140625, + "rewards/margins": 2.041884183883667, + "rewards/rejected": -12.321822166442871, + "step": 1569 + }, + { + "epoch": 1.0834914611005693, + "grad_norm": 0.39107999205589294, + "learning_rate": 3.0088156381755464e-06, + "logits/chosen": 4.045101642608643, + "logits/rejected": 4.045101642608643, + "logps/chosen": -181.7005157470703, + "logps/rejected": -181.7005157470703, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.264238357543945, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.264238357543945, + "step": 1570 + }, + { + "epoch": 1.0841814731757806, + "grad_norm": 1.0513001680374146, + "learning_rate": 3.0107320812571868e-06, + "logits/chosen": 3.719451427459717, + "logits/rejected": 3.72501277923584, + "logps/chosen": -173.02987670898438, + "logps/rejected": -176.34580993652344, + "loss": 0.6139, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.523300170898438, + "rewards/margins": 0.34953176975250244, + "rewards/rejected": -12.872831344604492, + "step": 1571 + }, + { + "epoch": 1.084871485250992, + "grad_norm": 0.2554548382759094, + "learning_rate": 3.0126485243388276e-06, + "logits/chosen": 3.648470401763916, + "logits/rejected": 3.927076816558838, + "logps/chosen": -152.9944305419922, + "logps/rejected": -188.71661376953125, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.490579605102539, + "rewards/margins": 3.5680203437805176, + "rewards/rejected": -14.058599472045898, + "step": 1572 + }, + { + "epoch": 1.085561497326203, + "grad_norm": 0.34348270297050476, + "learning_rate": 3.014564967420468e-06, + "logits/chosen": 3.989030361175537, + "logits/rejected": 3.989030361175537, + "logps/chosen": -170.5446319580078, + "logps/rejected": -170.5446319580078, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.0986328125, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.098631858825684, + "step": 1573 + }, + { + "epoch": 1.0862515094014145, + "grad_norm": 0.40429428219795227, + "learning_rate": 3.0164814105021084e-06, + "logits/chosen": 4.027584552764893, + "logits/rejected": 4.053144931793213, + "logps/chosen": -163.1742706298828, + "logps/rejected": -171.58248901367188, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.66853141784668, + "rewards/margins": 0.9295037984848022, + "rewards/rejected": -12.598033905029297, + "step": 1574 + }, + { + "epoch": 1.0869415214766258, + "grad_norm": 2.038672685623169, + "learning_rate": 3.0183978535837488e-06, + "logits/chosen": 3.750908613204956, + "logits/rejected": 4.046693801879883, + "logps/chosen": -179.70858764648438, + "logps/rejected": -187.76722717285156, + "loss": 0.5385, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.071561813354492, + "rewards/margins": 0.7767698168754578, + "rewards/rejected": -13.848331451416016, + "step": 1575 + }, + { + "epoch": 1.0876315335518372, + "grad_norm": 0.31819644570350647, + "learning_rate": 3.020314296665389e-06, + "logits/chosen": 4.041497230529785, + "logits/rejected": 4.076034069061279, + "logps/chosen": -190.45855712890625, + "logps/rejected": -196.95513916015625, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.191756248474121, + "rewards/margins": 0.6398898959159851, + "rewards/rejected": -14.831645965576172, + "step": 1576 + }, + { + "epoch": 1.0883215456270485, + "grad_norm": 0.26070114970207214, + "learning_rate": 3.0222307397470295e-06, + "logits/chosen": 3.6876611709594727, + "logits/rejected": 3.9628517627716064, + "logps/chosen": -156.73391723632812, + "logps/rejected": -180.96243286132812, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.945465087890625, + "rewards/margins": 2.4979069232940674, + "rewards/rejected": -13.443370819091797, + "step": 1577 + }, + { + "epoch": 1.0890115577022599, + "grad_norm": 0.29351192712783813, + "learning_rate": 3.02414718282867e-06, + "logits/chosen": 3.91304874420166, + "logits/rejected": 3.9835100173950195, + "logps/chosen": -171.00819396972656, + "logps/rejected": -181.65025329589844, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.369668006896973, + "rewards/margins": 1.0958600044250488, + "rewards/rejected": -13.465527534484863, + "step": 1578 + }, + { + "epoch": 1.089701569777471, + "grad_norm": 0.4208095371723175, + "learning_rate": 3.0260636259103103e-06, + "logits/chosen": 3.5303537845611572, + "logits/rejected": 3.7304515838623047, + "logps/chosen": -167.0387725830078, + "logps/rejected": -188.26947021484375, + "loss": 0.4361, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.93229866027832, + "rewards/margins": 2.0664680004119873, + "rewards/rejected": -13.998766899108887, + "step": 1579 + }, + { + "epoch": 1.0903915818526824, + "grad_norm": 0.3763618767261505, + "learning_rate": 3.0279800689919516e-06, + "logits/chosen": 4.052966594696045, + "logits/rejected": 4.052966594696045, + "logps/chosen": -179.4216766357422, + "logps/rejected": -179.4216766357422, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.056458473205566, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.056458473205566, + "step": 1580 + }, + { + "epoch": 1.0910815939278937, + "grad_norm": 0.32986336946487427, + "learning_rate": 3.029896512073592e-06, + "logits/chosen": 3.965850830078125, + "logits/rejected": 3.965850830078125, + "logps/chosen": -188.57315063476562, + "logps/rejected": -188.57315063476562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.099684715270996, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.099684715270996, + "step": 1581 + }, + { + "epoch": 1.091771606003105, + "grad_norm": 0.27676230669021606, + "learning_rate": 3.0318129551552324e-06, + "logits/chosen": 3.542023181915283, + "logits/rejected": 3.650517463684082, + "logps/chosen": -151.69894409179688, + "logps/rejected": -174.46229553222656, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.324698448181152, + "rewards/margins": 2.3525078296661377, + "rewards/rejected": -12.677206993103027, + "step": 1582 + }, + { + "epoch": 1.0924616180783164, + "grad_norm": 0.281720370054245, + "learning_rate": 3.0337293982368727e-06, + "logits/chosen": 4.263405799865723, + "logits/rejected": 4.263405799865723, + "logps/chosen": -190.5476837158203, + "logps/rejected": -190.5476837158203, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.281707763671875, + "rewards/margins": 0.0, + "rewards/rejected": -14.281707763671875, + "step": 1583 + }, + { + "epoch": 1.0931516301535278, + "grad_norm": 0.9595568776130676, + "learning_rate": 3.035645841318513e-06, + "logits/chosen": 4.475303649902344, + "logits/rejected": 4.621659278869629, + "logps/chosen": -186.22967529296875, + "logps/rejected": -189.2398223876953, + "loss": 0.6218, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.773995399475098, + "rewards/margins": 0.2550421357154846, + "rewards/rejected": -14.029037475585938, + "step": 1584 + }, + { + "epoch": 1.093841642228739, + "grad_norm": 0.3114936947822571, + "learning_rate": 3.0375622844001535e-06, + "logits/chosen": 4.279565811157227, + "logits/rejected": 4.482354164123535, + "logps/chosen": -171.033203125, + "logps/rejected": -188.97921752929688, + "loss": 0.5206, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.281403541564941, + "rewards/margins": 1.7416143417358398, + "rewards/rejected": -14.023017883300781, + "step": 1585 + }, + { + "epoch": 1.0945316543039503, + "grad_norm": 0.3725404441356659, + "learning_rate": 3.039478727481794e-06, + "logits/chosen": 3.950076103210449, + "logits/rejected": 4.10648250579834, + "logps/chosen": -182.75592041015625, + "logps/rejected": -191.91726684570312, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.515984535217285, + "rewards/margins": 0.9296302199363708, + "rewards/rejected": -14.4456148147583, + "step": 1586 + }, + { + "epoch": 1.0952216663791616, + "grad_norm": 0.335989385843277, + "learning_rate": 3.0413951705634343e-06, + "logits/chosen": 4.182088851928711, + "logits/rejected": 4.261163711547852, + "logps/chosen": -172.8805389404297, + "logps/rejected": -189.5545654296875, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.614657402038574, + "rewards/margins": 1.6482473611831665, + "rewards/rejected": -14.26290512084961, + "step": 1587 + }, + { + "epoch": 1.095911678454373, + "grad_norm": 0.3477801978588104, + "learning_rate": 3.043311613645075e-06, + "logits/chosen": 3.6288087368011475, + "logits/rejected": 3.6842052936553955, + "logps/chosen": -167.64801025390625, + "logps/rejected": -185.00709533691406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.876773834228516, + "rewards/margins": 1.671514868736267, + "rewards/rejected": -13.548288345336914, + "step": 1588 + }, + { + "epoch": 1.0966016905295843, + "grad_norm": 1.5194305181503296, + "learning_rate": 3.0452280567267155e-06, + "logits/chosen": 4.197990417480469, + "logits/rejected": 4.255621910095215, + "logps/chosen": -180.23422241210938, + "logps/rejected": -183.0714111328125, + "loss": 0.6143, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.21053695678711, + "rewards/margins": 0.3437255620956421, + "rewards/rejected": -13.554262161254883, + "step": 1589 + }, + { + "epoch": 1.0972917026047955, + "grad_norm": 10.935893058776855, + "learning_rate": 3.047144499808356e-06, + "logits/chosen": 4.037179946899414, + "logits/rejected": 4.04380464553833, + "logps/chosen": -180.44998168945312, + "logps/rejected": -176.30795288085938, + "loss": 1.444, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.294403076171875, + "rewards/margins": -0.4300868511199951, + "rewards/rejected": -12.864315032958984, + "step": 1590 + }, + { + "epoch": 1.0979817146800068, + "grad_norm": 0.39952757954597473, + "learning_rate": 3.0490609428899963e-06, + "logits/chosen": 3.9979405403137207, + "logits/rejected": 3.9979405403137207, + "logps/chosen": -187.16366577148438, + "logps/rejected": -187.16366577148438, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.06141471862793, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -14.06141471862793, + "step": 1591 + }, + { + "epoch": 1.0986717267552182, + "grad_norm": 19.712120056152344, + "learning_rate": 3.0509773859716367e-06, + "logits/chosen": 4.192006587982178, + "logits/rejected": 4.142881870269775, + "logps/chosen": -185.401611328125, + "logps/rejected": -188.9765625, + "loss": 0.8341, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.639481544494629, + "rewards/margins": 0.4317988157272339, + "rewards/rejected": -14.071279525756836, + "step": 1592 + }, + { + "epoch": 1.0993617388304295, + "grad_norm": 0.33169025182724, + "learning_rate": 3.052893829053277e-06, + "logits/chosen": 4.082162380218506, + "logits/rejected": 4.082162380218506, + "logps/chosen": -180.30337524414062, + "logps/rejected": -180.30337524414062, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.364975929260254, + "rewards/margins": 8.344650268554688e-07, + "rewards/rejected": -13.364975929260254, + "step": 1593 + }, + { + "epoch": 1.100051750905641, + "grad_norm": 0.3932049572467804, + "learning_rate": 3.0548102721349175e-06, + "logits/chosen": 4.174001693725586, + "logits/rejected": 4.174001693725586, + "logps/chosen": -188.652587890625, + "logps/rejected": -188.652587890625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.124967575073242, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.124967575073242, + "step": 1594 + }, + { + "epoch": 1.1007417629808522, + "grad_norm": 0.282196968793869, + "learning_rate": 3.056726715216558e-06, + "logits/chosen": 3.804581642150879, + "logits/rejected": 3.8465046882629395, + "logps/chosen": -180.14703369140625, + "logps/rejected": -190.09811401367188, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.11735725402832, + "rewards/margins": 1.0088533163070679, + "rewards/rejected": -14.126211166381836, + "step": 1595 + }, + { + "epoch": 1.1014317750560634, + "grad_norm": 0.34509822726249695, + "learning_rate": 3.058643158298199e-06, + "logits/chosen": 3.9383931159973145, + "logits/rejected": 3.9383931159973145, + "logps/chosen": -184.7066192626953, + "logps/rejected": -184.7066192626953, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.965126037597656, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.965126037597656, + "step": 1596 + }, + { + "epoch": 1.1021217871312747, + "grad_norm": 0.32719260454177856, + "learning_rate": 3.0605596013798395e-06, + "logits/chosen": 4.361108779907227, + "logits/rejected": 4.361108779907227, + "logps/chosen": -181.2658233642578, + "logps/rejected": -181.2658233642578, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.595694541931152, + "rewards/margins": 2.980232238769531e-07, + "rewards/rejected": -13.595694541931152, + "step": 1597 + }, + { + "epoch": 1.102811799206486, + "grad_norm": 0.3257448375225067, + "learning_rate": 3.06247604446148e-06, + "logits/chosen": 4.166661739349365, + "logits/rejected": 4.166661739349365, + "logps/chosen": -191.88536071777344, + "logps/rejected": -191.88536071777344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.311629295349121, + "rewards/margins": 0.0, + "rewards/rejected": -14.311629295349121, + "step": 1598 + }, + { + "epoch": 1.1035018112816974, + "grad_norm": 0.337519109249115, + "learning_rate": 3.0643924875431203e-06, + "logits/chosen": 3.7902235984802246, + "logits/rejected": 3.983461380004883, + "logps/chosen": -172.16253662109375, + "logps/rejected": -193.56277465820312, + "loss": 0.521, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.476089477539062, + "rewards/margins": 2.11425518989563, + "rewards/rejected": -14.59034538269043, + "step": 1599 + }, + { + "epoch": 1.1041918233569088, + "grad_norm": 0.35294073820114136, + "learning_rate": 3.0663089306247607e-06, + "logits/chosen": 3.927304267883301, + "logits/rejected": 4.173756122589111, + "logps/chosen": -177.02223205566406, + "logps/rejected": -199.1164093017578, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.751579284667969, + "rewards/margins": 2.2413363456726074, + "rewards/rejected": -14.992916107177734, + "step": 1600 + }, + { + "epoch": 1.1048818354321202, + "grad_norm": 0.3697686195373535, + "learning_rate": 3.068225373706401e-06, + "logits/chosen": 4.2419562339782715, + "logits/rejected": 4.2419562339782715, + "logps/chosen": -187.6674346923828, + "logps/rejected": -187.66741943359375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.82467269897461, + "rewards/margins": 0.0, + "rewards/rejected": -13.824671745300293, + "step": 1601 + }, + { + "epoch": 1.1055718475073313, + "grad_norm": 0.3212621510028839, + "learning_rate": 3.0701418167880415e-06, + "logits/chosen": 4.032137393951416, + "logits/rejected": 4.032137393951416, + "logps/chosen": -182.92552185058594, + "logps/rejected": -182.92552185058594, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.397411346435547, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.397411346435547, + "step": 1602 + }, + { + "epoch": 1.1062618595825426, + "grad_norm": 0.2639116644859314, + "learning_rate": 3.072058259869682e-06, + "logits/chosen": 3.957303524017334, + "logits/rejected": 4.208233833312988, + "logps/chosen": -173.10009765625, + "logps/rejected": -202.38812255859375, + "loss": 0.4339, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.384742736816406, + "rewards/margins": 2.9789748191833496, + "rewards/rejected": -15.363718032836914, + "step": 1603 + }, + { + "epoch": 1.106951871657754, + "grad_norm": 0.3242131471633911, + "learning_rate": 3.0739747029513227e-06, + "logits/chosen": 4.008781909942627, + "logits/rejected": 4.231396675109863, + "logps/chosen": -163.26400756835938, + "logps/rejected": -199.70022583007812, + "loss": 0.4338, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.641347885131836, + "rewards/margins": 3.5411994457244873, + "rewards/rejected": -15.18254566192627, + "step": 1604 + }, + { + "epoch": 1.1076418837329653, + "grad_norm": 23.678081512451172, + "learning_rate": 3.075891146032963e-06, + "logits/chosen": 4.131167411804199, + "logits/rejected": 4.200063705444336, + "logps/chosen": -168.80410766601562, + "logps/rejected": -185.31207275390625, + "loss": 0.5881, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.130765914916992, + "rewards/margins": 1.694730520248413, + "rewards/rejected": -13.8254976272583, + "step": 1605 + }, + { + "epoch": 1.1083318958081767, + "grad_norm": 0.31979507207870483, + "learning_rate": 3.0778075891146034e-06, + "logits/chosen": 4.329150676727295, + "logits/rejected": 4.329150676727295, + "logps/chosen": -177.40196228027344, + "logps/rejected": -177.40196228027344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.928976058959961, + "rewards/margins": 0.0, + "rewards/rejected": -12.928976058959961, + "step": 1606 + }, + { + "epoch": 1.109021907883388, + "grad_norm": 0.6531662940979004, + "learning_rate": 3.0797240321962443e-06, + "logits/chosen": 4.253058433532715, + "logits/rejected": 4.162666320800781, + "logps/chosen": -166.70298767089844, + "logps/rejected": -178.0294189453125, + "loss": 0.5254, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.859112739562988, + "rewards/margins": 1.0646626949310303, + "rewards/rejected": -12.923774719238281, + "step": 1607 + }, + { + "epoch": 1.1097119199585992, + "grad_norm": 0.30071306228637695, + "learning_rate": 3.0816404752778847e-06, + "logits/chosen": 4.138416290283203, + "logits/rejected": 4.189868450164795, + "logps/chosen": -169.1463623046875, + "logps/rejected": -176.8323516845703, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.042386054992676, + "rewards/margins": 0.8280116319656372, + "rewards/rejected": -12.870397567749023, + "step": 1608 + }, + { + "epoch": 1.1104019320338105, + "grad_norm": 14.330015182495117, + "learning_rate": 3.083556918359525e-06, + "logits/chosen": 4.123563766479492, + "logits/rejected": 4.2666015625, + "logps/chosen": -168.55084228515625, + "logps/rejected": -173.66464233398438, + "loss": 0.9698, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.874517440795898, + "rewards/margins": 0.6161371469497681, + "rewards/rejected": -12.490653991699219, + "step": 1609 + }, + { + "epoch": 1.111091944109022, + "grad_norm": 0.824303150177002, + "learning_rate": 3.0854733614411654e-06, + "logits/chosen": 4.160418510437012, + "logits/rejected": 4.208432197570801, + "logps/chosen": -190.05654907226562, + "logps/rejected": -194.21218872070312, + "loss": 0.6127, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.456300735473633, + "rewards/margins": 0.3715698719024658, + "rewards/rejected": -14.82787036895752, + "step": 1610 + }, + { + "epoch": 1.1117819561842333, + "grad_norm": 0.3646199405193329, + "learning_rate": 3.087389804522806e-06, + "logits/chosen": 3.9898922443389893, + "logits/rejected": 3.9898922443389893, + "logps/chosen": -172.99124145507812, + "logps/rejected": -172.99124145507812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.200420379638672, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -12.200420379638672, + "step": 1611 + }, + { + "epoch": 1.1124719682594446, + "grad_norm": 0.34353840351104736, + "learning_rate": 3.0893062476044466e-06, + "logits/chosen": 3.8868601322174072, + "logits/rejected": 3.8868601322174072, + "logps/chosen": -189.94168090820312, + "logps/rejected": -189.94168090820312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.341323852539062, + "rewards/margins": 0.0, + "rewards/rejected": -14.341323852539062, + "step": 1612 + }, + { + "epoch": 1.113161980334656, + "grad_norm": 0.27399909496307373, + "learning_rate": 3.091222690686087e-06, + "logits/chosen": 3.7489750385284424, + "logits/rejected": 3.968510389328003, + "logps/chosen": -159.13711547851562, + "logps/rejected": -177.92312622070312, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.147634506225586, + "rewards/margins": 1.7923040390014648, + "rewards/rejected": -12.939937591552734, + "step": 1613 + }, + { + "epoch": 1.113851992409867, + "grad_norm": 0.2962978184223175, + "learning_rate": 3.0931391337677274e-06, + "logits/chosen": 3.740978717803955, + "logits/rejected": 3.8571293354034424, + "logps/chosen": -174.0345001220703, + "logps/rejected": -193.03773498535156, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.525321960449219, + "rewards/margins": 2.02880597114563, + "rewards/rejected": -14.55412769317627, + "step": 1614 + }, + { + "epoch": 1.1145420044850785, + "grad_norm": 4.681168556213379, + "learning_rate": 3.095055576849368e-06, + "logits/chosen": 4.026059150695801, + "logits/rejected": 3.997084617614746, + "logps/chosen": -179.10360717773438, + "logps/rejected": -188.8280792236328, + "loss": 0.553, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.97756290435791, + "rewards/margins": 1.018559455871582, + "rewards/rejected": -13.996122360229492, + "step": 1615 + }, + { + "epoch": 1.1152320165602898, + "grad_norm": 0.3322427570819855, + "learning_rate": 3.096972019931008e-06, + "logits/chosen": 3.8168129920959473, + "logits/rejected": 3.8168129920959473, + "logps/chosen": -181.78170776367188, + "logps/rejected": -181.78170776367188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.28807258605957, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -13.28807258605957, + "step": 1616 + }, + { + "epoch": 1.1159220286355012, + "grad_norm": 0.3115568161010742, + "learning_rate": 3.0988884630126486e-06, + "logits/chosen": 3.7091846466064453, + "logits/rejected": 3.7091846466064453, + "logps/chosen": -185.44927978515625, + "logps/rejected": -185.44927978515625, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.614234924316406, + "rewards/margins": 6.556510925292969e-07, + "rewards/rejected": -13.614234924316406, + "step": 1617 + }, + { + "epoch": 1.1166120407107125, + "grad_norm": 0.3000492751598358, + "learning_rate": 3.100804906094289e-06, + "logits/chosen": 3.996086597442627, + "logits/rejected": 3.996086597442627, + "logps/chosen": -191.715087890625, + "logps/rejected": -191.715087890625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.517130851745605, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -14.517129898071289, + "step": 1618 + }, + { + "epoch": 1.1173020527859236, + "grad_norm": 0.3567725718021393, + "learning_rate": 3.1027213491759294e-06, + "logits/chosen": 3.727780818939209, + "logits/rejected": 3.9007794857025146, + "logps/chosen": -172.02447509765625, + "logps/rejected": -181.94677734375, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.283150672912598, + "rewards/margins": 1.0082679986953735, + "rewards/rejected": -13.291418075561523, + "step": 1619 + }, + { + "epoch": 1.117992064861135, + "grad_norm": 0.2980089485645294, + "learning_rate": 3.1046377922575706e-06, + "logits/chosen": 3.4315760135650635, + "logits/rejected": 3.4315760135650635, + "logps/chosen": -171.07106018066406, + "logps/rejected": -171.07106018066406, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.360374450683594, + "rewards/margins": 0.0, + "rewards/rejected": -12.360373497009277, + "step": 1620 + }, + { + "epoch": 1.1186820769363464, + "grad_norm": 0.34803682565689087, + "learning_rate": 3.106554235339211e-06, + "logits/chosen": 3.819153070449829, + "logits/rejected": 3.819153070449829, + "logps/chosen": -175.41043090820312, + "logps/rejected": -175.41043090820312, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.669292449951172, + "rewards/margins": 7.152557373046875e-07, + "rewards/rejected": -12.669292449951172, + "step": 1621 + }, + { + "epoch": 1.1193720890115577, + "grad_norm": 0.360281378030777, + "learning_rate": 3.1084706784208514e-06, + "logits/chosen": 3.518561601638794, + "logits/rejected": 3.7110633850097656, + "logps/chosen": -162.61679077148438, + "logps/rejected": -173.94183349609375, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.537881851196289, + "rewards/margins": 1.1769087314605713, + "rewards/rejected": -12.714791297912598, + "step": 1622 + }, + { + "epoch": 1.120062101086769, + "grad_norm": 0.3886793255805969, + "learning_rate": 3.110387121502492e-06, + "logits/chosen": 3.5153493881225586, + "logits/rejected": 3.5215086936950684, + "logps/chosen": -157.01046752929688, + "logps/rejected": -162.7594451904297, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.787044525146484, + "rewards/margins": 0.6001617908477783, + "rewards/rejected": -11.38720703125, + "step": 1623 + }, + { + "epoch": 1.1207521131619804, + "grad_norm": 0.33110466599464417, + "learning_rate": 3.112303564584132e-06, + "logits/chosen": 3.8560256958007812, + "logits/rejected": 3.8560256958007812, + "logps/chosen": -171.2685546875, + "logps/rejected": -171.26856994628906, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.348480224609375, + "rewards/margins": 5.364418029785156e-07, + "rewards/rejected": -12.348481178283691, + "step": 1624 + }, + { + "epoch": 1.1214421252371916, + "grad_norm": 27.806970596313477, + "learning_rate": 3.1142200076657726e-06, + "logits/chosen": 3.7353193759918213, + "logits/rejected": 3.705418348312378, + "logps/chosen": -177.73080444335938, + "logps/rejected": -175.38983154296875, + "loss": 0.8788, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.14805793762207, + "rewards/margins": -0.2572214603424072, + "rewards/rejected": -12.890836715698242, + "step": 1625 + }, + { + "epoch": 1.122132137312403, + "grad_norm": 0.2908390760421753, + "learning_rate": 3.116136450747413e-06, + "logits/chosen": 3.7101759910583496, + "logits/rejected": 3.7249796390533447, + "logps/chosen": -166.050537109375, + "logps/rejected": -181.6045379638672, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.785333633422852, + "rewards/margins": 1.564788579940796, + "rewards/rejected": -13.350122451782227, + "step": 1626 + }, + { + "epoch": 1.1228221493876143, + "grad_norm": 0.30175745487213135, + "learning_rate": 3.1180528938290534e-06, + "logits/chosen": 3.1810100078582764, + "logits/rejected": 3.184523105621338, + "logps/chosen": -144.4119873046875, + "logps/rejected": -155.09234619140625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.613430976867676, + "rewards/margins": 1.079477310180664, + "rewards/rejected": -10.69290828704834, + "step": 1627 + }, + { + "epoch": 1.1235121614628256, + "grad_norm": 0.2725284695625305, + "learning_rate": 3.119969336910694e-06, + "logits/chosen": 3.5976438522338867, + "logits/rejected": 3.7875900268554688, + "logps/chosen": -142.30764770507812, + "logps/rejected": -161.3811492919922, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.407754898071289, + "rewards/margins": 1.9021015167236328, + "rewards/rejected": -11.309856414794922, + "step": 1628 + }, + { + "epoch": 1.124202173538037, + "grad_norm": 0.29027289152145386, + "learning_rate": 3.1218857799923346e-06, + "logits/chosen": 3.6209535598754883, + "logits/rejected": 3.6209535598754883, + "logps/chosen": -172.42410278320312, + "logps/rejected": -172.42410278320312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.470845222473145, + "rewards/margins": 0.0, + "rewards/rejected": -12.470845222473145, + "step": 1629 + }, + { + "epoch": 1.1248921856132483, + "grad_norm": 0.29984062910079956, + "learning_rate": 3.123802223073975e-06, + "logits/chosen": 3.953918218612671, + "logits/rejected": 3.9997916221618652, + "logps/chosen": -181.76004028320312, + "logps/rejected": -194.0142822265625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.306949615478516, + "rewards/margins": 1.24383544921875, + "rewards/rejected": -14.550785064697266, + "step": 1630 + }, + { + "epoch": 1.1255821976884595, + "grad_norm": 0.33344972133636475, + "learning_rate": 3.1257186661556153e-06, + "logits/chosen": 3.4757964611053467, + "logits/rejected": 3.481945037841797, + "logps/chosen": -170.8289794921875, + "logps/rejected": -178.955078125, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.256961822509766, + "rewards/margins": 0.8941104412078857, + "rewards/rejected": -13.151073455810547, + "step": 1631 + }, + { + "epoch": 1.1262722097636708, + "grad_norm": 0.2742486000061035, + "learning_rate": 3.1276351092372557e-06, + "logits/chosen": 3.5073208808898926, + "logits/rejected": 3.5073208808898926, + "logps/chosen": -166.1883087158203, + "logps/rejected": -166.1883087158203, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.927925109863281, + "rewards/margins": 4.172325134277344e-07, + "rewards/rejected": -11.927925109863281, + "step": 1632 + }, + { + "epoch": 1.1269622218388822, + "grad_norm": 0.33177706599235535, + "learning_rate": 3.129551552318896e-06, + "logits/chosen": 3.7590222358703613, + "logits/rejected": 3.7590222358703613, + "logps/chosen": -177.09710693359375, + "logps/rejected": -177.09710693359375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.897087097167969, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.897087097167969, + "step": 1633 + }, + { + "epoch": 1.1276522339140935, + "grad_norm": 0.36396899819374084, + "learning_rate": 3.1314679954005365e-06, + "logits/chosen": 3.8199336528778076, + "logits/rejected": 3.820796489715576, + "logps/chosen": -173.6473388671875, + "logps/rejected": -182.74240112304688, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.62062931060791, + "rewards/margins": 0.9228699207305908, + "rewards/rejected": -13.543498992919922, + "step": 1634 + }, + { + "epoch": 1.1283422459893049, + "grad_norm": 0.3800429105758667, + "learning_rate": 3.133384438482177e-06, + "logits/chosen": 3.5678775310516357, + "logits/rejected": 3.5678775310516357, + "logps/chosen": -170.91481018066406, + "logps/rejected": -170.91482543945312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.298452377319336, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.29845142364502, + "step": 1635 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 0.3299192488193512, + "learning_rate": 3.135300881563818e-06, + "logits/chosen": 3.842587471008301, + "logits/rejected": 3.842587471008301, + "logps/chosen": -181.0232391357422, + "logps/rejected": -181.0232391357422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.298727989196777, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -13.298727989196777, + "step": 1636 + }, + { + "epoch": 1.1297222701397274, + "grad_norm": 0.30755966901779175, + "learning_rate": 3.1372173246454585e-06, + "logits/chosen": 3.5162267684936523, + "logits/rejected": 3.5254907608032227, + "logps/chosen": -173.79107666015625, + "logps/rejected": -179.6033477783203, + "loss": 0.6073, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.435333251953125, + "rewards/margins": 0.6361199021339417, + "rewards/rejected": -13.071453094482422, + "step": 1637 + }, + { + "epoch": 1.1304122822149387, + "grad_norm": 0.26213666796684265, + "learning_rate": 3.139133767727099e-06, + "logits/chosen": 3.7749061584472656, + "logits/rejected": 3.8900961875915527, + "logps/chosen": -153.7355194091797, + "logps/rejected": -162.6539764404297, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.38314437866211, + "rewards/margins": 0.9128046631813049, + "rewards/rejected": -11.295949935913086, + "step": 1638 + }, + { + "epoch": 1.13110229429015, + "grad_norm": 0.3806220591068268, + "learning_rate": 3.1410502108087393e-06, + "logits/chosen": 3.6178221702575684, + "logits/rejected": 3.7410807609558105, + "logps/chosen": -173.5869140625, + "logps/rejected": -182.97630310058594, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.489355087280273, + "rewards/margins": 0.9780044555664062, + "rewards/rejected": -13.46735954284668, + "step": 1639 + }, + { + "epoch": 1.1317923063653614, + "grad_norm": 0.3403241038322449, + "learning_rate": 3.1429666538903797e-06, + "logits/chosen": 3.572606325149536, + "logits/rejected": 3.6878137588500977, + "logps/chosen": -175.46096801757812, + "logps/rejected": -182.57012939453125, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.866683006286621, + "rewards/margins": 0.7597333192825317, + "rewards/rejected": -13.626416206359863, + "step": 1640 + }, + { + "epoch": 1.1324823184405728, + "grad_norm": 0.35804483294487, + "learning_rate": 3.14488309697202e-06, + "logits/chosen": 3.339918613433838, + "logits/rejected": 3.908662796020508, + "logps/chosen": -150.90933227539062, + "logps/rejected": -188.66510009765625, + "loss": 0.3486, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.286656379699707, + "rewards/margins": 3.835561513900757, + "rewards/rejected": -14.122217178344727, + "step": 1641 + }, + { + "epoch": 1.1331723305157841, + "grad_norm": 0.31152981519699097, + "learning_rate": 3.1467995400536605e-06, + "logits/chosen": 3.4638633728027344, + "logits/rejected": 3.461047649383545, + "logps/chosen": -168.91897583007812, + "logps/rejected": -175.71780395507812, + "loss": 0.6073, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.254454612731934, + "rewards/margins": 0.6335898637771606, + "rewards/rejected": -12.888044357299805, + "step": 1642 + }, + { + "epoch": 1.1338623425909953, + "grad_norm": 0.4050772786140442, + "learning_rate": 3.148715983135301e-06, + "logits/chosen": 3.663647413253784, + "logits/rejected": 3.6660354137420654, + "logps/chosen": -170.37484741210938, + "logps/rejected": -177.72848510742188, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.100931167602539, + "rewards/margins": 0.7503464818000793, + "rewards/rejected": -12.851278305053711, + "step": 1643 + }, + { + "epoch": 1.1345523546662066, + "grad_norm": 1.4983857870101929, + "learning_rate": 3.1506324262169417e-06, + "logits/chosen": 3.404751777648926, + "logits/rejected": 3.5797853469848633, + "logps/chosen": -155.88746643066406, + "logps/rejected": -171.1622314453125, + "loss": 0.5369, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.000967979431152, + "rewards/margins": 1.5358892679214478, + "rewards/rejected": -12.536856651306152, + "step": 1644 + }, + { + "epoch": 1.135242366741418, + "grad_norm": 0.2927800118923187, + "learning_rate": 3.152548869298582e-06, + "logits/chosen": 3.682126522064209, + "logits/rejected": 3.7934341430664062, + "logps/chosen": -173.5657958984375, + "logps/rejected": -181.21531677246094, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.610189437866211, + "rewards/margins": 0.7828592658042908, + "rewards/rejected": -13.393048286437988, + "step": 1645 + }, + { + "epoch": 1.1359323788166293, + "grad_norm": 0.4061196446418762, + "learning_rate": 3.1544653123802225e-06, + "logits/chosen": 3.3779163360595703, + "logits/rejected": 3.4184038639068604, + "logps/chosen": -174.48788452148438, + "logps/rejected": -184.5728759765625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.748346328735352, + "rewards/margins": 1.0464507341384888, + "rewards/rejected": -13.794795989990234, + "step": 1646 + }, + { + "epoch": 1.1366223908918407, + "grad_norm": 0.28756821155548096, + "learning_rate": 3.156381755461863e-06, + "logits/chosen": 3.7169246673583984, + "logits/rejected": 3.7169246673583984, + "logps/chosen": -187.47494506835938, + "logps/rejected": -187.47494506835938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.971399307250977, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -13.971399307250977, + "step": 1647 + }, + { + "epoch": 1.1373124029670518, + "grad_norm": 0.2715747654438019, + "learning_rate": 3.1582981985435037e-06, + "logits/chosen": 3.667959213256836, + "logits/rejected": 3.667959213256836, + "logps/chosen": -174.06993103027344, + "logps/rejected": -174.06993103027344, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.556069374084473, + "rewards/margins": 2.980232238769531e-07, + "rewards/rejected": -12.556068420410156, + "step": 1648 + }, + { + "epoch": 1.1380024150422632, + "grad_norm": 1.1509050130844116, + "learning_rate": 3.160214641625144e-06, + "logits/chosen": 3.7924978733062744, + "logits/rejected": 3.7684402465820312, + "logps/chosen": -191.5929412841797, + "logps/rejected": -195.2596893310547, + "loss": 0.6141, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.47402572631836, + "rewards/margins": 0.3460971713066101, + "rewards/rejected": -14.820121765136719, + "step": 1649 + }, + { + "epoch": 1.1386924271174745, + "grad_norm": 0.9720893502235413, + "learning_rate": 3.1621310847067845e-06, + "logits/chosen": 3.8354897499084473, + "logits/rejected": 3.8206686973571777, + "logps/chosen": -168.88052368164062, + "logps/rejected": -173.27664184570312, + "loss": 0.6101, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.178807258605957, + "rewards/margins": 0.44279128313064575, + "rewards/rejected": -12.621599197387695, + "step": 1650 + }, + { + "epoch": 1.139382439192686, + "grad_norm": 6.770792007446289, + "learning_rate": 3.164047527788425e-06, + "logits/chosen": 3.2311899662017822, + "logits/rejected": 3.2342872619628906, + "logps/chosen": -154.64230346679688, + "logps/rejected": -170.23593139648438, + "loss": 0.5463, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.784626007080078, + "rewards/margins": 1.4563742876052856, + "rewards/rejected": -12.240999221801758, + "step": 1651 + }, + { + "epoch": 1.1400724512678972, + "grad_norm": 0.2640346586704254, + "learning_rate": 3.1659639708700657e-06, + "logits/chosen": 3.7552671432495117, + "logits/rejected": 3.821394920349121, + "logps/chosen": -163.76568603515625, + "logps/rejected": -175.74679565429688, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.753753662109375, + "rewards/margins": 1.2133140563964844, + "rewards/rejected": -12.96706771850586, + "step": 1652 + }, + { + "epoch": 1.1407624633431086, + "grad_norm": 0.24601951241493225, + "learning_rate": 3.167880413951706e-06, + "logits/chosen": 3.6937007904052734, + "logits/rejected": 3.7718849182128906, + "logps/chosen": -192.37045288085938, + "logps/rejected": -201.89511108398438, + "loss": 0.6066, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.326871871948242, + "rewards/margins": 0.9496331214904785, + "rewards/rejected": -15.276504516601562, + "step": 1653 + }, + { + "epoch": 1.1414524754183197, + "grad_norm": 0.9455645680427551, + "learning_rate": 3.1697968570333465e-06, + "logits/chosen": 3.701632022857666, + "logits/rejected": 3.971815586090088, + "logps/chosen": -180.51425170898438, + "logps/rejected": -195.46124267578125, + "loss": 0.5254, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.210679054260254, + "rewards/margins": 1.4981602430343628, + "rewards/rejected": -14.708839416503906, + "step": 1654 + }, + { + "epoch": 1.142142487493531, + "grad_norm": 0.24803948402404785, + "learning_rate": 3.171713300114987e-06, + "logits/chosen": 3.9748728275299072, + "logits/rejected": 3.996713161468506, + "logps/chosen": -182.7438201904297, + "logps/rejected": -190.1535186767578, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.449090003967285, + "rewards/margins": 0.7473286390304565, + "rewards/rejected": -14.196418762207031, + "step": 1655 + }, + { + "epoch": 1.1428324995687424, + "grad_norm": 0.32370465993881226, + "learning_rate": 3.1736297431966273e-06, + "logits/chosen": 3.576660633087158, + "logits/rejected": 3.6884193420410156, + "logps/chosen": -186.33395385742188, + "logps/rejected": -194.6097412109375, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.88769817352295, + "rewards/margins": 0.8121219873428345, + "rewards/rejected": -14.699820518493652, + "step": 1656 + }, + { + "epoch": 1.1435225116439538, + "grad_norm": 0.2541882395744324, + "learning_rate": 3.1755461862782676e-06, + "logits/chosen": 3.9252195358276367, + "logits/rejected": 4.022891521453857, + "logps/chosen": -175.3685760498047, + "logps/rejected": -186.41055297851562, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.735888481140137, + "rewards/margins": 1.1299359798431396, + "rewards/rejected": -13.865824699401855, + "step": 1657 + }, + { + "epoch": 1.1442125237191652, + "grad_norm": 0.3522034287452698, + "learning_rate": 3.177462629359908e-06, + "logits/chosen": 3.8207359313964844, + "logits/rejected": 3.9114274978637695, + "logps/chosen": -162.43936157226562, + "logps/rejected": -168.45167541503906, + "loss": 0.6075, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.748151779174805, + "rewards/margins": 0.605792760848999, + "rewards/rejected": -12.35394287109375, + "step": 1658 + }, + { + "epoch": 1.1449025357943765, + "grad_norm": 0.27543166279792786, + "learning_rate": 3.1793790724415484e-06, + "logits/chosen": 3.964674711227417, + "logits/rejected": 3.964674711227417, + "logps/chosen": -188.4503173828125, + "logps/rejected": -188.4503173828125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.139581680297852, + "rewards/margins": 0.0, + "rewards/rejected": -14.139581680297852, + "step": 1659 + }, + { + "epoch": 1.1455925478695876, + "grad_norm": 0.3028824031352997, + "learning_rate": 3.1812955155231897e-06, + "logits/chosen": 3.711714506149292, + "logits/rejected": 3.792144536972046, + "logps/chosen": -175.76087951660156, + "logps/rejected": -183.40780639648438, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.870643615722656, + "rewards/margins": 0.7901859879493713, + "rewards/rejected": -13.660829544067383, + "step": 1660 + }, + { + "epoch": 1.146282559944799, + "grad_norm": 0.23389393091201782, + "learning_rate": 3.18321195860483e-06, + "logits/chosen": 3.9344217777252197, + "logits/rejected": 4.15937614440918, + "logps/chosen": -179.5731658935547, + "logps/rejected": -196.1940155029297, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.061059951782227, + "rewards/margins": 1.7681207656860352, + "rewards/rejected": -14.829179763793945, + "step": 1661 + }, + { + "epoch": 1.1469725720200104, + "grad_norm": 0.38551291823387146, + "learning_rate": 3.1851284016864705e-06, + "logits/chosen": 3.4957048892974854, + "logits/rejected": 3.6989777088165283, + "logps/chosen": -147.34088134765625, + "logps/rejected": -166.8770751953125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.259000778198242, + "rewards/margins": 1.7728983163833618, + "rewards/rejected": -12.031899452209473, + "step": 1662 + }, + { + "epoch": 1.1476625840952217, + "grad_norm": 15.888476371765137, + "learning_rate": 3.187044844768111e-06, + "logits/chosen": 3.775669574737549, + "logits/rejected": 3.7464399337768555, + "logps/chosen": -188.93499755859375, + "logps/rejected": -187.07447814941406, + "loss": 0.8197, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.099493026733398, + "rewards/margins": -0.18812429904937744, + "rewards/rejected": -13.911369323730469, + "step": 1663 + }, + { + "epoch": 1.148352596170433, + "grad_norm": 0.38579246401786804, + "learning_rate": 3.1889612878497512e-06, + "logits/chosen": 3.638350486755371, + "logits/rejected": 3.805992603302002, + "logps/chosen": -184.34732055664062, + "logps/rejected": -197.01922607421875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.472185134887695, + "rewards/margins": 1.285606861114502, + "rewards/rejected": -14.757791519165039, + "step": 1664 + }, + { + "epoch": 1.1490426082456442, + "grad_norm": 0.46733543276786804, + "learning_rate": 3.1908777309313916e-06, + "logits/chosen": 3.501159429550171, + "logits/rejected": 3.920978307723999, + "logps/chosen": -167.98780822753906, + "logps/rejected": -196.06134033203125, + "loss": 0.4344, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.990888595581055, + "rewards/margins": 2.7275757789611816, + "rewards/rejected": -14.718463897705078, + "step": 1665 + }, + { + "epoch": 1.1497326203208555, + "grad_norm": 0.360145628452301, + "learning_rate": 3.192794174013032e-06, + "logits/chosen": 4.008514881134033, + "logits/rejected": 4.13601541519165, + "logps/chosen": -187.5045166015625, + "logps/rejected": -194.78042602539062, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.081121444702148, + "rewards/margins": 0.7001224756240845, + "rewards/rejected": -14.781244277954102, + "step": 1666 + }, + { + "epoch": 1.150422632396067, + "grad_norm": 0.3170110583305359, + "learning_rate": 3.1947106170946724e-06, + "logits/chosen": 3.8786351680755615, + "logits/rejected": 4.081386566162109, + "logps/chosen": -176.22518920898438, + "logps/rejected": -186.10293579101562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.769021987915039, + "rewards/margins": 1.0022798776626587, + "rewards/rejected": -13.77130126953125, + "step": 1667 + }, + { + "epoch": 1.1511126444712783, + "grad_norm": 0.2970692217350006, + "learning_rate": 3.1966270601763132e-06, + "logits/chosen": 3.914940118789673, + "logits/rejected": 3.9806275367736816, + "logps/chosen": -178.66387939453125, + "logps/rejected": -198.43817138671875, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.979145050048828, + "rewards/margins": 2.0559628009796143, + "rewards/rejected": -15.03510856628418, + "step": 1668 + }, + { + "epoch": 1.1518026565464896, + "grad_norm": 0.445956289768219, + "learning_rate": 3.1985435032579536e-06, + "logits/chosen": 3.6563720703125, + "logits/rejected": 3.6563720703125, + "logps/chosen": -171.15670776367188, + "logps/rejected": -171.15670776367188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.470712661743164, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.470712661743164, + "step": 1669 + }, + { + "epoch": 1.152492668621701, + "grad_norm": 16.236982345581055, + "learning_rate": 3.200459946339594e-06, + "logits/chosen": 3.78410005569458, + "logits/rejected": 3.724585771560669, + "logps/chosen": -184.09617614746094, + "logps/rejected": -183.5349884033203, + "loss": 0.7302, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.738249778747559, + "rewards/margins": -0.06558680534362793, + "rewards/rejected": -13.672663688659668, + "step": 1670 + }, + { + "epoch": 1.153182680696912, + "grad_norm": 0.34850719571113586, + "learning_rate": 3.2023763894212344e-06, + "logits/chosen": 3.7156314849853516, + "logits/rejected": 3.7621779441833496, + "logps/chosen": -187.81475830078125, + "logps/rejected": -196.65823364257812, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.072683334350586, + "rewards/margins": 0.9047176837921143, + "rewards/rejected": -14.977399826049805, + "step": 1671 + }, + { + "epoch": 1.1538726927721235, + "grad_norm": 0.2733750641345978, + "learning_rate": 3.204292832502875e-06, + "logits/chosen": 3.5443220138549805, + "logits/rejected": 3.7267584800720215, + "logps/chosen": -154.14822387695312, + "logps/rejected": -168.99026489257812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.463359832763672, + "rewards/margins": 1.483367681503296, + "rewards/rejected": -11.946727752685547, + "step": 1672 + }, + { + "epoch": 1.1545627048473348, + "grad_norm": 0.2674844264984131, + "learning_rate": 3.206209275584515e-06, + "logits/chosen": 3.7868542671203613, + "logits/rejected": 3.8176751136779785, + "logps/chosen": -177.15403747558594, + "logps/rejected": -188.3201904296875, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.181254386901855, + "rewards/margins": 1.0457100868225098, + "rewards/rejected": -14.226963996887207, + "step": 1673 + }, + { + "epoch": 1.1552527169225462, + "grad_norm": 0.38247719407081604, + "learning_rate": 3.2081257186661556e-06, + "logits/chosen": 4.047327995300293, + "logits/rejected": 4.123558044433594, + "logps/chosen": -172.74688720703125, + "logps/rejected": -186.63821411132812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.374298095703125, + "rewards/margins": 1.416057825088501, + "rewards/rejected": -13.790355682373047, + "step": 1674 + }, + { + "epoch": 1.1559427289977575, + "grad_norm": 0.2938136160373688, + "learning_rate": 3.210042161747796e-06, + "logits/chosen": 3.8504934310913086, + "logits/rejected": 3.8504934310913086, + "logps/chosen": -208.24542236328125, + "logps/rejected": -208.24542236328125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -16.0046329498291, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -16.0046329498291, + "step": 1675 + }, + { + "epoch": 1.1566327410729689, + "grad_norm": 0.2661469876766205, + "learning_rate": 3.2119586048294364e-06, + "logits/chosen": 4.308551788330078, + "logits/rejected": 4.3681440353393555, + "logps/chosen": -185.2782440185547, + "logps/rejected": -191.19058227539062, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.739007949829102, + "rewards/margins": 0.6095881462097168, + "rewards/rejected": -14.348596572875977, + "step": 1676 + }, + { + "epoch": 1.15732275314818, + "grad_norm": 0.4272559881210327, + "learning_rate": 3.2138750479110776e-06, + "logits/chosen": 3.840623378753662, + "logits/rejected": 3.9534173011779785, + "logps/chosen": -156.25331115722656, + "logps/rejected": -162.6232452392578, + "loss": 0.6071, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.773689270019531, + "rewards/margins": 0.6762494444847107, + "rewards/rejected": -11.449938774108887, + "step": 1677 + }, + { + "epoch": 1.1580127652233914, + "grad_norm": 0.4297662377357483, + "learning_rate": 3.215791490992718e-06, + "logits/chosen": 3.7899651527404785, + "logits/rejected": 3.7899651527404785, + "logps/chosen": -178.52206420898438, + "logps/rejected": -178.52206420898438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.046271324157715, + "rewards/margins": 0.0, + "rewards/rejected": -13.046271324157715, + "step": 1678 + }, + { + "epoch": 1.1587027772986027, + "grad_norm": 16.537935256958008, + "learning_rate": 3.2177079340743584e-06, + "logits/chosen": 4.10004186630249, + "logits/rejected": 3.957601547241211, + "logps/chosen": -180.5404052734375, + "logps/rejected": -176.88232421875, + "loss": 0.9403, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.380040168762207, + "rewards/margins": -0.32486867904663086, + "rewards/rejected": -13.055171966552734, + "step": 1679 + }, + { + "epoch": 1.159392789373814, + "grad_norm": 0.3081153929233551, + "learning_rate": 3.2196243771559988e-06, + "logits/chosen": 3.9713099002838135, + "logits/rejected": 3.9713099002838135, + "logps/chosen": -181.80960083007812, + "logps/rejected": -181.80960083007812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.56765365600586, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.567655563354492, + "step": 1680 + }, + { + "epoch": 1.1600828014490254, + "grad_norm": 5.341567039489746, + "learning_rate": 3.221540820237639e-06, + "logits/chosen": 3.8205673694610596, + "logits/rejected": 3.8354673385620117, + "logps/chosen": -181.8578338623047, + "logps/rejected": -184.074951171875, + "loss": 0.6332, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.42580509185791, + "rewards/margins": 0.1795285940170288, + "rewards/rejected": -13.605332374572754, + "step": 1681 + }, + { + "epoch": 1.1607728135242366, + "grad_norm": 0.3472662568092346, + "learning_rate": 3.2234572633192796e-06, + "logits/chosen": 3.817591667175293, + "logits/rejected": 4.112277507781982, + "logps/chosen": -183.84913635253906, + "logps/rejected": -201.54441833496094, + "loss": 0.521, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.524906158447266, + "rewards/margins": 1.7805010080337524, + "rewards/rejected": -15.30540657043457, + "step": 1682 + }, + { + "epoch": 1.161462825599448, + "grad_norm": 0.31913894414901733, + "learning_rate": 3.22537370640092e-06, + "logits/chosen": 4.065720558166504, + "logits/rejected": 4.065720558166504, + "logps/chosen": -190.30384826660156, + "logps/rejected": -190.3038330078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.319974899291992, + "rewards/margins": -8.344650268554688e-07, + "rewards/rejected": -14.319974899291992, + "step": 1683 + }, + { + "epoch": 1.1621528376746593, + "grad_norm": 0.4114392399787903, + "learning_rate": 3.2272901494825603e-06, + "logits/chosen": 4.163091659545898, + "logits/rejected": 4.163091659545898, + "logps/chosen": -177.95974731445312, + "logps/rejected": -177.95974731445312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.078237533569336, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.078238487243652, + "step": 1684 + }, + { + "epoch": 1.1628428497498706, + "grad_norm": 23.272310256958008, + "learning_rate": 3.229206592564201e-06, + "logits/chosen": 3.9569344520568848, + "logits/rejected": 3.9548420906066895, + "logps/chosen": -203.25570678710938, + "logps/rejected": -198.15420532226562, + "loss": 1.1493, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.408745765686035, + "rewards/margins": -0.5411430597305298, + "rewards/rejected": -14.86760139465332, + "step": 1685 + }, + { + "epoch": 1.163532861825082, + "grad_norm": 0.7466979622840881, + "learning_rate": 3.2311230356458415e-06, + "logits/chosen": 3.9606997966766357, + "logits/rejected": 4.017059326171875, + "logps/chosen": -177.12930297851562, + "logps/rejected": -181.09246826171875, + "loss": 0.6104, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.042470932006836, + "rewards/margins": 0.43171441555023193, + "rewards/rejected": -13.474185943603516, + "step": 1686 + }, + { + "epoch": 1.1642228739002933, + "grad_norm": 0.2893518805503845, + "learning_rate": 3.233039478727482e-06, + "logits/chosen": 3.910907745361328, + "logits/rejected": 4.041815757751465, + "logps/chosen": -165.33502197265625, + "logps/rejected": -190.52757263183594, + "loss": 0.521, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.77615737915039, + "rewards/margins": 2.316679000854492, + "rewards/rejected": -14.092836380004883, + "step": 1687 + }, + { + "epoch": 1.1649128859755047, + "grad_norm": 0.9860350489616394, + "learning_rate": 3.2349559218091227e-06, + "logits/chosen": 4.100948810577393, + "logits/rejected": 4.255289077758789, + "logps/chosen": -182.20083618164062, + "logps/rejected": -191.58860778808594, + "loss": 0.5278, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.335275650024414, + "rewards/margins": 0.8870856761932373, + "rewards/rejected": -14.22236156463623, + "step": 1688 + }, + { + "epoch": 1.1656028980507158, + "grad_norm": 0.3502384126186371, + "learning_rate": 3.236872364890763e-06, + "logits/chosen": 4.232682228088379, + "logits/rejected": 4.232682228088379, + "logps/chosen": -181.9710235595703, + "logps/rejected": -181.9710235595703, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.169866561889648, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.169867515563965, + "step": 1689 + }, + { + "epoch": 1.1662929101259272, + "grad_norm": 0.30516839027404785, + "learning_rate": 3.2387888079724035e-06, + "logits/chosen": 4.148836612701416, + "logits/rejected": 4.148836612701416, + "logps/chosen": -187.5570526123047, + "logps/rejected": -187.5570526123047, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.96894645690918, + "rewards/margins": 0.0, + "rewards/rejected": -13.96894645690918, + "step": 1690 + }, + { + "epoch": 1.1669829222011385, + "grad_norm": 0.359101802110672, + "learning_rate": 3.240705251054044e-06, + "logits/chosen": 3.986006498336792, + "logits/rejected": 4.1922454833984375, + "logps/chosen": -168.14317321777344, + "logps/rejected": -188.40786743164062, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.172039031982422, + "rewards/margins": 1.9816603660583496, + "rewards/rejected": -14.15369987487793, + "step": 1691 + }, + { + "epoch": 1.1676729342763499, + "grad_norm": 0.22219660878181458, + "learning_rate": 3.2426216941356843e-06, + "logits/chosen": 3.7249722480773926, + "logits/rejected": 3.8584885597229004, + "logps/chosen": -190.56011962890625, + "logps/rejected": -212.42376708984375, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.36489486694336, + "rewards/margins": 2.1327567100524902, + "rewards/rejected": -16.497652053833008, + "step": 1692 + }, + { + "epoch": 1.1683629463515612, + "grad_norm": 3.843141555786133, + "learning_rate": 3.244538137217325e-06, + "logits/chosen": 4.285458564758301, + "logits/rejected": 4.250269889831543, + "logps/chosen": -179.97296142578125, + "logps/rejected": -182.34112548828125, + "loss": 0.6249, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.313928604125977, + "rewards/margins": 0.23017406463623047, + "rewards/rejected": -13.544102668762207, + "step": 1693 + }, + { + "epoch": 1.1690529584267724, + "grad_norm": 0.36351972818374634, + "learning_rate": 3.2464545802989655e-06, + "logits/chosen": 4.312601089477539, + "logits/rejected": 4.312601089477539, + "logps/chosen": -196.79981994628906, + "logps/rejected": -196.79981994628906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.814220428466797, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.814220428466797, + "step": 1694 + }, + { + "epoch": 1.1697429705019837, + "grad_norm": 0.344031423330307, + "learning_rate": 3.248371023380606e-06, + "logits/chosen": 3.9274861812591553, + "logits/rejected": 4.063054084777832, + "logps/chosen": -166.4697723388672, + "logps/rejected": -177.11924743652344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.702016830444336, + "rewards/margins": 1.0754433870315552, + "rewards/rejected": -12.777460098266602, + "step": 1695 + }, + { + "epoch": 1.170432982577195, + "grad_norm": 0.3578411340713501, + "learning_rate": 3.2502874664622463e-06, + "logits/chosen": 3.9532272815704346, + "logits/rejected": 4.023545265197754, + "logps/chosen": -187.7896728515625, + "logps/rejected": -195.48614501953125, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.01201057434082, + "rewards/margins": 0.8025956153869629, + "rewards/rejected": -14.814605712890625, + "step": 1696 + }, + { + "epoch": 1.1711229946524064, + "grad_norm": 13.868762016296387, + "learning_rate": 3.2522039095438867e-06, + "logits/chosen": 3.6445059776306152, + "logits/rejected": 3.6334633827209473, + "logps/chosen": -190.868896484375, + "logps/rejected": -193.9485321044922, + "loss": 0.6671, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.378393173217773, + "rewards/margins": 0.3653559684753418, + "rewards/rejected": -14.743749618530273, + "step": 1697 + }, + { + "epoch": 1.1718130067276178, + "grad_norm": 0.3237113654613495, + "learning_rate": 3.254120352625527e-06, + "logits/chosen": 3.750480890274048, + "logits/rejected": 3.750480890274048, + "logps/chosen": -174.48184204101562, + "logps/rejected": -174.48184204101562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.52862548828125, + "rewards/margins": 0.0, + "rewards/rejected": -12.52862548828125, + "step": 1698 + }, + { + "epoch": 1.1725030188028291, + "grad_norm": 0.3269021809101105, + "learning_rate": 3.2560367957071675e-06, + "logits/chosen": 3.8633787631988525, + "logits/rejected": 3.8782103061676025, + "logps/chosen": -158.548095703125, + "logps/rejected": -167.75526428222656, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.335551261901855, + "rewards/margins": 0.8556898236274719, + "rewards/rejected": -12.191241264343262, + "step": 1699 + }, + { + "epoch": 1.1731930308780403, + "grad_norm": 0.2978931665420532, + "learning_rate": 3.257953238788808e-06, + "logits/chosen": 4.1274847984313965, + "logits/rejected": 4.209475040435791, + "logps/chosen": -197.36026000976562, + "logps/rejected": -207.84652709960938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.91263198852539, + "rewards/margins": 1.0510400533676147, + "rewards/rejected": -15.96367073059082, + "step": 1700 + }, + { + "epoch": 1.1738830429532516, + "grad_norm": 1.9465017318725586, + "learning_rate": 3.259869681870449e-06, + "logits/chosen": 3.9399189949035645, + "logits/rejected": 4.0318217277526855, + "logps/chosen": -165.24002075195312, + "logps/rejected": -181.38534545898438, + "loss": 0.5386, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.010358810424805, + "rewards/margins": 1.650500774383545, + "rewards/rejected": -13.660860061645508, + "step": 1701 + }, + { + "epoch": 1.174573055028463, + "grad_norm": 0.2607688903808594, + "learning_rate": 3.2617861249520895e-06, + "logits/chosen": 4.099494934082031, + "logits/rejected": 4.239988327026367, + "logps/chosen": -175.16688537597656, + "logps/rejected": -194.1872100830078, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.756467819213867, + "rewards/margins": 1.909987449645996, + "rewards/rejected": -14.666454315185547, + "step": 1702 + }, + { + "epoch": 1.1752630671036743, + "grad_norm": 6.133636474609375, + "learning_rate": 3.26370256803373e-06, + "logits/chosen": 4.302578449249268, + "logits/rejected": 4.349145889282227, + "logps/chosen": -188.02008056640625, + "logps/rejected": -188.67437744140625, + "loss": 0.657, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.125726699829102, + "rewards/margins": 0.08711707592010498, + "rewards/rejected": -14.212843894958496, + "step": 1703 + }, + { + "epoch": 1.1759530791788857, + "grad_norm": 0.27576744556427, + "learning_rate": 3.2656190111153703e-06, + "logits/chosen": 4.003963947296143, + "logits/rejected": 4.316556453704834, + "logps/chosen": -168.1961669921875, + "logps/rejected": -188.58689880371094, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.058480262756348, + "rewards/margins": 1.9823837280273438, + "rewards/rejected": -14.040863990783691, + "step": 1704 + }, + { + "epoch": 1.176643091254097, + "grad_norm": 0.36391639709472656, + "learning_rate": 3.2675354541970107e-06, + "logits/chosen": 3.573145866394043, + "logits/rejected": 3.674570083618164, + "logps/chosen": -162.42864990234375, + "logps/rejected": -174.93629455566406, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.473970413208008, + "rewards/margins": 1.2108650207519531, + "rewards/rejected": -12.684835433959961, + "step": 1705 + }, + { + "epoch": 1.1773331033293082, + "grad_norm": 0.4159295856952667, + "learning_rate": 3.269451897278651e-06, + "logits/chosen": 4.198203086853027, + "logits/rejected": 4.352711200714111, + "logps/chosen": -163.1079864501953, + "logps/rejected": -172.939453125, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.393272399902344, + "rewards/margins": 0.9715898633003235, + "rewards/rejected": -12.364862442016602, + "step": 1706 + }, + { + "epoch": 1.1780231154045195, + "grad_norm": 0.39535197615623474, + "learning_rate": 3.2713683403602915e-06, + "logits/chosen": 4.029165267944336, + "logits/rejected": 4.029165267944336, + "logps/chosen": -173.2200164794922, + "logps/rejected": -173.2200164794922, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.408218383789062, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -12.408218383789062, + "step": 1707 + }, + { + "epoch": 1.178713127479731, + "grad_norm": 0.33800947666168213, + "learning_rate": 3.273284783441932e-06, + "logits/chosen": 4.191252708435059, + "logits/rejected": 4.191252708435059, + "logps/chosen": -195.83578491210938, + "logps/rejected": -195.83578491210938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.985843658447266, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.985843658447266, + "step": 1708 + }, + { + "epoch": 1.1794031395549422, + "grad_norm": 0.27863943576812744, + "learning_rate": 3.2752012265235727e-06, + "logits/chosen": 3.923455238342285, + "logits/rejected": 3.958883762359619, + "logps/chosen": -196.52322387695312, + "logps/rejected": -207.52554321289062, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.875955581665039, + "rewards/margins": 1.1030666828155518, + "rewards/rejected": -15.979022979736328, + "step": 1709 + }, + { + "epoch": 1.1800931516301536, + "grad_norm": 0.4320293962955475, + "learning_rate": 3.277117669605213e-06, + "logits/chosen": 3.9308953285217285, + "logits/rejected": 4.018181800842285, + "logps/chosen": -178.36727905273438, + "logps/rejected": -189.61643981933594, + "loss": 0.5224, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.996418952941895, + "rewards/margins": 1.1832771301269531, + "rewards/rejected": -14.179695129394531, + "step": 1710 + }, + { + "epoch": 1.1807831637053647, + "grad_norm": 0.27321481704711914, + "learning_rate": 3.2790341126868534e-06, + "logits/chosen": 4.052063465118408, + "logits/rejected": 4.137639999389648, + "logps/chosen": -182.40805053710938, + "logps/rejected": -196.2810821533203, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.28481674194336, + "rewards/margins": 1.4183226823806763, + "rewards/rejected": -14.703140258789062, + "step": 1711 + }, + { + "epoch": 1.181473175780576, + "grad_norm": 12.286066055297852, + "learning_rate": 3.280950555768494e-06, + "logits/chosen": 3.341599225997925, + "logits/rejected": 3.5418965816497803, + "logps/chosen": -162.09031677246094, + "logps/rejected": -183.21621704101562, + "loss": 0.6351, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.230724334716797, + "rewards/margins": 2.105494976043701, + "rewards/rejected": -13.33621883392334, + "step": 1712 + }, + { + "epoch": 1.1821631878557874, + "grad_norm": 0.30900824069976807, + "learning_rate": 3.2828669988501342e-06, + "logits/chosen": 4.153498649597168, + "logits/rejected": 4.249885559082031, + "logps/chosen": -188.375, + "logps/rejected": -195.16549682617188, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.918956756591797, + "rewards/margins": 0.7005045413970947, + "rewards/rejected": -14.619462013244629, + "step": 1713 + }, + { + "epoch": 1.1828531999309988, + "grad_norm": 7.22489070892334, + "learning_rate": 3.2847834419317746e-06, + "logits/chosen": 3.661442518234253, + "logits/rejected": 3.708291530609131, + "logps/chosen": -162.30276489257812, + "logps/rejected": -173.60462951660156, + "loss": 0.6628, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.619535446166992, + "rewards/margins": 1.1936594247817993, + "rewards/rejected": -12.813194274902344, + "step": 1714 + }, + { + "epoch": 1.1835432120062102, + "grad_norm": 16.06458282470703, + "learning_rate": 3.286699885013415e-06, + "logits/chosen": 3.7801132202148438, + "logits/rejected": 3.8298327922821045, + "logps/chosen": -169.38775634765625, + "logps/rejected": -182.30784606933594, + "loss": 0.5589, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.08673095703125, + "rewards/margins": 1.3277450799942017, + "rewards/rejected": -13.41447639465332, + "step": 1715 + }, + { + "epoch": 1.1842332240814215, + "grad_norm": 0.31490427255630493, + "learning_rate": 3.2886163280950554e-06, + "logits/chosen": 3.7571630477905273, + "logits/rejected": 4.025175094604492, + "logps/chosen": -160.79617309570312, + "logps/rejected": -180.38706970214844, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.21786117553711, + "rewards/margins": 2.0319628715515137, + "rewards/rejected": -13.249824523925781, + "step": 1716 + }, + { + "epoch": 1.1849232361566329, + "grad_norm": 23.17823600769043, + "learning_rate": 3.2905327711766966e-06, + "logits/chosen": 3.83088755607605, + "logits/rejected": 3.806990623474121, + "logps/chosen": -185.01821899414062, + "logps/rejected": -176.81747436523438, + "loss": 1.4516, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.781786918640137, + "rewards/margins": -0.8449660539627075, + "rewards/rejected": -12.936820983886719, + "step": 1717 + }, + { + "epoch": 1.185613248231844, + "grad_norm": 10.928438186645508, + "learning_rate": 3.292449214258337e-06, + "logits/chosen": 4.157465934753418, + "logits/rejected": 4.203827857971191, + "logps/chosen": -176.75521850585938, + "logps/rejected": -180.37252807617188, + "loss": 0.6016, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.035675048828125, + "rewards/margins": 0.35935354232788086, + "rewards/rejected": -13.395029067993164, + "step": 1718 + }, + { + "epoch": 1.1863032603070554, + "grad_norm": 0.7734194993972778, + "learning_rate": 3.2943656573399774e-06, + "logits/chosen": 3.890821933746338, + "logits/rejected": 3.8574752807617188, + "logps/chosen": -179.9658203125, + "logps/rejected": -185.1796875, + "loss": 0.6079, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.179740905761719, + "rewards/margins": 0.5597838163375854, + "rewards/rejected": -13.739524841308594, + "step": 1719 + }, + { + "epoch": 1.1869932723822667, + "grad_norm": 14.389386177062988, + "learning_rate": 3.296282100421618e-06, + "logits/chosen": 3.642448663711548, + "logits/rejected": 3.591071844100952, + "logps/chosen": -168.20120239257812, + "logps/rejected": -166.54299926757812, + "loss": 0.7899, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.051387786865234, + "rewards/margins": -0.15064465999603271, + "rewards/rejected": -11.90074348449707, + "step": 1720 + }, + { + "epoch": 1.187683284457478, + "grad_norm": 0.27657297253608704, + "learning_rate": 3.298198543503258e-06, + "logits/chosen": 3.9343128204345703, + "logits/rejected": 4.00862455368042, + "logps/chosen": -177.19046020507812, + "logps/rejected": -184.0867919921875, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.964897155761719, + "rewards/margins": 0.6738817691802979, + "rewards/rejected": -13.638778686523438, + "step": 1721 + }, + { + "epoch": 1.1883732965326894, + "grad_norm": 15.748836517333984, + "learning_rate": 3.3001149865848986e-06, + "logits/chosen": 4.137081146240234, + "logits/rejected": 4.127511978149414, + "logps/chosen": -173.30209350585938, + "logps/rejected": -172.50625610351562, + "loss": 0.7761, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.534740447998047, + "rewards/margins": -0.1324063539505005, + "rewards/rejected": -12.402334213256836, + "step": 1722 + }, + { + "epoch": 1.1890633086079005, + "grad_norm": 0.2872304916381836, + "learning_rate": 3.302031429666539e-06, + "logits/chosen": 3.5995466709136963, + "logits/rejected": 3.6846864223480225, + "logps/chosen": -193.2250518798828, + "logps/rejected": -205.40451049804688, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.489799499511719, + "rewards/margins": 1.2243229150772095, + "rewards/rejected": -15.714122772216797, + "step": 1723 + }, + { + "epoch": 1.189753320683112, + "grad_norm": 2.4427013397216797, + "learning_rate": 3.3039478727481794e-06, + "logits/chosen": 3.5952746868133545, + "logits/rejected": 3.603841781616211, + "logps/chosen": -177.39491271972656, + "logps/rejected": -183.16726684570312, + "loss": 0.5533, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.780694961547852, + "rewards/margins": 0.570955753326416, + "rewards/rejected": -13.35165023803711, + "step": 1724 + }, + { + "epoch": 1.1904433327583233, + "grad_norm": 0.4306758940219879, + "learning_rate": 3.30586431582982e-06, + "logits/chosen": 4.033606052398682, + "logits/rejected": 4.105587482452393, + "logps/chosen": -172.18185424804688, + "logps/rejected": -185.90628051757812, + "loss": 0.5219, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.48991584777832, + "rewards/margins": 1.3051036596298218, + "rewards/rejected": -13.79502010345459, + "step": 1725 + }, + { + "epoch": 1.1911333448335346, + "grad_norm": 0.31572389602661133, + "learning_rate": 3.3077807589114606e-06, + "logits/chosen": 3.998871326446533, + "logits/rejected": 3.998871326446533, + "logps/chosen": -182.14674377441406, + "logps/rejected": -182.146728515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.262120246887207, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -13.262120246887207, + "step": 1726 + }, + { + "epoch": 1.191823356908746, + "grad_norm": 0.2812690734863281, + "learning_rate": 3.309697201993101e-06, + "logits/chosen": 3.881944179534912, + "logits/rejected": 4.012680530548096, + "logps/chosen": -172.8923797607422, + "logps/rejected": -179.6859893798828, + "loss": 0.607, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.632012367248535, + "rewards/margins": 0.7028442025184631, + "rewards/rejected": -13.334856033325195, + "step": 1727 + }, + { + "epoch": 1.192513368983957, + "grad_norm": 0.2795259356498718, + "learning_rate": 3.311613645074742e-06, + "logits/chosen": 4.087262153625488, + "logits/rejected": 4.087262153625488, + "logps/chosen": -192.34552001953125, + "logps/rejected": -192.34552001953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.386621475219727, + "rewards/margins": 0.0, + "rewards/rejected": -14.386621475219727, + "step": 1728 + }, + { + "epoch": 1.1932033810591685, + "grad_norm": 0.35428768396377563, + "learning_rate": 3.313530088156382e-06, + "logits/chosen": 3.8494482040405273, + "logits/rejected": 3.8494482040405273, + "logps/chosen": -191.581298828125, + "logps/rejected": -191.581298828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.448034286499023, + "rewards/margins": 0.0, + "rewards/rejected": -14.448034286499023, + "step": 1729 + }, + { + "epoch": 1.1938933931343798, + "grad_norm": 0.29338401556015015, + "learning_rate": 3.3154465312380226e-06, + "logits/chosen": 4.108126640319824, + "logits/rejected": 4.3462371826171875, + "logps/chosen": -156.15176391601562, + "logps/rejected": -183.26229858398438, + "loss": 0.4342, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.925679206848145, + "rewards/margins": 2.668919324874878, + "rewards/rejected": -13.594598770141602, + "step": 1730 + }, + { + "epoch": 1.1945834052095912, + "grad_norm": 19.082460403442383, + "learning_rate": 3.317362974319663e-06, + "logits/chosen": 3.7957606315612793, + "logits/rejected": 3.893662929534912, + "logps/chosen": -184.82350158691406, + "logps/rejected": -181.78794860839844, + "loss": 0.9569, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.943761825561523, + "rewards/margins": -0.3425699472427368, + "rewards/rejected": -13.601192474365234, + "step": 1731 + }, + { + "epoch": 1.1952734172848025, + "grad_norm": 0.27364203333854675, + "learning_rate": 3.3192794174013034e-06, + "logits/chosen": 3.660290241241455, + "logits/rejected": 3.90797758102417, + "logps/chosen": -167.35777282714844, + "logps/rejected": -174.19215393066406, + "loss": 0.607, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.994773864746094, + "rewards/margins": 0.683319091796875, + "rewards/rejected": -12.678092956542969, + "step": 1732 + }, + { + "epoch": 1.1959634293600139, + "grad_norm": 23.596776962280273, + "learning_rate": 3.321195860482944e-06, + "logits/chosen": 3.939830780029297, + "logits/rejected": 4.067914009094238, + "logps/chosen": -186.05490112304688, + "logps/rejected": -201.24032592773438, + "loss": 0.7494, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.644773483276367, + "rewards/margins": 1.5832616090774536, + "rewards/rejected": -15.228034973144531, + "step": 1733 + }, + { + "epoch": 1.1966534414352252, + "grad_norm": 0.2871934473514557, + "learning_rate": 3.3231123035645846e-06, + "logits/chosen": 4.067122936248779, + "logits/rejected": 4.067122936248779, + "logps/chosen": -187.33334350585938, + "logps/rejected": -187.33334350585938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.993868827819824, + "rewards/margins": 0.0, + "rewards/rejected": -13.993868827819824, + "step": 1734 + }, + { + "epoch": 1.1973434535104364, + "grad_norm": 0.3094560205936432, + "learning_rate": 3.325028746646225e-06, + "logits/chosen": 3.722355842590332, + "logits/rejected": 3.866248846054077, + "logps/chosen": -163.24017333984375, + "logps/rejected": -194.51080322265625, + "loss": 0.4343, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.609844207763672, + "rewards/margins": 3.0407567024230957, + "rewards/rejected": -14.650601387023926, + "step": 1735 + }, + { + "epoch": 1.1980334655856477, + "grad_norm": 0.2790687084197998, + "learning_rate": 3.3269451897278654e-06, + "logits/chosen": 4.041675567626953, + "logits/rejected": 4.0957465171813965, + "logps/chosen": -188.32589721679688, + "logps/rejected": -197.74554443359375, + "loss": 0.6066, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.14163589477539, + "rewards/margins": 0.9618836641311646, + "rewards/rejected": -15.103519439697266, + "step": 1736 + }, + { + "epoch": 1.198723477660859, + "grad_norm": 0.3329545259475708, + "learning_rate": 3.3288616328095057e-06, + "logits/chosen": 4.101997375488281, + "logits/rejected": 4.101997375488281, + "logps/chosen": -181.10870361328125, + "logps/rejected": -181.10870361328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.329025268554688, + "rewards/margins": 0.0, + "rewards/rejected": -13.329025268554688, + "step": 1737 + }, + { + "epoch": 1.1994134897360704, + "grad_norm": 0.29661500453948975, + "learning_rate": 3.330778075891146e-06, + "logits/chosen": 3.9935483932495117, + "logits/rejected": 4.260335922241211, + "logps/chosen": -184.75442504882812, + "logps/rejected": -200.59735107421875, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.774490356445312, + "rewards/margins": 1.568930745124817, + "rewards/rejected": -15.343420028686523, + "step": 1738 + }, + { + "epoch": 1.2001035018112818, + "grad_norm": 0.27450045943260193, + "learning_rate": 3.3326945189727865e-06, + "logits/chosen": 3.8373842239379883, + "logits/rejected": 3.9372334480285645, + "logps/chosen": -164.92227172851562, + "logps/rejected": -177.1547088623047, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.580852508544922, + "rewards/margins": 1.2452775239944458, + "rewards/rejected": -12.826129913330078, + "step": 1739 + }, + { + "epoch": 1.200793513886493, + "grad_norm": 8.26707649230957, + "learning_rate": 3.334610962054427e-06, + "logits/chosen": 3.6977853775024414, + "logits/rejected": 3.792417287826538, + "logps/chosen": -178.84396362304688, + "logps/rejected": -185.27040100097656, + "loss": 0.6462, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.182435989379883, + "rewards/margins": 0.6533550024032593, + "rewards/rejected": -13.835790634155273, + "step": 1740 + }, + { + "epoch": 1.2014835259617043, + "grad_norm": 4.712945461273193, + "learning_rate": 3.336527405136068e-06, + "logits/chosen": 4.2458343505859375, + "logits/rejected": 4.417603492736816, + "logps/chosen": -181.5904541015625, + "logps/rejected": -198.84007263183594, + "loss": 0.5436, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.560562133789062, + "rewards/margins": 1.6254427433013916, + "rewards/rejected": -15.186004638671875, + "step": 1741 + }, + { + "epoch": 1.2021735380369156, + "grad_norm": 0.342671275138855, + "learning_rate": 3.3384438482177085e-06, + "logits/chosen": 4.002241134643555, + "logits/rejected": 4.136109352111816, + "logps/chosen": -171.45486450195312, + "logps/rejected": -182.87890625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.412561416625977, + "rewards/margins": 1.2039141654968262, + "rewards/rejected": -13.616476058959961, + "step": 1742 + }, + { + "epoch": 1.202863550112127, + "grad_norm": 0.3017103672027588, + "learning_rate": 3.340360291299349e-06, + "logits/chosen": 4.347155570983887, + "logits/rejected": 4.347155570983887, + "logps/chosen": -175.7678985595703, + "logps/rejected": -175.7678985595703, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.820549011230469, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.820549011230469, + "step": 1743 + }, + { + "epoch": 1.2035535621873383, + "grad_norm": 0.32198193669319153, + "learning_rate": 3.3422767343809893e-06, + "logits/chosen": 3.651050090789795, + "logits/rejected": 3.830794095993042, + "logps/chosen": -168.69378662109375, + "logps/rejected": -182.68896484375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.194716453552246, + "rewards/margins": 1.3255000114440918, + "rewards/rejected": -13.52021598815918, + "step": 1744 + }, + { + "epoch": 1.2042435742625497, + "grad_norm": 0.42367151379585266, + "learning_rate": 3.3441931774626297e-06, + "logits/chosen": 3.78251314163208, + "logits/rejected": 3.831562042236328, + "logps/chosen": -167.01971435546875, + "logps/rejected": -177.9024200439453, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.699407577514648, + "rewards/margins": 1.112785816192627, + "rewards/rejected": -12.812192916870117, + "step": 1745 + }, + { + "epoch": 1.2049335863377608, + "grad_norm": 0.35614272952079773, + "learning_rate": 3.34610962054427e-06, + "logits/chosen": 4.066868305206299, + "logits/rejected": 4.205929756164551, + "logps/chosen": -156.4988250732422, + "logps/rejected": -166.93032836914062, + "loss": 0.5228, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.747843742370605, + "rewards/margins": 1.105973243713379, + "rewards/rejected": -11.853816986083984, + "step": 1746 + }, + { + "epoch": 1.2056235984129722, + "grad_norm": 0.39343202114105225, + "learning_rate": 3.3480260636259105e-06, + "logits/chosen": 4.216921329498291, + "logits/rejected": 4.196913719177246, + "logps/chosen": -171.36000061035156, + "logps/rejected": -187.05908203125, + "loss": 0.5209, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.454113960266113, + "rewards/margins": 1.5317747592926025, + "rewards/rejected": -13.985889434814453, + "step": 1747 + }, + { + "epoch": 1.2063136104881835, + "grad_norm": 0.3540613651275635, + "learning_rate": 3.349942506707551e-06, + "logits/chosen": 4.109726905822754, + "logits/rejected": 4.109726905822754, + "logps/chosen": -187.69821166992188, + "logps/rejected": -187.69821166992188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.790291786193848, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.790291786193848, + "step": 1748 + }, + { + "epoch": 1.2070036225633949, + "grad_norm": 0.29554682970046997, + "learning_rate": 3.3518589497891917e-06, + "logits/chosen": 4.294281959533691, + "logits/rejected": 4.39447546005249, + "logps/chosen": -181.9377899169922, + "logps/rejected": -191.0757293701172, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.14022159576416, + "rewards/margins": 0.9216014742851257, + "rewards/rejected": -14.061822891235352, + "step": 1749 + }, + { + "epoch": 1.2076936346386062, + "grad_norm": 28.08357810974121, + "learning_rate": 3.353775392870832e-06, + "logits/chosen": 4.1829681396484375, + "logits/rejected": 4.027464866638184, + "logps/chosen": -174.15310668945312, + "logps/rejected": -176.9921875, + "loss": 1.1182, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.588830947875977, + "rewards/margins": 0.29201555252075195, + "rewards/rejected": -12.880846977233887, + "step": 1750 + }, + { + "epoch": 1.2083836467138176, + "grad_norm": 0.3772869408130646, + "learning_rate": 3.3556918359524725e-06, + "logits/chosen": 4.006382942199707, + "logits/rejected": 4.006382942199707, + "logps/chosen": -186.98919677734375, + "logps/rejected": -186.98919677734375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.854239463806152, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.854239463806152, + "step": 1751 + }, + { + "epoch": 1.2090736587890287, + "grad_norm": 0.30404046177864075, + "learning_rate": 3.357608279034113e-06, + "logits/chosen": 4.247402191162109, + "logits/rejected": 4.349969387054443, + "logps/chosen": -174.91946411132812, + "logps/rejected": -190.42660522460938, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.604592323303223, + "rewards/margins": 1.4625351428985596, + "rewards/rejected": -14.06712818145752, + "step": 1752 + }, + { + "epoch": 1.20976367086424, + "grad_norm": 8.90007209777832, + "learning_rate": 3.3595247221157533e-06, + "logits/chosen": 4.179483413696289, + "logits/rejected": 4.284023761749268, + "logps/chosen": -163.93484497070312, + "logps/rejected": -174.5173797607422, + "loss": 1.1831, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.548946380615234, + "rewards/margins": 1.0234260559082031, + "rewards/rejected": -12.572372436523438, + "step": 1753 + }, + { + "epoch": 1.2104536829394514, + "grad_norm": 0.3350512683391571, + "learning_rate": 3.3614411651973937e-06, + "logits/chosen": 4.009915828704834, + "logits/rejected": 4.109039306640625, + "logps/chosen": -180.46817016601562, + "logps/rejected": -195.51138305664062, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.30446720123291, + "rewards/margins": 1.4959791898727417, + "rewards/rejected": -14.800445556640625, + "step": 1754 + }, + { + "epoch": 1.2111436950146628, + "grad_norm": 0.40394845604896545, + "learning_rate": 3.363357608279034e-06, + "logits/chosen": 3.9927124977111816, + "logits/rejected": 4.082268714904785, + "logps/chosen": -185.526611328125, + "logps/rejected": -198.66815185546875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.681486129760742, + "rewards/margins": 1.3512282371520996, + "rewards/rejected": -15.03271484375, + "step": 1755 + }, + { + "epoch": 1.2118337070898741, + "grad_norm": 0.3538399040699005, + "learning_rate": 3.3652740513606745e-06, + "logits/chosen": 4.311191082000732, + "logits/rejected": 4.311191082000732, + "logps/chosen": -183.51461791992188, + "logps/rejected": -183.51461791992188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.6935396194458, + "rewards/margins": 0.0, + "rewards/rejected": -13.6935396194458, + "step": 1756 + }, + { + "epoch": 1.2125237191650853, + "grad_norm": 0.2984163165092468, + "learning_rate": 3.3671904944423157e-06, + "logits/chosen": 4.205150127410889, + "logits/rejected": 4.193953990936279, + "logps/chosen": -193.12049865722656, + "logps/rejected": -199.42591857910156, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.531720161437988, + "rewards/margins": 0.641445517539978, + "rewards/rejected": -15.173166275024414, + "step": 1757 + }, + { + "epoch": 1.2132137312402966, + "grad_norm": 0.3074456453323364, + "learning_rate": 3.369106937523956e-06, + "logits/chosen": 3.9499425888061523, + "logits/rejected": 4.029358863830566, + "logps/chosen": -181.3764190673828, + "logps/rejected": -189.72071838378906, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.398605346679688, + "rewards/margins": 0.8418028354644775, + "rewards/rejected": -14.240407943725586, + "step": 1758 + }, + { + "epoch": 1.213903743315508, + "grad_norm": 0.45943695306777954, + "learning_rate": 3.3710233806055965e-06, + "logits/chosen": 4.395138740539551, + "logits/rejected": 4.3081793785095215, + "logps/chosen": -177.47824096679688, + "logps/rejected": -186.53607177734375, + "loss": 0.5267, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.901228904724121, + "rewards/margins": 0.9218689203262329, + "rewards/rejected": -13.823098182678223, + "step": 1759 + }, + { + "epoch": 1.2145937553907193, + "grad_norm": 23.942588806152344, + "learning_rate": 3.372939823687237e-06, + "logits/chosen": 3.959122896194458, + "logits/rejected": 4.007088661193848, + "logps/chosen": -163.9589080810547, + "logps/rejected": -175.74935913085938, + "loss": 1.0913, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.52103042602539, + "rewards/margins": 1.1206814050674438, + "rewards/rejected": -12.641712188720703, + "step": 1760 + }, + { + "epoch": 1.2152837674659307, + "grad_norm": 0.19810929894447327, + "learning_rate": 3.3748562667688773e-06, + "logits/chosen": 3.9359922409057617, + "logits/rejected": 4.2711591720581055, + "logps/chosen": -153.88014221191406, + "logps/rejected": -201.67745971679688, + "loss": 0.3468, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.438212394714355, + "rewards/margins": 4.754870414733887, + "rewards/rejected": -15.193081855773926, + "step": 1761 + }, + { + "epoch": 1.215973779541142, + "grad_norm": 0.26727092266082764, + "learning_rate": 3.3767727098505176e-06, + "logits/chosen": 3.881502866744995, + "logits/rejected": 4.014524459838867, + "logps/chosen": -175.79849243164062, + "logps/rejected": -197.1316375732422, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.83163070678711, + "rewards/margins": 2.251492500305176, + "rewards/rejected": -15.083123207092285, + "step": 1762 + }, + { + "epoch": 1.2166637916163534, + "grad_norm": 0.2447321116924286, + "learning_rate": 3.378689152932158e-06, + "logits/chosen": 4.192849636077881, + "logits/rejected": 4.397733688354492, + "logps/chosen": -169.28372192382812, + "logps/rejected": -195.85362243652344, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.235498428344727, + "rewards/margins": 2.634652614593506, + "rewards/rejected": -14.87015151977539, + "step": 1763 + }, + { + "epoch": 1.2173538036915645, + "grad_norm": 1.7038480043411255, + "learning_rate": 3.3806055960137984e-06, + "logits/chosen": 4.061907768249512, + "logits/rejected": 4.256575584411621, + "logps/chosen": -170.75064086914062, + "logps/rejected": -184.92906188964844, + "loss": 0.5278, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.25973129272461, + "rewards/margins": 1.3758431673049927, + "rewards/rejected": -13.635574340820312, + "step": 1764 + }, + { + "epoch": 1.218043815766776, + "grad_norm": 0.2729928195476532, + "learning_rate": 3.3825220390954392e-06, + "logits/chosen": 4.279350280761719, + "logits/rejected": 4.335238456726074, + "logps/chosen": -179.77755737304688, + "logps/rejected": -191.56805419921875, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.992653846740723, + "rewards/margins": 1.218700885772705, + "rewards/rejected": -14.21135425567627, + "step": 1765 + }, + { + "epoch": 1.2187338278419872, + "grad_norm": 0.27408456802368164, + "learning_rate": 3.3844384821770796e-06, + "logits/chosen": 3.841365337371826, + "logits/rejected": 4.066164493560791, + "logps/chosen": -168.893798828125, + "logps/rejected": -210.25527954101562, + "loss": 0.3472, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.248085975646973, + "rewards/margins": 4.122159957885742, + "rewards/rejected": -16.37024688720703, + "step": 1766 + }, + { + "epoch": 1.2194238399171986, + "grad_norm": 5.129932880401611, + "learning_rate": 3.38635492525872e-06, + "logits/chosen": 4.271793842315674, + "logits/rejected": 4.3572001457214355, + "logps/chosen": -173.0970916748047, + "logps/rejected": -187.81040954589844, + "loss": 0.482, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.547101974487305, + "rewards/margins": 1.5327452421188354, + "rewards/rejected": -14.07984733581543, + "step": 1767 + }, + { + "epoch": 1.22011385199241, + "grad_norm": 0.32017236948013306, + "learning_rate": 3.388271368340361e-06, + "logits/chosen": 4.488972187042236, + "logits/rejected": 4.564533710479736, + "logps/chosen": -177.37559509277344, + "logps/rejected": -189.79014587402344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.877165794372559, + "rewards/margins": 1.2743033170700073, + "rewards/rejected": -14.151470184326172, + "step": 1768 + }, + { + "epoch": 1.220803864067621, + "grad_norm": 0.32260510325431824, + "learning_rate": 3.3901878114220012e-06, + "logits/chosen": 4.009678840637207, + "logits/rejected": 4.009678840637207, + "logps/chosen": -192.21612548828125, + "logps/rejected": -192.21612548828125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.418228149414062, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -14.418228149414062, + "step": 1769 + }, + { + "epoch": 1.2214938761428324, + "grad_norm": 0.38715603947639465, + "learning_rate": 3.3921042545036416e-06, + "logits/chosen": 4.155310153961182, + "logits/rejected": 4.155310153961182, + "logps/chosen": -177.70315551757812, + "logps/rejected": -177.70315551757812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.971319198608398, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.971319198608398, + "step": 1770 + }, + { + "epoch": 1.2221838882180438, + "grad_norm": 0.24617113173007965, + "learning_rate": 3.394020697585282e-06, + "logits/chosen": 3.8929920196533203, + "logits/rejected": 3.9801955223083496, + "logps/chosen": -172.507080078125, + "logps/rejected": -192.7274932861328, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.39341926574707, + "rewards/margins": 2.0511746406555176, + "rewards/rejected": -14.444594383239746, + "step": 1771 + }, + { + "epoch": 1.2228739002932552, + "grad_norm": 3.22243332862854, + "learning_rate": 3.3959371406669224e-06, + "logits/chosen": 4.116267681121826, + "logits/rejected": 4.145637512207031, + "logps/chosen": -174.0602569580078, + "logps/rejected": -175.3415069580078, + "loss": 0.6395, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.717233657836914, + "rewards/margins": 0.14980173110961914, + "rewards/rejected": -12.867035865783691, + "step": 1772 + }, + { + "epoch": 1.2235639123684665, + "grad_norm": 0.6777714490890503, + "learning_rate": 3.3978535837485632e-06, + "logits/chosen": 3.7146005630493164, + "logits/rejected": 3.8185553550720215, + "logps/chosen": -172.62713623046875, + "logps/rejected": -200.532470703125, + "loss": 0.4382, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.507999420166016, + "rewards/margins": 2.8859195709228516, + "rewards/rejected": -15.393918991088867, + "step": 1773 + }, + { + "epoch": 1.2242539244436776, + "grad_norm": 0.2617388069629669, + "learning_rate": 3.3997700268302036e-06, + "logits/chosen": 3.839905023574829, + "logits/rejected": 4.0446271896362305, + "logps/chosen": -187.7976531982422, + "logps/rejected": -194.58786010742188, + "loss": 0.6074, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.021190643310547, + "rewards/margins": 0.6128736734390259, + "rewards/rejected": -14.634064674377441, + "step": 1774 + }, + { + "epoch": 1.224943936518889, + "grad_norm": 0.29112640023231506, + "learning_rate": 3.401686469911844e-06, + "logits/chosen": 4.047145843505859, + "logits/rejected": 4.230730056762695, + "logps/chosen": -165.3402862548828, + "logps/rejected": -190.25494384765625, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.784238815307617, + "rewards/margins": 2.4432358741760254, + "rewards/rejected": -14.227474212646484, + "step": 1775 + }, + { + "epoch": 1.2256339485941004, + "grad_norm": 0.30679500102996826, + "learning_rate": 3.4036029129934844e-06, + "logits/chosen": 3.6965839862823486, + "logits/rejected": 3.6965839862823486, + "logps/chosen": -172.72402954101562, + "logps/rejected": -172.72402954101562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.24247932434082, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.24247932434082, + "step": 1776 + }, + { + "epoch": 1.2263239606693117, + "grad_norm": 0.29347434639930725, + "learning_rate": 3.405519356075125e-06, + "logits/chosen": 3.979041576385498, + "logits/rejected": 3.979041576385498, + "logps/chosen": -160.37576293945312, + "logps/rejected": -160.37576293945312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.189802169799805, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -11.189801216125488, + "step": 1777 + }, + { + "epoch": 1.227013972744523, + "grad_norm": 0.292664498090744, + "learning_rate": 3.407435799156765e-06, + "logits/chosen": 3.9676623344421387, + "logits/rejected": 3.9622607231140137, + "logps/chosen": -167.22616577148438, + "logps/rejected": -179.33139038085938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.891178131103516, + "rewards/margins": 1.2523301839828491, + "rewards/rejected": -13.143508911132812, + "step": 1778 + }, + { + "epoch": 1.2277039848197344, + "grad_norm": 0.38615670800209045, + "learning_rate": 3.4093522422384056e-06, + "logits/chosen": 4.248291015625, + "logits/rejected": 4.307526588439941, + "logps/chosen": -162.93060302734375, + "logps/rejected": -185.2127227783203, + "loss": 0.5211, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.566333770751953, + "rewards/margins": 2.1401901245117188, + "rewards/rejected": -13.706523895263672, + "step": 1779 + }, + { + "epoch": 1.2283939968949458, + "grad_norm": 8.170368194580078, + "learning_rate": 3.411268685320046e-06, + "logits/chosen": 4.405108451843262, + "logits/rejected": 4.502110481262207, + "logps/chosen": -183.46029663085938, + "logps/rejected": -196.55355834960938, + "loss": 0.5998, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.643976211547852, + "rewards/margins": 1.2994563579559326, + "rewards/rejected": -14.94343376159668, + "step": 1780 + }, + { + "epoch": 1.229084008970157, + "grad_norm": 0.3106750547885895, + "learning_rate": 3.4131851284016864e-06, + "logits/chosen": 4.314695358276367, + "logits/rejected": 4.314695358276367, + "logps/chosen": -181.96652221679688, + "logps/rejected": -181.96652221679688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.399612426757812, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.399613380432129, + "step": 1781 + }, + { + "epoch": 1.2297740210453683, + "grad_norm": 0.45414671301841736, + "learning_rate": 3.4151015714833276e-06, + "logits/chosen": 4.256650447845459, + "logits/rejected": 4.185462951660156, + "logps/chosen": -178.7359619140625, + "logps/rejected": -183.91775512695312, + "loss": 0.6083, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.132913589477539, + "rewards/margins": 0.529238760471344, + "rewards/rejected": -13.662151336669922, + "step": 1782 + }, + { + "epoch": 1.2304640331205796, + "grad_norm": 0.41452109813690186, + "learning_rate": 3.417018014564968e-06, + "logits/chosen": 4.361949920654297, + "logits/rejected": 4.361949920654297, + "logps/chosen": -186.80792236328125, + "logps/rejected": -186.80792236328125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.818979263305664, + "rewards/margins": 0.0, + "rewards/rejected": -13.818979263305664, + "step": 1783 + }, + { + "epoch": 1.231154045195791, + "grad_norm": 0.3118138015270233, + "learning_rate": 3.4189344576466084e-06, + "logits/chosen": 4.19906759262085, + "logits/rejected": 4.19906759262085, + "logps/chosen": -185.31173706054688, + "logps/rejected": -185.31173706054688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.868167877197266, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.868167877197266, + "step": 1784 + }, + { + "epoch": 1.2318440572710023, + "grad_norm": 0.3030991852283478, + "learning_rate": 3.4208509007282488e-06, + "logits/chosen": 4.060645580291748, + "logits/rejected": 4.060645580291748, + "logps/chosen": -169.22296142578125, + "logps/rejected": -169.22296142578125, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.238409042358398, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -12.238409042358398, + "step": 1785 + }, + { + "epoch": 1.2325340693462135, + "grad_norm": 0.6337924003601074, + "learning_rate": 3.422767343809889e-06, + "logits/chosen": 4.435661315917969, + "logits/rejected": 4.463829517364502, + "logps/chosen": -162.0928955078125, + "logps/rejected": -166.62155151367188, + "loss": 0.6091, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.623453140258789, + "rewards/margins": 0.4818253517150879, + "rewards/rejected": -12.105278015136719, + "step": 1786 + }, + { + "epoch": 1.2332240814214248, + "grad_norm": 12.002593994140625, + "learning_rate": 3.4246837868915296e-06, + "logits/chosen": 4.014657974243164, + "logits/rejected": 4.084079265594482, + "logps/chosen": -182.12405395507812, + "logps/rejected": -180.21669006347656, + "loss": 0.8436, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.292596817016602, + "rewards/margins": -0.21677881479263306, + "rewards/rejected": -13.075818061828613, + "step": 1787 + }, + { + "epoch": 1.2339140934966362, + "grad_norm": 0.3361181318759918, + "learning_rate": 3.42660022997317e-06, + "logits/chosen": 4.208474159240723, + "logits/rejected": 4.208474159240723, + "logps/chosen": -183.79351806640625, + "logps/rejected": -183.79351806640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.437480926513672, + "rewards/margins": 0.0, + "rewards/rejected": -13.437480926513672, + "step": 1788 + }, + { + "epoch": 1.2346041055718475, + "grad_norm": 0.32577040791511536, + "learning_rate": 3.4285166730548103e-06, + "logits/chosen": 4.2041473388671875, + "logits/rejected": 4.325019359588623, + "logps/chosen": -171.7764892578125, + "logps/rejected": -179.87147521972656, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.439230918884277, + "rewards/margins": 0.8373291492462158, + "rewards/rejected": -13.276559829711914, + "step": 1789 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 2.9363515377044678, + "learning_rate": 3.430433116136451e-06, + "logits/chosen": 4.03164005279541, + "logits/rejected": 4.15841007232666, + "logps/chosen": -166.91879272460938, + "logps/rejected": -188.94564819335938, + "loss": 0.4864, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.035709381103516, + "rewards/margins": 1.9237899780273438, + "rewards/rejected": -13.95949935913086, + "step": 1790 + }, + { + "epoch": 1.2359841297222702, + "grad_norm": 0.29297032952308655, + "learning_rate": 3.4323495592180915e-06, + "logits/chosen": 3.939937114715576, + "logits/rejected": 4.0002031326293945, + "logps/chosen": -181.20098876953125, + "logps/rejected": -191.84359741210938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.329355239868164, + "rewards/margins": 1.0466902256011963, + "rewards/rejected": -14.376045227050781, + "step": 1791 + }, + { + "epoch": 1.2366741417974814, + "grad_norm": 0.35609591007232666, + "learning_rate": 3.434266002299732e-06, + "logits/chosen": 4.062306880950928, + "logits/rejected": 4.105045318603516, + "logps/chosen": -178.6389617919922, + "logps/rejected": -185.11807250976562, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.057280540466309, + "rewards/margins": 0.6451311111450195, + "rewards/rejected": -13.702411651611328, + "step": 1792 + }, + { + "epoch": 1.2373641538726927, + "grad_norm": 1.3483812808990479, + "learning_rate": 3.4361824453813723e-06, + "logits/chosen": 3.9919824600219727, + "logits/rejected": 4.088190078735352, + "logps/chosen": -173.86618041992188, + "logps/rejected": -184.93814086914062, + "loss": 0.5275, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.506168365478516, + "rewards/margins": 1.0414167642593384, + "rewards/rejected": -13.547586441040039, + "step": 1793 + }, + { + "epoch": 1.238054165947904, + "grad_norm": 0.3440602421760559, + "learning_rate": 3.4380988884630127e-06, + "logits/chosen": 4.094634056091309, + "logits/rejected": 4.318488597869873, + "logps/chosen": -169.8606414794922, + "logps/rejected": -181.80255126953125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.18132495880127, + "rewards/margins": 1.1926605701446533, + "rewards/rejected": -13.373984336853027, + "step": 1794 + }, + { + "epoch": 1.2387441780231154, + "grad_norm": 0.30656698346138, + "learning_rate": 3.440015331544653e-06, + "logits/chosen": 3.9349279403686523, + "logits/rejected": 4.101962089538574, + "logps/chosen": -165.33969116210938, + "logps/rejected": -189.5794219970703, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.608260154724121, + "rewards/margins": 2.456531286239624, + "rewards/rejected": -14.064790725708008, + "step": 1795 + }, + { + "epoch": 1.2394341900983268, + "grad_norm": 1.2538623809814453, + "learning_rate": 3.4419317746262935e-06, + "logits/chosen": 4.2069525718688965, + "logits/rejected": 4.214191436767578, + "logps/chosen": -174.39306640625, + "logps/rejected": -178.2779541015625, + "loss": 0.6174, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.866581916809082, + "rewards/margins": 0.29895949363708496, + "rewards/rejected": -13.16554069519043, + "step": 1796 + }, + { + "epoch": 1.2401242021735381, + "grad_norm": 0.27425768971443176, + "learning_rate": 3.443848217707934e-06, + "logits/chosen": 4.611006259918213, + "logits/rejected": 4.611006259918213, + "logps/chosen": -179.7328643798828, + "logps/rejected": -179.73284912109375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.216907501220703, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.216907501220703, + "step": 1797 + }, + { + "epoch": 1.2408142142487493, + "grad_norm": 0.3461928963661194, + "learning_rate": 3.445764660789575e-06, + "logits/chosen": 4.573531150817871, + "logits/rejected": 4.573531150817871, + "logps/chosen": -191.83285522460938, + "logps/rejected": -191.83285522460938, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.22564697265625, + "rewards/margins": 7.152557373046875e-07, + "rewards/rejected": -14.225648880004883, + "step": 1798 + }, + { + "epoch": 1.2415042263239606, + "grad_norm": 0.2513272762298584, + "learning_rate": 3.4476811038712155e-06, + "logits/chosen": 3.929018974304199, + "logits/rejected": 4.100683212280273, + "logps/chosen": -141.64413452148438, + "logps/rejected": -165.26132202148438, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.27607250213623, + "rewards/margins": 2.4049363136291504, + "rewards/rejected": -11.681008338928223, + "step": 1799 + }, + { + "epoch": 1.242194238399172, + "grad_norm": 0.9757115840911865, + "learning_rate": 3.449597546952856e-06, + "logits/chosen": 4.337708473205566, + "logits/rejected": 4.381105899810791, + "logps/chosen": -184.68551635742188, + "logps/rejected": -188.40069580078125, + "loss": 0.6134, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.9052734375, + "rewards/margins": 0.35913002490997314, + "rewards/rejected": -14.264402389526367, + "step": 1800 + }, + { + "epoch": 1.2428842504743833, + "grad_norm": 0.29805347323417664, + "learning_rate": 3.4515139900344963e-06, + "logits/chosen": 3.969207286834717, + "logits/rejected": 4.031528472900391, + "logps/chosen": -171.4871368408203, + "logps/rejected": -194.10354614257812, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.329029083251953, + "rewards/margins": 2.2375707626342773, + "rewards/rejected": -14.56659984588623, + "step": 1801 + }, + { + "epoch": 1.2435742625495947, + "grad_norm": 1.6450281143188477, + "learning_rate": 3.4534304331161367e-06, + "logits/chosen": 4.222562313079834, + "logits/rejected": 4.24492883682251, + "logps/chosen": -170.49447631835938, + "logps/rejected": -174.2600860595703, + "loss": 0.6147, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.245352745056152, + "rewards/margins": 0.33660268783569336, + "rewards/rejected": -12.581954956054688, + "step": 1802 + }, + { + "epoch": 1.2442642746248058, + "grad_norm": 0.30467522144317627, + "learning_rate": 3.455346876197777e-06, + "logits/chosen": 4.352588176727295, + "logits/rejected": 4.352588176727295, + "logps/chosen": -176.97390747070312, + "logps/rejected": -176.97390747070312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.944341659545898, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.944341659545898, + "step": 1803 + }, + { + "epoch": 1.2449542867000172, + "grad_norm": 0.2918197214603424, + "learning_rate": 3.4572633192794175e-06, + "logits/chosen": 3.8177075386047363, + "logits/rejected": 4.218260765075684, + "logps/chosen": -161.81600952148438, + "logps/rejected": -189.30325317382812, + "loss": 0.4335, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.274943351745605, + "rewards/margins": 2.7889139652252197, + "rewards/rejected": -14.063857078552246, + "step": 1804 + }, + { + "epoch": 1.2456442987752285, + "grad_norm": 0.2986694276332855, + "learning_rate": 3.459179762361058e-06, + "logits/chosen": 4.255229949951172, + "logits/rejected": 4.255229949951172, + "logps/chosen": -205.56787109375, + "logps/rejected": -205.56785583496094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -15.613914489746094, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -15.613914489746094, + "step": 1805 + }, + { + "epoch": 1.2463343108504399, + "grad_norm": 0.36078473925590515, + "learning_rate": 3.4610962054426987e-06, + "logits/chosen": 4.282121181488037, + "logits/rejected": 4.282121181488037, + "logps/chosen": -184.9735870361328, + "logps/rejected": -184.9735870361328, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.791098594665527, + "rewards/margins": 0.0, + "rewards/rejected": -13.791098594665527, + "step": 1806 + }, + { + "epoch": 1.2470243229256512, + "grad_norm": 0.3362979292869568, + "learning_rate": 3.463012648524339e-06, + "logits/chosen": 4.309996604919434, + "logits/rejected": 4.309996604919434, + "logps/chosen": -175.7896270751953, + "logps/rejected": -175.7896270751953, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.800878524780273, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.800878524780273, + "step": 1807 + }, + { + "epoch": 1.2477143350008626, + "grad_norm": 0.6558471322059631, + "learning_rate": 3.4649290916059795e-06, + "logits/chosen": 4.309338092803955, + "logits/rejected": 4.324255466461182, + "logps/chosen": -166.35899353027344, + "logps/rejected": -185.97438049316406, + "loss": 0.4393, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.8518648147583, + "rewards/margins": 1.9561946392059326, + "rewards/rejected": -13.808059692382812, + "step": 1808 + }, + { + "epoch": 1.248404347076074, + "grad_norm": 11.222615242004395, + "learning_rate": 3.4668455346876203e-06, + "logits/chosen": 4.393183708190918, + "logits/rejected": 4.392265796661377, + "logps/chosen": -173.18115234375, + "logps/rejected": -181.4408416748047, + "loss": 0.6116, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.534039497375488, + "rewards/margins": 0.8336576223373413, + "rewards/rejected": -13.367697715759277, + "step": 1809 + }, + { + "epoch": 1.249094359151285, + "grad_norm": 0.378642737865448, + "learning_rate": 3.4687619777692607e-06, + "logits/chosen": 3.999401092529297, + "logits/rejected": 3.999401092529297, + "logps/chosen": -180.07223510742188, + "logps/rejected": -180.07223510742188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.246509552001953, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.246509552001953, + "step": 1810 + }, + { + "epoch": 1.2497843712264964, + "grad_norm": 0.4219064712524414, + "learning_rate": 3.470678420850901e-06, + "logits/chosen": 4.407527923583984, + "logits/rejected": 4.407527923583984, + "logps/chosen": -180.92266845703125, + "logps/rejected": -180.92266845703125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.271018981933594, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.271018981933594, + "step": 1811 + }, + { + "epoch": 1.2504743833017078, + "grad_norm": 0.2654617130756378, + "learning_rate": 3.4725948639325415e-06, + "logits/chosen": 4.154045104980469, + "logits/rejected": 4.3500823974609375, + "logps/chosen": -178.3434295654297, + "logps/rejected": -185.79205322265625, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.968164443969727, + "rewards/margins": 0.7683937549591064, + "rewards/rejected": -13.73655891418457, + "step": 1812 + }, + { + "epoch": 1.2511643953769191, + "grad_norm": 0.31171515583992004, + "learning_rate": 3.474511307014182e-06, + "logits/chosen": 3.979607343673706, + "logits/rejected": 4.107302665710449, + "logps/chosen": -171.95733642578125, + "logps/rejected": -191.43209838867188, + "loss": 0.5205, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.383993148803711, + "rewards/margins": 2.020716667175293, + "rewards/rejected": -14.40471076965332, + "step": 1813 + }, + { + "epoch": 1.2518544074521305, + "grad_norm": 0.3643907606601715, + "learning_rate": 3.4764277500958227e-06, + "logits/chosen": 4.063708782196045, + "logits/rejected": 4.063708782196045, + "logps/chosen": -180.70867919921875, + "logps/rejected": -180.70867919921875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.495896339416504, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.49589729309082, + "step": 1814 + }, + { + "epoch": 1.2525444195273416, + "grad_norm": 0.30167922377586365, + "learning_rate": 3.478344193177463e-06, + "logits/chosen": 4.233283042907715, + "logits/rejected": 4.233283042907715, + "logps/chosen": -171.70330810546875, + "logps/rejected": -171.70330810546875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.302865028381348, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.302865028381348, + "step": 1815 + }, + { + "epoch": 1.253234431602553, + "grad_norm": 0.8639237284660339, + "learning_rate": 3.4802606362591034e-06, + "logits/chosen": 4.0441083908081055, + "logits/rejected": 4.042003631591797, + "logps/chosen": -177.87115478515625, + "logps/rejected": -181.31491088867188, + "loss": 0.6116, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.039985656738281, + "rewards/margins": 0.39698636531829834, + "rewards/rejected": -13.436971664428711, + "step": 1816 + }, + { + "epoch": 1.2539244436777643, + "grad_norm": 0.30656003952026367, + "learning_rate": 3.482177079340744e-06, + "logits/chosen": 3.916926383972168, + "logits/rejected": 4.014863014221191, + "logps/chosen": -176.7698974609375, + "logps/rejected": -196.3563995361328, + "loss": 0.5204, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.034751892089844, + "rewards/margins": 1.8627289533615112, + "rewards/rejected": -14.897480010986328, + "step": 1817 + }, + { + "epoch": 1.2546144557529757, + "grad_norm": 11.941266059875488, + "learning_rate": 3.4840935224223842e-06, + "logits/chosen": 4.013296604156494, + "logits/rejected": 4.022730827331543, + "logps/chosen": -177.7023162841797, + "logps/rejected": -177.5663299560547, + "loss": 0.7177, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.010932922363281, + "rewards/margins": -0.045149922370910645, + "rewards/rejected": -12.965784072875977, + "step": 1818 + }, + { + "epoch": 1.255304467828187, + "grad_norm": 0.4356548488140106, + "learning_rate": 3.4860099655040246e-06, + "logits/chosen": 4.432125568389893, + "logits/rejected": 4.503536701202393, + "logps/chosen": -166.3892364501953, + "logps/rejected": -172.02328491210938, + "loss": 0.6081, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.753229141235352, + "rewards/margins": 0.5482046604156494, + "rewards/rejected": -12.301435470581055, + "step": 1819 + }, + { + "epoch": 1.2559944799033982, + "grad_norm": 0.41677334904670715, + "learning_rate": 3.487926408585665e-06, + "logits/chosen": 4.097264766693115, + "logits/rejected": 4.144359588623047, + "logps/chosen": -187.54251098632812, + "logps/rejected": -193.54933166503906, + "loss": 0.6076, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.041181564331055, + "rewards/margins": 0.5911664962768555, + "rewards/rejected": -14.632349014282227, + "step": 1820 + }, + { + "epoch": 1.2566844919786098, + "grad_norm": 0.31329503655433655, + "learning_rate": 3.4898428516673054e-06, + "logits/chosen": 3.5782670974731445, + "logits/rejected": 3.5782670974731445, + "logps/chosen": -171.54006958007812, + "logps/rejected": -171.54006958007812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.552772521972656, + "rewards/margins": -2.980232238769531e-07, + "rewards/rejected": -12.552772521972656, + "step": 1821 + }, + { + "epoch": 1.257374504053821, + "grad_norm": 0.29479071497917175, + "learning_rate": 3.4917592947489466e-06, + "logits/chosen": 3.7712197303771973, + "logits/rejected": 3.9025206565856934, + "logps/chosen": -167.92752075195312, + "logps/rejected": -176.129638671875, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.939541816711426, + "rewards/margins": 0.845208466053009, + "rewards/rejected": -12.784749984741211, + "step": 1822 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 0.33294135332107544, + "learning_rate": 3.493675737830587e-06, + "logits/chosen": 4.160220146179199, + "logits/rejected": 4.242918014526367, + "logps/chosen": -163.5254669189453, + "logps/rejected": -169.90188598632812, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.806720733642578, + "rewards/margins": 0.6488152742385864, + "rewards/rejected": -12.455535888671875, + "step": 1823 + }, + { + "epoch": 1.2587545282042436, + "grad_norm": 0.3025960922241211, + "learning_rate": 3.4955921809122274e-06, + "logits/chosen": 4.072578430175781, + "logits/rejected": 4.072578430175781, + "logps/chosen": -174.48960876464844, + "logps/rejected": -174.48960876464844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.815689086914062, + "rewards/margins": 0.0, + "rewards/rejected": -12.815689086914062, + "step": 1824 + }, + { + "epoch": 1.259444540279455, + "grad_norm": 0.6985302567481995, + "learning_rate": 3.497508623993868e-06, + "logits/chosen": 4.312713623046875, + "logits/rejected": 4.277777671813965, + "logps/chosen": -178.62445068359375, + "logps/rejected": -183.28768920898438, + "loss": 0.6091, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.113834381103516, + "rewards/margins": 0.4845426082611084, + "rewards/rejected": -13.598377227783203, + "step": 1825 + }, + { + "epoch": 1.2601345523546663, + "grad_norm": 0.4045969247817993, + "learning_rate": 3.499425067075508e-06, + "logits/chosen": 3.918578624725342, + "logits/rejected": 4.025628566741943, + "logps/chosen": -158.7078857421875, + "logps/rejected": -179.15838623046875, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.976239204406738, + "rewards/margins": 2.056375503540039, + "rewards/rejected": -13.032615661621094, + "step": 1826 + }, + { + "epoch": 1.2608245644298774, + "grad_norm": 0.3356713652610779, + "learning_rate": 3.5013415101571486e-06, + "logits/chosen": 4.266184329986572, + "logits/rejected": 4.409284591674805, + "logps/chosen": -163.13204956054688, + "logps/rejected": -181.7030029296875, + "loss": 0.5204, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.549956321716309, + "rewards/margins": 1.8286947011947632, + "rewards/rejected": -13.37865161895752, + "step": 1827 + }, + { + "epoch": 1.2615145765050888, + "grad_norm": 0.3412630259990692, + "learning_rate": 3.503257953238789e-06, + "logits/chosen": 4.2196550369262695, + "logits/rejected": 4.304304122924805, + "logps/chosen": -167.3744659423828, + "logps/rejected": -175.23922729492188, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.844758033752441, + "rewards/margins": 0.7754783630371094, + "rewards/rejected": -12.62023639678955, + "step": 1828 + }, + { + "epoch": 1.2622045885803002, + "grad_norm": 0.3341160714626312, + "learning_rate": 3.5051743963204294e-06, + "logits/chosen": 4.304479122161865, + "logits/rejected": 4.304479122161865, + "logps/chosen": -188.94561767578125, + "logps/rejected": -188.94561767578125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.003576278686523, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -14.003576278686523, + "step": 1829 + }, + { + "epoch": 1.2628946006555115, + "grad_norm": 0.2964378893375397, + "learning_rate": 3.50709083940207e-06, + "logits/chosen": 4.437122821807861, + "logits/rejected": 4.484058380126953, + "logps/chosen": -193.14801025390625, + "logps/rejected": -200.6092987060547, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.336429595947266, + "rewards/margins": 0.7529264688491821, + "rewards/rejected": -15.089356422424316, + "step": 1830 + }, + { + "epoch": 1.2635846127307229, + "grad_norm": 0.32040154933929443, + "learning_rate": 3.5090072824837106e-06, + "logits/chosen": 4.195883274078369, + "logits/rejected": 4.195883274078369, + "logps/chosen": -179.7950439453125, + "logps/rejected": -179.7950439453125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.207724571228027, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.207725524902344, + "step": 1831 + }, + { + "epoch": 1.264274624805934, + "grad_norm": 0.3278096616268158, + "learning_rate": 3.510923725565351e-06, + "logits/chosen": 4.041449069976807, + "logits/rejected": 4.061744213104248, + "logps/chosen": -151.62191772460938, + "logps/rejected": -162.57269287109375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.54043960571289, + "rewards/margins": 1.1084918975830078, + "rewards/rejected": -11.648932456970215, + "step": 1832 + }, + { + "epoch": 1.2649646368811454, + "grad_norm": 0.4078809916973114, + "learning_rate": 3.5128401686469914e-06, + "logits/chosen": 3.943512439727783, + "logits/rejected": 4.070388317108154, + "logps/chosen": -176.86831665039062, + "logps/rejected": -186.07510375976562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.92752456665039, + "rewards/margins": 0.9344078302383423, + "rewards/rejected": -13.861932754516602, + "step": 1833 + }, + { + "epoch": 1.2656546489563567, + "grad_norm": 0.38007593154907227, + "learning_rate": 3.5147566117286318e-06, + "logits/chosen": 4.092801094055176, + "logits/rejected": 4.247892379760742, + "logps/chosen": -157.98782348632812, + "logps/rejected": -167.3106689453125, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.063810348510742, + "rewards/margins": 0.9285882711410522, + "rewards/rejected": -11.992399215698242, + "step": 1834 + }, + { + "epoch": 1.266344661031568, + "grad_norm": 0.28626543283462524, + "learning_rate": 3.516673054810272e-06, + "logits/chosen": 3.9860551357269287, + "logits/rejected": 4.030580520629883, + "logps/chosen": -164.47659301757812, + "logps/rejected": -174.8052978515625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.867992401123047, + "rewards/margins": 1.0494019985198975, + "rewards/rejected": -12.917394638061523, + "step": 1835 + }, + { + "epoch": 1.2670346731067794, + "grad_norm": 14.163237571716309, + "learning_rate": 3.5185894978919125e-06, + "logits/chosen": 4.269408226013184, + "logits/rejected": 4.312960624694824, + "logps/chosen": -170.25579833984375, + "logps/rejected": -171.40956115722656, + "loss": 1.1914, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.361933708190918, + "rewards/margins": 0.06378704309463501, + "rewards/rejected": -12.42572021484375, + "step": 1836 + }, + { + "epoch": 1.2677246851819906, + "grad_norm": 0.28170767426490784, + "learning_rate": 3.520505940973553e-06, + "logits/chosen": 4.248772621154785, + "logits/rejected": 4.336453437805176, + "logps/chosen": -187.2693328857422, + "logps/rejected": -197.33749389648438, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.940577507019043, + "rewards/margins": 1.0022683143615723, + "rewards/rejected": -14.94284439086914, + "step": 1837 + }, + { + "epoch": 1.2684146972572021, + "grad_norm": 0.3721306324005127, + "learning_rate": 3.522422384055194e-06, + "logits/chosen": 3.8645405769348145, + "logits/rejected": 3.8645405769348145, + "logps/chosen": -162.61390686035156, + "logps/rejected": -162.61390686035156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.649354934692383, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -11.64935302734375, + "step": 1838 + }, + { + "epoch": 1.2691047093324133, + "grad_norm": 0.3650352656841278, + "learning_rate": 3.5243388271368346e-06, + "logits/chosen": 4.414923191070557, + "logits/rejected": 4.414923191070557, + "logps/chosen": -176.5980987548828, + "logps/rejected": -176.5980987548828, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.946438789367676, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.94643783569336, + "step": 1839 + }, + { + "epoch": 1.2697947214076246, + "grad_norm": 0.40957847237586975, + "learning_rate": 3.526255270218475e-06, + "logits/chosen": 4.119388580322266, + "logits/rejected": 4.236598968505859, + "logps/chosen": -170.5784454345703, + "logps/rejected": -185.6527099609375, + "loss": 0.5221, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.313650131225586, + "rewards/margins": 1.4381797313690186, + "rewards/rejected": -13.7518310546875, + "step": 1840 + }, + { + "epoch": 1.270484733482836, + "grad_norm": 0.33221811056137085, + "learning_rate": 3.5281717133001154e-06, + "logits/chosen": 4.529476165771484, + "logits/rejected": 4.5389533042907715, + "logps/chosen": -195.9189910888672, + "logps/rejected": -202.86366271972656, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.655858993530273, + "rewards/margins": 0.7347859144210815, + "rewards/rejected": -15.390645027160645, + "step": 1841 + }, + { + "epoch": 1.2711747455580473, + "grad_norm": 0.28085681796073914, + "learning_rate": 3.5300881563817557e-06, + "logits/chosen": 4.26706600189209, + "logits/rejected": 4.374635696411133, + "logps/chosen": -167.43133544921875, + "logps/rejected": -190.2669677734375, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.765579223632812, + "rewards/margins": 2.420787811279297, + "rewards/rejected": -14.18636703491211, + "step": 1842 + }, + { + "epoch": 1.2718647576332587, + "grad_norm": 0.3602791130542755, + "learning_rate": 3.532004599463396e-06, + "logits/chosen": 4.436900615692139, + "logits/rejected": 4.436900615692139, + "logps/chosen": -194.095703125, + "logps/rejected": -194.095703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.57612419128418, + "rewards/margins": 0.0, + "rewards/rejected": -14.57612419128418, + "step": 1843 + }, + { + "epoch": 1.2725547697084698, + "grad_norm": 0.31200501322746277, + "learning_rate": 3.5339210425450365e-06, + "logits/chosen": 4.238563537597656, + "logits/rejected": 4.238563537597656, + "logps/chosen": -192.10800170898438, + "logps/rejected": -192.1079864501953, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.504467010498047, + "rewards/margins": -8.344650268554688e-07, + "rewards/rejected": -14.50446605682373, + "step": 1844 + }, + { + "epoch": 1.2732447817836812, + "grad_norm": 0.33224159479141235, + "learning_rate": 3.535837485626677e-06, + "logits/chosen": 4.245329856872559, + "logits/rejected": 4.240787506103516, + "logps/chosen": -180.74374389648438, + "logps/rejected": -190.5955810546875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.227447509765625, + "rewards/margins": 1.00412118434906, + "rewards/rejected": -14.231569290161133, + "step": 1845 + }, + { + "epoch": 1.2739347938588925, + "grad_norm": 0.403289794921875, + "learning_rate": 3.5377539287083177e-06, + "logits/chosen": 4.191767692565918, + "logits/rejected": 4.142736911773682, + "logps/chosen": -167.99169921875, + "logps/rejected": -174.821044921875, + "loss": 0.6073, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.179862976074219, + "rewards/margins": 0.6385079622268677, + "rewards/rejected": -12.818371772766113, + "step": 1846 + }, + { + "epoch": 1.2746248059341039, + "grad_norm": 0.37203001976013184, + "learning_rate": 3.539670371789958e-06, + "logits/chosen": 4.2244648933410645, + "logits/rejected": 4.236477375030518, + "logps/chosen": -188.57666015625, + "logps/rejected": -198.19326782226562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.142182350158691, + "rewards/margins": 1.0026265382766724, + "rewards/rejected": -15.144808769226074, + "step": 1847 + }, + { + "epoch": 1.2753148180093152, + "grad_norm": 0.29138895869255066, + "learning_rate": 3.5415868148715985e-06, + "logits/chosen": 3.9122374057769775, + "logits/rejected": 4.162769794464111, + "logps/chosen": -149.99624633789062, + "logps/rejected": -180.66665649414062, + "loss": 0.3481, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.269234657287598, + "rewards/margins": 3.069061756134033, + "rewards/rejected": -13.338294982910156, + "step": 1848 + }, + { + "epoch": 1.2760048300845264, + "grad_norm": 0.35245561599731445, + "learning_rate": 3.5435032579532393e-06, + "logits/chosen": 4.500369071960449, + "logits/rejected": 4.500369071960449, + "logps/chosen": -166.60275268554688, + "logps/rejected": -166.60275268554688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.803791046142578, + "rewards/margins": 0.0, + "rewards/rejected": -11.803791046142578, + "step": 1849 + }, + { + "epoch": 1.2766948421597377, + "grad_norm": 0.2810501754283905, + "learning_rate": 3.5454197010348797e-06, + "logits/chosen": 4.206335067749023, + "logits/rejected": 4.326493263244629, + "logps/chosen": -168.88404846191406, + "logps/rejected": -195.36285400390625, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.943363189697266, + "rewards/margins": 2.696275234222412, + "rewards/rejected": -14.639638900756836, + "step": 1850 + }, + { + "epoch": 1.277384854234949, + "grad_norm": 0.36804863810539246, + "learning_rate": 3.54733614411652e-06, + "logits/chosen": 3.995542287826538, + "logits/rejected": 4.4631147384643555, + "logps/chosen": -153.04649353027344, + "logps/rejected": -184.30068969726562, + "loss": 0.4334, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.626119613647461, + "rewards/margins": 3.0383830070495605, + "rewards/rejected": -13.66450309753418, + "step": 1851 + }, + { + "epoch": 1.2780748663101604, + "grad_norm": 0.3793262541294098, + "learning_rate": 3.5492525871981605e-06, + "logits/chosen": 4.082161903381348, + "logits/rejected": 4.082161903381348, + "logps/chosen": -171.26824951171875, + "logps/rejected": -171.26828002929688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.315546035766602, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.315546035766602, + "step": 1852 + }, + { + "epoch": 1.2787648783853718, + "grad_norm": 1.3838214874267578, + "learning_rate": 3.551169030279801e-06, + "logits/chosen": 4.416932106018066, + "logits/rejected": 4.473663806915283, + "logps/chosen": -174.61581420898438, + "logps/rejected": -182.60870361328125, + "loss": 0.5322, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.669696807861328, + "rewards/margins": 0.8261429071426392, + "rewards/rejected": -13.495840072631836, + "step": 1853 + }, + { + "epoch": 1.2794548904605831, + "grad_norm": 0.348906010389328, + "learning_rate": 3.5530854733614417e-06, + "logits/chosen": 4.659269332885742, + "logits/rejected": 4.659269332885742, + "logps/chosen": -195.74609375, + "logps/rejected": -195.74609375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.936078071594238, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.936078071594238, + "step": 1854 + }, + { + "epoch": 1.2801449025357945, + "grad_norm": 0.3816656768321991, + "learning_rate": 3.555001916443082e-06, + "logits/chosen": 4.091513156890869, + "logits/rejected": 4.216911315917969, + "logps/chosen": -178.13966369628906, + "logps/rejected": -199.6290283203125, + "loss": 0.4348, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.070879936218262, + "rewards/margins": 2.1507229804992676, + "rewards/rejected": -15.221603393554688, + "step": 1855 + }, + { + "epoch": 1.2808349146110056, + "grad_norm": 0.45346787571907043, + "learning_rate": 3.5569183595247225e-06, + "logits/chosen": 4.70352840423584, + "logits/rejected": 4.755237579345703, + "logps/chosen": -176.05262756347656, + "logps/rejected": -186.49624633789062, + "loss": 0.5234, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.744291305541992, + "rewards/margins": 1.1276531219482422, + "rewards/rejected": -13.871944427490234, + "step": 1856 + }, + { + "epoch": 1.281524926686217, + "grad_norm": 0.32933926582336426, + "learning_rate": 3.558834802606363e-06, + "logits/chosen": 4.339469909667969, + "logits/rejected": 4.328652381896973, + "logps/chosen": -182.1263427734375, + "logps/rejected": -192.90176391601562, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.333906173706055, + "rewards/margins": 1.1383062601089478, + "rewards/rejected": -14.472211837768555, + "step": 1857 + }, + { + "epoch": 1.2822149387614283, + "grad_norm": 0.31387558579444885, + "learning_rate": 3.5607512456880033e-06, + "logits/chosen": 4.3025031089782715, + "logits/rejected": 4.3025031089782715, + "logps/chosen": -176.728759765625, + "logps/rejected": -176.728759765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.880963325500488, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.880962371826172, + "step": 1858 + }, + { + "epoch": 1.2829049508366397, + "grad_norm": 0.505490779876709, + "learning_rate": 3.5626676887696437e-06, + "logits/chosen": 4.169312000274658, + "logits/rejected": 4.1959638595581055, + "logps/chosen": -171.61163330078125, + "logps/rejected": -185.808349609375, + "loss": 0.5227, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.44958782196045, + "rewards/margins": 1.429908037185669, + "rewards/rejected": -13.879495620727539, + "step": 1859 + }, + { + "epoch": 1.283594962911851, + "grad_norm": 0.34964174032211304, + "learning_rate": 3.564584131851284e-06, + "logits/chosen": 4.117170810699463, + "logits/rejected": 4.140529632568359, + "logps/chosen": -170.58447265625, + "logps/rejected": -178.55360412597656, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.136178970336914, + "rewards/margins": 0.8301352262496948, + "rewards/rejected": -12.966314315795898, + "step": 1860 + }, + { + "epoch": 1.2842849749870622, + "grad_norm": 9.122137069702148, + "learning_rate": 3.5665005749329245e-06, + "logits/chosen": 4.292396545410156, + "logits/rejected": 4.286134243011475, + "logps/chosen": -177.66864013671875, + "logps/rejected": -176.78482055664062, + "loss": 0.7658, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.907583236694336, + "rewards/margins": -0.11834269762039185, + "rewards/rejected": -12.789239883422852, + "step": 1861 + }, + { + "epoch": 1.2849749870622735, + "grad_norm": 0.3334592580795288, + "learning_rate": 3.5684170180145657e-06, + "logits/chosen": 4.096762657165527, + "logits/rejected": 4.099510192871094, + "logps/chosen": -173.6639862060547, + "logps/rejected": -184.19256591796875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.911408424377441, + "rewards/margins": 1.0514081716537476, + "rewards/rejected": -13.96281623840332, + "step": 1862 + }, + { + "epoch": 1.2856649991374849, + "grad_norm": 0.35210439562797546, + "learning_rate": 3.570333461096206e-06, + "logits/chosen": 4.002971649169922, + "logits/rejected": 4.002971649169922, + "logps/chosen": -172.0965118408203, + "logps/rejected": -172.0965118408203, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.713675498962402, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.713675498962402, + "step": 1863 + }, + { + "epoch": 1.2863550112126962, + "grad_norm": 0.374491810798645, + "learning_rate": 3.5722499041778465e-06, + "logits/chosen": 3.7504124641418457, + "logits/rejected": 3.7504124641418457, + "logps/chosen": -159.28932189941406, + "logps/rejected": -159.289306640625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.26346206665039, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -11.26346206665039, + "step": 1864 + }, + { + "epoch": 1.2870450232879076, + "grad_norm": 0.36072349548339844, + "learning_rate": 3.574166347259487e-06, + "logits/chosen": 4.202032566070557, + "logits/rejected": 4.233292102813721, + "logps/chosen": -174.23516845703125, + "logps/rejected": -180.64456176757812, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.61706829071045, + "rewards/margins": 0.6478426456451416, + "rewards/rejected": -13.264910697937012, + "step": 1865 + }, + { + "epoch": 1.2877350353631187, + "grad_norm": 5.650665283203125, + "learning_rate": 3.5760827903411273e-06, + "logits/chosen": 4.3407206535339355, + "logits/rejected": 4.344182014465332, + "logps/chosen": -177.83607482910156, + "logps/rejected": -178.33721923828125, + "loss": 0.6666, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.918194770812988, + "rewards/margins": 0.06019389629364014, + "rewards/rejected": -12.978388786315918, + "step": 1866 + }, + { + "epoch": 1.2884250474383303, + "grad_norm": 0.2527635097503662, + "learning_rate": 3.5779992334227677e-06, + "logits/chosen": 3.8761942386627197, + "logits/rejected": 4.097087860107422, + "logps/chosen": -155.76541137695312, + "logps/rejected": -185.38772583007812, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.7186279296875, + "rewards/margins": 3.0078139305114746, + "rewards/rejected": -13.726442337036133, + "step": 1867 + }, + { + "epoch": 1.2891150595135414, + "grad_norm": 0.28390592336654663, + "learning_rate": 3.579915676504408e-06, + "logits/chosen": 3.998406410217285, + "logits/rejected": 4.086825847625732, + "logps/chosen": -173.12417602539062, + "logps/rejected": -183.848388671875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.330774307250977, + "rewards/margins": 1.0929148197174072, + "rewards/rejected": -13.423688888549805, + "step": 1868 + }, + { + "epoch": 1.2898050715887528, + "grad_norm": 0.34624168276786804, + "learning_rate": 3.5818321195860484e-06, + "logits/chosen": 4.508293151855469, + "logits/rejected": 4.508293151855469, + "logps/chosen": -178.8085174560547, + "logps/rejected": -178.8085174560547, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.989432334899902, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.989432334899902, + "step": 1869 + }, + { + "epoch": 1.2904950836639641, + "grad_norm": 0.6613948941230774, + "learning_rate": 3.5837485626676892e-06, + "logits/chosen": 4.0966715812683105, + "logits/rejected": 4.116164207458496, + "logps/chosen": -189.6299591064453, + "logps/rejected": -193.86013793945312, + "loss": 0.6114, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.261624336242676, + "rewards/margins": 0.40333080291748047, + "rewards/rejected": -14.664955139160156, + "step": 1870 + }, + { + "epoch": 1.2911850957391755, + "grad_norm": 6.9484381675720215, + "learning_rate": 3.5856650057493296e-06, + "logits/chosen": 4.076606750488281, + "logits/rejected": 4.142425537109375, + "logps/chosen": -187.99264526367188, + "logps/rejected": -191.60781860351562, + "loss": 0.6382, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.078243255615234, + "rewards/margins": 0.3552132844924927, + "rewards/rejected": -14.433456420898438, + "step": 1871 + }, + { + "epoch": 1.2918751078143869, + "grad_norm": 0.36737918853759766, + "learning_rate": 3.58758144883097e-06, + "logits/chosen": 4.397580146789551, + "logits/rejected": 4.447007656097412, + "logps/chosen": -182.65921020507812, + "logps/rejected": -188.4849853515625, + "loss": 0.6081, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.490324020385742, + "rewards/margins": 0.5448397397994995, + "rewards/rejected": -14.035163879394531, + "step": 1872 + }, + { + "epoch": 1.292565119889598, + "grad_norm": 0.3905442953109741, + "learning_rate": 3.5894978919126104e-06, + "logits/chosen": 3.9134395122528076, + "logits/rejected": 3.9134395122528076, + "logps/chosen": -176.3345947265625, + "logps/rejected": -176.3345947265625, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.830568313598633, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -12.830568313598633, + "step": 1873 + }, + { + "epoch": 1.2932551319648093, + "grad_norm": 1.160035490989685, + "learning_rate": 3.591414334994251e-06, + "logits/chosen": 4.208694934844971, + "logits/rejected": 4.20731258392334, + "logps/chosen": -167.11981201171875, + "logps/rejected": -169.9462127685547, + "loss": 0.6161, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.885117530822754, + "rewards/margins": 0.31632041931152344, + "rewards/rejected": -12.201436996459961, + "step": 1874 + }, + { + "epoch": 1.2939451440400207, + "grad_norm": 0.33775627613067627, + "learning_rate": 3.593330778075891e-06, + "logits/chosen": 4.446374893188477, + "logits/rejected": 4.517077922821045, + "logps/chosen": -182.24667358398438, + "logps/rejected": -198.69757080078125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.492176055908203, + "rewards/margins": 1.5426583290100098, + "rewards/rejected": -15.034833908081055, + "step": 1875 + }, + { + "epoch": 1.294635156115232, + "grad_norm": 0.383590430021286, + "learning_rate": 3.5952472211575316e-06, + "logits/chosen": 4.07703971862793, + "logits/rejected": 4.217133522033691, + "logps/chosen": -154.26097106933594, + "logps/rejected": -181.47015380859375, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.827523231506348, + "rewards/margins": 2.6440553665161133, + "rewards/rejected": -13.471578598022461, + "step": 1876 + }, + { + "epoch": 1.2953251681904434, + "grad_norm": 0.32511207461357117, + "learning_rate": 3.597163664239172e-06, + "logits/chosen": 4.108731269836426, + "logits/rejected": 4.173222541809082, + "logps/chosen": -156.192138671875, + "logps/rejected": -189.6239776611328, + "loss": 0.4345, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.740941047668457, + "rewards/margins": 3.2946155071258545, + "rewards/rejected": -14.03555679321289, + "step": 1877 + }, + { + "epoch": 1.2960151802656545, + "grad_norm": 0.917522132396698, + "learning_rate": 3.5990801073208132e-06, + "logits/chosen": 4.11113977432251, + "logits/rejected": 4.158397197723389, + "logps/chosen": -173.47543334960938, + "logps/rejected": -177.34266662597656, + "loss": 0.6148, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.657638549804688, + "rewards/margins": 0.3354341983795166, + "rewards/rejected": -12.993072509765625, + "step": 1878 + }, + { + "epoch": 1.296705192340866, + "grad_norm": 0.26311108469963074, + "learning_rate": 3.6009965504024536e-06, + "logits/chosen": 4.000591278076172, + "logits/rejected": 4.145511627197266, + "logps/chosen": -167.9515838623047, + "logps/rejected": -195.94931030273438, + "loss": 0.4335, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.955053329467773, + "rewards/margins": 2.8355650901794434, + "rewards/rejected": -14.790617942810059, + "step": 1879 + }, + { + "epoch": 1.2973952044160773, + "grad_norm": 0.35137563943862915, + "learning_rate": 3.602912993484094e-06, + "logits/chosen": 3.918455123901367, + "logits/rejected": 4.110045433044434, + "logps/chosen": -163.53201293945312, + "logps/rejected": -184.35891723632812, + "loss": 0.5208, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.653016090393066, + "rewards/margins": 2.0620455741882324, + "rewards/rejected": -13.715062141418457, + "step": 1880 + }, + { + "epoch": 1.2980852164912886, + "grad_norm": 0.38436052203178406, + "learning_rate": 3.6048294365657344e-06, + "logits/chosen": 3.9280920028686523, + "logits/rejected": 4.089461326599121, + "logps/chosen": -167.88754272460938, + "logps/rejected": -186.38006591796875, + "loss": 0.5207, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.95821475982666, + "rewards/margins": 1.830581545829773, + "rewards/rejected": -13.788796424865723, + "step": 1881 + }, + { + "epoch": 1.2987752285665, + "grad_norm": 23.124759674072266, + "learning_rate": 3.606745879647375e-06, + "logits/chosen": 3.8452725410461426, + "logits/rejected": 3.760542392730713, + "logps/chosen": -164.8763427734375, + "logps/rejected": -176.55679321289062, + "loss": 1.1055, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.760417938232422, + "rewards/margins": 1.0021830797195435, + "rewards/rejected": -12.762601852416992, + "step": 1882 + }, + { + "epoch": 1.299465240641711, + "grad_norm": 0.3158876895904541, + "learning_rate": 3.608662322729015e-06, + "logits/chosen": 3.779120445251465, + "logits/rejected": 3.8732733726501465, + "logps/chosen": -162.4273223876953, + "logps/rejected": -186.1063690185547, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.663942337036133, + "rewards/margins": 2.3476295471191406, + "rewards/rejected": -14.011571884155273, + "step": 1883 + }, + { + "epoch": 1.3001552527169227, + "grad_norm": 0.3044103980064392, + "learning_rate": 3.6105787658106556e-06, + "logits/chosen": 4.308753967285156, + "logits/rejected": 4.390594482421875, + "logps/chosen": -181.47366333007812, + "logps/rejected": -194.16285705566406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.075292587280273, + "rewards/margins": 1.2821218967437744, + "rewards/rejected": -14.357415199279785, + "step": 1884 + }, + { + "epoch": 1.3008452647921338, + "grad_norm": 0.40430253744125366, + "learning_rate": 3.612495208892296e-06, + "logits/chosen": 4.381460189819336, + "logits/rejected": 4.315304279327393, + "logps/chosen": -181.56393432617188, + "logps/rejected": -187.51483154296875, + "loss": 0.6074, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.2600679397583, + "rewards/margins": 0.6144899129867554, + "rewards/rejected": -13.874557495117188, + "step": 1885 + }, + { + "epoch": 1.3015352768673452, + "grad_norm": 0.43449175357818604, + "learning_rate": 3.6144116519739364e-06, + "logits/chosen": 4.117905139923096, + "logits/rejected": 4.117905139923096, + "logps/chosen": -178.8995361328125, + "logps/rejected": -178.8995361328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.09033489227295, + "rewards/margins": 0.0, + "rewards/rejected": -13.09033489227295, + "step": 1886 + }, + { + "epoch": 1.3022252889425565, + "grad_norm": 0.30929794907569885, + "learning_rate": 3.616328095055577e-06, + "logits/chosen": 3.912975788116455, + "logits/rejected": 4.092317581176758, + "logps/chosen": -150.2493438720703, + "logps/rejected": -172.21102905273438, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.248939514160156, + "rewards/margins": 2.208766460418701, + "rewards/rejected": -12.457706451416016, + "step": 1887 + }, + { + "epoch": 1.3029153010177679, + "grad_norm": 0.34083423018455505, + "learning_rate": 3.6182445381372176e-06, + "logits/chosen": 3.9618916511535645, + "logits/rejected": 4.051022529602051, + "logps/chosen": -172.62786865234375, + "logps/rejected": -180.21917724609375, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.54057788848877, + "rewards/margins": 0.7342736124992371, + "rewards/rejected": -13.27485179901123, + "step": 1888 + }, + { + "epoch": 1.3036053130929792, + "grad_norm": 0.3620082139968872, + "learning_rate": 3.6201609812188584e-06, + "logits/chosen": 4.056910037994385, + "logits/rejected": 4.089649200439453, + "logps/chosen": -172.41009521484375, + "logps/rejected": -185.76051330566406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.544511795043945, + "rewards/margins": 1.3463683128356934, + "rewards/rejected": -13.890880584716797, + "step": 1889 + }, + { + "epoch": 1.3042953251681904, + "grad_norm": 0.32188570499420166, + "learning_rate": 3.6220774243004988e-06, + "logits/chosen": 3.9802536964416504, + "logits/rejected": 4.0990705490112305, + "logps/chosen": -158.76300048828125, + "logps/rejected": -174.38845825195312, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.021038055419922, + "rewards/margins": 1.6049360036849976, + "rewards/rejected": -12.625975608825684, + "step": 1890 + }, + { + "epoch": 1.3049853372434017, + "grad_norm": 0.2965603768825531, + "learning_rate": 3.623993867382139e-06, + "logits/chosen": 3.842432737350464, + "logits/rejected": 3.9623284339904785, + "logps/chosen": -170.29962158203125, + "logps/rejected": -188.32611083984375, + "loss": 0.5203, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.294960975646973, + "rewards/margins": 1.7976160049438477, + "rewards/rejected": -14.09257698059082, + "step": 1891 + }, + { + "epoch": 1.305675349318613, + "grad_norm": 0.31272074580192566, + "learning_rate": 3.6259103104637796e-06, + "logits/chosen": 4.4273457527160645, + "logits/rejected": 4.553697109222412, + "logps/chosen": -179.67620849609375, + "logps/rejected": -187.44235229492188, + "loss": 0.6068, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.170449256896973, + "rewards/margins": 0.7594587802886963, + "rewards/rejected": -13.92990779876709, + "step": 1892 + }, + { + "epoch": 1.3063653613938244, + "grad_norm": 0.43060964345932007, + "learning_rate": 3.62782675354542e-06, + "logits/chosen": 3.5288429260253906, + "logits/rejected": 3.594784736633301, + "logps/chosen": -149.65684509277344, + "logps/rejected": -168.6435546875, + "loss": 0.5217, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.38083267211914, + "rewards/margins": 1.7971289157867432, + "rewards/rejected": -12.177961349487305, + "step": 1893 + }, + { + "epoch": 1.3070553734690358, + "grad_norm": 0.40385228395462036, + "learning_rate": 3.6297431966270603e-06, + "logits/chosen": 3.5572566986083984, + "logits/rejected": 3.796443462371826, + "logps/chosen": -156.6230926513672, + "logps/rejected": -183.9583740234375, + "loss": 0.4357, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.982622146606445, + "rewards/margins": 2.7225685119628906, + "rewards/rejected": -13.705190658569336, + "step": 1894 + }, + { + "epoch": 1.307745385544247, + "grad_norm": 0.28953444957733154, + "learning_rate": 3.631659639708701e-06, + "logits/chosen": 4.150862216949463, + "logits/rejected": 4.308133125305176, + "logps/chosen": -166.2510528564453, + "logps/rejected": -181.04298400878906, + "loss": 0.521, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.740278244018555, + "rewards/margins": 1.3883857727050781, + "rewards/rejected": -13.128664016723633, + "step": 1895 + }, + { + "epoch": 1.3084353976194583, + "grad_norm": 0.3305630385875702, + "learning_rate": 3.6335760827903415e-06, + "logits/chosen": 3.9194717407226562, + "logits/rejected": 3.9593453407287598, + "logps/chosen": -146.6236114501953, + "logps/rejected": -152.14889526367188, + "loss": 0.6077, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.882831573486328, + "rewards/margins": 0.5852757096290588, + "rewards/rejected": -10.468107223510742, + "step": 1896 + }, + { + "epoch": 1.3091254096946696, + "grad_norm": 14.834731101989746, + "learning_rate": 3.635492525871982e-06, + "logits/chosen": 4.083254337310791, + "logits/rejected": 3.7730534076690674, + "logps/chosen": -166.64085388183594, + "logps/rejected": -164.1557159423828, + "loss": 1.1913, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.843607902526855, + "rewards/margins": -0.1820368766784668, + "rewards/rejected": -11.66157054901123, + "step": 1897 + }, + { + "epoch": 1.309815421769881, + "grad_norm": 1.6668461561203003, + "learning_rate": 3.6374089689536223e-06, + "logits/chosen": 4.01802921295166, + "logits/rejected": 4.025486946105957, + "logps/chosen": -171.2010498046875, + "logps/rejected": -185.35812377929688, + "loss": 0.5319, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.530323028564453, + "rewards/margins": 1.4312925338745117, + "rewards/rejected": -13.961615562438965, + "step": 1898 + }, + { + "epoch": 1.3105054338450923, + "grad_norm": 4.81764030456543, + "learning_rate": 3.6393254120352627e-06, + "logits/chosen": 4.160757064819336, + "logits/rejected": 4.152729511260986, + "logps/chosen": -164.48423767089844, + "logps/rejected": -166.13502502441406, + "loss": 0.6303, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.75909423828125, + "rewards/margins": 0.19518119096755981, + "rewards/rejected": -11.954275131225586, + "step": 1899 + }, + { + "epoch": 1.3111954459203037, + "grad_norm": 0.40250709652900696, + "learning_rate": 3.641241855116903e-06, + "logits/chosen": 3.9253013134002686, + "logits/rejected": 4.069329261779785, + "logps/chosen": -168.56576538085938, + "logps/rejected": -175.37205505371094, + "loss": 0.6072, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.323325157165527, + "rewards/margins": 0.6456371545791626, + "rewards/rejected": -12.968962669372559, + "step": 1900 + }, + { + "epoch": 1.311885457995515, + "grad_norm": 0.3514649569988251, + "learning_rate": 3.6431582981985435e-06, + "logits/chosen": 4.213316917419434, + "logits/rejected": 4.21976375579834, + "logps/chosen": -184.58474731445312, + "logps/rejected": -190.341552734375, + "loss": 0.6074, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.736459732055664, + "rewards/margins": 0.6186045408248901, + "rewards/rejected": -14.355064392089844, + "step": 1901 + }, + { + "epoch": 1.3125754700707262, + "grad_norm": 0.4017721712589264, + "learning_rate": 3.645074741280184e-06, + "logits/chosen": 3.8456547260284424, + "logits/rejected": 3.8418335914611816, + "logps/chosen": -182.53610229492188, + "logps/rejected": -186.5619659423828, + "loss": 0.6102, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.519393920898438, + "rewards/margins": 0.4366728663444519, + "rewards/rejected": -13.95606803894043, + "step": 1902 + }, + { + "epoch": 1.3132654821459375, + "grad_norm": 0.36408960819244385, + "learning_rate": 3.646991184361825e-06, + "logits/chosen": 4.035886764526367, + "logits/rejected": 4.035886764526367, + "logps/chosen": -175.69723510742188, + "logps/rejected": -175.69723510742188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.08815860748291, + "rewards/margins": 2.980232238769531e-07, + "rewards/rejected": -13.08815860748291, + "step": 1903 + }, + { + "epoch": 1.3139554942211489, + "grad_norm": 26.267986297607422, + "learning_rate": 3.6489076274434655e-06, + "logits/chosen": 4.146018981933594, + "logits/rejected": 4.015388011932373, + "logps/chosen": -185.31060791015625, + "logps/rejected": -180.0538787841797, + "loss": 1.1183, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.639261245727539, + "rewards/margins": -0.5096699595451355, + "rewards/rejected": -13.12959098815918, + "step": 1904 + }, + { + "epoch": 1.3146455062963602, + "grad_norm": 0.2896048426628113, + "learning_rate": 3.650824070525106e-06, + "logits/chosen": 3.991544723510742, + "logits/rejected": 4.1800537109375, + "logps/chosen": -169.4715118408203, + "logps/rejected": -193.05812072753906, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.33358383178711, + "rewards/margins": 2.353972911834717, + "rewards/rejected": -14.687555313110352, + "step": 1905 + }, + { + "epoch": 1.3153355183715716, + "grad_norm": 0.3254663646221161, + "learning_rate": 3.6527405136067463e-06, + "logits/chosen": 4.000153541564941, + "logits/rejected": 4.03814172744751, + "logps/chosen": -165.0201416015625, + "logps/rejected": -180.05007934570312, + "loss": 0.5207, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.725215911865234, + "rewards/margins": 1.5478718280792236, + "rewards/rejected": -13.273088455200195, + "step": 1906 + }, + { + "epoch": 1.3160255304467827, + "grad_norm": 0.31339412927627563, + "learning_rate": 3.6546569566883867e-06, + "logits/chosen": 4.086852073669434, + "logits/rejected": 4.166065216064453, + "logps/chosen": -181.6501922607422, + "logps/rejected": -188.84742736816406, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.473731994628906, + "rewards/margins": 0.7253514528274536, + "rewards/rejected": -14.19908332824707, + "step": 1907 + }, + { + "epoch": 1.316715542521994, + "grad_norm": 1.9122529029846191, + "learning_rate": 3.656573399770027e-06, + "logits/chosen": 3.603689670562744, + "logits/rejected": 3.930095672607422, + "logps/chosen": -155.69967651367188, + "logps/rejected": -186.9814453125, + "loss": 0.3591, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.93521499633789, + "rewards/margins": 3.164273738861084, + "rewards/rejected": -14.099489212036133, + "step": 1908 + }, + { + "epoch": 1.3174055545972054, + "grad_norm": 0.3929947316646576, + "learning_rate": 3.6584898428516675e-06, + "logits/chosen": 3.7773876190185547, + "logits/rejected": 3.7773876190185547, + "logps/chosen": -175.6180877685547, + "logps/rejected": -175.61807250976562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.778104782104492, + "rewards/margins": 0.0, + "rewards/rejected": -12.778104782104492, + "step": 1909 + }, + { + "epoch": 1.3180955666724168, + "grad_norm": 0.2713806629180908, + "learning_rate": 3.660406285933308e-06, + "logits/chosen": 3.6441311836242676, + "logits/rejected": 3.9131979942321777, + "logps/chosen": -173.09896850585938, + "logps/rejected": -194.7361297607422, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.808928489685059, + "rewards/margins": 2.236868143081665, + "rewards/rejected": -15.045797348022461, + "step": 1910 + }, + { + "epoch": 1.3187855787476281, + "grad_norm": 0.5451551079750061, + "learning_rate": 3.6623227290149487e-06, + "logits/chosen": 3.5445847511291504, + "logits/rejected": 3.6655101776123047, + "logps/chosen": -178.6132354736328, + "logps/rejected": -190.64463806152344, + "loss": 0.5226, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.871755599975586, + "rewards/margins": 1.2002813816070557, + "rewards/rejected": -14.072035789489746, + "step": 1911 + }, + { + "epoch": 1.3194755908228393, + "grad_norm": 0.3209916949272156, + "learning_rate": 3.664239172096589e-06, + "logits/chosen": 4.106164932250977, + "logits/rejected": 4.136404991149902, + "logps/chosen": -175.3703155517578, + "logps/rejected": -188.6412353515625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.932514190673828, + "rewards/margins": 1.0995467901229858, + "rewards/rejected": -14.032060623168945, + "step": 1912 + }, + { + "epoch": 1.3201656028980508, + "grad_norm": 0.4241950213909149, + "learning_rate": 3.6661556151782295e-06, + "logits/chosen": 3.9761476516723633, + "logits/rejected": 3.9761476516723633, + "logps/chosen": -178.83551025390625, + "logps/rejected": -178.83551025390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.168191909790039, + "rewards/margins": 0.0, + "rewards/rejected": -13.168191909790039, + "step": 1913 + }, + { + "epoch": 1.320855614973262, + "grad_norm": 3.9038233757019043, + "learning_rate": 3.66807205825987e-06, + "logits/chosen": 3.7883448600769043, + "logits/rejected": 3.8792366981506348, + "logps/chosen": -166.88494873046875, + "logps/rejected": -190.5453338623047, + "loss": 0.4527, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.890426635742188, + "rewards/margins": 2.3384995460510254, + "rewards/rejected": -14.228925704956055, + "step": 1914 + }, + { + "epoch": 1.3215456270484733, + "grad_norm": 16.085678100585938, + "learning_rate": 3.6699885013415103e-06, + "logits/chosen": 3.697730302810669, + "logits/rejected": 3.6989991664886475, + "logps/chosen": -162.24745178222656, + "logps/rejected": -168.67428588867188, + "loss": 0.7302, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.46859073638916, + "rewards/margins": 0.6664783954620361, + "rewards/rejected": -12.135068893432617, + "step": 1915 + }, + { + "epoch": 1.3222356391236847, + "grad_norm": 0.985511839389801, + "learning_rate": 3.6719049444231506e-06, + "logits/chosen": 3.9211416244506836, + "logits/rejected": 3.9428563117980957, + "logps/chosen": -183.09226989746094, + "logps/rejected": -186.9569854736328, + "loss": 0.6123, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.542064666748047, + "rewards/margins": 0.38149142265319824, + "rewards/rejected": -13.92355728149414, + "step": 1916 + }, + { + "epoch": 1.322925651198896, + "grad_norm": 4.817612171173096, + "learning_rate": 3.673821387504791e-06, + "logits/chosen": 4.025568962097168, + "logits/rejected": 4.031796932220459, + "logps/chosen": -197.94692993164062, + "logps/rejected": -199.16665649414062, + "loss": 0.6384, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.875532150268555, + "rewards/margins": 0.1542881727218628, + "rewards/rejected": -15.02981948852539, + "step": 1917 + }, + { + "epoch": 1.3236156632741074, + "grad_norm": 0.37342241406440735, + "learning_rate": 3.6757378305864314e-06, + "logits/chosen": 4.156486988067627, + "logits/rejected": 4.156486988067627, + "logps/chosen": -195.1505126953125, + "logps/rejected": -195.1505126953125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.833964347839355, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -14.833965301513672, + "step": 1918 + }, + { + "epoch": 1.3243056753493185, + "grad_norm": 0.34313276410102844, + "learning_rate": 3.6776542736680727e-06, + "logits/chosen": 4.1134490966796875, + "logits/rejected": 4.158655166625977, + "logps/chosen": -182.2567138671875, + "logps/rejected": -192.11465454101562, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.342470169067383, + "rewards/margins": 0.9910897612571716, + "rewards/rejected": -14.333560943603516, + "step": 1919 + }, + { + "epoch": 1.3249956874245299, + "grad_norm": 0.5101824998855591, + "learning_rate": 3.679570716749713e-06, + "logits/chosen": 3.7455763816833496, + "logits/rejected": 3.9995932579040527, + "logps/chosen": -173.60858154296875, + "logps/rejected": -186.4144287109375, + "loss": 0.5243, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.607338905334473, + "rewards/margins": 1.2450003623962402, + "rewards/rejected": -13.852339744567871, + "step": 1920 + }, + { + "epoch": 1.3256856994997412, + "grad_norm": 4.628432750701904, + "learning_rate": 3.6814871598313535e-06, + "logits/chosen": 3.6464123725891113, + "logits/rejected": 3.7050209045410156, + "logps/chosen": -172.61053466796875, + "logps/rejected": -175.15054321289062, + "loss": 0.6232, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.560565948486328, + "rewards/margins": 0.2433009147644043, + "rewards/rejected": -12.803865432739258, + "step": 1921 + }, + { + "epoch": 1.3263757115749526, + "grad_norm": 0.3528761863708496, + "learning_rate": 3.683403602912994e-06, + "logits/chosen": 3.8931522369384766, + "logits/rejected": 3.8931522369384766, + "logps/chosen": -164.35833740234375, + "logps/rejected": -164.35833740234375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.719900131225586, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -11.719900131225586, + "step": 1922 + }, + { + "epoch": 1.327065723650164, + "grad_norm": 0.3064393699169159, + "learning_rate": 3.6853200459946342e-06, + "logits/chosen": 3.772282600402832, + "logits/rejected": 3.8146204948425293, + "logps/chosen": -150.12889099121094, + "logps/rejected": -184.88601684570312, + "loss": 0.4337, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.499261856079102, + "rewards/margins": 3.2852563858032227, + "rewards/rejected": -13.784517288208008, + "step": 1923 + }, + { + "epoch": 1.327755735725375, + "grad_norm": 7.699094295501709, + "learning_rate": 3.6872364890762746e-06, + "logits/chosen": 3.9046287536621094, + "logits/rejected": 3.99691104888916, + "logps/chosen": -161.49803161621094, + "logps/rejected": -169.35903930664062, + "loss": 0.5743, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.29511833190918, + "rewards/margins": 0.849628746509552, + "rewards/rejected": -12.144745826721191, + "step": 1924 + }, + { + "epoch": 1.3284457478005864, + "grad_norm": 0.34324678778648376, + "learning_rate": 3.689152932157915e-06, + "logits/chosen": 3.890350580215454, + "logits/rejected": 3.890350580215454, + "logps/chosen": -182.6446990966797, + "logps/rejected": -182.6446990966797, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.298311233520508, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.298311233520508, + "step": 1925 + }, + { + "epoch": 1.3291357598757978, + "grad_norm": 5.5250725746154785, + "learning_rate": 3.6910693752395554e-06, + "logits/chosen": 3.480978488922119, + "logits/rejected": 3.516819953918457, + "logps/chosen": -167.14820861816406, + "logps/rejected": -173.51670837402344, + "loss": 0.5558, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.168846130371094, + "rewards/margins": 0.5085536241531372, + "rewards/rejected": -12.677399635314941, + "step": 1926 + }, + { + "epoch": 1.3298257719510092, + "grad_norm": 0.2986268103122711, + "learning_rate": 3.6929858183211962e-06, + "logits/chosen": 3.4586052894592285, + "logits/rejected": 3.672549247741699, + "logps/chosen": -170.55596923828125, + "logps/rejected": -183.06863403320312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.21107292175293, + "rewards/margins": 1.2858868837356567, + "rewards/rejected": -13.496959686279297, + "step": 1927 + }, + { + "epoch": 1.3305157840262205, + "grad_norm": 0.2992565631866455, + "learning_rate": 3.6949022614028366e-06, + "logits/chosen": 3.5622363090515137, + "logits/rejected": 3.6435327529907227, + "logps/chosen": -171.2256622314453, + "logps/rejected": -180.1864776611328, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.347627639770508, + "rewards/margins": 0.8719204068183899, + "rewards/rejected": -13.219547271728516, + "step": 1928 + }, + { + "epoch": 1.3312057961014316, + "grad_norm": 0.3028072714805603, + "learning_rate": 3.696818704484477e-06, + "logits/chosen": 3.676753520965576, + "logits/rejected": 3.6808390617370605, + "logps/chosen": -177.528564453125, + "logps/rejected": -192.05816650390625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.209511756896973, + "rewards/margins": 1.4533735513687134, + "rewards/rejected": -14.662884712219238, + "step": 1929 + }, + { + "epoch": 1.3318958081766432, + "grad_norm": 0.3865108788013458, + "learning_rate": 3.698735147566118e-06, + "logits/chosen": 3.9460866451263428, + "logits/rejected": 3.9460866451263428, + "logps/chosen": -178.23953247070312, + "logps/rejected": -178.23953247070312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.159549713134766, + "rewards/margins": 0.0, + "rewards/rejected": -13.159549713134766, + "step": 1930 + }, + { + "epoch": 1.3325858202518543, + "grad_norm": 0.293791800737381, + "learning_rate": 3.7006515906477582e-06, + "logits/chosen": 3.6038527488708496, + "logits/rejected": 3.6038527488708496, + "logps/chosen": -167.90667724609375, + "logps/rejected": -167.90667724609375, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.958047866821289, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.958047866821289, + "step": 1931 + }, + { + "epoch": 1.3332758323270657, + "grad_norm": 0.3340955078601837, + "learning_rate": 3.7025680337293986e-06, + "logits/chosen": 3.7290337085723877, + "logits/rejected": 3.7290337085723877, + "logps/chosen": -195.87197875976562, + "logps/rejected": -195.87196350097656, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.851515769958496, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.85151481628418, + "step": 1932 + }, + { + "epoch": 1.333965844402277, + "grad_norm": 0.3904297649860382, + "learning_rate": 3.704484476811039e-06, + "logits/chosen": 3.7163963317871094, + "logits/rejected": 3.7163963317871094, + "logps/chosen": -164.07159423828125, + "logps/rejected": -164.07159423828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.844511032104492, + "rewards/margins": 0.0, + "rewards/rejected": -11.844511032104492, + "step": 1933 + }, + { + "epoch": 1.3346558564774884, + "grad_norm": 0.419694185256958, + "learning_rate": 3.7064009198926794e-06, + "logits/chosen": 3.8210134506225586, + "logits/rejected": 3.8210134506225586, + "logps/chosen": -168.3978729248047, + "logps/rejected": -168.3978729248047, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.15402603149414, + "rewards/margins": 7.152557373046875e-07, + "rewards/rejected": -12.15402603149414, + "step": 1934 + }, + { + "epoch": 1.3353458685526998, + "grad_norm": 0.3069513142108917, + "learning_rate": 3.70831736297432e-06, + "logits/chosen": 3.725001811981201, + "logits/rejected": 3.9006526470184326, + "logps/chosen": -165.63909912109375, + "logps/rejected": -177.18252563476562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.833887100219727, + "rewards/margins": 1.1846073865890503, + "rewards/rejected": -13.018494606018066, + "step": 1935 + }, + { + "epoch": 1.336035880627911, + "grad_norm": 0.34768155217170715, + "learning_rate": 3.7102338060559606e-06, + "logits/chosen": 3.9108572006225586, + "logits/rejected": 3.9582581520080566, + "logps/chosen": -155.85638427734375, + "logps/rejected": -162.60781860351562, + "loss": 0.6073, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.939943313598633, + "rewards/margins": 0.6274389624595642, + "rewards/rejected": -11.5673828125, + "step": 1936 + }, + { + "epoch": 1.3367258927031223, + "grad_norm": 35.40803909301758, + "learning_rate": 3.712150249137601e-06, + "logits/chosen": 3.783487319946289, + "logits/rejected": 3.7834415435791016, + "logps/chosen": -172.15447998046875, + "logps/rejected": -170.63333129882812, + "loss": 0.8084, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.249153137207031, + "rewards/margins": -0.1741442084312439, + "rewards/rejected": -12.075010299682617, + "step": 1937 + }, + { + "epoch": 1.3374159047783336, + "grad_norm": 0.3599790036678314, + "learning_rate": 3.7140666922192414e-06, + "logits/chosen": 3.7305359840393066, + "logits/rejected": 3.8644440174102783, + "logps/chosen": -186.7412567138672, + "logps/rejected": -195.4546661376953, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.862181663513184, + "rewards/margins": 0.857882559299469, + "rewards/rejected": -14.720064163208008, + "step": 1938 + }, + { + "epoch": 1.338105916853545, + "grad_norm": 0.37184590101242065, + "learning_rate": 3.7159831353008818e-06, + "logits/chosen": 3.8374409675598145, + "logits/rejected": 3.8374409675598145, + "logps/chosen": -183.98367309570312, + "logps/rejected": -183.98365783691406, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.459571838378906, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.459571838378906, + "step": 1939 + }, + { + "epoch": 1.3387959289287563, + "grad_norm": 0.334707647562027, + "learning_rate": 3.717899578382522e-06, + "logits/chosen": 3.6054129600524902, + "logits/rejected": 3.7468879222869873, + "logps/chosen": -167.25637817382812, + "logps/rejected": -181.59332275390625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.11564826965332, + "rewards/margins": 1.4042216539382935, + "rewards/rejected": -13.51987075805664, + "step": 1940 + }, + { + "epoch": 1.3394859410039675, + "grad_norm": 0.8807584047317505, + "learning_rate": 3.7198160214641626e-06, + "logits/chosen": 3.455965995788574, + "logits/rejected": 3.542886972427368, + "logps/chosen": -156.40476989746094, + "logps/rejected": -171.3755645751953, + "loss": 0.5279, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.939849853515625, + "rewards/margins": 1.5138026475906372, + "rewards/rejected": -12.453652381896973, + "step": 1941 + }, + { + "epoch": 1.3401759530791788, + "grad_norm": 0.360027551651001, + "learning_rate": 3.721732464545803e-06, + "logits/chosen": 3.897463798522949, + "logits/rejected": 3.897463798522949, + "logps/chosen": -182.48834228515625, + "logps/rejected": -182.48834228515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.503251075744629, + "rewards/margins": 0.0, + "rewards/rejected": -13.503251075744629, + "step": 1942 + }, + { + "epoch": 1.3408659651543902, + "grad_norm": 0.33666789531707764, + "learning_rate": 3.723648907627444e-06, + "logits/chosen": 3.37186598777771, + "logits/rejected": 3.37186598777771, + "logps/chosen": -164.682373046875, + "logps/rejected": -164.682373046875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.767290115356445, + "rewards/margins": 0.0, + "rewards/rejected": -11.767290115356445, + "step": 1943 + }, + { + "epoch": 1.3415559772296015, + "grad_norm": 0.44522032141685486, + "learning_rate": 3.7255653507090846e-06, + "logits/chosen": 3.906970500946045, + "logits/rejected": 3.906970500946045, + "logps/chosen": -161.47607421875, + "logps/rejected": -161.47607421875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.317800521850586, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -11.317800521850586, + "step": 1944 + }, + { + "epoch": 1.3422459893048129, + "grad_norm": 0.3915863335132599, + "learning_rate": 3.727481793790725e-06, + "logits/chosen": 3.7543649673461914, + "logits/rejected": 3.7543649673461914, + "logps/chosen": -183.3671417236328, + "logps/rejected": -183.3671417236328, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.427206993103027, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.427207946777344, + "step": 1945 + }, + { + "epoch": 1.3429360013800242, + "grad_norm": 0.6110818982124329, + "learning_rate": 3.7293982368723654e-06, + "logits/chosen": 3.313991069793701, + "logits/rejected": 3.414052963256836, + "logps/chosen": -165.33973693847656, + "logps/rejected": -170.00457763671875, + "loss": 0.6099, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.810391426086426, + "rewards/margins": 0.4497586488723755, + "rewards/rejected": -12.260150909423828, + "step": 1946 + }, + { + "epoch": 1.3436260134552356, + "grad_norm": 13.37125015258789, + "learning_rate": 3.7313146799540057e-06, + "logits/chosen": 3.7261743545532227, + "logits/rejected": 3.807939052581787, + "logps/chosen": -169.67486572265625, + "logps/rejected": -177.60174560546875, + "loss": 0.6014, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.006025314331055, + "rewards/margins": 0.7724817395210266, + "rewards/rejected": -12.778507232666016, + "step": 1947 + }, + { + "epoch": 1.3443160255304467, + "grad_norm": 0.46032965183258057, + "learning_rate": 3.733231123035646e-06, + "logits/chosen": 3.9315237998962402, + "logits/rejected": 4.013547420501709, + "logps/chosen": -174.46359252929688, + "logps/rejected": -179.76141357421875, + "loss": 0.6078, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.475581169128418, + "rewards/margins": 0.5733466148376465, + "rewards/rejected": -13.048928260803223, + "step": 1948 + }, + { + "epoch": 1.345006037605658, + "grad_norm": 0.38643085956573486, + "learning_rate": 3.7351475661172865e-06, + "logits/chosen": 3.833036422729492, + "logits/rejected": 3.833036422729492, + "logps/chosen": -178.10926818847656, + "logps/rejected": -178.10926818847656, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.963708877563477, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.963708877563477, + "step": 1949 + }, + { + "epoch": 1.3456960496808694, + "grad_norm": 0.33730608224868774, + "learning_rate": 3.737064009198927e-06, + "logits/chosen": 4.097619533538818, + "logits/rejected": 4.097619533538818, + "logps/chosen": -174.17498779296875, + "logps/rejected": -174.17498779296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.785879135131836, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.785879135131836, + "step": 1950 + }, + { + "epoch": 1.3463860617560808, + "grad_norm": 0.9129610657691956, + "learning_rate": 3.7389804522805677e-06, + "logits/chosen": 3.4733808040618896, + "logits/rejected": 3.6732025146484375, + "logps/chosen": -151.177734375, + "logps/rejected": -170.88304138183594, + "loss": 0.4368, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.40450668334961, + "rewards/margins": 2.0168724060058594, + "rewards/rejected": -12.421379089355469, + "step": 1951 + }, + { + "epoch": 1.3470760738312921, + "grad_norm": 0.4080633223056793, + "learning_rate": 3.740896895362208e-06, + "logits/chosen": 3.4782838821411133, + "logits/rejected": 3.584108829498291, + "logps/chosen": -171.8129119873047, + "logps/rejected": -185.74258422851562, + "loss": 0.5212, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.404263496398926, + "rewards/margins": 1.3664356470108032, + "rewards/rejected": -13.770699501037598, + "step": 1952 + }, + { + "epoch": 1.3477660859065033, + "grad_norm": 0.3770826756954193, + "learning_rate": 3.7428133384438485e-06, + "logits/chosen": 3.720099687576294, + "logits/rejected": 3.7510271072387695, + "logps/chosen": -167.69583129882812, + "logps/rejected": -180.0432891845703, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.920852661132812, + "rewards/margins": 1.2733197212219238, + "rewards/rejected": -13.194171905517578, + "step": 1953 + }, + { + "epoch": 1.3484560979817146, + "grad_norm": 0.26323118805885315, + "learning_rate": 3.744729781525489e-06, + "logits/chosen": 3.166022539138794, + "logits/rejected": 3.5295932292938232, + "logps/chosen": -136.9070587158203, + "logps/rejected": -168.5140838623047, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.963785171508789, + "rewards/margins": 3.1108884811401367, + "rewards/rejected": -12.074673652648926, + "step": 1954 + }, + { + "epoch": 1.349146110056926, + "grad_norm": 2.8382678031921387, + "learning_rate": 3.7466462246071293e-06, + "logits/chosen": 3.174043893814087, + "logits/rejected": 3.2789649963378906, + "logps/chosen": -159.63754272460938, + "logps/rejected": -159.6199951171875, + "loss": 0.6534, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.34585189819336, + "rewards/margins": 0.09840011596679688, + "rewards/rejected": -11.444252014160156, + "step": 1955 + }, + { + "epoch": 1.3498361221321373, + "grad_norm": 25.795726776123047, + "learning_rate": 3.7485626676887697e-06, + "logits/chosen": 3.377748489379883, + "logits/rejected": 3.373806953430176, + "logps/chosen": -177.79261779785156, + "logps/rejected": -174.64512634277344, + "loss": 0.8982, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.051597595214844, + "rewards/margins": -0.27894288301467896, + "rewards/rejected": -12.772655487060547, + "step": 1956 + }, + { + "epoch": 1.3505261342073487, + "grad_norm": 0.42905157804489136, + "learning_rate": 3.75047911077041e-06, + "logits/chosen": 3.4290924072265625, + "logits/rejected": 3.492269515991211, + "logps/chosen": -151.02203369140625, + "logps/rejected": -160.37734985351562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.306711196899414, + "rewards/margins": 0.9085640907287598, + "rewards/rejected": -11.215274810791016, + "step": 1957 + }, + { + "epoch": 1.3512161462825598, + "grad_norm": 0.34628432989120483, + "learning_rate": 3.7523955538520505e-06, + "logits/chosen": 3.3950531482696533, + "logits/rejected": 3.6037561893463135, + "logps/chosen": -149.74301147460938, + "logps/rejected": -169.31387329101562, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.05268383026123, + "rewards/margins": 1.951533555984497, + "rewards/rejected": -12.004217147827148, + "step": 1958 + }, + { + "epoch": 1.3519061583577714, + "grad_norm": 0.3684414029121399, + "learning_rate": 3.7543119969336917e-06, + "logits/chosen": 3.8674893379211426, + "logits/rejected": 3.8674893379211426, + "logps/chosen": -184.4306182861328, + "logps/rejected": -184.4306182861328, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.833309173583984, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.833309173583984, + "step": 1959 + }, + { + "epoch": 1.3525961704329825, + "grad_norm": 0.3793933391571045, + "learning_rate": 3.756228440015332e-06, + "logits/chosen": 3.828015089035034, + "logits/rejected": 3.828015089035034, + "logps/chosen": -178.10650634765625, + "logps/rejected": -178.10650634765625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.953545570373535, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -12.953545570373535, + "step": 1960 + }, + { + "epoch": 1.3532861825081939, + "grad_norm": 21.610212326049805, + "learning_rate": 3.7581448830969725e-06, + "logits/chosen": 3.3470449447631836, + "logits/rejected": 3.318474769592285, + "logps/chosen": -163.22335815429688, + "logps/rejected": -161.71926879882812, + "loss": 0.798, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.65096378326416, + "rewards/margins": -0.1611011028289795, + "rewards/rejected": -11.489862442016602, + "step": 1961 + }, + { + "epoch": 1.3539761945834052, + "grad_norm": 0.4253763258457184, + "learning_rate": 3.760061326178613e-06, + "logits/chosen": 3.6374666690826416, + "logits/rejected": 3.6374666690826416, + "logps/chosen": -181.52432250976562, + "logps/rejected": -181.52432250976562, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.465713500976562, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -13.465713500976562, + "step": 1962 + }, + { + "epoch": 1.3546662066586166, + "grad_norm": 0.40974587202072144, + "learning_rate": 3.7619777692602533e-06, + "logits/chosen": 3.6831750869750977, + "logits/rejected": 3.6831750869750977, + "logps/chosen": -189.41644287109375, + "logps/rejected": -189.4164581298828, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.082237243652344, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -14.082237243652344, + "step": 1963 + }, + { + "epoch": 1.355356218733828, + "grad_norm": 0.3963150382041931, + "learning_rate": 3.7638942123418937e-06, + "logits/chosen": 3.1799659729003906, + "logits/rejected": 3.338735580444336, + "logps/chosen": -149.29458618164062, + "logps/rejected": -173.29551696777344, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.2178955078125, + "rewards/margins": 2.3666834831237793, + "rewards/rejected": -12.584579467773438, + "step": 1964 + }, + { + "epoch": 1.356046230809039, + "grad_norm": 0.41333043575286865, + "learning_rate": 3.765810655423534e-06, + "logits/chosen": 3.417736530303955, + "logits/rejected": 3.5087437629699707, + "logps/chosen": -159.4210662841797, + "logps/rejected": -164.71266174316406, + "loss": 0.608, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.10206413269043, + "rewards/margins": 0.5495678186416626, + "rewards/rejected": -11.651632308959961, + "step": 1965 + }, + { + "epoch": 1.3567362428842504, + "grad_norm": 0.3195410370826721, + "learning_rate": 3.7677270985051745e-06, + "logits/chosen": 3.5295417308807373, + "logits/rejected": 3.653639554977417, + "logps/chosen": -172.12527465820312, + "logps/rejected": -179.5708465576172, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.465555191040039, + "rewards/margins": 0.777740478515625, + "rewards/rejected": -13.243295669555664, + "step": 1966 + }, + { + "epoch": 1.3574262549594618, + "grad_norm": 0.3354514539241791, + "learning_rate": 3.7696435415868153e-06, + "logits/chosen": 3.5676870346069336, + "logits/rejected": 3.5676870346069336, + "logps/chosen": -162.77572631835938, + "logps/rejected": -162.77572631835938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.528905868530273, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -11.528905868530273, + "step": 1967 + }, + { + "epoch": 1.3581162670346731, + "grad_norm": 0.27954527735710144, + "learning_rate": 3.7715599846684557e-06, + "logits/chosen": 3.3912301063537598, + "logits/rejected": 3.495286464691162, + "logps/chosen": -141.61233520507812, + "logps/rejected": -167.9200897216797, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.33698558807373, + "rewards/margins": 2.643773078918457, + "rewards/rejected": -11.980759620666504, + "step": 1968 + }, + { + "epoch": 1.3588062791098845, + "grad_norm": 0.33695611357688904, + "learning_rate": 3.773476427750096e-06, + "logits/chosen": 3.215975522994995, + "logits/rejected": 3.297621011734009, + "logps/chosen": -160.98922729492188, + "logps/rejected": -171.80552673339844, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.211664199829102, + "rewards/margins": 1.0784459114074707, + "rewards/rejected": -12.290111541748047, + "step": 1969 + }, + { + "epoch": 1.3594962911850956, + "grad_norm": 0.3304298222064972, + "learning_rate": 3.775392870831737e-06, + "logits/chosen": 3.139997959136963, + "logits/rejected": 3.4349615573883057, + "logps/chosen": -144.77224731445312, + "logps/rejected": -175.8734130859375, + "loss": 0.4343, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.783939361572266, + "rewards/margins": 3.1162819862365723, + "rewards/rejected": -12.900221824645996, + "step": 1970 + }, + { + "epoch": 1.360186303260307, + "grad_norm": 0.8026936650276184, + "learning_rate": 3.7773093139133773e-06, + "logits/chosen": 3.295750141143799, + "logits/rejected": 3.357140064239502, + "logps/chosen": -166.78750610351562, + "logps/rejected": -181.85231018066406, + "loss": 0.5235, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.812708854675293, + "rewards/margins": 1.5596518516540527, + "rewards/rejected": -13.372360229492188, + "step": 1971 + }, + { + "epoch": 1.3608763153355183, + "grad_norm": 0.28721851110458374, + "learning_rate": 3.7792257569950177e-06, + "logits/chosen": 3.536912679672241, + "logits/rejected": 3.5555152893066406, + "logps/chosen": -185.5545654296875, + "logps/rejected": -198.86407470703125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.911931991577148, + "rewards/margins": 1.3574182987213135, + "rewards/rejected": -15.269350051879883, + "step": 1972 + }, + { + "epoch": 1.3615663274107297, + "grad_norm": 0.4105600416660309, + "learning_rate": 3.781142200076658e-06, + "logits/chosen": 3.2477383613586426, + "logits/rejected": 3.2477383613586426, + "logps/chosen": -184.15138244628906, + "logps/rejected": -184.15138244628906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.477914810180664, + "rewards/margins": 0.0, + "rewards/rejected": -13.477914810180664, + "step": 1973 + }, + { + "epoch": 1.362256339485941, + "grad_norm": 0.3909050524234772, + "learning_rate": 3.7830586431582984e-06, + "logits/chosen": 3.2648487091064453, + "logits/rejected": 3.431438446044922, + "logps/chosen": -171.1657257080078, + "logps/rejected": -183.58160400390625, + "loss": 0.5214, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.426342010498047, + "rewards/margins": 1.2861087322235107, + "rewards/rejected": -13.71245002746582, + "step": 1974 + }, + { + "epoch": 1.3629463515611524, + "grad_norm": 0.44063735008239746, + "learning_rate": 3.7849750862399393e-06, + "logits/chosen": 3.4367032051086426, + "logits/rejected": 3.4367032051086426, + "logps/chosen": -174.58843994140625, + "logps/rejected": -174.5884552001953, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.804224014282227, + "rewards/margins": 5.364418029785156e-07, + "rewards/rejected": -12.804224014282227, + "step": 1975 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 1.8155558109283447, + "learning_rate": 3.7868915293215796e-06, + "logits/chosen": 3.014359951019287, + "logits/rejected": 3.0568106174468994, + "logps/chosen": -144.93154907226562, + "logps/rejected": -148.05947875976562, + "loss": 0.6187, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.616774559020996, + "rewards/margins": 0.285144567489624, + "rewards/rejected": -9.9019193649292, + "step": 1976 + }, + { + "epoch": 1.364326375711575, + "grad_norm": 5.780656814575195, + "learning_rate": 3.78880797240322e-06, + "logits/chosen": 3.0947391986846924, + "logits/rejected": 3.033684253692627, + "logps/chosen": -159.30470275878906, + "logps/rejected": -161.5919189453125, + "loss": 0.6355, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.398423194885254, + "rewards/margins": 0.16763657331466675, + "rewards/rejected": -11.566060066223145, + "step": 1977 + }, + { + "epoch": 1.3650163877867862, + "grad_norm": 0.2968987822532654, + "learning_rate": 3.7907244154848604e-06, + "logits/chosen": 3.181910514831543, + "logits/rejected": 3.534069299697876, + "logps/chosen": -136.36285400390625, + "logps/rejected": -168.64773559570312, + "loss": 0.4336, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.20663833618164, + "rewards/margins": 3.318216323852539, + "rewards/rejected": -12.52485466003418, + "step": 1978 + }, + { + "epoch": 1.3657063998619976, + "grad_norm": 0.34202346205711365, + "learning_rate": 3.792640858566501e-06, + "logits/chosen": 3.450674295425415, + "logits/rejected": 3.450674295425415, + "logps/chosen": -167.22433471679688, + "logps/rejected": -167.22433471679688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.078070640563965, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.078070640563965, + "step": 1979 + }, + { + "epoch": 1.366396411937209, + "grad_norm": 0.3724638521671295, + "learning_rate": 3.794557301648141e-06, + "logits/chosen": 3.1401753425598145, + "logits/rejected": 3.404633045196533, + "logps/chosen": -158.97232055664062, + "logps/rejected": -176.62503051757812, + "loss": 0.5201, + "rewards/accuracies": 0.75, + "rewards/chosen": -11.075394630432129, + "rewards/margins": 1.796201229095459, + "rewards/rejected": -12.87159538269043, + "step": 1980 + }, + { + "epoch": 1.3670864240124203, + "grad_norm": 0.3201121687889099, + "learning_rate": 3.7964737447297816e-06, + "logits/chosen": 3.1739799976348877, + "logits/rejected": 3.3939177989959717, + "logps/chosen": -149.8370819091797, + "logps/rejected": -162.19154357910156, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.24260425567627, + "rewards/margins": 1.251776933670044, + "rewards/rejected": -11.49438190460205, + "step": 1981 + }, + { + "epoch": 1.3677764360876314, + "grad_norm": 0.34845495223999023, + "learning_rate": 3.798390187811422e-06, + "logits/chosen": 3.2268853187561035, + "logits/rejected": 3.3245410919189453, + "logps/chosen": -170.57981872558594, + "logps/rejected": -178.45848083496094, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.161619186401367, + "rewards/margins": 0.7833349704742432, + "rewards/rejected": -12.944953918457031, + "step": 1982 + }, + { + "epoch": 1.3684664481628428, + "grad_norm": 14.770488739013672, + "learning_rate": 3.8003066308930624e-06, + "logits/chosen": 3.148390293121338, + "logits/rejected": 3.150240659713745, + "logps/chosen": -164.923583984375, + "logps/rejected": -163.77072143554688, + "loss": 0.7231, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.703359603881836, + "rewards/margins": -0.05413532257080078, + "rewards/rejected": -11.649225234985352, + "step": 1983 + }, + { + "epoch": 1.3691564602380542, + "grad_norm": 0.32799017429351807, + "learning_rate": 3.8022230739747036e-06, + "logits/chosen": 3.7455759048461914, + "logits/rejected": 3.7455759048461914, + "logps/chosen": -168.79055786132812, + "logps/rejected": -168.79055786132812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.188464164733887, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -12.188464164733887, + "step": 1984 + }, + { + "epoch": 1.3698464723132655, + "grad_norm": 0.39195841550827026, + "learning_rate": 3.804139517056344e-06, + "logits/chosen": 3.160860538482666, + "logits/rejected": 3.410935401916504, + "logps/chosen": -154.67950439453125, + "logps/rejected": -168.34857177734375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.900439262390137, + "rewards/margins": 1.3282477855682373, + "rewards/rejected": -12.228687286376953, + "step": 1985 + }, + { + "epoch": 1.3705364843884769, + "grad_norm": 0.3986700475215912, + "learning_rate": 3.8060559601379844e-06, + "logits/chosen": 3.3707988262176514, + "logits/rejected": 3.3707988262176514, + "logps/chosen": -176.27951049804688, + "logps/rejected": -176.27951049804688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.793815612792969, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.793815612792969, + "step": 1986 + }, + { + "epoch": 1.371226496463688, + "grad_norm": 0.3583231568336487, + "learning_rate": 3.807972403219625e-06, + "logits/chosen": 3.5009608268737793, + "logits/rejected": 3.56546688079834, + "logps/chosen": -183.6042022705078, + "logps/rejected": -197.43515014648438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.332651138305664, + "rewards/margins": 1.4329313039779663, + "rewards/rejected": -14.765582084655762, + "step": 1987 + }, + { + "epoch": 1.3719165085388993, + "grad_norm": 0.45139557123184204, + "learning_rate": 3.809888846301265e-06, + "logits/chosen": 3.1508383750915527, + "logits/rejected": 3.1508383750915527, + "logps/chosen": -180.11129760742188, + "logps/rejected": -180.11129760742188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.12419605255127, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -13.12419605255127, + "step": 1988 + }, + { + "epoch": 1.3726065206141107, + "grad_norm": 0.31489139795303345, + "learning_rate": 3.8118052893829056e-06, + "logits/chosen": 3.1238319873809814, + "logits/rejected": 3.1238319873809814, + "logps/chosen": -166.85641479492188, + "logps/rejected": -166.85641479492188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.697749137878418, + "rewards/margins": 0.0, + "rewards/rejected": -11.697749137878418, + "step": 1989 + }, + { + "epoch": 1.373296532689322, + "grad_norm": 0.34296900033950806, + "learning_rate": 3.813721732464546e-06, + "logits/chosen": 3.474297046661377, + "logits/rejected": 3.474297046661377, + "logps/chosen": -182.8309783935547, + "logps/rejected": -182.8309783935547, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.814760208129883, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.814760208129883, + "step": 1990 + }, + { + "epoch": 1.3739865447645334, + "grad_norm": 0.3905571401119232, + "learning_rate": 3.815638175546186e-06, + "logits/chosen": 3.553734302520752, + "logits/rejected": 3.553734302520752, + "logps/chosen": -184.01449584960938, + "logps/rejected": -184.01449584960938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.720382690429688, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.720382690429688, + "step": 1991 + }, + { + "epoch": 1.3746765568397448, + "grad_norm": 0.3625430464744568, + "learning_rate": 3.817554618627828e-06, + "logits/chosen": 3.6451125144958496, + "logits/rejected": 3.6451125144958496, + "logps/chosen": -175.81173706054688, + "logps/rejected": -175.81173706054688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.848685264587402, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.848685264587402, + "step": 1992 + }, + { + "epoch": 1.3753665689149561, + "grad_norm": 0.3430789113044739, + "learning_rate": 3.8194710617094676e-06, + "logits/chosen": 3.118358850479126, + "logits/rejected": 3.3356196880340576, + "logps/chosen": -169.9256134033203, + "logps/rejected": -190.21934509277344, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.088117599487305, + "rewards/margins": 2.021876811981201, + "rewards/rejected": -14.109994888305664, + "step": 1993 + }, + { + "epoch": 1.3760565809901673, + "grad_norm": 0.35014861822128296, + "learning_rate": 3.821387504791108e-06, + "logits/chosen": 3.4424960613250732, + "logits/rejected": 3.4424960613250732, + "logps/chosen": -173.09117126464844, + "logps/rejected": -173.09117126464844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.664424896240234, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.664424896240234, + "step": 1994 + }, + { + "epoch": 1.3767465930653786, + "grad_norm": 0.35080912709236145, + "learning_rate": 3.823303947872748e-06, + "logits/chosen": 3.5211997032165527, + "logits/rejected": 3.5211997032165527, + "logps/chosen": -185.7314910888672, + "logps/rejected": -185.7314910888672, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.886619567871094, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.886619567871094, + "step": 1995 + }, + { + "epoch": 1.37743660514059, + "grad_norm": 0.3054949641227722, + "learning_rate": 3.825220390954389e-06, + "logits/chosen": 3.4918696880340576, + "logits/rejected": 3.565324306488037, + "logps/chosen": -163.31118774414062, + "logps/rejected": -177.8484649658203, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.69046401977539, + "rewards/margins": 1.3946863412857056, + "rewards/rejected": -13.085151672363281, + "step": 1996 + }, + { + "epoch": 1.3781266172158013, + "grad_norm": 0.3104743957519531, + "learning_rate": 3.827136834036029e-06, + "logits/chosen": 2.860409736633301, + "logits/rejected": 3.100778579711914, + "logps/chosen": -157.263916015625, + "logps/rejected": -181.93179321289062, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.119199752807617, + "rewards/margins": 2.493011236190796, + "rewards/rejected": -13.612211227416992, + "step": 1997 + }, + { + "epoch": 1.3788166292910127, + "grad_norm": 0.34707728028297424, + "learning_rate": 3.82905327711767e-06, + "logits/chosen": 3.3443775177001953, + "logits/rejected": 3.4366040229797363, + "logps/chosen": -153.3140869140625, + "logps/rejected": -167.06382751464844, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.458267211914062, + "rewards/margins": 1.384575605392456, + "rewards/rejected": -11.842843055725098, + "step": 1998 + }, + { + "epoch": 1.3795066413662238, + "grad_norm": 0.33197659254074097, + "learning_rate": 3.83096972019931e-06, + "logits/chosen": 3.343052387237549, + "logits/rejected": 3.547008514404297, + "logps/chosen": -181.09471130371094, + "logps/rejected": -196.99124145507812, + "loss": 0.5207, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.361038208007812, + "rewards/margins": 1.5878254175186157, + "rewards/rejected": -14.94886302947998, + "step": 1999 + }, + { + "epoch": 1.3801966534414352, + "grad_norm": 0.34665820002555847, + "learning_rate": 3.832886163280951e-06, + "logits/chosen": 2.991048574447632, + "logits/rejected": 2.991048574447632, + "logps/chosen": -175.65672302246094, + "logps/rejected": -175.65672302246094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.756118774414062, + "rewards/margins": 0.0, + "rewards/rejected": -12.756118774414062, + "step": 2000 + }, + { + "epoch": 1.3808866655166465, + "grad_norm": 0.3034103512763977, + "learning_rate": 3.8348026063625915e-06, + "logits/chosen": 3.403484344482422, + "logits/rejected": 3.513136863708496, + "logps/chosen": -170.13748168945312, + "logps/rejected": -182.56402587890625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.212121963500977, + "rewards/margins": 1.2519341707229614, + "rewards/rejected": -13.464056015014648, + "step": 2001 + }, + { + "epoch": 1.3815766775918579, + "grad_norm": 0.37426167726516724, + "learning_rate": 3.8367190494442315e-06, + "logits/chosen": 3.286062479019165, + "logits/rejected": 3.286062479019165, + "logps/chosen": -174.70372009277344, + "logps/rejected": -174.70372009277344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.699552536010742, + "rewards/margins": 0.0, + "rewards/rejected": -12.699552536010742, + "step": 2002 + }, + { + "epoch": 1.3822666896670692, + "grad_norm": 0.332753986120224, + "learning_rate": 3.838635492525872e-06, + "logits/chosen": 3.204484224319458, + "logits/rejected": 3.204484224319458, + "logps/chosen": -166.4932861328125, + "logps/rejected": -166.4932861328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.677811622619629, + "rewards/margins": 0.0, + "rewards/rejected": -11.677811622619629, + "step": 2003 + }, + { + "epoch": 1.3829567017422804, + "grad_norm": 0.32353681325912476, + "learning_rate": 3.840551935607512e-06, + "logits/chosen": 3.4034345149993896, + "logits/rejected": 3.6567513942718506, + "logps/chosen": -159.98086547851562, + "logps/rejected": -187.80052185058594, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.155036926269531, + "rewards/margins": 2.7253057956695557, + "rewards/rejected": -13.880343437194824, + "step": 2004 + }, + { + "epoch": 1.383646713817492, + "grad_norm": 0.5028050541877747, + "learning_rate": 3.842468378689153e-06, + "logits/chosen": 3.272768259048462, + "logits/rejected": 3.41911244392395, + "logps/chosen": -161.94271850585938, + "logps/rejected": -172.3140869140625, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.245615005493164, + "rewards/margins": 1.1254703998565674, + "rewards/rejected": -12.371086120605469, + "step": 2005 + }, + { + "epoch": 1.384336725892703, + "grad_norm": 6.355066299438477, + "learning_rate": 3.844384821770794e-06, + "logits/chosen": 2.9807872772216797, + "logits/rejected": 3.159247875213623, + "logps/chosen": -136.69419860839844, + "logps/rejected": -149.16403198242188, + "loss": 0.4933, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.917396545410156, + "rewards/margins": 1.301539421081543, + "rewards/rejected": -10.2189359664917, + "step": 2006 + }, + { + "epoch": 1.3850267379679144, + "grad_norm": 0.36729174852371216, + "learning_rate": 3.846301264852434e-06, + "logits/chosen": 3.0536913871765137, + "logits/rejected": 3.0536913871765137, + "logps/chosen": -153.62835693359375, + "logps/rejected": -153.62835693359375, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.496480941772461, + "rewards/margins": 7.152557373046875e-07, + "rewards/rejected": -10.496480941772461, + "step": 2007 + }, + { + "epoch": 1.3857167500431258, + "grad_norm": 0.3076905906200409, + "learning_rate": 3.848217707934075e-06, + "logits/chosen": 3.2412402629852295, + "logits/rejected": 3.2412402629852295, + "logps/chosen": -171.04469299316406, + "logps/rejected": -171.04469299316406, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.285813331604004, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.28581428527832, + "step": 2008 + }, + { + "epoch": 1.3864067621183371, + "grad_norm": 0.3872759938240051, + "learning_rate": 3.8501341510157155e-06, + "logits/chosen": 3.0928311347961426, + "logits/rejected": 3.166813373565674, + "logps/chosen": -164.31324768066406, + "logps/rejected": -171.20848083496094, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.751739501953125, + "rewards/margins": 0.7163152694702148, + "rewards/rejected": -12.46805477142334, + "step": 2009 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 0.38151100277900696, + "learning_rate": 3.8520505940973555e-06, + "logits/chosen": 3.3540778160095215, + "logits/rejected": 3.3540778160095215, + "logps/chosen": -171.05218505859375, + "logps/rejected": -171.05218505859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.366874694824219, + "rewards/margins": 0.0, + "rewards/rejected": -12.366874694824219, + "step": 2010 + }, + { + "epoch": 1.3877867862687596, + "grad_norm": 1.9903204441070557, + "learning_rate": 3.853967037178996e-06, + "logits/chosen": 2.7765321731567383, + "logits/rejected": 2.847520112991333, + "logps/chosen": -132.67111206054688, + "logps/rejected": -150.88922119140625, + "loss": 0.4658, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.581188201904297, + "rewards/margins": 1.8324252367019653, + "rewards/rejected": -10.413613319396973, + "step": 2011 + }, + { + "epoch": 1.388476798343971, + "grad_norm": 0.37082797288894653, + "learning_rate": 3.855883480260636e-06, + "logits/chosen": 2.981229782104492, + "logits/rejected": 2.981229782104492, + "logps/chosen": -162.13150024414062, + "logps/rejected": -162.1314697265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.475690841674805, + "rewards/margins": -7.748603820800781e-07, + "rewards/rejected": -11.475690841674805, + "step": 2012 + }, + { + "epoch": 1.3891668104191823, + "grad_norm": 0.3732815682888031, + "learning_rate": 3.857799923342277e-06, + "logits/chosen": 3.292996883392334, + "logits/rejected": 3.292996883392334, + "logps/chosen": -174.2392120361328, + "logps/rejected": -174.23919677734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.603994369506836, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.603994369506836, + "step": 2013 + }, + { + "epoch": 1.3898568224943937, + "grad_norm": 5.338415622711182, + "learning_rate": 3.859716366423917e-06, + "logits/chosen": 2.787726879119873, + "logits/rejected": 2.7946853637695312, + "logps/chosen": -160.889892578125, + "logps/rejected": -162.12954711914062, + "loss": 0.65, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.364874839782715, + "rewards/margins": 0.10944080352783203, + "rewards/rejected": -11.474315643310547, + "step": 2014 + }, + { + "epoch": 1.390546834569605, + "grad_norm": 20.034608840942383, + "learning_rate": 3.861632809505558e-06, + "logits/chosen": 2.9462740421295166, + "logits/rejected": 2.978332042694092, + "logps/chosen": -168.99978637695312, + "logps/rejected": -167.96282958984375, + "loss": 0.78, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.092317581176758, + "rewards/margins": -0.13757306337356567, + "rewards/rejected": -11.954743385314941, + "step": 2015 + }, + { + "epoch": 1.3912368466448162, + "grad_norm": 0.3659023940563202, + "learning_rate": 3.863549252587199e-06, + "logits/chosen": 3.0747921466827393, + "logits/rejected": 3.1757664680480957, + "logps/chosen": -145.5045166015625, + "logps/rejected": -175.1124725341797, + "loss": 0.434, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.774580001831055, + "rewards/margins": 3.0180177688598633, + "rewards/rejected": -12.792597770690918, + "step": 2016 + }, + { + "epoch": 1.3919268587200275, + "grad_norm": 0.44826778769493103, + "learning_rate": 3.8654656956688395e-06, + "logits/chosen": 2.9195396900177, + "logits/rejected": 3.059389591217041, + "logps/chosen": -134.15731811523438, + "logps/rejected": -155.60574340820312, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.461629867553711, + "rewards/margins": 2.2076706886291504, + "rewards/rejected": -10.669300079345703, + "step": 2017 + }, + { + "epoch": 1.3926168707952389, + "grad_norm": 16.465362548828125, + "learning_rate": 3.8673821387504795e-06, + "logits/chosen": 3.4310450553894043, + "logits/rejected": 3.322266101837158, + "logps/chosen": -177.20245361328125, + "logps/rejected": -168.1521759033203, + "loss": 1.5319, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.944829940795898, + "rewards/margins": -0.9253424406051636, + "rewards/rejected": -12.019487380981445, + "step": 2018 + }, + { + "epoch": 1.3933068828704502, + "grad_norm": 0.4905080497264862, + "learning_rate": 3.86929858183212e-06, + "logits/chosen": 2.8437418937683105, + "logits/rejected": 2.8437418937683105, + "logps/chosen": -152.76739501953125, + "logps/rejected": -152.76739501953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.436872482299805, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -10.436872482299805, + "step": 2019 + }, + { + "epoch": 1.3939968949456616, + "grad_norm": 0.43622082471847534, + "learning_rate": 3.87121502491376e-06, + "logits/chosen": 3.5429677963256836, + "logits/rejected": 3.5429677963256836, + "logps/chosen": -178.75344848632812, + "logps/rejected": -178.75343322753906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.878070831298828, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.878070831298828, + "step": 2020 + }, + { + "epoch": 1.394686907020873, + "grad_norm": 0.598558783531189, + "learning_rate": 3.873131467995401e-06, + "logits/chosen": 2.7984418869018555, + "logits/rejected": 2.968428611755371, + "logps/chosen": -155.54507446289062, + "logps/rejected": -161.2169189453125, + "loss": 0.6084, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.815886497497559, + "rewards/margins": 0.5251433849334717, + "rewards/rejected": -11.34103012084961, + "step": 2021 + }, + { + "epoch": 1.3953769190960843, + "grad_norm": 0.33806511759757996, + "learning_rate": 3.875047911077041e-06, + "logits/chosen": 3.0342190265655518, + "logits/rejected": 3.1839420795440674, + "logps/chosen": -166.7901611328125, + "logps/rejected": -188.68601989746094, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.992647171020508, + "rewards/margins": 2.2404866218566895, + "rewards/rejected": -14.233133316040039, + "step": 2022 + }, + { + "epoch": 1.3960669311712954, + "grad_norm": 0.3848150670528412, + "learning_rate": 3.876964354158682e-06, + "logits/chosen": 3.217477560043335, + "logits/rejected": 3.1431331634521484, + "logps/chosen": -161.34579467773438, + "logps/rejected": -169.0909881591797, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.425956726074219, + "rewards/margins": 0.7908363342285156, + "rewards/rejected": -12.21679401397705, + "step": 2023 + }, + { + "epoch": 1.3967569432465068, + "grad_norm": 0.38894182443618774, + "learning_rate": 3.878880797240323e-06, + "logits/chosen": 3.0194272994995117, + "logits/rejected": 3.1180429458618164, + "logps/chosen": -157.2906494140625, + "logps/rejected": -163.2703857421875, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.867210388183594, + "rewards/margins": 0.6526361703872681, + "rewards/rejected": -11.519845962524414, + "step": 2024 + }, + { + "epoch": 1.3974469553217181, + "grad_norm": 0.5234600305557251, + "learning_rate": 3.880797240321963e-06, + "logits/chosen": 3.3306894302368164, + "logits/rejected": 3.3306894302368164, + "logps/chosen": -175.83013916015625, + "logps/rejected": -175.83013916015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.670392990112305, + "rewards/margins": 0.0, + "rewards/rejected": -12.670392990112305, + "step": 2025 + }, + { + "epoch": 1.3981369673969295, + "grad_norm": 0.2834252715110779, + "learning_rate": 3.8827136834036035e-06, + "logits/chosen": 3.0048582553863525, + "logits/rejected": 3.0476787090301514, + "logps/chosen": -145.31100463867188, + "logps/rejected": -155.51669311523438, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.820877075195312, + "rewards/margins": 0.9740442633628845, + "rewards/rejected": -10.794921875, + "step": 2026 + }, + { + "epoch": 1.3988269794721409, + "grad_norm": 0.3598932921886444, + "learning_rate": 3.884630126485243e-06, + "logits/chosen": 3.214446783065796, + "logits/rejected": 3.214446783065796, + "logps/chosen": -162.10011291503906, + "logps/rejected": -162.10012817382812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.322568893432617, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -11.322568893432617, + "step": 2027 + }, + { + "epoch": 1.399516991547352, + "grad_norm": 0.3714958131313324, + "learning_rate": 3.886546569566884e-06, + "logits/chosen": 3.290949821472168, + "logits/rejected": 3.287966728210449, + "logps/chosen": -137.56044006347656, + "logps/rejected": -148.21292114257812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.129523277282715, + "rewards/margins": 1.0328365564346313, + "rewards/rejected": -10.162360191345215, + "step": 2028 + }, + { + "epoch": 1.4002070036225633, + "grad_norm": 0.3214585781097412, + "learning_rate": 3.888463012648524e-06, + "logits/chosen": 3.010300874710083, + "logits/rejected": 3.196028709411621, + "logps/chosen": -157.54434204101562, + "logps/rejected": -179.53457641601562, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.074769020080566, + "rewards/margins": 2.1614038944244385, + "rewards/rejected": -13.236173629760742, + "step": 2029 + }, + { + "epoch": 1.4008970156977747, + "grad_norm": 14.839472770690918, + "learning_rate": 3.890379455730165e-06, + "logits/chosen": 3.0365235805511475, + "logits/rejected": 3.197336196899414, + "logps/chosen": -151.82460021972656, + "logps/rejected": -162.59071350097656, + "loss": 0.776, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.519535064697266, + "rewards/margins": 0.9475952386856079, + "rewards/rejected": -11.467129707336426, + "step": 2030 + }, + { + "epoch": 1.401587027772986, + "grad_norm": 0.36381766200065613, + "learning_rate": 3.892295898811805e-06, + "logits/chosen": 3.6372809410095215, + "logits/rejected": 3.9010746479034424, + "logps/chosen": -153.9365692138672, + "logps/rejected": -174.31272888183594, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.61335563659668, + "rewards/margins": 2.054489850997925, + "rewards/rejected": -12.667845726013184, + "step": 2031 + }, + { + "epoch": 1.4022770398481974, + "grad_norm": 0.4887586534023285, + "learning_rate": 3.894212341893447e-06, + "logits/chosen": 3.2036280632019043, + "logits/rejected": 3.349856376647949, + "logps/chosen": -154.9952392578125, + "logps/rejected": -168.9771270751953, + "loss": 0.5232, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.748886108398438, + "rewards/margins": 1.4281015396118164, + "rewards/rejected": -12.17698860168457, + "step": 2032 + }, + { + "epoch": 1.4029670519234085, + "grad_norm": 1.1690038442611694, + "learning_rate": 3.896128784975087e-06, + "logits/chosen": 3.3445053100585938, + "logits/rejected": 3.445213794708252, + "logps/chosen": -155.18292236328125, + "logps/rejected": -175.9466552734375, + "loss": 0.5233, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.692521095275879, + "rewards/margins": 2.059856414794922, + "rewards/rejected": -12.7523775100708, + "step": 2033 + }, + { + "epoch": 1.4036570639986201, + "grad_norm": 0.3039429783821106, + "learning_rate": 3.8980452280567274e-06, + "logits/chosen": 3.0116779804229736, + "logits/rejected": 3.0511410236358643, + "logps/chosen": -154.27706909179688, + "logps/rejected": -169.828369140625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.66706657409668, + "rewards/margins": 1.5462437868118286, + "rewards/rejected": -12.213310241699219, + "step": 2034 + }, + { + "epoch": 1.4043470760738312, + "grad_norm": 0.32392627000808716, + "learning_rate": 3.899961671138367e-06, + "logits/chosen": 3.0840892791748047, + "logits/rejected": 3.360291004180908, + "logps/chosen": -148.79075622558594, + "logps/rejected": -173.8635711669922, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.186981201171875, + "rewards/margins": 2.435168981552124, + "rewards/rejected": -12.622150421142578, + "step": 2035 + }, + { + "epoch": 1.4050370881490426, + "grad_norm": 0.37178289890289307, + "learning_rate": 3.901878114220008e-06, + "logits/chosen": 3.083801746368408, + "logits/rejected": 3.083801746368408, + "logps/chosen": -165.87466430664062, + "logps/rejected": -165.87466430664062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.776058197021484, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -11.776057243347168, + "step": 2036 + }, + { + "epoch": 1.405727100224254, + "grad_norm": 0.3453398048877716, + "learning_rate": 3.903794557301648e-06, + "logits/chosen": 2.7846906185150146, + "logits/rejected": 3.1457438468933105, + "logps/chosen": -146.15296936035156, + "logps/rejected": -171.13674926757812, + "loss": 0.4357, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.869640350341797, + "rewards/margins": 2.5428953170776367, + "rewards/rejected": -12.412535667419434, + "step": 2037 + }, + { + "epoch": 1.4064171122994653, + "grad_norm": 0.392739474773407, + "learning_rate": 3.905711000383289e-06, + "logits/chosen": 3.2385549545288086, + "logits/rejected": 3.2385549545288086, + "logps/chosen": -176.88153076171875, + "logps/rejected": -176.88153076171875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.985747337341309, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.985747337341309, + "step": 2038 + }, + { + "epoch": 1.4071071243746767, + "grad_norm": 0.3251315653324127, + "learning_rate": 3.907627443464929e-06, + "logits/chosen": 3.162393093109131, + "logits/rejected": 3.318967580795288, + "logps/chosen": -182.176025390625, + "logps/rejected": -189.3719940185547, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.380674362182617, + "rewards/margins": 0.7174859046936035, + "rewards/rejected": -14.098159790039062, + "step": 2039 + }, + { + "epoch": 1.4077971364498878, + "grad_norm": 28.708251953125, + "learning_rate": 3.90954388654657e-06, + "logits/chosen": 3.2415993213653564, + "logits/rejected": 3.170335531234741, + "logps/chosen": -153.42991638183594, + "logps/rejected": -158.15673828125, + "loss": 1.478, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.505168914794922, + "rewards/margins": 0.4415017366409302, + "rewards/rejected": -10.946669578552246, + "step": 2040 + }, + { + "epoch": 1.4084871485250992, + "grad_norm": 0.3650699853897095, + "learning_rate": 3.911460329628211e-06, + "logits/chosen": 2.940722942352295, + "logits/rejected": 2.940722942352295, + "logps/chosen": -167.12686157226562, + "logps/rejected": -167.12686157226562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.074451446533203, + "rewards/margins": 0.0, + "rewards/rejected": -12.074451446533203, + "step": 2041 + }, + { + "epoch": 1.4091771606003105, + "grad_norm": 8.43862533569336, + "learning_rate": 3.9133767727098506e-06, + "logits/chosen": 3.077683448791504, + "logits/rejected": 3.088609457015991, + "logps/chosen": -151.07626342773438, + "logps/rejected": -150.76231384277344, + "loss": 0.7262, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.306961059570312, + "rewards/margins": -0.059118449687957764, + "rewards/rejected": -10.247842788696289, + "step": 2042 + }, + { + "epoch": 1.4098671726755219, + "grad_norm": 0.440818727016449, + "learning_rate": 3.915293215791491e-06, + "logits/chosen": 3.30391788482666, + "logits/rejected": 3.3273377418518066, + "logps/chosen": -168.14901733398438, + "logps/rejected": -174.91217041015625, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.115934371948242, + "rewards/margins": 0.7110780477523804, + "rewards/rejected": -12.827012062072754, + "step": 2043 + }, + { + "epoch": 1.4105571847507332, + "grad_norm": 0.9498341679573059, + "learning_rate": 3.917209658873131e-06, + "logits/chosen": 3.145686149597168, + "logits/rejected": 3.3237011432647705, + "logps/chosen": -171.63427734375, + "logps/rejected": -176.26300048828125, + "loss": 0.6105, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.439056396484375, + "rewards/margins": 0.4286121129989624, + "rewards/rejected": -12.867670059204102, + "step": 2044 + }, + { + "epoch": 1.4112471968259443, + "grad_norm": 0.3800041973590851, + "learning_rate": 3.919126101954772e-06, + "logits/chosen": 3.613738775253296, + "logits/rejected": 3.613738775253296, + "logps/chosen": -173.9049072265625, + "logps/rejected": -173.9049072265625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.659141540527344, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.659141540527344, + "step": 2045 + }, + { + "epoch": 1.4119372089011557, + "grad_norm": 0.32847487926483154, + "learning_rate": 3.921042545036413e-06, + "logits/chosen": 3.3148610591888428, + "logits/rejected": 3.371644973754883, + "logps/chosen": -165.07008361816406, + "logps/rejected": -173.22633361816406, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.872748374938965, + "rewards/margins": 0.8641347885131836, + "rewards/rejected": -12.736883163452148, + "step": 2046 + }, + { + "epoch": 1.412627220976367, + "grad_norm": 0.3436621427536011, + "learning_rate": 3.922958988118053e-06, + "logits/chosen": 2.875898599624634, + "logits/rejected": 2.875898599624634, + "logps/chosen": -150.51303100585938, + "logps/rejected": -150.51303100585938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.268348693847656, + "rewards/margins": 0.0, + "rewards/rejected": -10.268348693847656, + "step": 2047 + }, + { + "epoch": 1.4133172330515784, + "grad_norm": 0.32147547602653503, + "learning_rate": 3.924875431199694e-06, + "logits/chosen": 2.8973793983459473, + "logits/rejected": 2.8973793983459473, + "logps/chosen": -181.2476806640625, + "logps/rejected": -181.2476806640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.257529258728027, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.257529258728027, + "step": 2048 + }, + { + "epoch": 1.4140072451267898, + "grad_norm": 0.3260158598423004, + "learning_rate": 3.9267918742813346e-06, + "logits/chosen": 3.1467130184173584, + "logits/rejected": 3.1467130184173584, + "logps/chosen": -183.76254272460938, + "logps/rejected": -183.76254272460938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.638666152954102, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.638666152954102, + "step": 2049 + }, + { + "epoch": 1.414697257202001, + "grad_norm": 3.607851266860962, + "learning_rate": 3.9287083173629745e-06, + "logits/chosen": 2.903261423110962, + "logits/rejected": 2.8775081634521484, + "logps/chosen": -177.71533203125, + "logps/rejected": -179.9697265625, + "loss": 0.6294, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.040703773498535, + "rewards/margins": 0.20044994354248047, + "rewards/rejected": -13.241153717041016, + "step": 2050 + }, + { + "epoch": 1.4153872692772125, + "grad_norm": 8.616954803466797, + "learning_rate": 3.930624760444615e-06, + "logits/chosen": 3.229429244995117, + "logits/rejected": 3.2281291484832764, + "logps/chosen": -175.1265869140625, + "logps/rejected": -174.07745361328125, + "loss": 0.709, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.730070114135742, + "rewards/margins": -0.029914140701293945, + "rewards/rejected": -12.700156211853027, + "step": 2051 + }, + { + "epoch": 1.4160772813524236, + "grad_norm": 0.3399277627468109, + "learning_rate": 3.932541203526255e-06, + "logits/chosen": 3.2867648601531982, + "logits/rejected": 3.2867648601531982, + "logps/chosen": -166.5102081298828, + "logps/rejected": -166.5102081298828, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.740480422973633, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.740482330322266, + "step": 2052 + }, + { + "epoch": 1.416767293427635, + "grad_norm": 0.3235706090927124, + "learning_rate": 3.934457646607896e-06, + "logits/chosen": 3.547337770462036, + "logits/rejected": 3.547337770462036, + "logps/chosen": -187.525634765625, + "logps/rejected": -187.525634765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.902944564819336, + "rewards/margins": 0.0, + "rewards/rejected": -13.902944564819336, + "step": 2053 + }, + { + "epoch": 1.4174573055028463, + "grad_norm": 0.3595898747444153, + "learning_rate": 3.936374089689536e-06, + "logits/chosen": 3.501225233078003, + "logits/rejected": 3.501225233078003, + "logps/chosen": -172.412353515625, + "logps/rejected": -172.412353515625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.249996185302734, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.249996185302734, + "step": 2054 + }, + { + "epoch": 1.4181473175780577, + "grad_norm": 0.3611392676830292, + "learning_rate": 3.938290532771177e-06, + "logits/chosen": 3.1556613445281982, + "logits/rejected": 3.196951389312744, + "logps/chosen": -165.59327697753906, + "logps/rejected": -176.43661499023438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.763646125793457, + "rewards/margins": 1.1137146949768066, + "rewards/rejected": -12.877360343933105, + "step": 2055 + }, + { + "epoch": 1.418837329653269, + "grad_norm": 0.37057799100875854, + "learning_rate": 3.940206975852818e-06, + "logits/chosen": 3.465057849884033, + "logits/rejected": 3.465057849884033, + "logps/chosen": -184.40748596191406, + "logps/rejected": -184.40748596191406, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.596675872802734, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.596675872802734, + "step": 2056 + }, + { + "epoch": 1.4195273417284802, + "grad_norm": 0.3179783225059509, + "learning_rate": 3.9421234189344586e-06, + "logits/chosen": 3.122114419937134, + "logits/rejected": 3.133096694946289, + "logps/chosen": -166.87559509277344, + "logps/rejected": -178.24119567871094, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.969524383544922, + "rewards/margins": 1.1562137603759766, + "rewards/rejected": -13.125738143920898, + "step": 2057 + }, + { + "epoch": 1.4202173538036915, + "grad_norm": 0.3796985149383545, + "learning_rate": 3.9440398620160985e-06, + "logits/chosen": 3.0917553901672363, + "logits/rejected": 3.0917553901672363, + "logps/chosen": -186.38595581054688, + "logps/rejected": -186.38595581054688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.817047119140625, + "rewards/margins": 0.0, + "rewards/rejected": -13.817047119140625, + "step": 2058 + }, + { + "epoch": 1.4209073658789029, + "grad_norm": 0.33074188232421875, + "learning_rate": 3.945956305097739e-06, + "logits/chosen": 3.4018068313598633, + "logits/rejected": 3.5344185829162598, + "logps/chosen": -148.42237854003906, + "logps/rejected": -163.6737518310547, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.076701164245605, + "rewards/margins": 1.4624773263931274, + "rewards/rejected": -11.539178848266602, + "step": 2059 + }, + { + "epoch": 1.4215973779541142, + "grad_norm": 0.35324007272720337, + "learning_rate": 3.947872748179379e-06, + "logits/chosen": 3.2346370220184326, + "logits/rejected": 3.3484740257263184, + "logps/chosen": -166.69406127929688, + "logps/rejected": -187.84466552734375, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.732905387878418, + "rewards/margins": 2.086669683456421, + "rewards/rejected": -13.819575309753418, + "step": 2060 + }, + { + "epoch": 1.4222873900293256, + "grad_norm": 0.48387932777404785, + "learning_rate": 3.94978919126102e-06, + "logits/chosen": 3.024714469909668, + "logits/rejected": 3.1926562786102295, + "logps/chosen": -146.4327392578125, + "logps/rejected": -165.29061889648438, + "loss": 0.5214, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.941001892089844, + "rewards/margins": 1.8317642211914062, + "rewards/rejected": -11.77276611328125, + "step": 2061 + }, + { + "epoch": 1.4229774021045367, + "grad_norm": 0.4577234089374542, + "learning_rate": 3.95170563434266e-06, + "logits/chosen": 2.8676412105560303, + "logits/rejected": 3.1391758918762207, + "logps/chosen": -147.84613037109375, + "logps/rejected": -174.21807861328125, + "loss": 0.4351, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.043330192565918, + "rewards/margins": 2.621647596359253, + "rewards/rejected": -12.66497802734375, + "step": 2062 + }, + { + "epoch": 1.423667414179748, + "grad_norm": 0.34963101148605347, + "learning_rate": 3.953622077424301e-06, + "logits/chosen": 3.1953606605529785, + "logits/rejected": 3.316133737564087, + "logps/chosen": -150.46682739257812, + "logps/rejected": -164.37478637695312, + "loss": 0.5207, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.274089813232422, + "rewards/margins": 1.4424293041229248, + "rewards/rejected": -11.716520309448242, + "step": 2063 + }, + { + "epoch": 1.4243574262549594, + "grad_norm": 0.3528752028942108, + "learning_rate": 3.955538520505942e-06, + "logits/chosen": 3.298135757446289, + "logits/rejected": 3.3155505657196045, + "logps/chosen": -172.097900390625, + "logps/rejected": -177.3116455078125, + "loss": 0.6085, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.431987762451172, + "rewards/margins": 0.5192402601242065, + "rewards/rejected": -12.951227188110352, + "step": 2064 + }, + { + "epoch": 1.4250474383301708, + "grad_norm": 0.39779627323150635, + "learning_rate": 3.957454963587582e-06, + "logits/chosen": 3.52948260307312, + "logits/rejected": 3.52948260307312, + "logps/chosen": -177.42041015625, + "logps/rejected": -177.42041015625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.960262298583984, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.960262298583984, + "step": 2065 + }, + { + "epoch": 1.4257374504053821, + "grad_norm": 0.603924572467804, + "learning_rate": 3.9593714066692225e-06, + "logits/chosen": 3.0697765350341797, + "logits/rejected": 3.267247438430786, + "logps/chosen": -159.73483276367188, + "logps/rejected": -171.33013916015625, + "loss": 0.5238, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.098701477050781, + "rewards/margins": 1.1498736143112183, + "rewards/rejected": -12.248575210571289, + "step": 2066 + }, + { + "epoch": 1.4264274624805935, + "grad_norm": 0.4969075322151184, + "learning_rate": 3.9612878497508625e-06, + "logits/chosen": 3.2353618144989014, + "logits/rejected": 3.2353618144989014, + "logps/chosen": -176.66592407226562, + "logps/rejected": -176.66592407226562, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.919559478759766, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.91955852508545, + "step": 2067 + }, + { + "epoch": 1.4271174745558048, + "grad_norm": 0.4518648386001587, + "learning_rate": 3.963204292832503e-06, + "logits/chosen": 3.191007614135742, + "logits/rejected": 3.191007614135742, + "logps/chosen": -167.98193359375, + "logps/rejected": -167.98193359375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.002641677856445, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.002643585205078, + "step": 2068 + }, + { + "epoch": 1.427807486631016, + "grad_norm": 0.42363008856773376, + "learning_rate": 3.965120735914143e-06, + "logits/chosen": 3.007842540740967, + "logits/rejected": 3.125973701477051, + "logps/chosen": -163.72616577148438, + "logps/rejected": -175.08799743652344, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.63805866241455, + "rewards/margins": 1.0657211542129517, + "rewards/rejected": -12.703779220581055, + "step": 2069 + }, + { + "epoch": 1.4284974987062273, + "grad_norm": 0.5157999992370605, + "learning_rate": 3.967037178995784e-06, + "logits/chosen": 3.3029258251190186, + "logits/rejected": 3.3029258251190186, + "logps/chosen": -174.63101196289062, + "logps/rejected": -174.63101196289062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.5130615234375, + "rewards/margins": 0.0, + "rewards/rejected": -12.5130615234375, + "step": 2070 + }, + { + "epoch": 1.4291875107814387, + "grad_norm": 0.3305678367614746, + "learning_rate": 3.968953622077424e-06, + "logits/chosen": 3.064656972885132, + "logits/rejected": 3.283613920211792, + "logps/chosen": -165.9596405029297, + "logps/rejected": -178.8798065185547, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.92773151397705, + "rewards/margins": 1.3055418729782104, + "rewards/rejected": -13.23327350616455, + "step": 2071 + }, + { + "epoch": 1.42987752285665, + "grad_norm": 0.43530774116516113, + "learning_rate": 3.970870065159066e-06, + "logits/chosen": 2.9890098571777344, + "logits/rejected": 2.9860177040100098, + "logps/chosen": -155.19827270507812, + "logps/rejected": -175.0364532470703, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.59170150756836, + "rewards/margins": 1.9889013767242432, + "rewards/rejected": -12.580602645874023, + "step": 2072 + }, + { + "epoch": 1.4305675349318614, + "grad_norm": 0.3748556971549988, + "learning_rate": 3.972786508240706e-06, + "logits/chosen": 3.1457252502441406, + "logits/rejected": 3.1457252502441406, + "logps/chosen": -152.80340576171875, + "logps/rejected": -152.80340576171875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.698293685913086, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -10.698293685913086, + "step": 2073 + }, + { + "epoch": 1.4312575470070725, + "grad_norm": 0.34768909215927124, + "learning_rate": 3.9747029513223465e-06, + "logits/chosen": 3.3931241035461426, + "logits/rejected": 3.469111442565918, + "logps/chosen": -168.385498046875, + "logps/rejected": -181.7283935546875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.872112274169922, + "rewards/margins": 1.3587177991867065, + "rewards/rejected": -13.230830192565918, + "step": 2074 + }, + { + "epoch": 1.4319475590822839, + "grad_norm": 0.41496020555496216, + "learning_rate": 3.9766193944039864e-06, + "logits/chosen": 3.47239351272583, + "logits/rejected": 3.47239351272583, + "logps/chosen": -183.5298309326172, + "logps/rejected": -183.52981567382812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.566984176635742, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.566984176635742, + "step": 2075 + }, + { + "epoch": 1.4326375711574952, + "grad_norm": 0.939953088760376, + "learning_rate": 3.978535837485627e-06, + "logits/chosen": 3.221818685531616, + "logits/rejected": 3.217233657836914, + "logps/chosen": -175.7229766845703, + "logps/rejected": -179.24688720703125, + "loss": 0.6128, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.802000045776367, + "rewards/margins": 0.36999350786209106, + "rewards/rejected": -13.171993255615234, + "step": 2076 + }, + { + "epoch": 1.4333275832327066, + "grad_norm": 0.5151866674423218, + "learning_rate": 3.980452280567267e-06, + "logits/chosen": 3.151970863342285, + "logits/rejected": 3.151970863342285, + "logps/chosen": -166.9438934326172, + "logps/rejected": -166.9438934326172, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.776820182800293, + "rewards/margins": 0.0, + "rewards/rejected": -11.776820182800293, + "step": 2077 + }, + { + "epoch": 1.434017595307918, + "grad_norm": 0.3887135088443756, + "learning_rate": 3.982368723648908e-06, + "logits/chosen": 3.3457493782043457, + "logits/rejected": 3.4823098182678223, + "logps/chosen": -173.81663513183594, + "logps/rejected": -180.28213500976562, + "loss": 0.6071, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.50834846496582, + "rewards/margins": 0.677501916885376, + "rewards/rejected": -13.185850143432617, + "step": 2078 + }, + { + "epoch": 1.434707607383129, + "grad_norm": 0.3465990722179413, + "learning_rate": 3.984285166730548e-06, + "logits/chosen": 3.2061312198638916, + "logits/rejected": 3.2674458026885986, + "logps/chosen": -176.03457641601562, + "logps/rejected": -185.292724609375, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.611581802368164, + "rewards/margins": 0.974121630191803, + "rewards/rejected": -13.58570384979248, + "step": 2079 + }, + { + "epoch": 1.4353976194583407, + "grad_norm": 2.232572555541992, + "learning_rate": 3.986201609812189e-06, + "logits/chosen": 3.0117599964141846, + "logits/rejected": 3.0553689002990723, + "logps/chosen": -169.77545166015625, + "logps/rejected": -173.65359497070312, + "loss": 0.6124, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.211764335632324, + "rewards/margins": 0.3792276382446289, + "rewards/rejected": -12.59099292755127, + "step": 2080 + }, + { + "epoch": 1.4360876315335518, + "grad_norm": 0.3668507933616638, + "learning_rate": 3.98811805289383e-06, + "logits/chosen": 2.8343796730041504, + "logits/rejected": 2.877688407897949, + "logps/chosen": -162.41952514648438, + "logps/rejected": -173.21217346191406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.401567459106445, + "rewards/margins": 1.1556905508041382, + "rewards/rejected": -12.557257652282715, + "step": 2081 + }, + { + "epoch": 1.4367776436087631, + "grad_norm": 0.944035530090332, + "learning_rate": 3.99003449597547e-06, + "logits/chosen": 3.161442279815674, + "logits/rejected": 3.164943218231201, + "logps/chosen": -138.80255126953125, + "logps/rejected": -158.44285583496094, + "loss": 0.5282, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.263071060180664, + "rewards/margins": 1.9100165367126465, + "rewards/rejected": -11.173088073730469, + "step": 2082 + }, + { + "epoch": 1.4374676556839745, + "grad_norm": 0.4664451479911804, + "learning_rate": 3.9919509390571104e-06, + "logits/chosen": 3.3842239379882812, + "logits/rejected": 3.4136481285095215, + "logps/chosen": -175.25869750976562, + "logps/rejected": -184.3466796875, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.767343521118164, + "rewards/margins": 0.9051694869995117, + "rewards/rejected": -13.672513008117676, + "step": 2083 + }, + { + "epoch": 1.4381576677591859, + "grad_norm": 1.3137751817703247, + "learning_rate": 3.99386738213875e-06, + "logits/chosen": 3.0377211570739746, + "logits/rejected": 3.0726351737976074, + "logps/chosen": -172.94473266601562, + "logps/rejected": -175.3096923828125, + "loss": 0.6244, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.50742244720459, + "rewards/margins": 0.23408746719360352, + "rewards/rejected": -12.741510391235352, + "step": 2084 + }, + { + "epoch": 1.4388476798343972, + "grad_norm": 0.5443946123123169, + "learning_rate": 3.995783825220391e-06, + "logits/chosen": 3.0543808937072754, + "logits/rejected": 3.2735466957092285, + "logps/chosen": -159.37747192382812, + "logps/rejected": -178.4559326171875, + "loss": 0.5236, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.242027282714844, + "rewards/margins": 1.8815078735351562, + "rewards/rejected": -13.123534202575684, + "step": 2085 + }, + { + "epoch": 1.4395376919096083, + "grad_norm": 0.3669467866420746, + "learning_rate": 3.997700268302032e-06, + "logits/chosen": 3.6145071983337402, + "logits/rejected": 3.6145071983337402, + "logps/chosen": -172.8314971923828, + "logps/rejected": -172.83151245117188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.480024337768555, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -12.480024337768555, + "step": 2086 + }, + { + "epoch": 1.4402277039848197, + "grad_norm": 0.4526999592781067, + "learning_rate": 3.999616711383672e-06, + "logits/chosen": 3.5662879943847656, + "logits/rejected": 3.5662879943847656, + "logps/chosen": -171.53970336914062, + "logps/rejected": -171.53970336914062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.38121223449707, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.38121223449707, + "step": 2087 + }, + { + "epoch": 1.440917716060031, + "grad_norm": 0.3386550843715668, + "learning_rate": 4.001533154465313e-06, + "logits/chosen": 3.5498855113983154, + "logits/rejected": 3.5498855113983154, + "logps/chosen": -181.9897918701172, + "logps/rejected": -181.98980712890625, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.225946426391602, + "rewards/margins": 8.344650268554688e-07, + "rewards/rejected": -13.225946426391602, + "step": 2088 + }, + { + "epoch": 1.4416077281352424, + "grad_norm": 0.4333031177520752, + "learning_rate": 4.003449597546954e-06, + "logits/chosen": 2.7515861988067627, + "logits/rejected": 2.7515861988067627, + "logps/chosen": -169.30068969726562, + "logps/rejected": -169.30068969726562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.24652099609375, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.24652099609375, + "step": 2089 + }, + { + "epoch": 1.4422977402104538, + "grad_norm": 0.3853919506072998, + "learning_rate": 4.005366040628594e-06, + "logits/chosen": 4.005201816558838, + "logits/rejected": 4.005201816558838, + "logps/chosen": -178.15127563476562, + "logps/rejected": -178.15127563476562, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.067878723144531, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.067878723144531, + "step": 2090 + }, + { + "epoch": 1.442987752285665, + "grad_norm": 0.38890525698661804, + "learning_rate": 4.007282483710234e-06, + "logits/chosen": 3.368767023086548, + "logits/rejected": 3.65397310256958, + "logps/chosen": -162.04159545898438, + "logps/rejected": -181.95440673828125, + "loss": 0.4375, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.623600006103516, + "rewards/margins": 1.909034013748169, + "rewards/rejected": -13.532633781433105, + "step": 2091 + }, + { + "epoch": 1.4436777643608762, + "grad_norm": 25.03886604309082, + "learning_rate": 4.009198926791874e-06, + "logits/chosen": 3.3525376319885254, + "logits/rejected": 3.407125949859619, + "logps/chosen": -178.43130493164062, + "logps/rejected": -184.25479125976562, + "loss": 0.809, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.094696044921875, + "rewards/margins": 0.5778881907463074, + "rewards/rejected": -13.672584533691406, + "step": 2092 + }, + { + "epoch": 1.4443677764360876, + "grad_norm": 0.3099344074726105, + "learning_rate": 4.011115369873515e-06, + "logits/chosen": 3.6227073669433594, + "logits/rejected": 3.8226897716522217, + "logps/chosen": -177.04916381835938, + "logps/rejected": -193.2340850830078, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.63127613067627, + "rewards/margins": 1.6913046836853027, + "rewards/rejected": -14.32258129119873, + "step": 2093 + }, + { + "epoch": 1.445057788511299, + "grad_norm": 0.40512412786483765, + "learning_rate": 4.013031812955155e-06, + "logits/chosen": 3.207850694656372, + "logits/rejected": 3.207850694656372, + "logps/chosen": -173.09542846679688, + "logps/rejected": -173.09542846679688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.557442665100098, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.557442665100098, + "step": 2094 + }, + { + "epoch": 1.4457478005865103, + "grad_norm": 7.513484477996826, + "learning_rate": 4.014948256036796e-06, + "logits/chosen": 3.3793962001800537, + "logits/rejected": 3.451192617416382, + "logps/chosen": -169.47386169433594, + "logps/rejected": -170.4508819580078, + "loss": 0.6654, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.205032348632812, + "rewards/margins": 0.06338083744049072, + "rewards/rejected": -12.268412590026855, + "step": 2095 + }, + { + "epoch": 1.4464378126617214, + "grad_norm": 0.29156097769737244, + "learning_rate": 4.016864699118436e-06, + "logits/chosen": 3.4810476303100586, + "logits/rejected": 3.5563573837280273, + "logps/chosen": -196.19960021972656, + "logps/rejected": -205.91372680664062, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.789472579956055, + "rewards/margins": 0.9558877944946289, + "rewards/rejected": -15.745359420776367, + "step": 2096 + }, + { + "epoch": 1.447127824736933, + "grad_norm": 0.33780691027641296, + "learning_rate": 4.018781142200078e-06, + "logits/chosen": 3.479006290435791, + "logits/rejected": 3.6174161434173584, + "logps/chosen": -159.24578857421875, + "logps/rejected": -175.6941680908203, + "loss": 0.5206, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.080438613891602, + "rewards/margins": 1.5769122838974, + "rewards/rejected": -12.657350540161133, + "step": 2097 + }, + { + "epoch": 1.4478178368121442, + "grad_norm": 1.4597872495651245, + "learning_rate": 4.0206975852817176e-06, + "logits/chosen": 3.7023494243621826, + "logits/rejected": 3.705595016479492, + "logps/chosen": -163.1105194091797, + "logps/rejected": -166.49818420410156, + "loss": 0.6202, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.766740798950195, + "rewards/margins": 0.26963210105895996, + "rewards/rejected": -12.036373138427734, + "step": 2098 + }, + { + "epoch": 1.4485078488873555, + "grad_norm": 0.4715753197669983, + "learning_rate": 4.022614028363358e-06, + "logits/chosen": 3.5214662551879883, + "logits/rejected": 3.5214662551879883, + "logps/chosen": -166.9427490234375, + "logps/rejected": -166.9427490234375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.856237411499023, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.856237411499023, + "step": 2099 + }, + { + "epoch": 1.4491978609625669, + "grad_norm": 0.34153226017951965, + "learning_rate": 4.024530471444998e-06, + "logits/chosen": 3.466522455215454, + "logits/rejected": 3.6014010906219482, + "logps/chosen": -166.01681518554688, + "logps/rejected": -178.09329223632812, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.873440742492676, + "rewards/margins": 1.2216135263442993, + "rewards/rejected": -13.095053672790527, + "step": 2100 + }, + { + "epoch": 1.4498878730377782, + "grad_norm": 0.3345738351345062, + "learning_rate": 4.026446914526639e-06, + "logits/chosen": 3.3246426582336426, + "logits/rejected": 3.3246426582336426, + "logps/chosen": -194.13543701171875, + "logps/rejected": -194.13543701171875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.735859870910645, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -14.735859870910645, + "step": 2101 + }, + { + "epoch": 1.4505778851129896, + "grad_norm": 0.5025829672813416, + "learning_rate": 4.028363357608279e-06, + "logits/chosen": 3.5811805725097656, + "logits/rejected": 3.680576801300049, + "logps/chosen": -172.7948760986328, + "logps/rejected": -177.7233428955078, + "loss": 0.6083, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.702184677124023, + "rewards/margins": 0.5269807577133179, + "rewards/rejected": -13.229166030883789, + "step": 2102 + }, + { + "epoch": 1.4512678971882007, + "grad_norm": 25.270240783691406, + "learning_rate": 4.03027980068992e-06, + "logits/chosen": 3.4801406860351562, + "logits/rejected": 3.455871820449829, + "logps/chosen": -195.4361114501953, + "logps/rejected": -191.58749389648438, + "loss": 0.9891, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.744600296020508, + "rewards/margins": -0.3765498399734497, + "rewards/rejected": -14.368051528930664, + "step": 2103 + }, + { + "epoch": 1.451957909263412, + "grad_norm": 0.3671827018260956, + "learning_rate": 4.03219624377156e-06, + "logits/chosen": 3.90054988861084, + "logits/rejected": 3.90054988861084, + "logps/chosen": -199.23033142089844, + "logps/rejected": -199.23031616210938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -15.083730697631836, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -15.083730697631836, + "step": 2104 + }, + { + "epoch": 1.4526479213386234, + "grad_norm": 0.4600220024585724, + "learning_rate": 4.034112686853201e-06, + "logits/chosen": 3.107910633087158, + "logits/rejected": 3.246729612350464, + "logps/chosen": -171.7279510498047, + "logps/rejected": -177.66505432128906, + "loss": 0.6076, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.43862533569336, + "rewards/margins": 0.5941253304481506, + "rewards/rejected": -13.032751083374023, + "step": 2105 + }, + { + "epoch": 1.4533379334138348, + "grad_norm": 0.3084344267845154, + "learning_rate": 4.0360291299348416e-06, + "logits/chosen": 3.4847991466522217, + "logits/rejected": 3.552516460418701, + "logps/chosen": -168.44833374023438, + "logps/rejected": -190.51470947265625, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.003418922424316, + "rewards/margins": 2.2663869857788086, + "rewards/rejected": -14.269805908203125, + "step": 2106 + }, + { + "epoch": 1.4540279454890461, + "grad_norm": 0.2755538523197174, + "learning_rate": 4.0379455730164815e-06, + "logits/chosen": 3.206395149230957, + "logits/rejected": 3.254859447479248, + "logps/chosen": -159.90438842773438, + "logps/rejected": -170.93817138671875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.210933685302734, + "rewards/margins": 1.131035327911377, + "rewards/rejected": -12.341968536376953, + "step": 2107 + }, + { + "epoch": 1.4547179575642573, + "grad_norm": 0.7836423516273499, + "learning_rate": 4.039862016098122e-06, + "logits/chosen": 3.5281379222869873, + "logits/rejected": 3.816896677017212, + "logps/chosen": -172.43035888671875, + "logps/rejected": -189.56600952148438, + "loss": 0.5231, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.583333015441895, + "rewards/margins": 1.72823166847229, + "rewards/rejected": -14.311564445495605, + "step": 2108 + }, + { + "epoch": 1.4554079696394686, + "grad_norm": 0.5183457732200623, + "learning_rate": 4.041778459179762e-06, + "logits/chosen": 3.5841405391693115, + "logits/rejected": 3.692147731781006, + "logps/chosen": -179.63414001464844, + "logps/rejected": -186.57289123535156, + "loss": 0.6071, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.113594055175781, + "rewards/margins": 0.6685949563980103, + "rewards/rejected": -13.782190322875977, + "step": 2109 + }, + { + "epoch": 1.45609798171468, + "grad_norm": 3.582540988922119, + "learning_rate": 4.043694902261403e-06, + "logits/chosen": 3.3215441703796387, + "logits/rejected": 3.595759868621826, + "logps/chosen": -169.41990661621094, + "logps/rejected": -181.67030334472656, + "loss": 0.5398, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.997320175170898, + "rewards/margins": 1.2140319347381592, + "rewards/rejected": -13.21135139465332, + "step": 2110 + }, + { + "epoch": 1.4567879937898913, + "grad_norm": 0.35450178384780884, + "learning_rate": 4.045611345343043e-06, + "logits/chosen": 3.073428153991699, + "logits/rejected": 3.1589245796203613, + "logps/chosen": -173.77426147460938, + "logps/rejected": -184.63201904296875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.604230880737305, + "rewards/margins": 1.0969016551971436, + "rewards/rejected": -13.701131820678711, + "step": 2111 + }, + { + "epoch": 1.4574780058651027, + "grad_norm": 0.8808621168136597, + "learning_rate": 4.047527788424684e-06, + "logits/chosen": 3.090484857559204, + "logits/rejected": 3.3823697566986084, + "logps/chosen": -162.06503295898438, + "logps/rejected": -191.1405029296875, + "loss": 0.4382, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.60402774810791, + "rewards/margins": 2.9532392024993896, + "rewards/rejected": -14.557268142700195, + "step": 2112 + }, + { + "epoch": 1.458168017940314, + "grad_norm": 0.36217716336250305, + "learning_rate": 4.049444231506325e-06, + "logits/chosen": 3.7385926246643066, + "logits/rejected": 3.828500509262085, + "logps/chosen": -179.89535522460938, + "logps/rejected": -188.4446563720703, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.096834182739258, + "rewards/margins": 0.8560317754745483, + "rewards/rejected": -13.952866554260254, + "step": 2113 + }, + { + "epoch": 1.4588580300155254, + "grad_norm": 26.203292846679688, + "learning_rate": 4.0513606745879655e-06, + "logits/chosen": 3.551557779312134, + "logits/rejected": 3.633075714111328, + "logps/chosen": -169.95635986328125, + "logps/rejected": -176.1865234375, + "loss": 1.0295, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.985285758972168, + "rewards/margins": 0.6557607054710388, + "rewards/rejected": -12.641046524047852, + "step": 2114 + }, + { + "epoch": 1.4595480420907365, + "grad_norm": 0.5565980672836304, + "learning_rate": 4.0532771176696055e-06, + "logits/chosen": 3.1214911937713623, + "logits/rejected": 3.5281262397766113, + "logps/chosen": -158.24099731445312, + "logps/rejected": -184.6559600830078, + "loss": 0.4355, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.901132583618164, + "rewards/margins": 2.6583828926086426, + "rewards/rejected": -13.559514999389648, + "step": 2115 + }, + { + "epoch": 1.4602380541659479, + "grad_norm": 0.49169662594795227, + "learning_rate": 4.055193560751246e-06, + "logits/chosen": 3.568448781967163, + "logits/rejected": 3.672114372253418, + "logps/chosen": -165.51693725585938, + "logps/rejected": -182.34414672851562, + "loss": 0.5232, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.87993049621582, + "rewards/margins": 1.6527929306030273, + "rewards/rejected": -13.532724380493164, + "step": 2116 + }, + { + "epoch": 1.4609280662411592, + "grad_norm": 0.3749200105667114, + "learning_rate": 4.057110003832886e-06, + "logits/chosen": 3.4597885608673096, + "logits/rejected": 3.5688111782073975, + "logps/chosen": -172.7905731201172, + "logps/rejected": -179.37515258789062, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.373613357543945, + "rewards/margins": 0.6445720195770264, + "rewards/rejected": -13.018186569213867, + "step": 2117 + }, + { + "epoch": 1.4616180783163706, + "grad_norm": 3.9453203678131104, + "learning_rate": 4.059026446914527e-06, + "logits/chosen": 3.304070472717285, + "logits/rejected": 3.390346050262451, + "logps/chosen": -186.9647216796875, + "logps/rejected": -189.07452392578125, + "loss": 0.6239, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.969879150390625, + "rewards/margins": 0.2374894618988037, + "rewards/rejected": -14.207368850708008, + "step": 2118 + }, + { + "epoch": 1.462308090391582, + "grad_norm": 0.3373858332633972, + "learning_rate": 4.060942889996167e-06, + "logits/chosen": 3.7395811080932617, + "logits/rejected": 3.7661499977111816, + "logps/chosen": -183.886962890625, + "logps/rejected": -198.20889282226562, + "loss": 0.522, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.718015670776367, + "rewards/margins": 1.4502215385437012, + "rewards/rejected": -15.168237686157227, + "step": 2119 + }, + { + "epoch": 1.462998102466793, + "grad_norm": 0.435689240694046, + "learning_rate": 4.062859333077808e-06, + "logits/chosen": 3.519036293029785, + "logits/rejected": 3.519036293029785, + "logps/chosen": -173.35369873046875, + "logps/rejected": -173.35369873046875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.450933456420898, + "rewards/margins": 0.0, + "rewards/rejected": -12.450933456420898, + "step": 2120 + }, + { + "epoch": 1.4636881145420044, + "grad_norm": 0.29023975133895874, + "learning_rate": 4.064775776159449e-06, + "logits/chosen": 3.8672876358032227, + "logits/rejected": 3.9372382164001465, + "logps/chosen": -189.4478759765625, + "logps/rejected": -201.77874755859375, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.065149307250977, + "rewards/margins": 1.2412015199661255, + "rewards/rejected": -15.306350708007812, + "step": 2121 + }, + { + "epoch": 1.4643781266172158, + "grad_norm": 17.037057876586914, + "learning_rate": 4.066692219241089e-06, + "logits/chosen": 3.401797294616699, + "logits/rejected": 3.506273031234741, + "logps/chosen": -168.3794708251953, + "logps/rejected": -167.56137084960938, + "loss": 1.5476, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.079323768615723, + "rewards/margins": -0.16762810945510864, + "rewards/rejected": -11.91169548034668, + "step": 2122 + }, + { + "epoch": 1.4650681386924271, + "grad_norm": 0.7717711329460144, + "learning_rate": 4.0686086623227295e-06, + "logits/chosen": 3.4010448455810547, + "logits/rejected": 3.486538887023926, + "logps/chosen": -141.47023010253906, + "logps/rejected": -146.11676025390625, + "loss": 0.6093, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.376699447631836, + "rewards/margins": 0.4722193479537964, + "rewards/rejected": -9.848918914794922, + "step": 2123 + }, + { + "epoch": 1.4657581507676385, + "grad_norm": 2.532599449157715, + "learning_rate": 4.0705251054043694e-06, + "logits/chosen": 3.6198654174804688, + "logits/rejected": 3.662494421005249, + "logps/chosen": -173.98040771484375, + "logps/rejected": -175.8699493408203, + "loss": 0.6255, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.628625869750977, + "rewards/margins": 0.22614413499832153, + "rewards/rejected": -12.85477066040039, + "step": 2124 + }, + { + "epoch": 1.4664481628428496, + "grad_norm": 0.24499128758907318, + "learning_rate": 4.07244154848601e-06, + "logits/chosen": 3.3102917671203613, + "logits/rejected": 3.693641185760498, + "logps/chosen": -174.74330139160156, + "logps/rejected": -209.68344116210938, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.68780517578125, + "rewards/margins": 3.4772286415100098, + "rewards/rejected": -16.1650333404541, + "step": 2125 + }, + { + "epoch": 1.4671381749180612, + "grad_norm": 0.2579612731933594, + "learning_rate": 4.074357991567651e-06, + "logits/chosen": 3.427490472793579, + "logits/rejected": 3.596177577972412, + "logps/chosen": -156.43655395507812, + "logps/rejected": -173.88052368164062, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.637929916381836, + "rewards/margins": 1.7197184562683105, + "rewards/rejected": -12.357648849487305, + "step": 2126 + }, + { + "epoch": 1.4678281869932723, + "grad_norm": 0.3694801330566406, + "learning_rate": 4.076274434649291e-06, + "logits/chosen": 3.4521729946136475, + "logits/rejected": 3.4521729946136475, + "logps/chosen": -184.61810302734375, + "logps/rejected": -184.61810302734375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.565418243408203, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.565420150756836, + "step": 2127 + }, + { + "epoch": 1.4685181990684837, + "grad_norm": 23.26336097717285, + "learning_rate": 4.078190877730932e-06, + "logits/chosen": 3.5624778270721436, + "logits/rejected": 3.613210678100586, + "logps/chosen": -165.39610290527344, + "logps/rejected": -167.16392517089844, + "loss": 1.1093, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.650662422180176, + "rewards/margins": 0.20394659042358398, + "rewards/rejected": -11.854609489440918, + "step": 2128 + }, + { + "epoch": 1.469208211143695, + "grad_norm": 0.30212125182151794, + "learning_rate": 4.080107320812573e-06, + "logits/chosen": 3.715477228164673, + "logits/rejected": 3.9164352416992188, + "logps/chosen": -162.8319854736328, + "logps/rejected": -175.4969482421875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.453609466552734, + "rewards/margins": 1.2192630767822266, + "rewards/rejected": -12.672872543334961, + "step": 2129 + }, + { + "epoch": 1.4698982232189064, + "grad_norm": 45.12718200683594, + "learning_rate": 4.082023763894213e-06, + "logits/chosen": 3.7476935386657715, + "logits/rejected": 3.7832179069519043, + "logps/chosen": -150.0489959716797, + "logps/rejected": -171.7808837890625, + "loss": 0.8921, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.493011474609375, + "rewards/margins": 2.159499168395996, + "rewards/rejected": -12.652511596679688, + "step": 2130 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.36581534147262573, + "learning_rate": 4.0839402069758535e-06, + "logits/chosen": 3.7523975372314453, + "logits/rejected": 3.7523975372314453, + "logps/chosen": -174.1345672607422, + "logps/rejected": -174.1345672607422, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.617471694946289, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.617472648620605, + "step": 2131 + }, + { + "epoch": 1.4712782473693289, + "grad_norm": 4.235677719116211, + "learning_rate": 4.0858566500574934e-06, + "logits/chosen": 3.708569049835205, + "logits/rejected": 3.871858596801758, + "logps/chosen": -185.2009735107422, + "logps/rejected": -186.19537353515625, + "loss": 0.6636, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.593770980834961, + "rewards/margins": 0.06820857524871826, + "rewards/rejected": -13.661979675292969, + "step": 2132 + }, + { + "epoch": 1.4719682594445402, + "grad_norm": 0.296286404132843, + "learning_rate": 4.087773093139134e-06, + "logits/chosen": 3.786515712738037, + "logits/rejected": 3.8749706745147705, + "logps/chosen": -176.56649780273438, + "logps/rejected": -184.8248291015625, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.916437149047852, + "rewards/margins": 0.8187234401702881, + "rewards/rejected": -13.735160827636719, + "step": 2133 + }, + { + "epoch": 1.4726582715197516, + "grad_norm": 0.40691184997558594, + "learning_rate": 4.089689536220774e-06, + "logits/chosen": 3.5913195610046387, + "logits/rejected": 3.742873191833496, + "logps/chosen": -167.68148803710938, + "logps/rejected": -190.0992431640625, + "loss": 0.4358, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.980713844299316, + "rewards/margins": 2.2869820594787598, + "rewards/rejected": -14.267695426940918, + "step": 2134 + }, + { + "epoch": 1.473348283594963, + "grad_norm": 0.281213641166687, + "learning_rate": 4.091605979302415e-06, + "logits/chosen": 3.879793882369995, + "logits/rejected": 3.886805295944214, + "logps/chosen": -166.9326934814453, + "logps/rejected": -175.61610412597656, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.785634994506836, + "rewards/margins": 0.8839715719223022, + "rewards/rejected": -12.669605255126953, + "step": 2135 + }, + { + "epoch": 1.4740382956701743, + "grad_norm": 0.26691389083862305, + "learning_rate": 4.093522422384055e-06, + "logits/chosen": 3.9770092964172363, + "logits/rejected": 4.029852867126465, + "logps/chosen": -187.6033172607422, + "logps/rejected": -195.57894897460938, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.148358345031738, + "rewards/margins": 0.8259798288345337, + "rewards/rejected": -14.974337577819824, + "step": 2136 + }, + { + "epoch": 1.4747283077453854, + "grad_norm": 0.2988731265068054, + "learning_rate": 4.095438865465697e-06, + "logits/chosen": 3.9359169006347656, + "logits/rejected": 4.028782844543457, + "logps/chosen": -181.77586364746094, + "logps/rejected": -193.27655029296875, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.414068222045898, + "rewards/margins": 1.1714431047439575, + "rewards/rejected": -14.58551025390625, + "step": 2137 + }, + { + "epoch": 1.4754183198205968, + "grad_norm": 1.707740068435669, + "learning_rate": 4.097355308547337e-06, + "logits/chosen": 3.8212594985961914, + "logits/rejected": 3.905426263809204, + "logps/chosen": -176.966064453125, + "logps/rejected": -180.28646850585938, + "loss": 0.6166, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.889076232910156, + "rewards/margins": 0.3096276521682739, + "rewards/rejected": -13.19870376586914, + "step": 2138 + }, + { + "epoch": 1.4761083318958081, + "grad_norm": 0.24908043444156647, + "learning_rate": 4.0992717516289774e-06, + "logits/chosen": 3.4022574424743652, + "logits/rejected": 3.5973503589630127, + "logps/chosen": -148.91671752929688, + "logps/rejected": -171.27059936523438, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.039839744567871, + "rewards/margins": 2.228166103363037, + "rewards/rejected": -12.26800537109375, + "step": 2139 + }, + { + "epoch": 1.4767983439710195, + "grad_norm": 0.3272104859352112, + "learning_rate": 4.101188194710617e-06, + "logits/chosen": 3.8696024417877197, + "logits/rejected": 3.8696024417877197, + "logps/chosen": -173.0429229736328, + "logps/rejected": -173.04293823242188, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.684137344360352, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -12.684137344360352, + "step": 2140 + }, + { + "epoch": 1.4774883560462309, + "grad_norm": 0.3016718327999115, + "learning_rate": 4.103104637792258e-06, + "logits/chosen": 3.843804359436035, + "logits/rejected": 3.843804359436035, + "logps/chosen": -189.97216796875, + "logps/rejected": -189.97216796875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.310379981994629, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -14.310379981994629, + "step": 2141 + }, + { + "epoch": 1.478178368121442, + "grad_norm": 0.30298978090286255, + "learning_rate": 4.105021080873898e-06, + "logits/chosen": 3.8795692920684814, + "logits/rejected": 3.8795692920684814, + "logps/chosen": -190.31040954589844, + "logps/rejected": -190.31039428710938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.2247314453125, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -14.224729537963867, + "step": 2142 + }, + { + "epoch": 1.4788683801966536, + "grad_norm": 0.2710098624229431, + "learning_rate": 4.106937523955539e-06, + "logits/chosen": 3.6570725440979004, + "logits/rejected": 3.9412920475006104, + "logps/chosen": -172.7760772705078, + "logps/rejected": -191.36367797851562, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.470513343811035, + "rewards/margins": 1.8709313869476318, + "rewards/rejected": -14.341445922851562, + "step": 2143 + }, + { + "epoch": 1.4795583922718647, + "grad_norm": 0.31548452377319336, + "learning_rate": 4.108853967037179e-06, + "logits/chosen": 3.352080821990967, + "logits/rejected": 3.672313690185547, + "logps/chosen": -171.30026245117188, + "logps/rejected": -189.85667419433594, + "loss": 0.5205, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.19271183013916, + "rewards/margins": 1.9497593641281128, + "rewards/rejected": -14.142471313476562, + "step": 2144 + }, + { + "epoch": 1.480248404347076, + "grad_norm": 0.9830532670021057, + "learning_rate": 4.11077041011882e-06, + "logits/chosen": 3.610314130783081, + "logits/rejected": 3.6817262172698975, + "logps/chosen": -180.21066284179688, + "logps/rejected": -191.64674377441406, + "loss": 0.5269, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.049647331237793, + "rewards/margins": 1.2513396739959717, + "rewards/rejected": -14.300987243652344, + "step": 2145 + }, + { + "epoch": 1.4809384164222874, + "grad_norm": 0.2748686671257019, + "learning_rate": 4.112686853200461e-06, + "logits/chosen": 3.6808252334594727, + "logits/rejected": 3.7541351318359375, + "logps/chosen": -159.95797729492188, + "logps/rejected": -185.76681518554688, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.396062850952148, + "rewards/margins": 2.443039894104004, + "rewards/rejected": -13.839103698730469, + "step": 2146 + }, + { + "epoch": 1.4816284284974988, + "grad_norm": 1.6944983005523682, + "learning_rate": 4.1146032962821006e-06, + "logits/chosen": 3.2605857849121094, + "logits/rejected": 3.4069137573242188, + "logps/chosen": -153.8083038330078, + "logps/rejected": -167.81381225585938, + "loss": 0.5279, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.66423511505127, + "rewards/margins": 1.3774317502975464, + "rewards/rejected": -12.041666984558105, + "step": 2147 + }, + { + "epoch": 1.4823184405727101, + "grad_norm": 21.689491271972656, + "learning_rate": 4.116519739363741e-06, + "logits/chosen": 3.6784942150115967, + "logits/rejected": 3.5712711811065674, + "logps/chosen": -182.9860076904297, + "logps/rejected": -178.7395477294922, + "loss": 1.0667, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.677600860595703, + "rewards/margins": -0.45700502395629883, + "rewards/rejected": -13.22059440612793, + "step": 2148 + }, + { + "epoch": 1.4830084526479212, + "grad_norm": 0.417684406042099, + "learning_rate": 4.118436182445381e-06, + "logits/chosen": 3.446120500564575, + "logits/rejected": 3.446120500564575, + "logps/chosen": -167.23573303222656, + "logps/rejected": -167.23573303222656, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.046295166015625, + "rewards/margins": 0.0, + "rewards/rejected": -12.046296119689941, + "step": 2149 + }, + { + "epoch": 1.4836984647231326, + "grad_norm": 13.266326904296875, + "learning_rate": 4.120352625527022e-06, + "logits/chosen": 3.6140971183776855, + "logits/rejected": 3.5667738914489746, + "logps/chosen": -167.96209716796875, + "logps/rejected": -176.9725341796875, + "loss": 0.6649, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.138480186462402, + "rewards/margins": 0.8728561401367188, + "rewards/rejected": -13.011335372924805, + "step": 2150 + }, + { + "epoch": 1.484388476798344, + "grad_norm": 16.715063095092773, + "learning_rate": 4.122269068608662e-06, + "logits/chosen": 3.485757350921631, + "logits/rejected": 3.598313093185425, + "logps/chosen": -172.460205078125, + "logps/rejected": -185.2328643798828, + "loss": 0.6052, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.47530746459961, + "rewards/margins": 1.2363590002059937, + "rewards/rejected": -13.711665153503418, + "step": 2151 + }, + { + "epoch": 1.4850784888735553, + "grad_norm": 0.3366192877292633, + "learning_rate": 4.124185511690303e-06, + "logits/chosen": 3.5802998542785645, + "logits/rejected": 3.5802998542785645, + "logps/chosen": -182.450927734375, + "logps/rejected": -182.450927734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.583637237548828, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.583636283874512, + "step": 2152 + }, + { + "epoch": 1.4857685009487667, + "grad_norm": 0.3889119029045105, + "learning_rate": 4.126101954771944e-06, + "logits/chosen": 3.6222689151763916, + "logits/rejected": 3.6032657623291016, + "logps/chosen": -174.27565002441406, + "logps/rejected": -187.37100219726562, + "loss": 0.5217, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.735666275024414, + "rewards/margins": 1.3103008270263672, + "rewards/rejected": -14.045967102050781, + "step": 2153 + }, + { + "epoch": 1.4864585130239778, + "grad_norm": 3.5948808193206787, + "learning_rate": 4.128018397853585e-06, + "logits/chosen": 3.6303741931915283, + "logits/rejected": 3.901233196258545, + "logps/chosen": -173.769775390625, + "logps/rejected": -181.49508666992188, + "loss": 0.5675, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.76698112487793, + "rewards/margins": 0.721194863319397, + "rewards/rejected": -13.488174438476562, + "step": 2154 + }, + { + "epoch": 1.4871485250991892, + "grad_norm": 0.31043756008148193, + "learning_rate": 4.1299348409352245e-06, + "logits/chosen": 3.3747687339782715, + "logits/rejected": 3.564182758331299, + "logps/chosen": -165.06124877929688, + "logps/rejected": -177.43765258789062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.940147399902344, + "rewards/margins": 1.2072619199752808, + "rewards/rejected": -13.147409439086914, + "step": 2155 + }, + { + "epoch": 1.4878385371744005, + "grad_norm": 0.37068188190460205, + "learning_rate": 4.131851284016865e-06, + "logits/chosen": 3.790527105331421, + "logits/rejected": 3.790527105331421, + "logps/chosen": -177.27589416503906, + "logps/rejected": -177.27589416503906, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.989351272583008, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.989351272583008, + "step": 2156 + }, + { + "epoch": 1.4885285492496119, + "grad_norm": 2.7598986625671387, + "learning_rate": 4.133767727098505e-06, + "logits/chosen": 3.7390711307525635, + "logits/rejected": 3.7105624675750732, + "logps/chosen": -164.70594787597656, + "logps/rejected": -170.18405151367188, + "loss": 0.5491, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.771219253540039, + "rewards/margins": 0.5302364826202393, + "rewards/rejected": -12.3014554977417, + "step": 2157 + }, + { + "epoch": 1.4892185613248232, + "grad_norm": 0.329786479473114, + "learning_rate": 4.135684170180146e-06, + "logits/chosen": 3.7295618057250977, + "logits/rejected": 3.7295618057250977, + "logps/chosen": -181.77606201171875, + "logps/rejected": -181.77606201171875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.47478199005127, + "rewards/margins": 7.152557373046875e-07, + "rewards/rejected": -13.474782943725586, + "step": 2158 + }, + { + "epoch": 1.4899085734000346, + "grad_norm": 0.29964131116867065, + "learning_rate": 4.137600613261786e-06, + "logits/chosen": 3.7232391834259033, + "logits/rejected": 3.7232391834259033, + "logps/chosen": -193.1326904296875, + "logps/rejected": -193.1326904296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.440003395080566, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.440003395080566, + "step": 2159 + }, + { + "epoch": 1.490598585475246, + "grad_norm": 0.2736095190048218, + "learning_rate": 4.139517056343427e-06, + "logits/chosen": 3.5665669441223145, + "logits/rejected": 3.6309080123901367, + "logps/chosen": -184.14138793945312, + "logps/rejected": -196.80145263671875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.646702766418457, + "rewards/margins": 1.271375298500061, + "rewards/rejected": -14.918078422546387, + "step": 2160 + }, + { + "epoch": 1.491288597550457, + "grad_norm": 0.7366228103637695, + "learning_rate": 4.141433499425068e-06, + "logits/chosen": 3.4549999237060547, + "logits/rejected": 3.508347272872925, + "logps/chosen": -168.17930603027344, + "logps/rejected": -191.66561889648438, + "loss": 0.4367, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.974287033081055, + "rewards/margins": 2.433877468109131, + "rewards/rejected": -14.408164978027344, + "step": 2161 + }, + { + "epoch": 1.4919786096256684, + "grad_norm": 0.3339824676513672, + "learning_rate": 4.143349942506708e-06, + "logits/chosen": 3.5105533599853516, + "logits/rejected": 3.5105533599853516, + "logps/chosen": -185.20150756835938, + "logps/rejected": -185.20150756835938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.674495697021484, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.674495697021484, + "step": 2162 + }, + { + "epoch": 1.4926686217008798, + "grad_norm": 3.5520334243774414, + "learning_rate": 4.1452663855883485e-06, + "logits/chosen": 3.5046308040618896, + "logits/rejected": 3.6439547538757324, + "logps/chosen": -173.4192352294922, + "logps/rejected": -179.68385314941406, + "loss": 0.5443, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.630278587341309, + "rewards/margins": 0.6968866586685181, + "rewards/rejected": -13.327165603637695, + "step": 2163 + }, + { + "epoch": 1.4933586337760911, + "grad_norm": 0.2748431861400604, + "learning_rate": 4.1471828286699885e-06, + "logits/chosen": 3.5727763175964355, + "logits/rejected": 3.6384549140930176, + "logps/chosen": -175.8013916015625, + "logps/rejected": -192.21743774414062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.677166938781738, + "rewards/margins": 1.5816715955734253, + "rewards/rejected": -14.258837699890137, + "step": 2164 + }, + { + "epoch": 1.4940486458513025, + "grad_norm": 26.625337600708008, + "learning_rate": 4.149099271751629e-06, + "logits/chosen": 3.7429609298706055, + "logits/rejected": 3.6194214820861816, + "logps/chosen": -179.52159118652344, + "logps/rejected": -178.15231323242188, + "loss": 1.0948, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.188535690307617, + "rewards/margins": -0.17159926891326904, + "rewards/rejected": -13.016935348510742, + "step": 2165 + }, + { + "epoch": 1.4947386579265136, + "grad_norm": 0.266810804605484, + "learning_rate": 4.15101571483327e-06, + "logits/chosen": 3.6869521141052246, + "logits/rejected": 3.7678415775299072, + "logps/chosen": -158.26605224609375, + "logps/rejected": -189.97262573242188, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.12415885925293, + "rewards/margins": 3.119058847427368, + "rewards/rejected": -14.243217468261719, + "step": 2166 + }, + { + "epoch": 1.495428670001725, + "grad_norm": 0.9319338798522949, + "learning_rate": 4.15293215791491e-06, + "logits/chosen": 3.5534772872924805, + "logits/rejected": 3.64463472366333, + "logps/chosen": -165.71075439453125, + "logps/rejected": -181.79638671875, + "loss": 0.5234, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.758604049682617, + "rewards/margins": 1.5825214385986328, + "rewards/rejected": -13.34112548828125, + "step": 2167 + }, + { + "epoch": 1.4961186820769363, + "grad_norm": 0.3269230127334595, + "learning_rate": 4.154848600996551e-06, + "logits/chosen": 3.3841874599456787, + "logits/rejected": 3.4793753623962402, + "logps/chosen": -151.02066040039062, + "logps/rejected": -176.45718383789062, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.6057767868042, + "rewards/margins": 2.384918212890625, + "rewards/rejected": -12.990694999694824, + "step": 2168 + }, + { + "epoch": 1.4968086941521477, + "grad_norm": 0.35421136021614075, + "learning_rate": 4.156765044078192e-06, + "logits/chosen": 3.5077619552612305, + "logits/rejected": 3.5077619552612305, + "logps/chosen": -178.41307067871094, + "logps/rejected": -178.41305541992188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.114205360412598, + "rewards/margins": 0.0, + "rewards/rejected": -13.114205360412598, + "step": 2169 + }, + { + "epoch": 1.497498706227359, + "grad_norm": 0.2388615608215332, + "learning_rate": 4.158681487159832e-06, + "logits/chosen": 3.2323341369628906, + "logits/rejected": 3.2644145488739014, + "logps/chosen": -166.44052124023438, + "logps/rejected": -189.73155212402344, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.835311889648438, + "rewards/margins": 2.3887500762939453, + "rewards/rejected": -14.224061965942383, + "step": 2170 + }, + { + "epoch": 1.4981887183025702, + "grad_norm": 0.34914109110832214, + "learning_rate": 4.1605979302414725e-06, + "logits/chosen": 3.343684673309326, + "logits/rejected": 3.343684673309326, + "logps/chosen": -171.9861297607422, + "logps/rejected": -171.9861297607422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.406939506530762, + "rewards/margins": 0.0, + "rewards/rejected": -12.406939506530762, + "step": 2171 + }, + { + "epoch": 1.4988787303777817, + "grad_norm": 0.28303173184394836, + "learning_rate": 4.1625143733231125e-06, + "logits/chosen": 3.400345802307129, + "logits/rejected": 3.3582441806793213, + "logps/chosen": -168.78237915039062, + "logps/rejected": -174.92919921875, + "loss": 0.6078, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.032724380493164, + "rewards/margins": 0.5684230327606201, + "rewards/rejected": -12.601146697998047, + "step": 2172 + }, + { + "epoch": 1.4995687424529929, + "grad_norm": 0.39703431725502014, + "learning_rate": 4.164430816404753e-06, + "logits/chosen": 3.264425754547119, + "logits/rejected": 3.264425754547119, + "logps/chosen": -176.358642578125, + "logps/rejected": -176.358642578125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.804023742675781, + "rewards/margins": 0.0, + "rewards/rejected": -12.804023742675781, + "step": 2173 + }, + { + "epoch": 1.5002587545282042, + "grad_norm": 0.44062289595603943, + "learning_rate": 4.166347259486393e-06, + "logits/chosen": 3.168104410171509, + "logits/rejected": 3.1863207817077637, + "logps/chosen": -157.86361694335938, + "logps/rejected": -163.51974487304688, + "loss": 0.608, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.984619140625, + "rewards/margins": 0.5546048879623413, + "rewards/rejected": -11.539223670959473, + "step": 2174 + }, + { + "epoch": 1.5009487666034156, + "grad_norm": 0.26849648356437683, + "learning_rate": 4.168263702568034e-06, + "logits/chosen": 3.37310791015625, + "logits/rejected": 3.4552974700927734, + "logps/chosen": -184.19049072265625, + "logps/rejected": -193.38555908203125, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.56846809387207, + "rewards/margins": 0.9311606884002686, + "rewards/rejected": -14.499629974365234, + "step": 2175 + }, + { + "epoch": 1.5016387786786267, + "grad_norm": 0.33025553822517395, + "learning_rate": 4.170180145649674e-06, + "logits/chosen": 3.0642642974853516, + "logits/rejected": 3.2533278465270996, + "logps/chosen": -128.72686767578125, + "logps/rejected": -166.79074096679688, + "loss": 0.4339, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.222478866577148, + "rewards/margins": 3.467012405395508, + "rewards/rejected": -11.689491271972656, + "step": 2176 + }, + { + "epoch": 1.5023287907538383, + "grad_norm": 0.2875601351261139, + "learning_rate": 4.172096588731316e-06, + "logits/chosen": 3.607133388519287, + "logits/rejected": 3.6511294841766357, + "logps/chosen": -173.89915466308594, + "logps/rejected": -180.41758728027344, + "loss": 0.6073, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.612837791442871, + "rewards/margins": 0.6385212540626526, + "rewards/rejected": -13.251358985900879, + "step": 2177 + }, + { + "epoch": 1.5030188028290494, + "grad_norm": 0.30070650577545166, + "learning_rate": 4.174013031812956e-06, + "logits/chosen": 2.980229616165161, + "logits/rejected": 3.080423355102539, + "logps/chosen": -172.51539611816406, + "logps/rejected": -185.9473876953125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.292536735534668, + "rewards/margins": 1.3645133972167969, + "rewards/rejected": -13.657050132751465, + "step": 2178 + }, + { + "epoch": 1.5037088149042608, + "grad_norm": 1.0885717868804932, + "learning_rate": 4.1759294748945965e-06, + "logits/chosen": 3.1438848972320557, + "logits/rejected": 3.475834608078003, + "logps/chosen": -157.15304565429688, + "logps/rejected": -186.96405029296875, + "loss": 0.4384, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.846099853515625, + "rewards/margins": 2.9556336402893066, + "rewards/rejected": -13.80173397064209, + "step": 2179 + }, + { + "epoch": 1.5043988269794721, + "grad_norm": 0.26011553406715393, + "learning_rate": 4.1778459179762365e-06, + "logits/chosen": 3.4060556888580322, + "logits/rejected": 3.547849178314209, + "logps/chosen": -166.35865783691406, + "logps/rejected": -178.91946411132812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.896421432495117, + "rewards/margins": 1.174218773841858, + "rewards/rejected": -13.070640563964844, + "step": 2180 + }, + { + "epoch": 1.5050888390546835, + "grad_norm": 0.3795939087867737, + "learning_rate": 4.179762361057877e-06, + "logits/chosen": 3.185844898223877, + "logits/rejected": 3.2109620571136475, + "logps/chosen": -162.27503967285156, + "logps/rejected": -172.706298828125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.422497749328613, + "rewards/margins": 1.029800295829773, + "rewards/rejected": -12.452298164367676, + "step": 2181 + }, + { + "epoch": 1.5057788511298948, + "grad_norm": 0.3024214804172516, + "learning_rate": 4.181678804139517e-06, + "logits/chosen": 3.124612808227539, + "logits/rejected": 3.139347553253174, + "logps/chosen": -167.147216796875, + "logps/rejected": -176.28695678710938, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.023366928100586, + "rewards/margins": 0.9051355123519897, + "rewards/rejected": -12.928503036499023, + "step": 2182 + }, + { + "epoch": 1.506468863205106, + "grad_norm": 0.3723054528236389, + "learning_rate": 4.183595247221158e-06, + "logits/chosen": 2.875511646270752, + "logits/rejected": 2.8935394287109375, + "logps/chosen": -158.5877685546875, + "logps/rejected": -165.65806579589844, + "loss": 0.607, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.107165336608887, + "rewards/margins": 0.6875944137573242, + "rewards/rejected": -11.794759750366211, + "step": 2183 + }, + { + "epoch": 1.5071588752803176, + "grad_norm": 9.44344425201416, + "learning_rate": 4.185511690302798e-06, + "logits/chosen": 3.165010929107666, + "logits/rejected": 3.3802146911621094, + "logps/chosen": -169.81692504882812, + "logps/rejected": -173.42721557617188, + "loss": 0.7439, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.210441589355469, + "rewards/margins": 0.3620350956916809, + "rewards/rejected": -12.572477340698242, + "step": 2184 + }, + { + "epoch": 1.5078488873555287, + "grad_norm": 0.3289755880832672, + "learning_rate": 4.187428133384439e-06, + "logits/chosen": 3.319085121154785, + "logits/rejected": 3.5670325756073, + "logps/chosen": -166.27978515625, + "logps/rejected": -183.18716430664062, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.899246215820312, + "rewards/margins": 1.7284613847732544, + "rewards/rejected": -13.627708435058594, + "step": 2185 + }, + { + "epoch": 1.50853889943074, + "grad_norm": 16.75476837158203, + "learning_rate": 4.18934457646608e-06, + "logits/chosen": 3.45558500289917, + "logits/rejected": 3.319784164428711, + "logps/chosen": -160.76536560058594, + "logps/rejected": -161.51229858398438, + "loss": 1.0934, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.323440551757812, + "rewards/margins": 0.1846851110458374, + "rewards/rejected": -11.508125305175781, + "step": 2186 + }, + { + "epoch": 1.5092289115059514, + "grad_norm": 0.3001616299152374, + "learning_rate": 4.19126101954772e-06, + "logits/chosen": 3.365652322769165, + "logits/rejected": 3.5315845012664795, + "logps/chosen": -178.36959838867188, + "logps/rejected": -186.35655212402344, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.968034744262695, + "rewards/margins": 0.7897021174430847, + "rewards/rejected": -13.757736206054688, + "step": 2187 + }, + { + "epoch": 1.5099189235811625, + "grad_norm": 0.45741012692451477, + "learning_rate": 4.1931774626293604e-06, + "logits/chosen": 3.090836763381958, + "logits/rejected": 3.1301774978637695, + "logps/chosen": -157.28570556640625, + "logps/rejected": -173.73831176757812, + "loss": 0.5225, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.134343147277832, + "rewards/margins": 1.7013736963272095, + "rewards/rejected": -12.83571720123291, + "step": 2188 + }, + { + "epoch": 1.510608935656374, + "grad_norm": 3.225334405899048, + "learning_rate": 4.195093905711e-06, + "logits/chosen": 3.1349499225616455, + "logits/rejected": 3.1176652908325195, + "logps/chosen": -162.08274841308594, + "logps/rejected": -169.33453369140625, + "loss": 0.5474, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.47430419921875, + "rewards/margins": 0.7467369437217712, + "rewards/rejected": -12.221041679382324, + "step": 2189 + }, + { + "epoch": 1.5112989477315852, + "grad_norm": 0.3108592927455902, + "learning_rate": 4.197010348792641e-06, + "logits/chosen": 3.3778038024902344, + "logits/rejected": 3.364915370941162, + "logps/chosen": -174.19175720214844, + "logps/rejected": -182.6567840576172, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.583404541015625, + "rewards/margins": 0.898004412651062, + "rewards/rejected": -13.481409072875977, + "step": 2190 + }, + { + "epoch": 1.5119889598067966, + "grad_norm": 0.29034820199012756, + "learning_rate": 4.198926791874281e-06, + "logits/chosen": 2.706547260284424, + "logits/rejected": 2.7384634017944336, + "logps/chosen": -145.93490600585938, + "logps/rejected": -172.76797485351562, + "loss": 0.4338, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.793373107910156, + "rewards/margins": 2.7152953147888184, + "rewards/rejected": -12.508668899536133, + "step": 2191 + }, + { + "epoch": 1.512678971882008, + "grad_norm": 0.3695196807384491, + "learning_rate": 4.200843234955922e-06, + "logits/chosen": 3.030376672744751, + "logits/rejected": 3.030376672744751, + "logps/chosen": -170.89602661132812, + "logps/rejected": -170.8960418701172, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.484151840209961, + "rewards/margins": 0.0, + "rewards/rejected": -12.484151840209961, + "step": 2192 + }, + { + "epoch": 1.5133689839572193, + "grad_norm": 2.0722057819366455, + "learning_rate": 4.202759678037562e-06, + "logits/chosen": 3.024719715118408, + "logits/rejected": 3.211308479309082, + "logps/chosen": -160.5187530517578, + "logps/rejected": -165.7595672607422, + "loss": 0.5603, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.299164772033691, + "rewards/margins": 0.5048701763153076, + "rewards/rejected": -11.804035186767578, + "step": 2193 + }, + { + "epoch": 1.5140589960324307, + "grad_norm": 13.56889820098877, + "learning_rate": 4.204676121119204e-06, + "logits/chosen": 3.0275697708129883, + "logits/rejected": 3.14267635345459, + "logps/chosen": -151.81170654296875, + "logps/rejected": -161.1808319091797, + "loss": 0.6705, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.499937057495117, + "rewards/margins": 0.9102806448936462, + "rewards/rejected": -11.41021728515625, + "step": 2194 + }, + { + "epoch": 1.5147490081076418, + "grad_norm": 0.2826042175292969, + "learning_rate": 4.206592564200844e-06, + "logits/chosen": 3.025559663772583, + "logits/rejected": 3.0797276496887207, + "logps/chosen": -169.48304748535156, + "logps/rejected": -177.81060791015625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.357358932495117, + "rewards/margins": 0.8633274435997009, + "rewards/rejected": -13.220686912536621, + "step": 2195 + }, + { + "epoch": 1.5154390201828531, + "grad_norm": 0.27674224972724915, + "learning_rate": 4.208509007282484e-06, + "logits/chosen": 3.2145392894744873, + "logits/rejected": 3.2289388179779053, + "logps/chosen": -150.57440185546875, + "logps/rejected": -162.44747924804688, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.564373016357422, + "rewards/margins": 1.1970272064208984, + "rewards/rejected": -11.76140022277832, + "step": 2196 + }, + { + "epoch": 1.5161290322580645, + "grad_norm": 0.33702877163887024, + "learning_rate": 4.210425450364124e-06, + "logits/chosen": 3.1311190128326416, + "logits/rejected": 3.1311190128326416, + "logps/chosen": -166.86378479003906, + "logps/rejected": -166.86378479003906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.038806915283203, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.038806915283203, + "step": 2197 + }, + { + "epoch": 1.5168190443332759, + "grad_norm": 0.33290985226631165, + "learning_rate": 4.212341893445765e-06, + "logits/chosen": 3.3290789127349854, + "logits/rejected": 3.3290789127349854, + "logps/chosen": -151.6527862548828, + "logps/rejected": -151.6527862548828, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.546988487243652, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -10.546988487243652, + "step": 2198 + }, + { + "epoch": 1.5175090564084872, + "grad_norm": 0.5973294973373413, + "learning_rate": 4.214258336527405e-06, + "logits/chosen": 2.8948535919189453, + "logits/rejected": 3.222121238708496, + "logps/chosen": -169.42869567871094, + "logps/rejected": -182.9761505126953, + "loss": 0.5231, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.133471488952637, + "rewards/margins": 1.3139702081680298, + "rewards/rejected": -13.447442054748535, + "step": 2199 + }, + { + "epoch": 1.5181990684836983, + "grad_norm": 0.3672776520252228, + "learning_rate": 4.216174779609046e-06, + "logits/chosen": 3.234194278717041, + "logits/rejected": 3.446004867553711, + "logps/chosen": -176.45315551757812, + "logps/rejected": -185.5065460205078, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.846202850341797, + "rewards/margins": 0.894220232963562, + "rewards/rejected": -13.740422248840332, + "step": 2200 + }, + { + "epoch": 1.51888908055891, + "grad_norm": 0.8200278282165527, + "learning_rate": 4.218091222690686e-06, + "logits/chosen": 3.285524368286133, + "logits/rejected": 3.230689764022827, + "logps/chosen": -176.5765380859375, + "logps/rejected": -185.8722381591797, + "loss": 0.5278, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.916852951049805, + "rewards/margins": 0.896553635597229, + "rewards/rejected": -13.813405990600586, + "step": 2201 + }, + { + "epoch": 1.519579092634121, + "grad_norm": 0.288999080657959, + "learning_rate": 4.220007665772327e-06, + "logits/chosen": 3.081655740737915, + "logits/rejected": 3.0378215312957764, + "logps/chosen": -174.5514373779297, + "logps/rejected": -190.22547912597656, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.678499221801758, + "rewards/margins": 1.552986741065979, + "rewards/rejected": -14.231485366821289, + "step": 2202 + }, + { + "epoch": 1.5202691047093324, + "grad_norm": 0.2971605062484741, + "learning_rate": 4.2219241088539676e-06, + "logits/chosen": 3.44627046585083, + "logits/rejected": 3.531334638595581, + "logps/chosen": -176.9608612060547, + "logps/rejected": -182.8019561767578, + "loss": 0.6074, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.973570823669434, + "rewards/margins": 0.6238569021224976, + "rewards/rejected": -13.597427368164062, + "step": 2203 + }, + { + "epoch": 1.5209591167845438, + "grad_norm": 0.3083021938800812, + "learning_rate": 4.2238405519356075e-06, + "logits/chosen": 3.1616227626800537, + "logits/rejected": 3.3293752670288086, + "logps/chosen": -143.23072814941406, + "logps/rejected": -150.6831512451172, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.628554344177246, + "rewards/margins": 0.7694791555404663, + "rewards/rejected": -10.398033142089844, + "step": 2204 + }, + { + "epoch": 1.521649128859755, + "grad_norm": 9.573701858520508, + "learning_rate": 4.225756995017248e-06, + "logits/chosen": 3.1684203147888184, + "logits/rejected": 3.5004849433898926, + "logps/chosen": -143.536376953125, + "logps/rejected": -169.5185546875, + "loss": 0.4411, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.728666305541992, + "rewards/margins": 2.5798959732055664, + "rewards/rejected": -12.308561325073242, + "step": 2205 + }, + { + "epoch": 1.5223391409349665, + "grad_norm": 0.3273712694644928, + "learning_rate": 4.227673438098889e-06, + "logits/chosen": 3.4634339809417725, + "logits/rejected": 3.4634339809417725, + "logps/chosen": -170.10972595214844, + "logps/rejected": -170.10972595214844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.072211265563965, + "rewards/margins": 0.0, + "rewards/rejected": -12.072211265563965, + "step": 2206 + }, + { + "epoch": 1.5230291530101776, + "grad_norm": 0.8497674465179443, + "learning_rate": 4.229589881180529e-06, + "logits/chosen": 2.9798154830932617, + "logits/rejected": 2.9701502323150635, + "logps/chosen": -182.37225341796875, + "logps/rejected": -186.05990600585938, + "loss": 0.614, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.41860294342041, + "rewards/margins": 0.34810584783554077, + "rewards/rejected": -13.766708374023438, + "step": 2207 + }, + { + "epoch": 1.523719165085389, + "grad_norm": 0.37741750478744507, + "learning_rate": 4.23150632426217e-06, + "logits/chosen": 3.0951600074768066, + "logits/rejected": 3.0951600074768066, + "logps/chosen": -159.777587890625, + "logps/rejected": -159.777587890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.195290565490723, + "rewards/margins": 0.0, + "rewards/rejected": -11.195290565490723, + "step": 2208 + }, + { + "epoch": 1.5244091771606003, + "grad_norm": 0.3278605043888092, + "learning_rate": 4.23342276734381e-06, + "logits/chosen": 3.378911018371582, + "logits/rejected": 3.378911018371582, + "logps/chosen": -190.88986206054688, + "logps/rejected": -190.88986206054688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.325942993164062, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -14.325942993164062, + "step": 2209 + }, + { + "epoch": 1.5250991892358117, + "grad_norm": 0.3032088875770569, + "learning_rate": 4.235339210425451e-06, + "logits/chosen": 3.5187671184539795, + "logits/rejected": 3.5187671184539795, + "logps/chosen": -188.75128173828125, + "logps/rejected": -188.75128173828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.943056106567383, + "rewards/margins": 0.0, + "rewards/rejected": -13.943056106567383, + "step": 2210 + }, + { + "epoch": 1.525789201311023, + "grad_norm": 0.30896174907684326, + "learning_rate": 4.2372556535070916e-06, + "logits/chosen": 3.4429564476013184, + "logits/rejected": 3.5326547622680664, + "logps/chosen": -162.3414306640625, + "logps/rejected": -172.30523681640625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.430329322814941, + "rewards/margins": 0.990920901298523, + "rewards/rejected": -12.421249389648438, + "step": 2211 + }, + { + "epoch": 1.5264792133862342, + "grad_norm": 0.32002413272857666, + "learning_rate": 4.2391720965887315e-06, + "logits/chosen": 2.971309185028076, + "logits/rejected": 3.023329734802246, + "logps/chosen": -155.40818786621094, + "logps/rejected": -172.38766479492188, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.761359214782715, + "rewards/margins": 1.6306931972503662, + "rewards/rejected": -12.39205265045166, + "step": 2212 + }, + { + "epoch": 1.5271692254614457, + "grad_norm": 0.33912307024002075, + "learning_rate": 4.241088539670372e-06, + "logits/chosen": 3.427353858947754, + "logits/rejected": 3.4510021209716797, + "logps/chosen": -166.92953491210938, + "logps/rejected": -174.19229125976562, + "loss": 0.6069, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.95705795288086, + "rewards/margins": 0.720862090587616, + "rewards/rejected": -12.677919387817383, + "step": 2213 + }, + { + "epoch": 1.5278592375366569, + "grad_norm": 0.33465173840522766, + "learning_rate": 4.243004982752012e-06, + "logits/chosen": 3.228618621826172, + "logits/rejected": 3.228618621826172, + "logps/chosen": -170.3184051513672, + "logps/rejected": -170.3184051513672, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.297985076904297, + "rewards/margins": 0.0, + "rewards/rejected": -12.297985076904297, + "step": 2214 + }, + { + "epoch": 1.5285492496118682, + "grad_norm": 0.41804051399230957, + "learning_rate": 4.244921425833653e-06, + "logits/chosen": 2.9981307983398438, + "logits/rejected": 3.1285581588745117, + "logps/chosen": -154.04994201660156, + "logps/rejected": -160.98500061035156, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.773319244384766, + "rewards/margins": 0.7568129301071167, + "rewards/rejected": -11.530132293701172, + "step": 2215 + }, + { + "epoch": 1.5292392616870796, + "grad_norm": 6.152493953704834, + "learning_rate": 4.246837868915293e-06, + "logits/chosen": 3.386420488357544, + "logits/rejected": 3.448380708694458, + "logps/chosen": -181.527099609375, + "logps/rejected": -183.02874755859375, + "loss": 0.654, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.258596420288086, + "rewards/margins": 0.09647870063781738, + "rewards/rejected": -13.35507583618164, + "step": 2216 + }, + { + "epoch": 1.5299292737622907, + "grad_norm": 0.33938518166542053, + "learning_rate": 4.248754311996934e-06, + "logits/chosen": 3.1191582679748535, + "logits/rejected": 3.1191582679748535, + "logps/chosen": -170.84918212890625, + "logps/rejected": -170.84918212890625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.130990982055664, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.130990982055664, + "step": 2217 + }, + { + "epoch": 1.5306192858375023, + "grad_norm": 0.39658480882644653, + "learning_rate": 4.250670755078575e-06, + "logits/chosen": 2.834865093231201, + "logits/rejected": 2.9457836151123047, + "logps/chosen": -148.82302856445312, + "logps/rejected": -171.091552734375, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.08738899230957, + "rewards/margins": 2.1578242778778076, + "rewards/rejected": -12.245213508605957, + "step": 2218 + }, + { + "epoch": 1.5313092979127134, + "grad_norm": 0.38339751958847046, + "learning_rate": 4.2525871981602155e-06, + "logits/chosen": 2.9736275672912598, + "logits/rejected": 2.9736275672912598, + "logps/chosen": -183.90933227539062, + "logps/rejected": -183.90933227539062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.74477767944336, + "rewards/margins": 0.0, + "rewards/rejected": -13.74477767944336, + "step": 2219 + }, + { + "epoch": 1.5319993099879248, + "grad_norm": 0.2626432180404663, + "learning_rate": 4.2545036412418555e-06, + "logits/chosen": 3.2386865615844727, + "logits/rejected": 3.2386865615844727, + "logps/chosen": -164.5136260986328, + "logps/rejected": -164.51364135742188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.645576477050781, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -11.645576477050781, + "step": 2220 + }, + { + "epoch": 1.5326893220631361, + "grad_norm": 0.24271589517593384, + "learning_rate": 4.256420084323496e-06, + "logits/chosen": 2.78885555267334, + "logits/rejected": 2.983588695526123, + "logps/chosen": -168.386962890625, + "logps/rejected": -190.39468383789062, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.115828514099121, + "rewards/margins": 2.1550087928771973, + "rewards/rejected": -14.270837783813477, + "step": 2221 + }, + { + "epoch": 1.5333793341383473, + "grad_norm": 4.31848669052124, + "learning_rate": 4.258336527405136e-06, + "logits/chosen": 2.762202739715576, + "logits/rejected": 2.912837028503418, + "logps/chosen": -161.98895263671875, + "logps/rejected": -171.3153076171875, + "loss": 0.5481, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.307954788208008, + "rewards/margins": 1.0013957023620605, + "rewards/rejected": -12.309350967407227, + "step": 2222 + }, + { + "epoch": 1.5340693462135588, + "grad_norm": 0.3613838851451874, + "learning_rate": 4.260252970486777e-06, + "logits/chosen": 3.109245538711548, + "logits/rejected": 3.109245538711548, + "logps/chosen": -180.71820068359375, + "logps/rejected": -180.71820068359375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.162031173706055, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.162031173706055, + "step": 2223 + }, + { + "epoch": 1.53475935828877, + "grad_norm": 20.004453659057617, + "learning_rate": 4.262169413568417e-06, + "logits/chosen": 3.25901460647583, + "logits/rejected": 3.420753002166748, + "logps/chosen": -170.23330688476562, + "logps/rejected": -177.51028442382812, + "loss": 1.0388, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.326141357421875, + "rewards/margins": 0.7625851631164551, + "rewards/rejected": -13.088726043701172, + "step": 2224 + }, + { + "epoch": 1.5354493703639813, + "grad_norm": 12.171160697937012, + "learning_rate": 4.264085856650058e-06, + "logits/chosen": 3.273725986480713, + "logits/rejected": 3.1344733238220215, + "logps/chosen": -184.1901397705078, + "logps/rejected": -179.81784057617188, + "loss": 1.0433, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.596758842468262, + "rewards/margins": -0.43289482593536377, + "rewards/rejected": -13.163864135742188, + "step": 2225 + }, + { + "epoch": 1.5361393824391927, + "grad_norm": 0.333462119102478, + "learning_rate": 4.266002299731699e-06, + "logits/chosen": 3.2838363647460938, + "logits/rejected": 3.2838363647460938, + "logps/chosen": -174.86532592773438, + "logps/rejected": -174.86534118652344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.76605224609375, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.76605224609375, + "step": 2226 + }, + { + "epoch": 1.536829394514404, + "grad_norm": 0.336820125579834, + "learning_rate": 4.267918742813339e-06, + "logits/chosen": 3.284019947052002, + "logits/rejected": 3.284019947052002, + "logps/chosen": -172.58697509765625, + "logps/rejected": -172.58697509765625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.440130233764648, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.440130233764648, + "step": 2227 + }, + { + "epoch": 1.5375194065896154, + "grad_norm": 0.320450097322464, + "learning_rate": 4.2698351858949795e-06, + "logits/chosen": 3.4911556243896484, + "logits/rejected": 3.4911556243896484, + "logps/chosen": -188.67987060546875, + "logps/rejected": -188.67987060546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.02668571472168, + "rewards/margins": 0.0, + "rewards/rejected": -14.02668571472168, + "step": 2228 + }, + { + "epoch": 1.5382094186648265, + "grad_norm": 0.28688716888427734, + "learning_rate": 4.2717516289766194e-06, + "logits/chosen": 2.9523940086364746, + "logits/rejected": 3.033158302307129, + "logps/chosen": -148.52752685546875, + "logps/rejected": -154.78602600097656, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.133819580078125, + "rewards/margins": 0.6910008788108826, + "rewards/rejected": -10.824819564819336, + "step": 2229 + }, + { + "epoch": 1.538899430740038, + "grad_norm": 0.3559672236442566, + "learning_rate": 4.27366807205826e-06, + "logits/chosen": 3.3993782997131348, + "logits/rejected": 3.3993782997131348, + "logps/chosen": -162.6578826904297, + "logps/rejected": -162.6578826904297, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.771419525146484, + "rewards/margins": 0.0, + "rewards/rejected": -11.771419525146484, + "step": 2230 + }, + { + "epoch": 1.5395894428152492, + "grad_norm": 14.847982406616211, + "learning_rate": 4.2755845151399e-06, + "logits/chosen": 3.3377442359924316, + "logits/rejected": 3.3821802139282227, + "logps/chosen": -171.8288116455078, + "logps/rejected": -170.274658203125, + "loss": 1.1478, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.547945976257324, + "rewards/margins": -0.23949432373046875, + "rewards/rejected": -12.308450698852539, + "step": 2231 + }, + { + "epoch": 1.5402794548904606, + "grad_norm": 0.7488226294517517, + "learning_rate": 4.277500958221541e-06, + "logits/chosen": 3.02016282081604, + "logits/rejected": 3.1190757751464844, + "logps/chosen": -175.22137451171875, + "logps/rejected": -189.7440643310547, + "loss": 0.5265, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.574960708618164, + "rewards/margins": 1.4696574211120605, + "rewards/rejected": -14.044618606567383, + "step": 2232 + }, + { + "epoch": 1.540969466965672, + "grad_norm": 22.462491989135742, + "learning_rate": 4.279417401303181e-06, + "logits/chosen": 2.8290047645568848, + "logits/rejected": 2.778177499771118, + "logps/chosen": -175.34271240234375, + "logps/rejected": -168.48953247070312, + "loss": 1.3242, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.604410171508789, + "rewards/margins": -0.7172476649284363, + "rewards/rejected": -11.887162208557129, + "step": 2233 + }, + { + "epoch": 1.541659479040883, + "grad_norm": 0.32511258125305176, + "learning_rate": 4.281333844384823e-06, + "logits/chosen": 3.219268560409546, + "logits/rejected": 3.315030813217163, + "logps/chosen": -172.95123291015625, + "logps/rejected": -185.418701171875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.309090614318848, + "rewards/margins": 1.2337749004364014, + "rewards/rejected": -13.542864799499512, + "step": 2234 + }, + { + "epoch": 1.5423494911160947, + "grad_norm": 0.38064342737197876, + "learning_rate": 4.283250287466463e-06, + "logits/chosen": 2.989443778991699, + "logits/rejected": 3.038503885269165, + "logps/chosen": -173.38414001464844, + "logps/rejected": -179.60205078125, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.386651992797852, + "rewards/margins": 0.6408495306968689, + "rewards/rejected": -13.027502059936523, + "step": 2235 + }, + { + "epoch": 1.5430395031913058, + "grad_norm": 0.18322408199310303, + "learning_rate": 4.2851667305481035e-06, + "logits/chosen": 3.134343385696411, + "logits/rejected": 3.40576434135437, + "logps/chosen": -151.54310607910156, + "logps/rejected": -191.0620574951172, + "loss": 0.3471, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.345951080322266, + "rewards/margins": 3.968637466430664, + "rewards/rejected": -14.31458854675293, + "step": 2236 + }, + { + "epoch": 1.5437295152665171, + "grad_norm": 0.316455602645874, + "learning_rate": 4.2870831736297434e-06, + "logits/chosen": 3.459536552429199, + "logits/rejected": 3.459536552429199, + "logps/chosen": -177.87481689453125, + "logps/rejected": -177.87481689453125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.061772346496582, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.061773300170898, + "step": 2237 + }, + { + "epoch": 1.5444195273417285, + "grad_norm": 16.786027908325195, + "learning_rate": 4.288999616711384e-06, + "logits/chosen": 3.3566882610321045, + "logits/rejected": 3.311115264892578, + "logps/chosen": -168.96310424804688, + "logps/rejected": -166.99871826171875, + "loss": 1.4584, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.052695274353027, + "rewards/margins": -0.22089511156082153, + "rewards/rejected": -11.831799507141113, + "step": 2238 + }, + { + "epoch": 1.5451095394169398, + "grad_norm": 0.3941822648048401, + "learning_rate": 4.290916059793024e-06, + "logits/chosen": 3.0279242992401123, + "logits/rejected": 3.0279242992401123, + "logps/chosen": -168.168212890625, + "logps/rejected": -168.168212890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.981767654418945, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -11.981767654418945, + "step": 2239 + }, + { + "epoch": 1.5457995514921512, + "grad_norm": 0.4116455316543579, + "learning_rate": 4.292832502874665e-06, + "logits/chosen": 3.185985803604126, + "logits/rejected": 3.2031891345977783, + "logps/chosen": -181.59877014160156, + "logps/rejected": -194.7814178466797, + "loss": 0.522, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.527002334594727, + "rewards/margins": 1.2708909511566162, + "rewards/rejected": -14.797893524169922, + "step": 2240 + }, + { + "epoch": 1.5464895635673623, + "grad_norm": 0.520110011100769, + "learning_rate": 4.294748945956305e-06, + "logits/chosen": 3.383448600769043, + "logits/rejected": 3.383448600769043, + "logps/chosen": -174.48593139648438, + "logps/rejected": -174.48593139648438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.636606216430664, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.636606216430664, + "step": 2241 + }, + { + "epoch": 1.5471795756425737, + "grad_norm": 0.2629760503768921, + "learning_rate": 4.296665389037946e-06, + "logits/chosen": 3.4267847537994385, + "logits/rejected": 3.5434563159942627, + "logps/chosen": -166.90432739257812, + "logps/rejected": -185.66513061523438, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.813694953918457, + "rewards/margins": 1.8664605617523193, + "rewards/rejected": -13.680155754089355, + "step": 2242 + }, + { + "epoch": 1.547869587717785, + "grad_norm": 0.3267155885696411, + "learning_rate": 4.298581832119587e-06, + "logits/chosen": 3.3806662559509277, + "logits/rejected": 3.3806662559509277, + "logps/chosen": -175.07525634765625, + "logps/rejected": -175.07525634765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.511237144470215, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -12.511237144470215, + "step": 2243 + }, + { + "epoch": 1.5485595997929964, + "grad_norm": 0.39079007506370544, + "learning_rate": 4.300498275201227e-06, + "logits/chosen": 3.3597514629364014, + "logits/rejected": 3.3988358974456787, + "logps/chosen": -154.4061279296875, + "logps/rejected": -163.99331665039062, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.813679695129395, + "rewards/margins": 0.9680119752883911, + "rewards/rejected": -11.781691551208496, + "step": 2244 + }, + { + "epoch": 1.5492496118682078, + "grad_norm": 11.524188041687012, + "learning_rate": 4.302414718282867e-06, + "logits/chosen": 3.4334118366241455, + "logits/rejected": 3.5060617923736572, + "logps/chosen": -163.1918487548828, + "logps/rejected": -175.12576293945312, + "loss": 0.6723, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.456727981567383, + "rewards/margins": 1.1735304594039917, + "rewards/rejected": -12.630258560180664, + "step": 2245 + }, + { + "epoch": 1.5499396239434189, + "grad_norm": 25.22956085205078, + "learning_rate": 4.304331161364507e-06, + "logits/chosen": 3.220020055770874, + "logits/rejected": 3.228647232055664, + "logps/chosen": -171.18524169921875, + "logps/rejected": -181.42025756835938, + "loss": 0.7324, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.317917823791504, + "rewards/margins": 1.1256824731826782, + "rewards/rejected": -13.443598747253418, + "step": 2246 + }, + { + "epoch": 1.5506296360186305, + "grad_norm": 0.30638301372528076, + "learning_rate": 4.306247604446148e-06, + "logits/chosen": 3.2099437713623047, + "logits/rejected": 3.262821674346924, + "logps/chosen": -155.29412841796875, + "logps/rejected": -164.31973266601562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.73159122467041, + "rewards/margins": 0.8962843418121338, + "rewards/rejected": -11.627875328063965, + "step": 2247 + }, + { + "epoch": 1.5513196480938416, + "grad_norm": 0.2849993109703064, + "learning_rate": 4.308164047527789e-06, + "logits/chosen": 3.238107442855835, + "logits/rejected": 3.2866547107696533, + "logps/chosen": -177.65151977539062, + "logps/rejected": -188.3468017578125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.996891021728516, + "rewards/margins": 1.094800353050232, + "rewards/rejected": -14.091691970825195, + "step": 2248 + }, + { + "epoch": 1.552009660169053, + "grad_norm": 0.9495073556900024, + "learning_rate": 4.310080490609429e-06, + "logits/chosen": 3.2094013690948486, + "logits/rejected": 3.424109935760498, + "logps/chosen": -177.8001708984375, + "logps/rejected": -192.06011962890625, + "loss": 0.5282, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.039073944091797, + "rewards/margins": 1.5078374147415161, + "rewards/rejected": -14.54691219329834, + "step": 2249 + }, + { + "epoch": 1.5526996722442643, + "grad_norm": 0.2796775698661804, + "learning_rate": 4.31199693369107e-06, + "logits/chosen": 3.2381820678710938, + "logits/rejected": 3.2381820678710938, + "logps/chosen": -178.832763671875, + "logps/rejected": -178.832763671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.027656555175781, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.027656555175781, + "step": 2250 + }, + { + "epoch": 1.5533896843194754, + "grad_norm": 3.622352361679077, + "learning_rate": 4.313913376772711e-06, + "logits/chosen": 3.253232717514038, + "logits/rejected": 3.226027011871338, + "logps/chosen": -156.8349609375, + "logps/rejected": -163.36883544921875, + "loss": 0.5526, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.927139282226562, + "rewards/margins": 0.7170883417129517, + "rewards/rejected": -11.644227027893066, + "step": 2251 + }, + { + "epoch": 1.554079696394687, + "grad_norm": 0.34987226128578186, + "learning_rate": 4.3158298198543506e-06, + "logits/chosen": 3.471400022506714, + "logits/rejected": 3.5181884765625, + "logps/chosen": -171.626953125, + "logps/rejected": -183.2806396484375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.443144798278809, + "rewards/margins": 1.195050597190857, + "rewards/rejected": -13.638195037841797, + "step": 2252 + }, + { + "epoch": 1.5547697084698981, + "grad_norm": 0.2530154287815094, + "learning_rate": 4.317746262935991e-06, + "logits/chosen": 3.021965980529785, + "logits/rejected": 3.1814398765563965, + "logps/chosen": -147.9285125732422, + "logps/rejected": -177.84901428222656, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.093113899230957, + "rewards/margins": 2.872389554977417, + "rewards/rejected": -12.965503692626953, + "step": 2253 + }, + { + "epoch": 1.5554597205451095, + "grad_norm": 1.3540771007537842, + "learning_rate": 4.319662706017631e-06, + "logits/chosen": 3.0174217224121094, + "logits/rejected": 2.9666409492492676, + "logps/chosen": -147.31991577148438, + "logps/rejected": -150.27215576171875, + "loss": 0.6191, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.145215034484863, + "rewards/margins": 0.2800144851207733, + "rewards/rejected": -10.425230026245117, + "step": 2254 + }, + { + "epoch": 1.5561497326203209, + "grad_norm": 0.28763920068740845, + "learning_rate": 4.321579149099272e-06, + "logits/chosen": 3.0899317264556885, + "logits/rejected": 3.185307264328003, + "logps/chosen": -172.05770874023438, + "logps/rejected": -179.58065795898438, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.321798324584961, + "rewards/margins": 0.7052167654037476, + "rewards/rejected": -13.027015686035156, + "step": 2255 + }, + { + "epoch": 1.5568397446955322, + "grad_norm": 0.34788718819618225, + "learning_rate": 4.323495592180912e-06, + "logits/chosen": 2.964540481567383, + "logits/rejected": 2.9502570629119873, + "logps/chosen": -166.2185821533203, + "logps/rejected": -195.233154296875, + "loss": 0.4354, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.655415534973145, + "rewards/margins": 3.016207695007324, + "rewards/rejected": -14.671623229980469, + "step": 2256 + }, + { + "epoch": 1.5575297567707436, + "grad_norm": 1.760378360748291, + "learning_rate": 4.325412035262553e-06, + "logits/chosen": 3.0284128189086914, + "logits/rejected": 3.013496160507202, + "logps/chosen": -154.32379150390625, + "logps/rejected": -160.76107788085938, + "loss": 0.5321, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.595515251159668, + "rewards/margins": 0.7598863840103149, + "rewards/rejected": -11.355401992797852, + "step": 2257 + }, + { + "epoch": 1.5582197688459547, + "grad_norm": 4.260509967803955, + "learning_rate": 4.327328478344194e-06, + "logits/chosen": 3.052255392074585, + "logits/rejected": 3.070661783218384, + "logps/chosen": -170.4882049560547, + "logps/rejected": -171.83157348632812, + "loss": 0.6451, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.234769821166992, + "rewards/margins": 0.1270148754119873, + "rewards/rejected": -12.361785888671875, + "step": 2258 + }, + { + "epoch": 1.5589097809211663, + "grad_norm": 0.5541474223136902, + "learning_rate": 4.329244921425835e-06, + "logits/chosen": 2.6785318851470947, + "logits/rejected": 2.6799728870391846, + "logps/chosen": -166.41744995117188, + "logps/rejected": -177.77938842773438, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.81209945678711, + "rewards/margins": 1.1681036949157715, + "rewards/rejected": -12.980203628540039, + "step": 2259 + }, + { + "epoch": 1.5595997929963774, + "grad_norm": 0.40019315481185913, + "learning_rate": 4.3311613645074745e-06, + "logits/chosen": 2.9837493896484375, + "logits/rejected": 3.108144760131836, + "logps/chosen": -163.19378662109375, + "logps/rejected": -175.44137573242188, + "loss": 0.522, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.685510635375977, + "rewards/margins": 1.1942732334136963, + "rewards/rejected": -12.879782676696777, + "step": 2260 + }, + { + "epoch": 1.5602898050715888, + "grad_norm": 0.5748686194419861, + "learning_rate": 4.333077807589115e-06, + "logits/chosen": 3.1471571922302246, + "logits/rejected": 3.275451421737671, + "logps/chosen": -165.91436767578125, + "logps/rejected": -170.281982421875, + "loss": 0.6104, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.966367721557617, + "rewards/margins": 0.43075090646743774, + "rewards/rejected": -12.397117614746094, + "step": 2261 + }, + { + "epoch": 1.5609798171468001, + "grad_norm": 0.391056090593338, + "learning_rate": 4.334994250670755e-06, + "logits/chosen": 3.174220323562622, + "logits/rejected": 3.174220323562622, + "logps/chosen": -172.0069580078125, + "logps/rejected": -172.0069580078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.516468048095703, + "rewards/margins": 0.0, + "rewards/rejected": -12.516468048095703, + "step": 2262 + }, + { + "epoch": 1.5616698292220113, + "grad_norm": 22.32866668701172, + "learning_rate": 4.336910693752396e-06, + "logits/chosen": 2.661412477493286, + "logits/rejected": 2.9201104640960693, + "logps/chosen": -164.77626037597656, + "logps/rejected": -175.67431640625, + "loss": 0.7743, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.719624519348145, + "rewards/margins": 1.1470078229904175, + "rewards/rejected": -12.866631507873535, + "step": 2263 + }, + { + "epoch": 1.5623598412972228, + "grad_norm": 23.036766052246094, + "learning_rate": 4.338827136834036e-06, + "logits/chosen": 2.747002601623535, + "logits/rejected": 2.850409984588623, + "logps/chosen": -164.5213623046875, + "logps/rejected": -177.45220947265625, + "loss": 0.7423, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.686235427856445, + "rewards/margins": 1.331760287284851, + "rewards/rejected": -13.017995834350586, + "step": 2264 + }, + { + "epoch": 1.563049853372434, + "grad_norm": 0.3025452792644501, + "learning_rate": 4.340743579915677e-06, + "logits/chosen": 3.28541898727417, + "logits/rejected": 3.28541898727417, + "logps/chosen": -168.04269409179688, + "logps/rejected": -168.04269409179688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.914817810058594, + "rewards/margins": 0.0, + "rewards/rejected": -11.914817810058594, + "step": 2265 + }, + { + "epoch": 1.5637398654476453, + "grad_norm": 0.2752493619918823, + "learning_rate": 4.342660022997318e-06, + "logits/chosen": 3.159069538116455, + "logits/rejected": 3.159069538116455, + "logps/chosen": -178.68504333496094, + "logps/rejected": -178.68504333496094, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.222002983093262, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.222002029418945, + "step": 2266 + }, + { + "epoch": 1.5644298775228567, + "grad_norm": 0.3038077652454376, + "learning_rate": 4.344576466078958e-06, + "logits/chosen": 2.987626791000366, + "logits/rejected": 2.987626791000366, + "logps/chosen": -176.76272583007812, + "logps/rejected": -176.76272583007812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.996706008911133, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.996706008911133, + "step": 2267 + }, + { + "epoch": 1.5651198895980678, + "grad_norm": 0.31889399886131287, + "learning_rate": 4.3464929091605985e-06, + "logits/chosen": 3.0478897094726562, + "logits/rejected": 3.1927919387817383, + "logps/chosen": -164.5247802734375, + "logps/rejected": -170.82815551757812, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.589433670043945, + "rewards/margins": 0.6891530752182007, + "rewards/rejected": -12.278587341308594, + "step": 2268 + }, + { + "epoch": 1.5658099016732794, + "grad_norm": 13.779218673706055, + "learning_rate": 4.3484093522422385e-06, + "logits/chosen": 3.3092682361602783, + "logits/rejected": 3.241706371307373, + "logps/chosen": -173.006103515625, + "logps/rejected": -172.01353454589844, + "loss": 0.7539, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.656951904296875, + "rewards/margins": -0.10149276256561279, + "rewards/rejected": -12.555459022521973, + "step": 2269 + }, + { + "epoch": 1.5664999137484905, + "grad_norm": 1.3590408563613892, + "learning_rate": 4.350325795323879e-06, + "logits/chosen": 3.0087814331054688, + "logits/rejected": 3.0885376930236816, + "logps/chosen": -176.123779296875, + "logps/rejected": -179.24905395507812, + "loss": 0.6155, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.622434616088867, + "rewards/margins": 0.3248516917228699, + "rewards/rejected": -12.947285652160645, + "step": 2270 + }, + { + "epoch": 1.5671899258237019, + "grad_norm": 0.3816226124763489, + "learning_rate": 4.352242238405519e-06, + "logits/chosen": 2.9529528617858887, + "logits/rejected": 2.9529528617858887, + "logps/chosen": -173.78726196289062, + "logps/rejected": -173.78726196289062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.404430389404297, + "rewards/margins": 0.0, + "rewards/rejected": -12.404430389404297, + "step": 2271 + }, + { + "epoch": 1.5678799378989132, + "grad_norm": 0.2664181888103485, + "learning_rate": 4.35415868148716e-06, + "logits/chosen": 3.3468809127807617, + "logits/rejected": 3.4066457748413086, + "logps/chosen": -164.29409790039062, + "logps/rejected": -184.11685180664062, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.689339637756348, + "rewards/margins": 2.0297632217407227, + "rewards/rejected": -13.71910285949707, + "step": 2272 + }, + { + "epoch": 1.5685699499741246, + "grad_norm": 0.32614588737487793, + "learning_rate": 4.3560751245688e-06, + "logits/chosen": 3.0534307956695557, + "logits/rejected": 3.0534307956695557, + "logps/chosen": -175.586181640625, + "logps/rejected": -175.586181640625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.599143981933594, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.599143981933594, + "step": 2273 + }, + { + "epoch": 1.569259962049336, + "grad_norm": 0.24093620479106903, + "learning_rate": 4.357991567650442e-06, + "logits/chosen": 2.9732398986816406, + "logits/rejected": 3.0228970050811768, + "logps/chosen": -138.42532348632812, + "logps/rejected": -176.84852600097656, + "loss": 0.3469, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.01082706451416, + "rewards/margins": 3.891843557357788, + "rewards/rejected": -12.902669906616211, + "step": 2274 + }, + { + "epoch": 1.569949974124547, + "grad_norm": 6.028412818908691, + "learning_rate": 4.359908010732082e-06, + "logits/chosen": 2.5421907901763916, + "logits/rejected": 2.6413955688476562, + "logps/chosen": -149.81472778320312, + "logps/rejected": -158.23251342773438, + "loss": 0.6189, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.238947868347168, + "rewards/margins": 0.8391990065574646, + "rewards/rejected": -11.078147888183594, + "step": 2275 + }, + { + "epoch": 1.5706399861997586, + "grad_norm": 0.3512963354587555, + "learning_rate": 4.3618244538137225e-06, + "logits/chosen": 3.346188545227051, + "logits/rejected": 3.346188545227051, + "logps/chosen": -178.42593383789062, + "logps/rejected": -178.42593383789062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.080558776855469, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.080558776855469, + "step": 2276 + }, + { + "epoch": 1.5713299982749698, + "grad_norm": 0.36063358187675476, + "learning_rate": 4.3637408968953625e-06, + "logits/chosen": 2.77970552444458, + "logits/rejected": 2.809640407562256, + "logps/chosen": -157.88365173339844, + "logps/rejected": -163.44834899902344, + "loss": 0.6073, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.00851821899414, + "rewards/margins": 0.625286340713501, + "rewards/rejected": -11.633804321289062, + "step": 2277 + }, + { + "epoch": 1.5720200103501811, + "grad_norm": 0.3737340271472931, + "learning_rate": 4.365657339977003e-06, + "logits/chosen": 3.150839328765869, + "logits/rejected": 3.187088966369629, + "logps/chosen": -150.74771118164062, + "logps/rejected": -162.66204833984375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.568927764892578, + "rewards/margins": 0.9972097873687744, + "rewards/rejected": -11.566139221191406, + "step": 2278 + }, + { + "epoch": 1.5727100224253925, + "grad_norm": 0.30666452646255493, + "learning_rate": 4.367573783058643e-06, + "logits/chosen": 2.909878969192505, + "logits/rejected": 2.935065269470215, + "logps/chosen": -174.71575927734375, + "logps/rejected": -185.1091766357422, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.640113830566406, + "rewards/margins": 1.0607755184173584, + "rewards/rejected": -13.700889587402344, + "step": 2279 + }, + { + "epoch": 1.5734000345006036, + "grad_norm": 0.29788434505462646, + "learning_rate": 4.369490226140284e-06, + "logits/chosen": 3.0614964962005615, + "logits/rejected": 3.0951058864593506, + "logps/chosen": -173.8048858642578, + "logps/rejected": -183.18234252929688, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.800802230834961, + "rewards/margins": 0.9324216246604919, + "rewards/rejected": -13.733222961425781, + "step": 2280 + }, + { + "epoch": 1.5740900465758152, + "grad_norm": 0.8522149324417114, + "learning_rate": 4.371406669221924e-06, + "logits/chosen": 3.103430986404419, + "logits/rejected": 3.1690094470977783, + "logps/chosen": -171.1954345703125, + "logps/rejected": -186.23574829101562, + "loss": 0.5228, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.149572372436523, + "rewards/margins": 1.573307752609253, + "rewards/rejected": -13.722881317138672, + "step": 2281 + }, + { + "epoch": 1.5747800586510263, + "grad_norm": 0.3620103895664215, + "learning_rate": 4.373323112303565e-06, + "logits/chosen": 3.021080255508423, + "logits/rejected": 3.021080255508423, + "logps/chosen": -164.16737365722656, + "logps/rejected": -164.16737365722656, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.634075164794922, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.634075164794922, + "step": 2282 + }, + { + "epoch": 1.5754700707262377, + "grad_norm": 0.3477906882762909, + "learning_rate": 4.375239555385206e-06, + "logits/chosen": 3.129695177078247, + "logits/rejected": 3.129695177078247, + "logps/chosen": -165.70474243164062, + "logps/rejected": -165.70474243164062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.83137321472168, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -11.831372261047363, + "step": 2283 + }, + { + "epoch": 1.576160082801449, + "grad_norm": 0.27042409777641296, + "learning_rate": 4.377155998466846e-06, + "logits/chosen": 3.032773017883301, + "logits/rejected": 3.055353879928589, + "logps/chosen": -168.186279296875, + "logps/rejected": -177.7640380859375, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.025032043457031, + "rewards/margins": 0.9631128311157227, + "rewards/rejected": -12.988143920898438, + "step": 2284 + }, + { + "epoch": 1.5768500948766604, + "grad_norm": 0.31561824679374695, + "learning_rate": 4.3790724415484865e-06, + "logits/chosen": 3.313727855682373, + "logits/rejected": 3.313727855682373, + "logps/chosen": -171.759765625, + "logps/rejected": -171.759765625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.430976867675781, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.430976867675781, + "step": 2285 + }, + { + "epoch": 1.5775401069518717, + "grad_norm": 0.2868815064430237, + "learning_rate": 4.380988884630126e-06, + "logits/chosen": 3.3513104915618896, + "logits/rejected": 3.3513104915618896, + "logps/chosen": -167.82740783691406, + "logps/rejected": -167.82740783691406, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.959538459777832, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.959538459777832, + "step": 2286 + }, + { + "epoch": 1.5782301190270829, + "grad_norm": 0.23057563602924347, + "learning_rate": 4.382905327711767e-06, + "logits/chosen": 3.0514721870422363, + "logits/rejected": 3.013485908508301, + "logps/chosen": -147.04820251464844, + "logps/rejected": -174.27029418945312, + "loss": 0.4337, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.958433151245117, + "rewards/margins": 2.736581325531006, + "rewards/rejected": -12.695014953613281, + "step": 2287 + }, + { + "epoch": 1.5789201311022945, + "grad_norm": 0.298055499792099, + "learning_rate": 4.384821770793408e-06, + "logits/chosen": 3.323180913925171, + "logits/rejected": 3.338050603866577, + "logps/chosen": -160.72052001953125, + "logps/rejected": -169.58358764648438, + "loss": 0.6066, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.239048957824707, + "rewards/margins": 0.8908668756484985, + "rewards/rejected": -12.129916191101074, + "step": 2288 + }, + { + "epoch": 1.5796101431775056, + "grad_norm": 0.27697551250457764, + "learning_rate": 4.386738213875048e-06, + "logits/chosen": 3.2388112545013428, + "logits/rejected": 3.237096071243286, + "logps/chosen": -152.94674682617188, + "logps/rejected": -163.26394653320312, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.564483642578125, + "rewards/margins": 0.9788637757301331, + "rewards/rejected": -11.543347358703613, + "step": 2289 + }, + { + "epoch": 1.580300155252717, + "grad_norm": 0.3236508369445801, + "learning_rate": 4.388654656956689e-06, + "logits/chosen": 3.0913519859313965, + "logits/rejected": 3.305927276611328, + "logps/chosen": -166.9320068359375, + "logps/rejected": -173.76991271972656, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.878853797912598, + "rewards/margins": 0.70381098985672, + "rewards/rejected": -12.582664489746094, + "step": 2290 + }, + { + "epoch": 1.5809901673279283, + "grad_norm": 0.2607249915599823, + "learning_rate": 4.39057110003833e-06, + "logits/chosen": 3.0593788623809814, + "logits/rejected": 3.2282657623291016, + "logps/chosen": -157.46780395507812, + "logps/rejected": -182.96900939941406, + "loss": 0.434, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.966718673706055, + "rewards/margins": 2.6378090381622314, + "rewards/rejected": -13.60452651977539, + "step": 2291 + }, + { + "epoch": 1.5816801794031394, + "grad_norm": 13.108722686767578, + "learning_rate": 4.39248754311997e-06, + "logits/chosen": 3.033475875854492, + "logits/rejected": 3.2404985427856445, + "logps/chosen": -145.39971923828125, + "logps/rejected": -163.9183807373047, + "loss": 0.5635, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.547622680664062, + "rewards/margins": 1.8761590719223022, + "rewards/rejected": -11.423782348632812, + "step": 2292 + }, + { + "epoch": 1.582370191478351, + "grad_norm": 0.2973935306072235, + "learning_rate": 4.3944039862016104e-06, + "logits/chosen": 3.579496145248413, + "logits/rejected": 3.6785478591918945, + "logps/chosen": -160.1994171142578, + "logps/rejected": -173.49783325195312, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.21534252166748, + "rewards/margins": 1.2777307033538818, + "rewards/rejected": -12.493073463439941, + "step": 2293 + }, + { + "epoch": 1.5830602035535621, + "grad_norm": 1.776166558265686, + "learning_rate": 4.39632042928325e-06, + "logits/chosen": 3.419644355773926, + "logits/rejected": 3.538997173309326, + "logps/chosen": -177.5987548828125, + "logps/rejected": -181.12625122070312, + "loss": 0.6175, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.008991241455078, + "rewards/margins": 0.29814958572387695, + "rewards/rejected": -13.307140350341797, + "step": 2294 + }, + { + "epoch": 1.5837502156287735, + "grad_norm": 0.9972215890884399, + "learning_rate": 4.398236872364891e-06, + "logits/chosen": 3.251692295074463, + "logits/rejected": 3.4814271926879883, + "logps/chosen": -158.13108825683594, + "logps/rejected": -173.11688232421875, + "loss": 0.4408, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.813015937805176, + "rewards/margins": 1.5333046913146973, + "rewards/rejected": -12.346321105957031, + "step": 2295 + }, + { + "epoch": 1.5844402277039848, + "grad_norm": 5.998512268066406, + "learning_rate": 4.400153315446531e-06, + "logits/chosen": 3.1584482192993164, + "logits/rejected": 3.1655702590942383, + "logps/chosen": -168.53448486328125, + "logps/rejected": -169.0377197265625, + "loss": 0.6705, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.166797637939453, + "rewards/margins": 0.05026054382324219, + "rewards/rejected": -12.217059135437012, + "step": 2296 + }, + { + "epoch": 1.585130239779196, + "grad_norm": 0.24052466452121735, + "learning_rate": 4.402069758528172e-06, + "logits/chosen": 3.5308899879455566, + "logits/rejected": 3.5308899879455566, + "logps/chosen": -173.83518981933594, + "logps/rejected": -173.83518981933594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.639059066772461, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.639059066772461, + "step": 2297 + }, + { + "epoch": 1.5858202518544076, + "grad_norm": 0.38857874274253845, + "learning_rate": 4.403986201609812e-06, + "logits/chosen": 3.6038854122161865, + "logits/rejected": 3.516758441925049, + "logps/chosen": -180.89321899414062, + "logps/rejected": -194.46189880371094, + "loss": 0.5225, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.198969841003418, + "rewards/margins": 1.4213213920593262, + "rewards/rejected": -14.620291709899902, + "step": 2298 + }, + { + "epoch": 1.5865102639296187, + "grad_norm": 0.24659758806228638, + "learning_rate": 4.405902644691454e-06, + "logits/chosen": 3.287440538406372, + "logits/rejected": 3.287440538406372, + "logps/chosen": -175.81195068359375, + "logps/rejected": -175.81195068359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.669988632202148, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -12.669987678527832, + "step": 2299 + }, + { + "epoch": 1.58720027600483, + "grad_norm": 0.3308368921279907, + "learning_rate": 4.407819087773094e-06, + "logits/chosen": 3.7262511253356934, + "logits/rejected": 3.7262511253356934, + "logps/chosen": -163.25442504882812, + "logps/rejected": -163.25442504882812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.505069732666016, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.505069732666016, + "step": 2300 + }, + { + "epoch": 1.5878902880800414, + "grad_norm": 0.2849200367927551, + "learning_rate": 4.409735530854734e-06, + "logits/chosen": 3.7419917583465576, + "logits/rejected": 3.7419917583465576, + "logps/chosen": -172.66127014160156, + "logps/rejected": -172.66127014160156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.322723388671875, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.322723388671875, + "step": 2301 + }, + { + "epoch": 1.5885803001552528, + "grad_norm": 0.2953393757343292, + "learning_rate": 4.411651973936374e-06, + "logits/chosen": 3.636388063430786, + "logits/rejected": 3.636388063430786, + "logps/chosen": -182.86526489257812, + "logps/rejected": -182.86526489257812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.26951789855957, + "rewards/margins": 0.0, + "rewards/rejected": -13.26951789855957, + "step": 2302 + }, + { + "epoch": 1.589270312230464, + "grad_norm": 0.27390530705451965, + "learning_rate": 4.413568417018015e-06, + "logits/chosen": 3.3796308040618896, + "logits/rejected": 3.3796308040618896, + "logps/chosen": -170.29217529296875, + "logps/rejected": -170.29217529296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.318747520446777, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.318747520446777, + "step": 2303 + }, + { + "epoch": 1.5899603243056752, + "grad_norm": 0.2110610157251358, + "learning_rate": 4.415484860099655e-06, + "logits/chosen": 3.5766005516052246, + "logits/rejected": 3.9056077003479004, + "logps/chosen": -153.328369140625, + "logps/rejected": -185.3492431640625, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.576211929321289, + "rewards/margins": 3.273362636566162, + "rewards/rejected": -13.84957504272461, + "step": 2304 + }, + { + "epoch": 1.5906503363808868, + "grad_norm": 6.438847541809082, + "learning_rate": 4.417401303181296e-06, + "logits/chosen": 3.471897602081299, + "logits/rejected": 3.551079750061035, + "logps/chosen": -171.470947265625, + "logps/rejected": -182.83828735351562, + "loss": 0.5787, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.333866119384766, + "rewards/margins": 1.169533610343933, + "rewards/rejected": -13.503399848937988, + "step": 2305 + }, + { + "epoch": 1.591340348456098, + "grad_norm": 0.26652488112449646, + "learning_rate": 4.419317746262936e-06, + "logits/chosen": 3.746152400970459, + "logits/rejected": 3.746152400970459, + "logps/chosen": -178.92605590820312, + "logps/rejected": -178.92605590820312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.955161094665527, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.955161094665527, + "step": 2306 + }, + { + "epoch": 1.5920303605313093, + "grad_norm": 0.3070565462112427, + "learning_rate": 4.421234189344577e-06, + "logits/chosen": 3.732107639312744, + "logits/rejected": 3.732107639312744, + "logps/chosen": -167.22015380859375, + "logps/rejected": -167.22015380859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.906885147094727, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.90688419342041, + "step": 2307 + }, + { + "epoch": 1.5927203726065207, + "grad_norm": 0.30973318219184875, + "learning_rate": 4.423150632426218e-06, + "logits/chosen": 3.551255941390991, + "logits/rejected": 3.735416889190674, + "logps/chosen": -158.70899963378906, + "logps/rejected": -180.14410400390625, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.14205551147461, + "rewards/margins": 2.117034912109375, + "rewards/rejected": -13.259090423583984, + "step": 2308 + }, + { + "epoch": 1.5934103846817318, + "grad_norm": 23.394899368286133, + "learning_rate": 4.4250670755078575e-06, + "logits/chosen": 3.3700010776519775, + "logits/rejected": 3.4065210819244385, + "logps/chosen": -170.55455017089844, + "logps/rejected": -164.79151916503906, + "loss": 1.2227, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.259653091430664, + "rewards/margins": -0.6152454614639282, + "rewards/rejected": -11.644407272338867, + "step": 2309 + }, + { + "epoch": 1.5941003967569434, + "grad_norm": 30.963735580444336, + "learning_rate": 4.426983518589498e-06, + "logits/chosen": 3.6839964389801025, + "logits/rejected": 3.8605704307556152, + "logps/chosen": -157.33535766601562, + "logps/rejected": -170.19149780273438, + "loss": 1.088, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.134777069091797, + "rewards/margins": 1.221628189086914, + "rewards/rejected": -12.356405258178711, + "step": 2310 + }, + { + "epoch": 1.5947904088321545, + "grad_norm": 15.381692886352539, + "learning_rate": 4.428899961671138e-06, + "logits/chosen": 3.7981042861938477, + "logits/rejected": 3.8810677528381348, + "logps/chosen": -163.25527954101562, + "logps/rejected": -183.03700256347656, + "loss": 1.0524, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.40222454071045, + "rewards/margins": 1.8153454065322876, + "rewards/rejected": -13.217569351196289, + "step": 2311 + }, + { + "epoch": 1.5954804209073659, + "grad_norm": 0.27344512939453125, + "learning_rate": 4.430816404752779e-06, + "logits/chosen": 3.6502790451049805, + "logits/rejected": 3.7729053497314453, + "logps/chosen": -178.97296142578125, + "logps/rejected": -191.86917114257812, + "loss": 0.5213, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.011813163757324, + "rewards/margins": 1.3263521194458008, + "rewards/rejected": -14.338165283203125, + "step": 2312 + }, + { + "epoch": 1.5961704329825772, + "grad_norm": 0.515438973903656, + "learning_rate": 4.432732847834419e-06, + "logits/chosen": 3.2724781036376953, + "logits/rejected": 3.4973578453063965, + "logps/chosen": -147.9488525390625, + "logps/rejected": -173.770263671875, + "loss": 0.4366, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.16466236114502, + "rewards/margins": 2.5972139835357666, + "rewards/rejected": -12.761876106262207, + "step": 2313 + }, + { + "epoch": 1.5968604450577883, + "grad_norm": 0.40997397899627686, + "learning_rate": 4.43464929091606e-06, + "logits/chosen": 3.4194507598876953, + "logits/rejected": 3.6103596687316895, + "logps/chosen": -165.06655883789062, + "logps/rejected": -185.15399169921875, + "loss": 0.521, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.612546920776367, + "rewards/margins": 2.0752148628234863, + "rewards/rejected": -13.687763214111328, + "step": 2314 + }, + { + "epoch": 1.597550457133, + "grad_norm": 1.155613899230957, + "learning_rate": 4.436565733997701e-06, + "logits/chosen": 3.5658020973205566, + "logits/rejected": 3.592055559158325, + "logps/chosen": -172.95156860351562, + "logps/rejected": -176.62832641601562, + "loss": 0.6119, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.551366806030273, + "rewards/margins": 0.38965606689453125, + "rewards/rejected": -12.941022872924805, + "step": 2315 + }, + { + "epoch": 1.598240469208211, + "grad_norm": 0.3002442717552185, + "learning_rate": 4.4384821770793416e-06, + "logits/chosen": 3.5861942768096924, + "logits/rejected": 3.6491217613220215, + "logps/chosen": -188.32948303222656, + "logps/rejected": -198.85235595703125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.069326400756836, + "rewards/margins": 1.065578579902649, + "rewards/rejected": -15.134904861450195, + "step": 2316 + }, + { + "epoch": 1.5989304812834224, + "grad_norm": 0.27126553654670715, + "learning_rate": 4.4403986201609815e-06, + "logits/chosen": 3.636119842529297, + "logits/rejected": 3.7173197269439697, + "logps/chosen": -182.61862182617188, + "logps/rejected": -194.5785369873047, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.382549285888672, + "rewards/margins": 1.1294089555740356, + "rewards/rejected": -14.511958122253418, + "step": 2317 + }, + { + "epoch": 1.5996204933586338, + "grad_norm": 0.37621524930000305, + "learning_rate": 4.442315063242622e-06, + "logits/chosen": 3.457460880279541, + "logits/rejected": 3.4719314575195312, + "logps/chosen": -174.63487243652344, + "logps/rejected": -181.6136474609375, + "loss": 0.6069, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.463950157165527, + "rewards/margins": 0.7212616205215454, + "rewards/rejected": -13.185211181640625, + "step": 2318 + }, + { + "epoch": 1.6003105054338451, + "grad_norm": 0.3217298090457916, + "learning_rate": 4.444231506324262e-06, + "logits/chosen": 3.7657251358032227, + "logits/rejected": 3.7463841438293457, + "logps/chosen": -173.0677032470703, + "logps/rejected": -184.8893585205078, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.589280128479004, + "rewards/margins": 1.1946955919265747, + "rewards/rejected": -13.783976554870605, + "step": 2319 + }, + { + "epoch": 1.6010005175090565, + "grad_norm": 0.3205719292163849, + "learning_rate": 4.446147949405903e-06, + "logits/chosen": 3.580453872680664, + "logits/rejected": 3.677614688873291, + "logps/chosen": -182.94012451171875, + "logps/rejected": -195.31805419921875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.302312850952148, + "rewards/margins": 1.223176121711731, + "rewards/rejected": -14.52548885345459, + "step": 2320 + }, + { + "epoch": 1.6016905295842676, + "grad_norm": 1.060638189315796, + "learning_rate": 4.448064392487543e-06, + "logits/chosen": 3.2824206352233887, + "logits/rejected": 3.239478826522827, + "logps/chosen": -159.6519775390625, + "logps/rejected": -181.26296997070312, + "loss": 0.4397, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.095487594604492, + "rewards/margins": 2.2052388191223145, + "rewards/rejected": -13.300726890563965, + "step": 2321 + }, + { + "epoch": 1.6023805416594792, + "grad_norm": 2.9956347942352295, + "learning_rate": 4.449980835569184e-06, + "logits/chosen": 3.581726312637329, + "logits/rejected": 3.539095401763916, + "logps/chosen": -173.3096466064453, + "logps/rejected": -175.47816467285156, + "loss": 0.6294, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.512276649475098, + "rewards/margins": 0.20029926300048828, + "rewards/rejected": -12.712576866149902, + "step": 2322 + }, + { + "epoch": 1.6030705537346903, + "grad_norm": 17.226104736328125, + "learning_rate": 4.451897278650825e-06, + "logits/chosen": 3.7147603034973145, + "logits/rejected": 3.785553455352783, + "logps/chosen": -168.59762573242188, + "logps/rejected": -188.44332885742188, + "loss": 0.6381, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.068436622619629, + "rewards/margins": 1.9461334943771362, + "rewards/rejected": -14.014570236206055, + "step": 2323 + }, + { + "epoch": 1.6037605658099017, + "grad_norm": 22.61965560913086, + "learning_rate": 4.453813721732465e-06, + "logits/chosen": 3.7113966941833496, + "logits/rejected": 3.7227978706359863, + "logps/chosen": -193.49398803710938, + "logps/rejected": -189.0184326171875, + "loss": 1.0529, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.564170837402344, + "rewards/margins": -0.4428595304489136, + "rewards/rejected": -14.12131118774414, + "step": 2324 + }, + { + "epoch": 1.604450577885113, + "grad_norm": 0.2752845287322998, + "learning_rate": 4.4557301648141055e-06, + "logits/chosen": 3.3770861625671387, + "logits/rejected": 3.6154212951660156, + "logps/chosen": -166.66415405273438, + "logps/rejected": -193.8294677734375, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.887395858764648, + "rewards/margins": 2.661454200744629, + "rewards/rejected": -14.548850059509277, + "step": 2325 + }, + { + "epoch": 1.6051405899603242, + "grad_norm": 0.2975578010082245, + "learning_rate": 4.4576466078957455e-06, + "logits/chosen": 3.3159618377685547, + "logits/rejected": 3.4529354572296143, + "logps/chosen": -188.77520751953125, + "logps/rejected": -195.61045837402344, + "loss": 0.607, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.998067855834961, + "rewards/margins": 0.701406717300415, + "rewards/rejected": -14.699474334716797, + "step": 2326 + }, + { + "epoch": 1.6058306020355357, + "grad_norm": 0.3894140422344208, + "learning_rate": 4.459563050977386e-06, + "logits/chosen": 3.6368441581726074, + "logits/rejected": 3.6368441581726074, + "logps/chosen": -172.3796844482422, + "logps/rejected": -172.3796844482422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.526612281799316, + "rewards/margins": -8.344650268554688e-07, + "rewards/rejected": -12.526611328125, + "step": 2327 + }, + { + "epoch": 1.6065206141107469, + "grad_norm": 0.2958383560180664, + "learning_rate": 4.461479494059027e-06, + "logits/chosen": 3.6102371215820312, + "logits/rejected": 3.6102371215820312, + "logps/chosen": -172.3846893310547, + "logps/rejected": -172.3846893310547, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.666976928710938, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -12.666976928710938, + "step": 2328 + }, + { + "epoch": 1.6072106261859582, + "grad_norm": 5.206296920776367, + "learning_rate": 4.463395937140667e-06, + "logits/chosen": 3.6549856662750244, + "logits/rejected": 3.867863416671753, + "logps/chosen": -174.4427947998047, + "logps/rejected": -186.54469299316406, + "loss": 0.5627, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.450323104858398, + "rewards/margins": 1.2351984977722168, + "rewards/rejected": -13.685521125793457, + "step": 2329 + }, + { + "epoch": 1.6079006382611696, + "grad_norm": 0.2577596604824066, + "learning_rate": 4.465312380222308e-06, + "logits/chosen": 3.958536148071289, + "logits/rejected": 4.03730583190918, + "logps/chosen": -172.9636688232422, + "logps/rejected": -184.5596160888672, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.592544555664062, + "rewards/margins": 1.103963017463684, + "rewards/rejected": -13.696508407592773, + "step": 2330 + }, + { + "epoch": 1.608590650336381, + "grad_norm": 0.27046704292297363, + "learning_rate": 4.467228823303949e-06, + "logits/chosen": 3.273305892944336, + "logits/rejected": 3.4882593154907227, + "logps/chosen": -174.07223510742188, + "logps/rejected": -201.69747924804688, + "loss": 0.4341, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.571126937866211, + "rewards/margins": 2.7760629653930664, + "rewards/rejected": -15.347189903259277, + "step": 2331 + }, + { + "epoch": 1.6092806624115923, + "grad_norm": 0.30240675806999207, + "learning_rate": 4.469145266385589e-06, + "logits/chosen": 3.327975034713745, + "logits/rejected": 3.327975034713745, + "logps/chosen": -168.29832458496094, + "logps/rejected": -168.29832458496094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.352290153503418, + "rewards/margins": 0.0, + "rewards/rejected": -12.352290153503418, + "step": 2332 + }, + { + "epoch": 1.6099706744868034, + "grad_norm": 0.27262213826179504, + "learning_rate": 4.4710617094672295e-06, + "logits/chosen": 3.3143811225891113, + "logits/rejected": 3.395599603652954, + "logps/chosen": -181.43609619140625, + "logps/rejected": -189.47874450683594, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.236926078796387, + "rewards/margins": 0.8319628238677979, + "rewards/rejected": -14.068889617919922, + "step": 2333 + }, + { + "epoch": 1.610660686562015, + "grad_norm": 0.3095638155937195, + "learning_rate": 4.4729781525488695e-06, + "logits/chosen": 3.400214195251465, + "logits/rejected": 3.400214195251465, + "logps/chosen": -186.97293090820312, + "logps/rejected": -186.97293090820312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.976879119873047, + "rewards/margins": 0.0, + "rewards/rejected": -13.976879119873047, + "step": 2334 + }, + { + "epoch": 1.6113506986372261, + "grad_norm": 0.32708239555358887, + "learning_rate": 4.47489459563051e-06, + "logits/chosen": 3.488341808319092, + "logits/rejected": 3.488341808319092, + "logps/chosen": -191.17784118652344, + "logps/rejected": -191.17784118652344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.372251510620117, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.372251510620117, + "step": 2335 + }, + { + "epoch": 1.6120407107124375, + "grad_norm": 0.2748529314994812, + "learning_rate": 4.47681103871215e-06, + "logits/chosen": 3.5503499507904053, + "logits/rejected": 3.5503499507904053, + "logps/chosen": -173.2843017578125, + "logps/rejected": -173.28431701660156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.458475112915039, + "rewards/margins": 0.0, + "rewards/rejected": -12.458475112915039, + "step": 2336 + }, + { + "epoch": 1.6127307227876488, + "grad_norm": 0.7020273804664612, + "learning_rate": 4.478727481793791e-06, + "logits/chosen": 3.2996368408203125, + "logits/rejected": 3.386538505554199, + "logps/chosen": -167.70248413085938, + "logps/rejected": -195.05538940429688, + "loss": 0.4368, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.997576713562012, + "rewards/margins": 2.7688961029052734, + "rewards/rejected": -14.766472816467285, + "step": 2337 + }, + { + "epoch": 1.61342073486286, + "grad_norm": 5.945786476135254, + "learning_rate": 4.480643924875431e-06, + "logits/chosen": 3.1788406372070312, + "logits/rejected": 3.4419171810150146, + "logps/chosen": -153.002685546875, + "logps/rejected": -183.23501586914062, + "loss": 0.3589, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.776679039001465, + "rewards/margins": 2.896662712097168, + "rewards/rejected": -13.673341751098633, + "step": 2338 + }, + { + "epoch": 1.6141107469380716, + "grad_norm": 32.9318733215332, + "learning_rate": 4.482560367957073e-06, + "logits/chosen": 3.3347249031066895, + "logits/rejected": 3.305605411529541, + "logps/chosen": -179.173583984375, + "logps/rejected": -173.0611114501953, + "loss": 1.2186, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.168497085571289, + "rewards/margins": -0.611204981803894, + "rewards/rejected": -12.557292938232422, + "step": 2339 + }, + { + "epoch": 1.6148007590132827, + "grad_norm": 0.48029765486717224, + "learning_rate": 4.484476811038713e-06, + "logits/chosen": 3.623016357421875, + "logits/rejected": 3.6693577766418457, + "logps/chosen": -190.21371459960938, + "logps/rejected": -195.11647033691406, + "loss": 0.6098, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.223084449768066, + "rewards/margins": 0.4521886110305786, + "rewards/rejected": -14.675271987915039, + "step": 2340 + }, + { + "epoch": 1.615490771088494, + "grad_norm": 10.794401168823242, + "learning_rate": 4.4863932541203535e-06, + "logits/chosen": 3.2473134994506836, + "logits/rejected": 3.342846155166626, + "logps/chosen": -160.50442504882812, + "logps/rejected": -180.66542053222656, + "loss": 0.6388, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.359793663024902, + "rewards/margins": 1.8366185426712036, + "rewards/rejected": -13.196412086486816, + "step": 2341 + }, + { + "epoch": 1.6161807831637054, + "grad_norm": 3.02537202835083, + "learning_rate": 4.4883096972019934e-06, + "logits/chosen": 3.20156192779541, + "logits/rejected": 3.199413537979126, + "logps/chosen": -168.6204833984375, + "logps/rejected": -169.90362548828125, + "loss": 0.6399, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.115388870239258, + "rewards/margins": 0.1477569341659546, + "rewards/rejected": -12.263145446777344, + "step": 2342 + }, + { + "epoch": 1.6168707952389165, + "grad_norm": 0.3225836157798767, + "learning_rate": 4.490226140283634e-06, + "logits/chosen": 3.2285776138305664, + "logits/rejected": 3.4204838275909424, + "logps/chosen": -150.39859008789062, + "logps/rejected": -178.09024047851562, + "loss": 0.4352, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.47853946685791, + "rewards/margins": 2.7594778537750244, + "rewards/rejected": -13.238016128540039, + "step": 2343 + }, + { + "epoch": 1.617560807314128, + "grad_norm": 5.639990329742432, + "learning_rate": 4.492142583365274e-06, + "logits/chosen": 3.1570348739624023, + "logits/rejected": 3.196277141571045, + "logps/chosen": -170.74267578125, + "logps/rejected": -178.68170166015625, + "loss": 0.5709, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.271150588989258, + "rewards/margins": 0.7927219867706299, + "rewards/rejected": -13.063873291015625, + "step": 2344 + }, + { + "epoch": 1.6182508193893392, + "grad_norm": 0.8285984396934509, + "learning_rate": 4.494059026446915e-06, + "logits/chosen": 3.3608503341674805, + "logits/rejected": 3.328153133392334, + "logps/chosen": -174.53173828125, + "logps/rejected": -186.3551483154297, + "loss": 0.5236, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.811259269714355, + "rewards/margins": 1.2062100172042847, + "rewards/rejected": -14.01746940612793, + "step": 2345 + }, + { + "epoch": 1.6189408314645506, + "grad_norm": 0.32913026213645935, + "learning_rate": 4.495975469528555e-06, + "logits/chosen": 3.2678298950195312, + "logits/rejected": 3.2903151512145996, + "logps/chosen": -171.18411254882812, + "logps/rejected": -181.8672637939453, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.307731628417969, + "rewards/margins": 1.0798182487487793, + "rewards/rejected": -13.38754940032959, + "step": 2346 + }, + { + "epoch": 1.619630843539762, + "grad_norm": 0.44385069608688354, + "learning_rate": 4.497891912610196e-06, + "logits/chosen": 3.061587333679199, + "logits/rejected": 3.061587333679199, + "logps/chosen": -187.31951904296875, + "logps/rejected": -187.3195343017578, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.203390121459961, + "rewards/margins": 0.0, + "rewards/rejected": -14.203390121459961, + "step": 2347 + }, + { + "epoch": 1.6203208556149733, + "grad_norm": 0.3577226996421814, + "learning_rate": 4.499808355691837e-06, + "logits/chosen": 3.2832398414611816, + "logits/rejected": 3.312793731689453, + "logps/chosen": -181.57223510742188, + "logps/rejected": -191.99072265625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.252835273742676, + "rewards/margins": 1.0438610315322876, + "rewards/rejected": -14.296696662902832, + "step": 2348 + }, + { + "epoch": 1.6210108676901847, + "grad_norm": 18.656051635742188, + "learning_rate": 4.501724798773477e-06, + "logits/chosen": 3.6709229946136475, + "logits/rejected": 3.5403056144714355, + "logps/chosen": -195.04525756835938, + "logps/rejected": -192.6876983642578, + "loss": 1.0588, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.749828338623047, + "rewards/margins": -0.25164902210235596, + "rewards/rejected": -14.498178482055664, + "step": 2349 + }, + { + "epoch": 1.6217008797653958, + "grad_norm": 0.28685224056243896, + "learning_rate": 4.503641241855117e-06, + "logits/chosen": 3.2930524349212646, + "logits/rejected": 3.386854410171509, + "logps/chosen": -183.60353088378906, + "logps/rejected": -191.55490112304688, + "loss": 0.6068, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.707805633544922, + "rewards/margins": 0.7721871137619019, + "rewards/rejected": -14.47999382019043, + "step": 2350 + }, + { + "epoch": 1.6223908918406074, + "grad_norm": 0.2314804643392563, + "learning_rate": 4.505557684936757e-06, + "logits/chosen": 3.3781464099884033, + "logits/rejected": 3.5646743774414062, + "logps/chosen": -156.70431518554688, + "logps/rejected": -182.15443420410156, + "loss": 0.4338, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.091630935668945, + "rewards/margins": 2.5006511211395264, + "rewards/rejected": -13.592281341552734, + "step": 2351 + }, + { + "epoch": 1.6230809039158185, + "grad_norm": 0.33862850069999695, + "learning_rate": 4.507474128018398e-06, + "logits/chosen": 3.575270414352417, + "logits/rejected": 3.575270414352417, + "logps/chosen": -177.47096252441406, + "logps/rejected": -177.47096252441406, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.931073188781738, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.931074142456055, + "step": 2352 + }, + { + "epoch": 1.6237709159910299, + "grad_norm": 0.3759656250476837, + "learning_rate": 4.509390571100038e-06, + "logits/chosen": 3.4449312686920166, + "logits/rejected": 3.448413133621216, + "logps/chosen": -175.2711181640625, + "logps/rejected": -181.678466796875, + "loss": 0.6079, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.757083892822266, + "rewards/margins": 0.5630074739456177, + "rewards/rejected": -13.320091247558594, + "step": 2353 + }, + { + "epoch": 1.6244609280662412, + "grad_norm": 0.25428542494773865, + "learning_rate": 4.511307014181679e-06, + "logits/chosen": 2.8728489875793457, + "logits/rejected": 2.9949190616607666, + "logps/chosen": -164.69097900390625, + "logps/rejected": -188.79034423828125, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.64409351348877, + "rewards/margins": 2.398685932159424, + "rewards/rejected": -14.042779922485352, + "step": 2354 + }, + { + "epoch": 1.6251509401414523, + "grad_norm": 0.3159627318382263, + "learning_rate": 4.51322345726332e-06, + "logits/chosen": 3.3930680751800537, + "logits/rejected": 3.437668800354004, + "logps/chosen": -176.71572875976562, + "logps/rejected": -183.94876098632812, + "loss": 0.6068, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.909852027893066, + "rewards/margins": 0.7705717086791992, + "rewards/rejected": -13.680424690246582, + "step": 2355 + }, + { + "epoch": 1.625840952216664, + "grad_norm": 12.031167984008789, + "learning_rate": 4.515139900344961e-06, + "logits/chosen": 3.384556770324707, + "logits/rejected": 3.346290111541748, + "logps/chosen": -178.67718505859375, + "logps/rejected": -175.69290161132812, + "loss": 0.8804, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.056500434875488, + "rewards/margins": -0.24831807613372803, + "rewards/rejected": -12.808182716369629, + "step": 2356 + }, + { + "epoch": 1.626530964291875, + "grad_norm": 0.35175466537475586, + "learning_rate": 4.5170563434266006e-06, + "logits/chosen": 3.1213386058807373, + "logits/rejected": 3.2025816440582275, + "logps/chosen": -169.9972686767578, + "logps/rejected": -176.1016387939453, + "loss": 0.6078, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.333480834960938, + "rewards/margins": 0.5715259313583374, + "rewards/rejected": -12.905007362365723, + "step": 2357 + }, + { + "epoch": 1.6272209763670864, + "grad_norm": 0.42077764868736267, + "learning_rate": 4.518972786508241e-06, + "logits/chosen": 2.9372153282165527, + "logits/rejected": 2.980247974395752, + "logps/chosen": -156.780029296875, + "logps/rejected": -179.0203857421875, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.849689483642578, + "rewards/margins": 2.2543511390686035, + "rewards/rejected": -13.104040145874023, + "step": 2358 + }, + { + "epoch": 1.6279109884422978, + "grad_norm": 0.3153223991394043, + "learning_rate": 4.520889229589881e-06, + "logits/chosen": 3.2602806091308594, + "logits/rejected": 3.3141136169433594, + "logps/chosen": -194.94781494140625, + "logps/rejected": -201.62060546875, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.686182975769043, + "rewards/margins": 0.639920711517334, + "rewards/rejected": -15.326103210449219, + "step": 2359 + }, + { + "epoch": 1.628601000517509, + "grad_norm": 0.3675234317779541, + "learning_rate": 4.522805672671522e-06, + "logits/chosen": 3.4373815059661865, + "logits/rejected": 3.698330879211426, + "logps/chosen": -163.35272216796875, + "logps/rejected": -185.43453979492188, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.60878849029541, + "rewards/margins": 2.2226881980895996, + "rewards/rejected": -13.831477165222168, + "step": 2360 + }, + { + "epoch": 1.6292910125927205, + "grad_norm": 1.6209481954574585, + "learning_rate": 4.524722115753162e-06, + "logits/chosen": 3.661072015762329, + "logits/rejected": 3.719532012939453, + "logps/chosen": -183.7778778076172, + "logps/rejected": -186.4136199951172, + "loss": 0.6208, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.810298919677734, + "rewards/margins": 0.26401591300964355, + "rewards/rejected": -14.07431411743164, + "step": 2361 + }, + { + "epoch": 1.6299810246679316, + "grad_norm": 0.8838686347007751, + "learning_rate": 4.526638558834803e-06, + "logits/chosen": 3.2606067657470703, + "logits/rejected": 3.25055193901062, + "logps/chosen": -179.0072021484375, + "logps/rejected": -182.71595764160156, + "loss": 0.6119, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.056522369384766, + "rewards/margins": 0.3893338441848755, + "rewards/rejected": -13.445856094360352, + "step": 2362 + }, + { + "epoch": 1.630671036743143, + "grad_norm": 0.34257403016090393, + "learning_rate": 4.528555001916444e-06, + "logits/chosen": 3.3340461254119873, + "logits/rejected": 3.3340461254119873, + "logps/chosen": -187.90582275390625, + "logps/rejected": -187.90582275390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.045310020446777, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -14.045310020446777, + "step": 2363 + }, + { + "epoch": 1.6313610488183543, + "grad_norm": 0.2424132525920868, + "learning_rate": 4.530471444998084e-06, + "logits/chosen": 3.5134260654449463, + "logits/rejected": 3.5322372913360596, + "logps/chosen": -187.68057250976562, + "logps/rejected": -193.5188751220703, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.929224014282227, + "rewards/margins": 0.6068010330200195, + "rewards/rejected": -14.536026000976562, + "step": 2364 + }, + { + "epoch": 1.6320510608935657, + "grad_norm": 0.2854008972644806, + "learning_rate": 4.5323878880797246e-06, + "logits/chosen": 3.4506282806396484, + "logits/rejected": 3.663205146789551, + "logps/chosen": -159.69996643066406, + "logps/rejected": -180.53836059570312, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.232086181640625, + "rewards/margins": 2.014172077178955, + "rewards/rejected": -13.246257781982422, + "step": 2365 + }, + { + "epoch": 1.632741072968777, + "grad_norm": 7.385499477386475, + "learning_rate": 4.5343043311613645e-06, + "logits/chosen": 2.9523963928222656, + "logits/rejected": 3.0399622917175293, + "logps/chosen": -173.40664672851562, + "logps/rejected": -180.07833862304688, + "loss": 0.6322, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.357660293579102, + "rewards/margins": 0.6815587878227234, + "rewards/rejected": -13.039217948913574, + "step": 2366 + }, + { + "epoch": 1.6334310850439882, + "grad_norm": 0.257523775100708, + "learning_rate": 4.536220774243005e-06, + "logits/chosen": 3.483527660369873, + "logits/rejected": 3.538328170776367, + "logps/chosen": -180.3206329345703, + "logps/rejected": -194.65158081054688, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.144842147827148, + "rewards/margins": 1.3766381740570068, + "rewards/rejected": -14.521479606628418, + "step": 2367 + }, + { + "epoch": 1.6341210971191997, + "grad_norm": 0.29263123869895935, + "learning_rate": 4.538137217324646e-06, + "logits/chosen": 3.5132791996002197, + "logits/rejected": 3.4781904220581055, + "logps/chosen": -184.65863037109375, + "logps/rejected": -197.02272033691406, + "loss": 0.5214, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.711248397827148, + "rewards/margins": 1.2787476778030396, + "rewards/rejected": -14.989995956420898, + "step": 2368 + }, + { + "epoch": 1.6348111091944109, + "grad_norm": 0.329292356967926, + "learning_rate": 4.540053660406286e-06, + "logits/chosen": 3.268117904663086, + "logits/rejected": 3.268117904663086, + "logps/chosen": -179.34906005859375, + "logps/rejected": -179.3490753173828, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.026611328125, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.026611328125, + "step": 2369 + }, + { + "epoch": 1.6355011212696222, + "grad_norm": 0.5862146615982056, + "learning_rate": 4.541970103487927e-06, + "logits/chosen": 3.3669686317443848, + "logits/rejected": 3.376455545425415, + "logps/chosen": -182.71080017089844, + "logps/rejected": -187.19850158691406, + "loss": 0.6091, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.588244438171387, + "rewards/margins": 0.48057377338409424, + "rewards/rejected": -14.068818092346191, + "step": 2370 + }, + { + "epoch": 1.6361911333448336, + "grad_norm": 0.36490970849990845, + "learning_rate": 4.543886546569568e-06, + "logits/chosen": 3.1365585327148438, + "logits/rejected": 3.124630928039551, + "logps/chosen": -169.96902465820312, + "logps/rejected": -190.4148406982422, + "loss": 0.5217, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.294179916381836, + "rewards/margins": 2.038325786590576, + "rewards/rejected": -14.33250617980957, + "step": 2371 + }, + { + "epoch": 1.6368811454200447, + "grad_norm": 0.26523691415786743, + "learning_rate": 4.545802989651208e-06, + "logits/chosen": 3.504302501678467, + "logits/rejected": 3.504302501678467, + "logps/chosen": -195.0523681640625, + "logps/rejected": -195.0523681640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.601736068725586, + "rewards/margins": 0.0, + "rewards/rejected": -14.601736068725586, + "step": 2372 + }, + { + "epoch": 1.6375711574952563, + "grad_norm": 0.26995736360549927, + "learning_rate": 4.5477194327328485e-06, + "logits/chosen": 3.477008819580078, + "logits/rejected": 3.477008819580078, + "logps/chosen": -165.22512817382812, + "logps/rejected": -165.22512817382812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.782352447509766, + "rewards/margins": 0.0, + "rewards/rejected": -11.782352447509766, + "step": 2373 + }, + { + "epoch": 1.6382611695704674, + "grad_norm": 0.3123026490211487, + "learning_rate": 4.5496358758144885e-06, + "logits/chosen": 3.322209358215332, + "logits/rejected": 3.343923330307007, + "logps/chosen": -173.72216796875, + "logps/rejected": -202.7161102294922, + "loss": 0.4345, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.57794189453125, + "rewards/margins": 2.961380958557129, + "rewards/rejected": -15.539321899414062, + "step": 2374 + }, + { + "epoch": 1.6389511816456788, + "grad_norm": 2.3679494857788086, + "learning_rate": 4.551552318896129e-06, + "logits/chosen": 2.985915184020996, + "logits/rejected": 3.217334032058716, + "logps/chosen": -153.0816650390625, + "logps/rejected": -188.66717529296875, + "loss": 0.3624, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.973612785339355, + "rewards/margins": 3.3384275436401367, + "rewards/rejected": -14.312040328979492, + "step": 2375 + }, + { + "epoch": 1.6396411937208901, + "grad_norm": 0.29809388518333435, + "learning_rate": 4.553468761977769e-06, + "logits/chosen": 3.387075185775757, + "logits/rejected": 3.387075185775757, + "logps/chosen": -181.82974243164062, + "logps/rejected": -181.82974243164062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.41087532043457, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.41087532043457, + "step": 2376 + }, + { + "epoch": 1.6403312057961015, + "grad_norm": 0.3483920097351074, + "learning_rate": 4.55538520505941e-06, + "logits/chosen": 3.5360279083251953, + "logits/rejected": 3.5360279083251953, + "logps/chosen": -195.4157257080078, + "logps/rejected": -195.4157257080078, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.733932495117188, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.733932495117188, + "step": 2377 + }, + { + "epoch": 1.6410212178713128, + "grad_norm": 0.3659403622150421, + "learning_rate": 4.55730164814105e-06, + "logits/chosen": 2.9180479049682617, + "logits/rejected": 2.9600398540496826, + "logps/chosen": -172.00790405273438, + "logps/rejected": -184.50827026367188, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.39944839477539, + "rewards/margins": 1.2696810960769653, + "rewards/rejected": -13.669129371643066, + "step": 2378 + }, + { + "epoch": 1.641711229946524, + "grad_norm": 0.3532155454158783, + "learning_rate": 4.559218091222692e-06, + "logits/chosen": 3.0856354236602783, + "logits/rejected": 3.0856354236602783, + "logps/chosen": -189.19970703125, + "logps/rejected": -189.19970703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.236776351928711, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.236776351928711, + "step": 2379 + }, + { + "epoch": 1.6424012420217355, + "grad_norm": 0.2597045302391052, + "learning_rate": 4.561134534304332e-06, + "logits/chosen": 3.256416082382202, + "logits/rejected": 3.467360496520996, + "logps/chosen": -172.4664306640625, + "logps/rejected": -183.50750732421875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.385448455810547, + "rewards/margins": 1.1519789695739746, + "rewards/rejected": -13.53742790222168, + "step": 2380 + }, + { + "epoch": 1.6430912540969467, + "grad_norm": 30.713855743408203, + "learning_rate": 4.5630509773859725e-06, + "logits/chosen": 3.1540417671203613, + "logits/rejected": 3.1854054927825928, + "logps/chosen": -186.4920196533203, + "logps/rejected": -184.0877685546875, + "loss": 0.9308, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.795523643493652, + "rewards/margins": -0.3145565986633301, + "rewards/rejected": -13.48096752166748, + "step": 2381 + }, + { + "epoch": 1.643781266172158, + "grad_norm": 2.300520658493042, + "learning_rate": 4.5649674204676125e-06, + "logits/chosen": 3.224828004837036, + "logits/rejected": 3.190303325653076, + "logps/chosen": -179.33456420898438, + "logps/rejected": -180.532958984375, + "loss": 0.6326, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.126153945922852, + "rewards/margins": 0.18256473541259766, + "rewards/rejected": -13.308717727661133, + "step": 2382 + }, + { + "epoch": 1.6444712782473694, + "grad_norm": 1.9186164140701294, + "learning_rate": 4.566883863549253e-06, + "logits/chosen": 3.220966339111328, + "logits/rejected": 3.296550750732422, + "logps/chosen": -188.5120391845703, + "logps/rejected": -191.35414123535156, + "loss": 0.6191, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.028055191040039, + "rewards/margins": 0.28082966804504395, + "rewards/rejected": -14.308884620666504, + "step": 2383 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 0.3223550617694855, + "learning_rate": 4.568800306630893e-06, + "logits/chosen": 3.2606420516967773, + "logits/rejected": 3.2606420516967773, + "logps/chosen": -176.930419921875, + "logps/rejected": -176.930419921875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.916267395019531, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.916267395019531, + "step": 2384 + }, + { + "epoch": 1.645851302397792, + "grad_norm": 0.2906126379966736, + "learning_rate": 4.570716749712534e-06, + "logits/chosen": 3.2894363403320312, + "logits/rejected": 3.4851911067962646, + "logps/chosen": -172.6111602783203, + "logps/rejected": -181.88671875, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.488531112670898, + "rewards/margins": 0.9174251556396484, + "rewards/rejected": -13.405956268310547, + "step": 2385 + }, + { + "epoch": 1.6465413144730032, + "grad_norm": 0.2609223425388336, + "learning_rate": 4.572633192794174e-06, + "logits/chosen": 3.1881656646728516, + "logits/rejected": 3.1881656646728516, + "logps/chosen": -177.30419921875, + "logps/rejected": -177.30421447753906, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.805319786071777, + "rewards/margins": 2.980232238769531e-07, + "rewards/rejected": -12.805320739746094, + "step": 2386 + }, + { + "epoch": 1.6472313265482146, + "grad_norm": 0.30919861793518066, + "learning_rate": 4.574549635875815e-06, + "logits/chosen": 3.2677111625671387, + "logits/rejected": 3.647122859954834, + "logps/chosen": -167.10694885253906, + "logps/rejected": -183.36241149902344, + "loss": 0.5212, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.922332763671875, + "rewards/margins": 1.6492195129394531, + "rewards/rejected": -13.571552276611328, + "step": 2387 + }, + { + "epoch": 1.647921338623426, + "grad_norm": 0.2871488034725189, + "learning_rate": 4.576466078957456e-06, + "logits/chosen": 3.4101052284240723, + "logits/rejected": 3.4101052284240723, + "logps/chosen": -172.22251892089844, + "logps/rejected": -172.22251892089844, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.382831573486328, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.382833480834961, + "step": 2388 + }, + { + "epoch": 1.648611350698637, + "grad_norm": 0.2967953681945801, + "learning_rate": 4.578382522039096e-06, + "logits/chosen": 2.9949235916137695, + "logits/rejected": 3.0390219688415527, + "logps/chosen": -141.76473999023438, + "logps/rejected": -165.67868041992188, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.329526901245117, + "rewards/margins": 2.353929042816162, + "rewards/rejected": -11.683457374572754, + "step": 2389 + }, + { + "epoch": 1.6493013627738486, + "grad_norm": 0.2561856806278229, + "learning_rate": 4.5802989651207365e-06, + "logits/chosen": 3.3506293296813965, + "logits/rejected": 3.358147144317627, + "logps/chosen": -160.72605895996094, + "logps/rejected": -167.8965606689453, + "loss": 0.6069, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.359313011169434, + "rewards/margins": 0.7356197834014893, + "rewards/rejected": -12.094934463500977, + "step": 2390 + }, + { + "epoch": 1.6499913748490598, + "grad_norm": 0.24624128639698029, + "learning_rate": 4.5822154082023764e-06, + "logits/chosen": 3.347370147705078, + "logits/rejected": 3.387478828430176, + "logps/chosen": -157.06947326660156, + "logps/rejected": -170.3729705810547, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.063761711120605, + "rewards/margins": 1.2808754444122314, + "rewards/rejected": -12.344636917114258, + "step": 2391 + }, + { + "epoch": 1.6506813869242711, + "grad_norm": 18.967138290405273, + "learning_rate": 4.584131851284017e-06, + "logits/chosen": 3.254556894302368, + "logits/rejected": 3.2084274291992188, + "logps/chosen": -173.64892578125, + "logps/rejected": -166.83143615722656, + "loss": 1.309, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.840580940246582, + "rewards/margins": -0.702027440071106, + "rewards/rejected": -12.138554573059082, + "step": 2392 + }, + { + "epoch": 1.6513713989994825, + "grad_norm": 0.2544289827346802, + "learning_rate": 4.586048294365657e-06, + "logits/chosen": 3.2679293155670166, + "logits/rejected": 3.2679293155670166, + "logps/chosen": -183.7111053466797, + "logps/rejected": -183.7111053466797, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.535675048828125, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.535675048828125, + "step": 2393 + }, + { + "epoch": 1.6520614110746938, + "grad_norm": 10.236274719238281, + "learning_rate": 4.587964737447298e-06, + "logits/chosen": 3.0653254985809326, + "logits/rejected": 3.109706163406372, + "logps/chosen": -155.5262451171875, + "logps/rejected": -150.5304412841797, + "loss": 1.0573, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.874217987060547, + "rewards/margins": -0.4473746418952942, + "rewards/rejected": -10.426843643188477, + "step": 2394 + }, + { + "epoch": 1.6527514231499052, + "grad_norm": 0.32825517654418945, + "learning_rate": 4.589881180528939e-06, + "logits/chosen": 3.074436902999878, + "logits/rejected": 3.1437430381774902, + "logps/chosen": -165.42251586914062, + "logps/rejected": -174.99559020996094, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.595854759216309, + "rewards/margins": 0.9762894511222839, + "rewards/rejected": -12.572144508361816, + "step": 2395 + }, + { + "epoch": 1.6534414352251163, + "grad_norm": 4.306396007537842, + "learning_rate": 4.59179762361058e-06, + "logits/chosen": 3.234755277633667, + "logits/rejected": 3.265166759490967, + "logps/chosen": -171.39541625976562, + "logps/rejected": -184.09303283691406, + "loss": 0.559, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.312630653381348, + "rewards/margins": 1.3497493267059326, + "rewards/rejected": -13.66238021850586, + "step": 2396 + }, + { + "epoch": 1.654131447300328, + "grad_norm": 0.28771311044692993, + "learning_rate": 4.59371406669222e-06, + "logits/chosen": 3.1893954277038574, + "logits/rejected": 3.394181966781616, + "logps/chosen": -165.7399139404297, + "logps/rejected": -176.64462280273438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.61617374420166, + "rewards/margins": 1.1168569326400757, + "rewards/rejected": -12.733031272888184, + "step": 2397 + }, + { + "epoch": 1.654821459375539, + "grad_norm": 0.26988768577575684, + "learning_rate": 4.5956305097738604e-06, + "logits/chosen": 3.1952390670776367, + "logits/rejected": 3.1952390670776367, + "logps/chosen": -171.79458618164062, + "logps/rejected": -171.79458618164062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.442866325378418, + "rewards/margins": 0.0, + "rewards/rejected": -12.442866325378418, + "step": 2398 + }, + { + "epoch": 1.6555114714507504, + "grad_norm": 11.760420799255371, + "learning_rate": 4.5975469528555e-06, + "logits/chosen": 3.495399236679077, + "logits/rejected": 3.501312494277954, + "logps/chosen": -173.38424682617188, + "logps/rejected": -180.39971923828125, + "loss": 0.8979, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.637349128723145, + "rewards/margins": 0.7199851274490356, + "rewards/rejected": -13.35733413696289, + "step": 2399 + }, + { + "epoch": 1.6562014835259617, + "grad_norm": 0.3041383624076843, + "learning_rate": 4.599463395937141e-06, + "logits/chosen": 3.0062575340270996, + "logits/rejected": 3.0062575340270996, + "logps/chosen": -172.2608642578125, + "logps/rejected": -172.2608642578125, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.518880844116211, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.518882751464844, + "step": 2400 + }, + { + "epoch": 1.6568914956011729, + "grad_norm": 0.4898880422115326, + "learning_rate": 4.601379839018781e-06, + "logits/chosen": 3.1498498916625977, + "logits/rejected": 3.3109700679779053, + "logps/chosen": -158.7936553955078, + "logps/rejected": -171.0146942138672, + "loss": 0.5223, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.34383773803711, + "rewards/margins": 1.2291617393493652, + "rewards/rejected": -12.572999000549316, + "step": 2401 + }, + { + "epoch": 1.6575815076763845, + "grad_norm": 0.30562371015548706, + "learning_rate": 4.603296282100422e-06, + "logits/chosen": 3.3747358322143555, + "logits/rejected": 3.3747358322143555, + "logps/chosen": -170.56375122070312, + "logps/rejected": -170.56375122070312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.320699691772461, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.320699691772461, + "step": 2402 + }, + { + "epoch": 1.6582715197515956, + "grad_norm": 21.347763061523438, + "learning_rate": 4.605212725182062e-06, + "logits/chosen": 2.889159679412842, + "logits/rejected": 2.9528274536132812, + "logps/chosen": -153.1903076171875, + "logps/rejected": -153.70867919921875, + "loss": 1.2948, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.675622940063477, + "rewards/margins": 0.02323627471923828, + "rewards/rejected": -10.698859214782715, + "step": 2403 + }, + { + "epoch": 1.658961531826807, + "grad_norm": 0.21886037290096283, + "learning_rate": 4.607129168263703e-06, + "logits/chosen": 3.3092634677886963, + "logits/rejected": 3.3752360343933105, + "logps/chosen": -155.15805053710938, + "logps/rejected": -166.03909301757812, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.762435913085938, + "rewards/margins": 1.0222359895706177, + "rewards/rejected": -11.784671783447266, + "step": 2404 + }, + { + "epoch": 1.6596515439020183, + "grad_norm": 0.3184148967266083, + "learning_rate": 4.609045611345344e-06, + "logits/chosen": 2.695242166519165, + "logits/rejected": 2.695242166519165, + "logps/chosen": -186.60995483398438, + "logps/rejected": -186.60997009277344, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.885334014892578, + "rewards/margins": 2.980232238769531e-07, + "rewards/rejected": -13.885334014892578, + "step": 2405 + }, + { + "epoch": 1.6603415559772297, + "grad_norm": 0.3008895218372345, + "learning_rate": 4.6109620544269836e-06, + "logits/chosen": 3.3351259231567383, + "logits/rejected": 3.3351259231567383, + "logps/chosen": -188.12501525878906, + "logps/rejected": -188.12501525878906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.98017406463623, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.98017406463623, + "step": 2406 + }, + { + "epoch": 1.661031568052441, + "grad_norm": 18.712051391601562, + "learning_rate": 4.612878497508624e-06, + "logits/chosen": 3.1781833171844482, + "logits/rejected": 3.107959508895874, + "logps/chosen": -189.8718719482422, + "logps/rejected": -185.23699951171875, + "loss": 1.0519, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.276123046875, + "rewards/margins": -0.4417504072189331, + "rewards/rejected": -13.834373474121094, + "step": 2407 + }, + { + "epoch": 1.6617215801276521, + "grad_norm": 0.6564086079597473, + "learning_rate": 4.614794940590265e-06, + "logits/chosen": 3.0585474967956543, + "logits/rejected": 3.2911009788513184, + "logps/chosen": -155.67652893066406, + "logps/rejected": -182.49850463867188, + "loss": 0.4375, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.840398788452148, + "rewards/margins": 2.6738474369049072, + "rewards/rejected": -13.514245986938477, + "step": 2408 + }, + { + "epoch": 1.6624115922028635, + "grad_norm": 0.2709024250507355, + "learning_rate": 4.616711383671905e-06, + "logits/chosen": 3.1390573978424072, + "logits/rejected": 3.2410669326782227, + "logps/chosen": -177.46102905273438, + "logps/rejected": -189.6409149169922, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.020465850830078, + "rewards/margins": 1.1553078889846802, + "rewards/rejected": -14.175774574279785, + "step": 2409 + }, + { + "epoch": 1.6631016042780749, + "grad_norm": 0.2788844108581543, + "learning_rate": 4.618627826753546e-06, + "logits/chosen": 3.160966396331787, + "logits/rejected": 3.2818806171417236, + "logps/chosen": -155.77976989746094, + "logps/rejected": -167.8250274658203, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.691317558288574, + "rewards/margins": 1.245446801185608, + "rewards/rejected": -11.93676471710205, + "step": 2410 + }, + { + "epoch": 1.6637916163532862, + "grad_norm": 0.2574860453605652, + "learning_rate": 4.620544269835186e-06, + "logits/chosen": 2.970693588256836, + "logits/rejected": 3.035170078277588, + "logps/chosen": -164.02529907226562, + "logps/rejected": -186.8133544921875, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.633035659790039, + "rewards/margins": 2.256133556365967, + "rewards/rejected": -13.889169692993164, + "step": 2411 + }, + { + "epoch": 1.6644816284284976, + "grad_norm": 0.2779202163219452, + "learning_rate": 4.622460712916827e-06, + "logits/chosen": 3.30953311920166, + "logits/rejected": 3.2779860496520996, + "logps/chosen": -158.4410858154297, + "logps/rejected": -170.26028442382812, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.091829299926758, + "rewards/margins": 1.1486923694610596, + "rewards/rejected": -12.240522384643555, + "step": 2412 + }, + { + "epoch": 1.6651716405037087, + "grad_norm": 0.25822484493255615, + "learning_rate": 4.624377155998468e-06, + "logits/chosen": 3.3791656494140625, + "logits/rejected": 3.3791656494140625, + "logps/chosen": -187.06857299804688, + "logps/rejected": -187.06854248046875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.872583389282227, + "rewards/margins": 0.0, + "rewards/rejected": -13.87258243560791, + "step": 2413 + }, + { + "epoch": 1.6658616525789203, + "grad_norm": 0.2976377308368683, + "learning_rate": 4.6262935990801075e-06, + "logits/chosen": 2.690429210662842, + "logits/rejected": 2.690429210662842, + "logps/chosen": -174.13912963867188, + "logps/rejected": -174.13912963867188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.637956619262695, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -12.637956619262695, + "step": 2414 + }, + { + "epoch": 1.6665516646541314, + "grad_norm": 0.5541545748710632, + "learning_rate": 4.628210042161748e-06, + "logits/chosen": 3.033717155456543, + "logits/rejected": 3.088526487350464, + "logps/chosen": -192.68359375, + "logps/rejected": -197.3588409423828, + "loss": 0.6096, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.297059059143066, + "rewards/margins": 0.46112918853759766, + "rewards/rejected": -14.758188247680664, + "step": 2415 + }, + { + "epoch": 1.6672416767293428, + "grad_norm": 0.30849021673202515, + "learning_rate": 4.630126485243388e-06, + "logits/chosen": 3.3025918006896973, + "logits/rejected": 3.3025918006896973, + "logps/chosen": -170.374755859375, + "logps/rejected": -170.374755859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.154848098754883, + "rewards/margins": 0.0, + "rewards/rejected": -12.154848098754883, + "step": 2416 + }, + { + "epoch": 1.6679316888045541, + "grad_norm": 0.28101247549057007, + "learning_rate": 4.632042928325029e-06, + "logits/chosen": 3.2451364994049072, + "logits/rejected": 3.2451364994049072, + "logps/chosen": -179.69703674316406, + "logps/rejected": -179.69703674316406, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.244888305664062, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.244888305664062, + "step": 2417 + }, + { + "epoch": 1.6686217008797652, + "grad_norm": 1.010712742805481, + "learning_rate": 4.633959371406669e-06, + "logits/chosen": 3.0552525520324707, + "logits/rejected": 3.1425867080688477, + "logps/chosen": -162.89932250976562, + "logps/rejected": -172.55128479003906, + "loss": 0.526, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.631072998046875, + "rewards/margins": 0.9313453435897827, + "rewards/rejected": -12.562417984008789, + "step": 2418 + }, + { + "epoch": 1.6693117129549768, + "grad_norm": 0.33295467495918274, + "learning_rate": 4.63587581448831e-06, + "logits/chosen": 3.3137166500091553, + "logits/rejected": 3.3137166500091553, + "logps/chosen": -160.0235595703125, + "logps/rejected": -160.0235595703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.0845365524292, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -11.0845365524292, + "step": 2419 + }, + { + "epoch": 1.670001725030188, + "grad_norm": 0.37523841857910156, + "learning_rate": 4.637792257569951e-06, + "logits/chosen": 3.151212692260742, + "logits/rejected": 3.3034403324127197, + "logps/chosen": -182.28631591796875, + "logps/rejected": -189.14739990234375, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.515811920166016, + "rewards/margins": 0.6641631126403809, + "rewards/rejected": -14.179975509643555, + "step": 2420 + }, + { + "epoch": 1.6706917371053993, + "grad_norm": 0.21859610080718994, + "learning_rate": 4.6397087006515916e-06, + "logits/chosen": 2.6970808506011963, + "logits/rejected": 3.082855701446533, + "logps/chosen": -128.60171508789062, + "logps/rejected": -173.90853881835938, + "loss": 0.3467, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.10966682434082, + "rewards/margins": 4.4483747482299805, + "rewards/rejected": -12.558042526245117, + "step": 2421 + }, + { + "epoch": 1.6713817491806107, + "grad_norm": 0.2788715660572052, + "learning_rate": 4.6416251437332315e-06, + "logits/chosen": 3.4329426288604736, + "logits/rejected": 3.45291805267334, + "logps/chosen": -165.06414794921875, + "logps/rejected": -175.4991455078125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.676695823669434, + "rewards/margins": 1.1042945384979248, + "rewards/rejected": -12.780990600585938, + "step": 2422 + }, + { + "epoch": 1.672071761255822, + "grad_norm": 0.34288784861564636, + "learning_rate": 4.643541586814872e-06, + "logits/chosen": 3.1552906036376953, + "logits/rejected": 3.1552906036376953, + "logps/chosen": -162.9656982421875, + "logps/rejected": -162.9656982421875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.428398132324219, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.428398132324219, + "step": 2423 + }, + { + "epoch": 1.6727617733310334, + "grad_norm": 0.2858389914035797, + "learning_rate": 4.645458029896512e-06, + "logits/chosen": 3.290245532989502, + "logits/rejected": 3.5329883098602295, + "logps/chosen": -154.60975646972656, + "logps/rejected": -168.25540161132812, + "loss": 0.5213, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.858673095703125, + "rewards/margins": 1.3613245487213135, + "rewards/rejected": -12.219998359680176, + "step": 2424 + }, + { + "epoch": 1.6734517854062445, + "grad_norm": 0.3617730736732483, + "learning_rate": 4.647374472978153e-06, + "logits/chosen": 2.8982086181640625, + "logits/rejected": 2.8982086181640625, + "logps/chosen": -169.12002563476562, + "logps/rejected": -169.12002563476562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.298349380493164, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -12.298349380493164, + "step": 2425 + }, + { + "epoch": 1.674141797481456, + "grad_norm": 0.905072033405304, + "learning_rate": 4.649290916059793e-06, + "logits/chosen": 2.93550443649292, + "logits/rejected": 2.9223215579986572, + "logps/chosen": -168.73870849609375, + "logps/rejected": -171.023681640625, + "loss": 0.6207, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.029481887817383, + "rewards/margins": 0.2645772695541382, + "rewards/rejected": -12.294059753417969, + "step": 2426 + }, + { + "epoch": 1.6748318095566672, + "grad_norm": 0.3368476331233978, + "learning_rate": 4.651207359141434e-06, + "logits/chosen": 3.282675266265869, + "logits/rejected": 3.282675266265869, + "logps/chosen": -173.3116455078125, + "logps/rejected": -173.3116455078125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.389565467834473, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -12.38956356048584, + "step": 2427 + }, + { + "epoch": 1.6755218216318786, + "grad_norm": 8.332663536071777, + "learning_rate": 4.653123802223075e-06, + "logits/chosen": 3.030872344970703, + "logits/rejected": 3.1827855110168457, + "logps/chosen": -136.56234741210938, + "logps/rejected": -142.40350341796875, + "loss": 0.5807, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.966545104980469, + "rewards/margins": 0.6341326236724854, + "rewards/rejected": -9.600677490234375, + "step": 2428 + }, + { + "epoch": 1.67621183370709, + "grad_norm": 0.3397826850414276, + "learning_rate": 4.655040245304715e-06, + "logits/chosen": 2.9261462688446045, + "logits/rejected": 3.005873680114746, + "logps/chosen": -152.49058532714844, + "logps/rejected": -165.300048828125, + "loss": 0.5234, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.56471061706543, + "rewards/margins": 1.2404543161392212, + "rewards/rejected": -11.805164337158203, + "step": 2429 + }, + { + "epoch": 1.676901845782301, + "grad_norm": 0.3103277385234833, + "learning_rate": 4.6569566883863555e-06, + "logits/chosen": 3.389517068862915, + "logits/rejected": 3.389517068862915, + "logps/chosen": -177.15489196777344, + "logps/rejected": -177.15489196777344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.882469177246094, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.882467269897461, + "step": 2430 + }, + { + "epoch": 1.6775918578575126, + "grad_norm": 0.24471944570541382, + "learning_rate": 4.6588731314679955e-06, + "logits/chosen": 3.524237632751465, + "logits/rejected": 3.502854108810425, + "logps/chosen": -165.42068481445312, + "logps/rejected": -181.8551025390625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.795676231384277, + "rewards/margins": 1.5745749473571777, + "rewards/rejected": -13.370250701904297, + "step": 2431 + }, + { + "epoch": 1.6782818699327238, + "grad_norm": 0.28009238839149475, + "learning_rate": 4.660789574549636e-06, + "logits/chosen": 2.9621500968933105, + "logits/rejected": 3.1331095695495605, + "logps/chosen": -148.88235473632812, + "logps/rejected": -156.95364379882812, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.235517501831055, + "rewards/margins": 0.8116045594215393, + "rewards/rejected": -11.047122955322266, + "step": 2432 + }, + { + "epoch": 1.6789718820079351, + "grad_norm": 0.29148149490356445, + "learning_rate": 4.662706017631276e-06, + "logits/chosen": 3.102436065673828, + "logits/rejected": 3.238213539123535, + "logps/chosen": -167.966064453125, + "logps/rejected": -175.57293701171875, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.982667922973633, + "rewards/margins": 0.7749612331390381, + "rewards/rejected": -12.75762939453125, + "step": 2433 + }, + { + "epoch": 1.6796618940831465, + "grad_norm": 0.30032244324684143, + "learning_rate": 4.664622460712917e-06, + "logits/chosen": 3.3386082649230957, + "logits/rejected": 3.461811065673828, + "logps/chosen": -172.0433807373047, + "logps/rejected": -184.28309631347656, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.394021034240723, + "rewards/margins": 1.150357961654663, + "rewards/rejected": -13.544379234313965, + "step": 2434 + }, + { + "epoch": 1.6803519061583576, + "grad_norm": 0.38689181208610535, + "learning_rate": 4.666538903794557e-06, + "logits/chosen": 2.968626022338867, + "logits/rejected": 2.90434193611145, + "logps/chosen": -174.50747680664062, + "logps/rejected": -179.97610473632812, + "loss": 0.6092, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.571192741394043, + "rewards/margins": 0.47646665573120117, + "rewards/rejected": -13.047658920288086, + "step": 2435 + }, + { + "epoch": 1.6810419182335692, + "grad_norm": 0.3393733501434326, + "learning_rate": 4.668455346876199e-06, + "logits/chosen": 3.0869290828704834, + "logits/rejected": 3.0869290828704834, + "logps/chosen": -172.26095581054688, + "logps/rejected": -172.26095581054688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.257661819458008, + "rewards/margins": 0.0, + "rewards/rejected": -12.257661819458008, + "step": 2436 + }, + { + "epoch": 1.6817319303087803, + "grad_norm": 0.41197606921195984, + "learning_rate": 4.670371789957839e-06, + "logits/chosen": 3.102308750152588, + "logits/rejected": 3.159670352935791, + "logps/chosen": -159.86863708496094, + "logps/rejected": -176.31143188476562, + "loss": 0.5221, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.145042419433594, + "rewards/margins": 1.621659517288208, + "rewards/rejected": -12.766700744628906, + "step": 2437 + }, + { + "epoch": 1.6824219423839917, + "grad_norm": 0.2919153571128845, + "learning_rate": 4.6722882330394795e-06, + "logits/chosen": 3.0865283012390137, + "logits/rejected": 3.0865283012390137, + "logps/chosen": -168.3263397216797, + "logps/rejected": -168.3263397216797, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.087458610534668, + "rewards/margins": 8.344650268554688e-07, + "rewards/rejected": -12.087458610534668, + "step": 2438 + }, + { + "epoch": 1.683111954459203, + "grad_norm": 14.60275650024414, + "learning_rate": 4.6742046761211195e-06, + "logits/chosen": 2.9638712406158447, + "logits/rejected": 3.061711072921753, + "logps/chosen": -184.06185913085938, + "logps/rejected": -180.35684204101562, + "loss": 1.3946, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.72042179107666, + "rewards/margins": -0.40659207105636597, + "rewards/rejected": -13.31382942199707, + "step": 2439 + }, + { + "epoch": 1.6838019665344144, + "grad_norm": 0.3536617159843445, + "learning_rate": 4.67612111920276e-06, + "logits/chosen": 2.8397879600524902, + "logits/rejected": 2.8397879600524902, + "logps/chosen": -173.2107696533203, + "logps/rejected": -173.2107696533203, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.565591812133789, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.565591812133789, + "step": 2440 + }, + { + "epoch": 1.6844919786096257, + "grad_norm": 0.31995922327041626, + "learning_rate": 4.6780375622844e-06, + "logits/chosen": 3.198533058166504, + "logits/rejected": 3.370521068572998, + "logps/chosen": -159.6470947265625, + "logps/rejected": -167.306640625, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.351802825927734, + "rewards/margins": 0.763408362865448, + "rewards/rejected": -12.115209579467773, + "step": 2441 + }, + { + "epoch": 1.6851819906848369, + "grad_norm": 0.3661266267299652, + "learning_rate": 4.679954005366041e-06, + "logits/chosen": 3.0935370922088623, + "logits/rejected": 3.0935370922088623, + "logps/chosen": -182.12466430664062, + "logps/rejected": -182.12466430664062, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.368650436401367, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.368650436401367, + "step": 2442 + }, + { + "epoch": 1.6858720027600484, + "grad_norm": 0.2741176187992096, + "learning_rate": 4.681870448447681e-06, + "logits/chosen": 3.152418851852417, + "logits/rejected": 3.3134539127349854, + "logps/chosen": -155.81936645507812, + "logps/rejected": -163.72259521484375, + "loss": 0.6068, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.9036283493042, + "rewards/margins": 0.7513613104820251, + "rewards/rejected": -11.654989242553711, + "step": 2443 + }, + { + "epoch": 1.6865620148352596, + "grad_norm": 0.3220284581184387, + "learning_rate": 4.683786891529322e-06, + "logits/chosen": 3.0607595443725586, + "logits/rejected": 3.206190586090088, + "logps/chosen": -157.3566131591797, + "logps/rejected": -164.65106201171875, + "loss": 0.6069, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.116788864135742, + "rewards/margins": 0.7349548935890198, + "rewards/rejected": -11.851743698120117, + "step": 2444 + }, + { + "epoch": 1.687252026910471, + "grad_norm": 0.2763231098651886, + "learning_rate": 4.685703334610963e-06, + "logits/chosen": 3.11920166015625, + "logits/rejected": 3.275860548019409, + "logps/chosen": -156.36328125, + "logps/rejected": -164.29959106445312, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.723321914672852, + "rewards/margins": 0.791727602481842, + "rewards/rejected": -11.515049934387207, + "step": 2445 + }, + { + "epoch": 1.6879420389856823, + "grad_norm": 0.33057448267936707, + "learning_rate": 4.687619777692603e-06, + "logits/chosen": 3.0595171451568604, + "logits/rejected": 3.150925397872925, + "logps/chosen": -169.9598388671875, + "logps/rejected": -178.92515563964844, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.19857120513916, + "rewards/margins": 0.9167281985282898, + "rewards/rejected": -13.115300178527832, + "step": 2446 + }, + { + "epoch": 1.6886320510608934, + "grad_norm": 15.560511589050293, + "learning_rate": 4.6895362207742434e-06, + "logits/chosen": 3.3778767585754395, + "logits/rejected": 3.4220573902130127, + "logps/chosen": -170.33993530273438, + "logps/rejected": -168.3707733154297, + "loss": 0.8304, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.206235885620117, + "rewards/margins": -0.20103812217712402, + "rewards/rejected": -12.00519847869873, + "step": 2447 + }, + { + "epoch": 1.689322063136105, + "grad_norm": 0.2945723235607147, + "learning_rate": 4.691452663855884e-06, + "logits/chosen": 3.3417391777038574, + "logits/rejected": 3.3417391777038574, + "logps/chosen": -174.701171875, + "logps/rejected": -174.701171875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.494208335876465, + "rewards/margins": 0.0, + "rewards/rejected": -12.494208335876465, + "step": 2448 + }, + { + "epoch": 1.6900120752113161, + "grad_norm": 0.2927975058555603, + "learning_rate": 4.693369106937524e-06, + "logits/chosen": 3.1963062286376953, + "logits/rejected": 3.1963062286376953, + "logps/chosen": -176.70379638671875, + "logps/rejected": -176.7037811279297, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.87859058380127, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -12.87859058380127, + "step": 2449 + }, + { + "epoch": 1.6907020872865275, + "grad_norm": 0.3568231463432312, + "learning_rate": 4.695285550019165e-06, + "logits/chosen": 2.9888343811035156, + "logits/rejected": 2.9888343811035156, + "logps/chosen": -167.46783447265625, + "logps/rejected": -167.46783447265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.07444953918457, + "rewards/margins": 0.0, + "rewards/rejected": -12.07444953918457, + "step": 2450 + }, + { + "epoch": 1.6913920993617388, + "grad_norm": 0.29311588406562805, + "learning_rate": 4.697201993100805e-06, + "logits/chosen": 2.9752776622772217, + "logits/rejected": 3.007875919342041, + "logps/chosen": -149.7742156982422, + "logps/rejected": -159.56527709960938, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.176366806030273, + "rewards/margins": 0.9600156545639038, + "rewards/rejected": -11.136382102966309, + "step": 2451 + }, + { + "epoch": 1.6920821114369502, + "grad_norm": 19.161134719848633, + "learning_rate": 4.699118436182446e-06, + "logits/chosen": 2.748316526412964, + "logits/rejected": 2.715367555618286, + "logps/chosen": -189.235107421875, + "logps/rejected": -186.76837158203125, + "loss": 0.7844, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.157224655151367, + "rewards/margins": -0.14341974258422852, + "rewards/rejected": -14.013805389404297, + "step": 2452 + }, + { + "epoch": 1.6927721235121616, + "grad_norm": 0.2762950360774994, + "learning_rate": 4.701034879264087e-06, + "logits/chosen": 2.553483009338379, + "logits/rejected": 2.625589370727539, + "logps/chosen": -159.84536743164062, + "logps/rejected": -179.9337615966797, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.019611358642578, + "rewards/margins": 2.02498197555542, + "rewards/rejected": -13.044593811035156, + "step": 2453 + }, + { + "epoch": 1.6934621355873727, + "grad_norm": 0.26130616664886475, + "learning_rate": 4.702951322345727e-06, + "logits/chosen": 2.820814609527588, + "logits/rejected": 2.724883556365967, + "logps/chosen": -154.82102966308594, + "logps/rejected": -183.14138793945312, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.590734481811523, + "rewards/margins": 2.999521017074585, + "rewards/rejected": -13.590254783630371, + "step": 2454 + }, + { + "epoch": 1.694152147662584, + "grad_norm": 19.936670303344727, + "learning_rate": 4.704867765427367e-06, + "logits/chosen": 2.950310468673706, + "logits/rejected": 2.9396584033966064, + "logps/chosen": -162.35194396972656, + "logps/rejected": -157.6425323486328, + "loss": 1.6359, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.698375701904297, + "rewards/margins": -0.4302772283554077, + "rewards/rejected": -11.268097877502441, + "step": 2455 + }, + { + "epoch": 1.6948421597377954, + "grad_norm": 0.2728313207626343, + "learning_rate": 4.706784208509007e-06, + "logits/chosen": 2.913499355316162, + "logits/rejected": 2.912059783935547, + "logps/chosen": -145.85040283203125, + "logps/rejected": -167.49581909179688, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.826272010803223, + "rewards/margins": 2.153409242630005, + "rewards/rejected": -11.979681015014648, + "step": 2456 + }, + { + "epoch": 1.6955321718130067, + "grad_norm": 0.32285767793655396, + "learning_rate": 4.708700651590648e-06, + "logits/chosen": 2.988798141479492, + "logits/rejected": 2.988798141479492, + "logps/chosen": -179.7933807373047, + "logps/rejected": -179.7933807373047, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.038872718811035, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.038872718811035, + "step": 2457 + }, + { + "epoch": 1.696222183888218, + "grad_norm": 0.48539283871650696, + "learning_rate": 4.710617094672288e-06, + "logits/chosen": 2.912813663482666, + "logits/rejected": 2.9847733974456787, + "logps/chosen": -151.53997802734375, + "logps/rejected": -174.94000244140625, + "loss": 0.4381, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.44998550415039, + "rewards/margins": 2.323211193084717, + "rewards/rejected": -12.773197174072266, + "step": 2458 + }, + { + "epoch": 1.6969121959634292, + "grad_norm": 0.32475247979164124, + "learning_rate": 4.712533537753929e-06, + "logits/chosen": 2.939847469329834, + "logits/rejected": 2.939847469329834, + "logps/chosen": -171.3551025390625, + "logps/rejected": -171.3551025390625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.45322036743164, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.453219413757324, + "step": 2459 + }, + { + "epoch": 1.6976022080386408, + "grad_norm": 0.31692132353782654, + "learning_rate": 4.71444998083557e-06, + "logits/chosen": 2.929886817932129, + "logits/rejected": 2.887943744659424, + "logps/chosen": -160.26959228515625, + "logps/rejected": -175.17930603027344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.282124519348145, + "rewards/margins": 1.4278247356414795, + "rewards/rejected": -12.709949493408203, + "step": 2460 + }, + { + "epoch": 1.698292220113852, + "grad_norm": 0.3361772298812866, + "learning_rate": 4.716366423917211e-06, + "logits/chosen": 3.0700182914733887, + "logits/rejected": 3.0700182914733887, + "logps/chosen": -173.24624633789062, + "logps/rejected": -173.24624633789062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.517884254455566, + "rewards/margins": 0.0, + "rewards/rejected": -12.517884254455566, + "step": 2461 + }, + { + "epoch": 1.6989822321890633, + "grad_norm": 9.153141975402832, + "learning_rate": 4.7182828669988506e-06, + "logits/chosen": 2.489513397216797, + "logits/rejected": 2.514881134033203, + "logps/chosen": -162.15496826171875, + "logps/rejected": -160.57479858398438, + "loss": 0.7694, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.448274612426758, + "rewards/margins": -0.12326419353485107, + "rewards/rejected": -11.325010299682617, + "step": 2462 + }, + { + "epoch": 1.6996722442642747, + "grad_norm": 0.7842038869857788, + "learning_rate": 4.720199310080491e-06, + "logits/chosen": 2.9360148906707764, + "logits/rejected": 3.232560634613037, + "logps/chosen": -176.06930541992188, + "logps/rejected": -185.78945922851562, + "loss": 0.5253, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.633002281188965, + "rewards/margins": 1.031815767288208, + "rewards/rejected": -13.664817810058594, + "step": 2463 + }, + { + "epoch": 1.7003622563394858, + "grad_norm": 0.4052512049674988, + "learning_rate": 4.722115753162131e-06, + "logits/chosen": 2.8700244426727295, + "logits/rejected": 2.888707160949707, + "logps/chosen": -175.98277282714844, + "logps/rejected": -193.09591674804688, + "loss": 0.5211, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.874042510986328, + "rewards/margins": 1.7532410621643066, + "rewards/rejected": -14.627283096313477, + "step": 2464 + }, + { + "epoch": 1.7010522684146974, + "grad_norm": 0.2952350974082947, + "learning_rate": 4.724032196243772e-06, + "logits/chosen": 3.055828809738159, + "logits/rejected": 3.055828809738159, + "logps/chosen": -164.53262329101562, + "logps/rejected": -164.5326385498047, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.481254577636719, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -11.481254577636719, + "step": 2465 + }, + { + "epoch": 1.7017422804899085, + "grad_norm": 0.957371711730957, + "learning_rate": 4.725948639325412e-06, + "logits/chosen": 3.1409988403320312, + "logits/rejected": 3.213902473449707, + "logps/chosen": -167.35137939453125, + "logps/rejected": -183.26756286621094, + "loss": 0.5265, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.98259162902832, + "rewards/margins": 1.5704829692840576, + "rewards/rejected": -13.553075790405273, + "step": 2466 + }, + { + "epoch": 1.7024322925651199, + "grad_norm": 0.3320868909358978, + "learning_rate": 4.727865082407053e-06, + "logits/chosen": 2.9026405811309814, + "logits/rejected": 2.9026405811309814, + "logps/chosen": -158.15438842773438, + "logps/rejected": -158.15438842773438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.867927551269531, + "rewards/margins": 0.0, + "rewards/rejected": -10.867927551269531, + "step": 2467 + }, + { + "epoch": 1.7031223046403312, + "grad_norm": 4.046403408050537, + "learning_rate": 4.729781525488694e-06, + "logits/chosen": 2.64841365814209, + "logits/rejected": 2.936039924621582, + "logps/chosen": -145.86843872070312, + "logps/rejected": -163.0233154296875, + "loss": 0.4835, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.950754165649414, + "rewards/margins": 1.7289668321609497, + "rewards/rejected": -11.679720878601074, + "step": 2468 + }, + { + "epoch": 1.7038123167155426, + "grad_norm": 0.315886527299881, + "learning_rate": 4.731697968570334e-06, + "logits/chosen": 2.664245128631592, + "logits/rejected": 2.717978000640869, + "logps/chosen": -151.5982208251953, + "logps/rejected": -165.6397247314453, + "loss": 0.5213, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.373342514038086, + "rewards/margins": 1.4558249711990356, + "rewards/rejected": -11.829167366027832, + "step": 2469 + }, + { + "epoch": 1.704502328790754, + "grad_norm": 0.3157491683959961, + "learning_rate": 4.7336144116519746e-06, + "logits/chosen": 2.5598320960998535, + "logits/rejected": 2.605651378631592, + "logps/chosen": -148.22227478027344, + "logps/rejected": -166.2677001953125, + "loss": 0.521, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.156171798706055, + "rewards/margins": 1.7492952346801758, + "rewards/rejected": -11.90546703338623, + "step": 2470 + }, + { + "epoch": 1.705192340865965, + "grad_norm": 0.2906196117401123, + "learning_rate": 4.7355308547336145e-06, + "logits/chosen": 2.4330992698669434, + "logits/rejected": 2.5161654949188232, + "logps/chosen": -167.45040893554688, + "logps/rejected": -175.482421875, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.01200008392334, + "rewards/margins": 0.7908148765563965, + "rewards/rejected": -12.802815437316895, + "step": 2471 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.2905770242214203, + "learning_rate": 4.737447297815255e-06, + "logits/chosen": 2.838223695755005, + "logits/rejected": 3.012925148010254, + "logps/chosen": -156.6262969970703, + "logps/rejected": -168.14056396484375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.934103965759277, + "rewards/margins": 1.1408690214157104, + "rewards/rejected": -12.074973106384277, + "step": 2472 + }, + { + "epoch": 1.7065723650163878, + "grad_norm": 0.294026255607605, + "learning_rate": 4.739363740896895e-06, + "logits/chosen": 3.112912654876709, + "logits/rejected": 3.0716400146484375, + "logps/chosen": -169.24937438964844, + "logps/rejected": -179.65435791015625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.991141319274902, + "rewards/margins": 1.0504783391952515, + "rewards/rejected": -13.041620254516602, + "step": 2473 + }, + { + "epoch": 1.7072623770915991, + "grad_norm": 0.36130377650260925, + "learning_rate": 4.741280183978536e-06, + "logits/chosen": 3.170478343963623, + "logits/rejected": 3.170478343963623, + "logps/chosen": -162.5847625732422, + "logps/rejected": -162.5847625732422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.60098648071289, + "rewards/margins": 0.0, + "rewards/rejected": -11.60098648071289, + "step": 2474 + }, + { + "epoch": 1.7079523891668105, + "grad_norm": 0.25688520073890686, + "learning_rate": 4.743196627060176e-06, + "logits/chosen": 3.1365389823913574, + "logits/rejected": 3.3668997287750244, + "logps/chosen": -174.10231018066406, + "logps/rejected": -182.88685607910156, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.445377349853516, + "rewards/margins": 0.9171577095985413, + "rewards/rejected": -13.36253547668457, + "step": 2475 + }, + { + "epoch": 1.7086424012420216, + "grad_norm": 0.30248314142227173, + "learning_rate": 4.745113070141818e-06, + "logits/chosen": 3.0870909690856934, + "logits/rejected": 3.1197190284729004, + "logps/chosen": -156.80235290527344, + "logps/rejected": -166.48204040527344, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.017899513244629, + "rewards/margins": 0.7538074851036072, + "rewards/rejected": -11.771706581115723, + "step": 2476 + }, + { + "epoch": 1.7093324133172332, + "grad_norm": 0.9259410500526428, + "learning_rate": 4.747029513223458e-06, + "logits/chosen": 2.9757022857666016, + "logits/rejected": 3.0794310569763184, + "logps/chosen": -151.07948303222656, + "logps/rejected": -179.20521545410156, + "loss": 0.4379, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.277257919311523, + "rewards/margins": 2.743847131729126, + "rewards/rejected": -13.021102905273438, + "step": 2477 + }, + { + "epoch": 1.7100224253924443, + "grad_norm": 0.35838109254837036, + "learning_rate": 4.7489459563050985e-06, + "logits/chosen": 3.075636625289917, + "logits/rejected": 3.075636625289917, + "logps/chosen": -176.03224182128906, + "logps/rejected": -176.03225708007812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.791215896606445, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.791215896606445, + "step": 2478 + }, + { + "epoch": 1.7107124374676557, + "grad_norm": 0.34558621048927307, + "learning_rate": 4.7508623993867385e-06, + "logits/chosen": 2.562812566757202, + "logits/rejected": 2.657766819000244, + "logps/chosen": -149.8324432373047, + "logps/rejected": -166.03079223632812, + "loss": 0.5207, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.029743194580078, + "rewards/margins": 1.581954836845398, + "rewards/rejected": -11.61169719696045, + "step": 2479 + }, + { + "epoch": 1.711402449542867, + "grad_norm": 0.3146887719631195, + "learning_rate": 4.752778842468379e-06, + "logits/chosen": 2.973529100418091, + "logits/rejected": 2.973529100418091, + "logps/chosen": -177.7252960205078, + "logps/rejected": -177.7252960205078, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.832332611083984, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.832332611083984, + "step": 2480 + }, + { + "epoch": 1.7120924616180782, + "grad_norm": 3.945387125015259, + "learning_rate": 4.754695285550019e-06, + "logits/chosen": 2.577240467071533, + "logits/rejected": 2.7618606090545654, + "logps/chosen": -147.80870056152344, + "logps/rejected": -157.1527557373047, + "loss": 0.5833, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.974908828735352, + "rewards/margins": 0.9564175009727478, + "rewards/rejected": -10.931325912475586, + "step": 2481 + }, + { + "epoch": 1.7127824736932897, + "grad_norm": 0.28311625123023987, + "learning_rate": 4.75661172863166e-06, + "logits/chosen": 2.680962562561035, + "logits/rejected": 2.7647335529327393, + "logps/chosen": -134.6162872314453, + "logps/rejected": -167.40780639648438, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.69615364074707, + "rewards/margins": 3.2950804233551025, + "rewards/rejected": -11.991233825683594, + "step": 2482 + }, + { + "epoch": 1.7134724857685009, + "grad_norm": 0.33241188526153564, + "learning_rate": 4.7585281717133e-06, + "logits/chosen": 2.7601988315582275, + "logits/rejected": 2.7025933265686035, + "logps/chosen": -172.5880126953125, + "logps/rejected": -184.14419555664062, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.466904640197754, + "rewards/margins": 1.1655910015106201, + "rewards/rejected": -13.632495880126953, + "step": 2483 + }, + { + "epoch": 1.7141624978437122, + "grad_norm": 0.5559474229812622, + "learning_rate": 4.760444614794941e-06, + "logits/chosen": 3.094181537628174, + "logits/rejected": 3.2372560501098633, + "logps/chosen": -154.73876953125, + "logps/rejected": -172.14718627929688, + "loss": 0.523, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.526742935180664, + "rewards/margins": 1.7752234935760498, + "rewards/rejected": -12.301965713500977, + "step": 2484 + }, + { + "epoch": 1.7148525099189236, + "grad_norm": 0.3725583851337433, + "learning_rate": 4.762361057876582e-06, + "logits/chosen": 2.7297439575195312, + "logits/rejected": 2.7297439575195312, + "logps/chosen": -163.34048461914062, + "logps/rejected": -163.34048461914062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.612309455871582, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.612309455871582, + "step": 2485 + }, + { + "epoch": 1.715542521994135, + "grad_norm": 0.3457132875919342, + "learning_rate": 4.764277500958222e-06, + "logits/chosen": 3.0106046199798584, + "logits/rejected": 3.0106046199798584, + "logps/chosen": -173.66696166992188, + "logps/rejected": -173.66696166992188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.766633987426758, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.766633987426758, + "step": 2486 + }, + { + "epoch": 1.7162325340693463, + "grad_norm": 0.31874921917915344, + "learning_rate": 4.7661939440398625e-06, + "logits/chosen": 2.666917324066162, + "logits/rejected": 2.666917324066162, + "logps/chosen": -144.21298217773438, + "logps/rejected": -144.21298217773438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -9.855203628540039, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -9.855202674865723, + "step": 2487 + }, + { + "epoch": 1.7169225461445574, + "grad_norm": 15.047748565673828, + "learning_rate": 4.7681103871215024e-06, + "logits/chosen": 2.9773848056793213, + "logits/rejected": 2.824481248855591, + "logps/chosen": -171.389892578125, + "logps/rejected": -162.92861938476562, + "loss": 1.4486, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.4719877243042, + "rewards/margins": -0.8419291973114014, + "rewards/rejected": -11.630058288574219, + "step": 2488 + }, + { + "epoch": 1.717612558219769, + "grad_norm": 0.27259883284568787, + "learning_rate": 4.770026830203143e-06, + "logits/chosen": 2.638899087905884, + "logits/rejected": 2.930643081665039, + "logps/chosen": -146.21827697753906, + "logps/rejected": -163.7481689453125, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.806150436401367, + "rewards/margins": 1.7118624448776245, + "rewards/rejected": -11.518013000488281, + "step": 2489 + }, + { + "epoch": 1.7183025702949801, + "grad_norm": 1.1053322553634644, + "learning_rate": 4.771943273284784e-06, + "logits/chosen": 3.0348129272460938, + "logits/rejected": 3.0837454795837402, + "logps/chosen": -174.34930419921875, + "logps/rejected": -177.63282775878906, + "loss": 0.6151, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.74730396270752, + "rewards/margins": 0.33004969358444214, + "rewards/rejected": -13.077353477478027, + "step": 2490 + }, + { + "epoch": 1.7189925823701915, + "grad_norm": 2.9467270374298096, + "learning_rate": 4.773859716366424e-06, + "logits/chosen": 2.929415225982666, + "logits/rejected": 2.9404892921447754, + "logps/chosen": -142.76889038085938, + "logps/rejected": -157.17312622070312, + "loss": 0.4625, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.302530288696289, + "rewards/margins": 1.471065640449524, + "rewards/rejected": -10.773595809936523, + "step": 2491 + }, + { + "epoch": 1.7196825944454028, + "grad_norm": 0.3322790861129761, + "learning_rate": 4.775776159448065e-06, + "logits/chosen": 2.893260955810547, + "logits/rejected": 2.962928056716919, + "logps/chosen": -154.11097717285156, + "logps/rejected": -160.56390380859375, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.740456581115723, + "rewards/margins": 0.6779066324234009, + "rewards/rejected": -11.418363571166992, + "step": 2492 + }, + { + "epoch": 1.720372606520614, + "grad_norm": 0.32347583770751953, + "learning_rate": 4.777692602529706e-06, + "logits/chosen": 2.9272186756134033, + "logits/rejected": 2.9272186756134033, + "logps/chosen": -157.7705535888672, + "logps/rejected": -157.77056884765625, + "loss": 0.6931, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.075159072875977, + "rewards/margins": 7.748603820800781e-07, + "rewards/rejected": -11.075159072875977, + "step": 2493 + }, + { + "epoch": 1.7210626185958255, + "grad_norm": 0.3071165978908539, + "learning_rate": 4.779609045611346e-06, + "logits/chosen": 3.2793312072753906, + "logits/rejected": 3.280313730239868, + "logps/chosen": -170.76483154296875, + "logps/rejected": -176.71414184570312, + "loss": 0.6075, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.353496551513672, + "rewards/margins": 0.602418065071106, + "rewards/rejected": -12.955914497375488, + "step": 2494 + }, + { + "epoch": 1.7217526306710367, + "grad_norm": 1.1797161102294922, + "learning_rate": 4.7815254886929865e-06, + "logits/chosen": 2.5567195415496826, + "logits/rejected": 2.7754015922546387, + "logps/chosen": -144.02459716796875, + "logps/rejected": -152.96234130859375, + "loss": 0.5259, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.63925552368164, + "rewards/margins": 0.940706729888916, + "rewards/rejected": -10.579961776733398, + "step": 2495 + }, + { + "epoch": 1.722442642746248, + "grad_norm": 0.3518033027648926, + "learning_rate": 4.7834419317746264e-06, + "logits/chosen": 2.9274466037750244, + "logits/rejected": 2.9096405506134033, + "logps/chosen": -153.05026245117188, + "logps/rejected": -167.13766479492188, + "loss": 0.521, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.72481918334961, + "rewards/margins": 1.3512816429138184, + "rewards/rejected": -12.076101303100586, + "step": 2496 + }, + { + "epoch": 1.7231326548214594, + "grad_norm": 0.31176701188087463, + "learning_rate": 4.785358374856267e-06, + "logits/chosen": 2.8977067470550537, + "logits/rejected": 2.8852884769439697, + "logps/chosen": -170.89675903320312, + "logps/rejected": -177.79605102539062, + "loss": 0.6068, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.149009704589844, + "rewards/margins": 0.7550017833709717, + "rewards/rejected": -12.904010772705078, + "step": 2497 + }, + { + "epoch": 1.7238226668966707, + "grad_norm": 1.708695888519287, + "learning_rate": 4.787274817937907e-06, + "logits/chosen": 2.9730417728424072, + "logits/rejected": 3.006324052810669, + "logps/chosen": -168.12881469726562, + "logps/rejected": -171.35653686523438, + "loss": 0.6155, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.965133666992188, + "rewards/margins": 0.3243081569671631, + "rewards/rejected": -12.28944206237793, + "step": 2498 + }, + { + "epoch": 1.724512678971882, + "grad_norm": 0.33872753381729126, + "learning_rate": 4.789191261019548e-06, + "logits/chosen": 3.038386344909668, + "logits/rejected": 3.081413745880127, + "logps/chosen": -166.75167846679688, + "logps/rejected": -174.746826171875, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.70124626159668, + "rewards/margins": 0.8534741401672363, + "rewards/rejected": -12.554719924926758, + "step": 2499 + }, + { + "epoch": 1.7252026910470932, + "grad_norm": 0.5103698968887329, + "learning_rate": 4.791107704101189e-06, + "logits/chosen": 2.932252883911133, + "logits/rejected": 2.933326244354248, + "logps/chosen": -156.08143615722656, + "logps/rejected": -171.002197265625, + "loss": 0.5223, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.83668041229248, + "rewards/margins": 1.4886252880096436, + "rewards/rejected": -12.325304985046387, + "step": 2500 + }, + { + "epoch": 1.7258927031223048, + "grad_norm": 0.32177406549453735, + "learning_rate": 4.79302414718283e-06, + "logits/chosen": 2.9851393699645996, + "logits/rejected": 3.1364688873291016, + "logps/chosen": -158.84939575195312, + "logps/rejected": -183.2070770263672, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.1715087890625, + "rewards/margins": 2.362076759338379, + "rewards/rejected": -13.533585548400879, + "step": 2501 + }, + { + "epoch": 1.726582715197516, + "grad_norm": 0.3213590085506439, + "learning_rate": 4.79494059026447e-06, + "logits/chosen": 3.1624698638916016, + "logits/rejected": 3.2077512741088867, + "logps/chosen": -161.73036193847656, + "logps/rejected": -177.47329711914062, + "loss": 0.5209, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.307476043701172, + "rewards/margins": 1.578743815422058, + "rewards/rejected": -12.88621997833252, + "step": 2502 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 0.3223232328891754, + "learning_rate": 4.7968570333461104e-06, + "logits/chosen": 2.8150734901428223, + "logits/rejected": 2.8150734901428223, + "logps/chosen": -175.57461547851562, + "logps/rejected": -175.57461547851562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.077768325805664, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.077768325805664, + "step": 2503 + }, + { + "epoch": 1.7279627393479386, + "grad_norm": 11.021171569824219, + "learning_rate": 4.79877347642775e-06, + "logits/chosen": 3.2669310569763184, + "logits/rejected": 3.270725727081299, + "logps/chosen": -167.35140991210938, + "logps/rejected": -165.46490478515625, + "loss": 0.7894, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.8722562789917, + "rewards/margins": -0.14992815256118774, + "rewards/rejected": -11.722329139709473, + "step": 2504 + }, + { + "epoch": 1.7286527514231498, + "grad_norm": 0.3433478772640228, + "learning_rate": 4.800689919509391e-06, + "logits/chosen": 3.096965789794922, + "logits/rejected": 3.096965789794922, + "logps/chosen": -175.93148803710938, + "logps/rejected": -175.93148803710938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.869256019592285, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.869256019592285, + "step": 2505 + }, + { + "epoch": 1.7293427634983614, + "grad_norm": 6.139048099517822, + "learning_rate": 4.802606362591031e-06, + "logits/chosen": 2.932159900665283, + "logits/rejected": 2.9555118083953857, + "logps/chosen": -155.4864044189453, + "logps/rejected": -173.55763244628906, + "loss": 0.4894, + "rewards/accuracies": 0.75, + "rewards/chosen": -10.840506553649902, + "rewards/margins": 1.8358556032180786, + "rewards/rejected": -12.676361083984375, + "step": 2506 + }, + { + "epoch": 1.7300327755735725, + "grad_norm": 0.3158016502857208, + "learning_rate": 4.804522805672672e-06, + "logits/chosen": 2.782195568084717, + "logits/rejected": 2.7580087184906006, + "logps/chosen": -127.0285415649414, + "logps/rejected": -144.01321411132812, + "loss": 0.5212, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.082086563110352, + "rewards/margins": 1.6181652545928955, + "rewards/rejected": -9.700252532958984, + "step": 2507 + }, + { + "epoch": 1.7307227876487838, + "grad_norm": 0.2443859726190567, + "learning_rate": 4.806439248754312e-06, + "logits/chosen": 2.877067804336548, + "logits/rejected": 3.0606236457824707, + "logps/chosen": -165.44009399414062, + "logps/rejected": -187.70945739746094, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.773181915283203, + "rewards/margins": 2.1684837341308594, + "rewards/rejected": -13.941665649414062, + "step": 2508 + }, + { + "epoch": 1.7314127997239952, + "grad_norm": 0.372781902551651, + "learning_rate": 4.808355691835953e-06, + "logits/chosen": 2.8841660022735596, + "logits/rejected": 3.0299670696258545, + "logps/chosen": -146.14718627929688, + "logps/rejected": -166.603515625, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.847390174865723, + "rewards/margins": 2.0982918739318848, + "rewards/rejected": -11.94568157196045, + "step": 2509 + }, + { + "epoch": 1.7321028117992063, + "grad_norm": 0.2613013684749603, + "learning_rate": 4.810272134917594e-06, + "logits/chosen": 3.3140065670013428, + "logits/rejected": 3.4626975059509277, + "logps/chosen": -168.3192138671875, + "logps/rejected": -177.36473083496094, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.98592472076416, + "rewards/margins": 0.9370571970939636, + "rewards/rejected": -12.922981262207031, + "step": 2510 + }, + { + "epoch": 1.732792823874418, + "grad_norm": 21.2016658782959, + "learning_rate": 4.8121885779992336e-06, + "logits/chosen": 3.1667442321777344, + "logits/rejected": 3.208885431289673, + "logps/chosen": -172.85879516601562, + "logps/rejected": -168.37359619140625, + "loss": 1.0658, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.531564712524414, + "rewards/margins": -0.45609474182128906, + "rewards/rejected": -12.075469970703125, + "step": 2511 + }, + { + "epoch": 1.733482835949629, + "grad_norm": 0.3329792320728302, + "learning_rate": 4.814105021080874e-06, + "logits/chosen": 3.2411251068115234, + "logits/rejected": 3.384772777557373, + "logps/chosen": -164.8385009765625, + "logps/rejected": -171.80294799804688, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.652822494506836, + "rewards/margins": 0.759796142578125, + "rewards/rejected": -12.412618637084961, + "step": 2512 + }, + { + "epoch": 1.7341728480248404, + "grad_norm": 0.34288349747657776, + "learning_rate": 4.816021464162514e-06, + "logits/chosen": 2.82255482673645, + "logits/rejected": 2.8436508178710938, + "logps/chosen": -170.82949829101562, + "logps/rejected": -178.95187377929688, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.241433143615723, + "rewards/margins": 0.8157453536987305, + "rewards/rejected": -13.057180404663086, + "step": 2513 + }, + { + "epoch": 1.7348628601000518, + "grad_norm": 14.967436790466309, + "learning_rate": 4.817937907244155e-06, + "logits/chosen": 3.1502366065979004, + "logits/rejected": 3.358154058456421, + "logps/chosen": -151.6383819580078, + "logps/rejected": -159.15635681152344, + "loss": 0.6353, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.385807991027832, + "rewards/margins": 0.8006850481033325, + "rewards/rejected": -11.186493873596191, + "step": 2514 + }, + { + "epoch": 1.735552872175263, + "grad_norm": 0.3956120014190674, + "learning_rate": 4.819854350325795e-06, + "logits/chosen": 2.8870091438293457, + "logits/rejected": 2.8870091438293457, + "logps/chosen": -165.15138244628906, + "logps/rejected": -165.1513671875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.665794372558594, + "rewards/margins": -5.364418029785156e-07, + "rewards/rejected": -11.665794372558594, + "step": 2515 + }, + { + "epoch": 1.7362428842504745, + "grad_norm": 0.2790236175060272, + "learning_rate": 4.821770793407436e-06, + "logits/chosen": 3.1952121257781982, + "logits/rejected": 3.1952121257781982, + "logps/chosen": -187.5858154296875, + "logps/rejected": -187.5858154296875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.017738342285156, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.017738342285156, + "step": 2516 + }, + { + "epoch": 1.7369328963256856, + "grad_norm": 0.359019011259079, + "learning_rate": 4.823687236489077e-06, + "logits/chosen": 3.1909422874450684, + "logits/rejected": 3.1909422874450684, + "logps/chosen": -177.98806762695312, + "logps/rejected": -177.98806762695312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.800582885742188, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.800582885742188, + "step": 2517 + }, + { + "epoch": 1.7376229084008972, + "grad_norm": 0.3595275282859802, + "learning_rate": 4.825603679570718e-06, + "logits/chosen": 3.5066471099853516, + "logits/rejected": 3.5066471099853516, + "logps/chosen": -180.39675903320312, + "logps/rejected": -180.39675903320312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.210487365722656, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.210487365722656, + "step": 2518 + }, + { + "epoch": 1.7383129204761083, + "grad_norm": 0.2841463088989258, + "learning_rate": 4.8275201226523575e-06, + "logits/chosen": 3.2795681953430176, + "logits/rejected": 3.3042078018188477, + "logps/chosen": -157.74395751953125, + "logps/rejected": -164.58026123046875, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.144299507141113, + "rewards/margins": 0.7128251791000366, + "rewards/rejected": -11.857124328613281, + "step": 2519 + }, + { + "epoch": 1.7390029325513197, + "grad_norm": 21.679384231567383, + "learning_rate": 4.829436565733998e-06, + "logits/chosen": 2.65535569190979, + "logits/rejected": 2.8044724464416504, + "logps/chosen": -161.8657684326172, + "logps/rejected": -173.18458557128906, + "loss": 0.7901, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.311214447021484, + "rewards/margins": 1.0962083339691162, + "rewards/rejected": -12.40742301940918, + "step": 2520 + }, + { + "epoch": 1.739692944626531, + "grad_norm": 0.3968138098716736, + "learning_rate": 4.831353008815638e-06, + "logits/chosen": 2.8737611770629883, + "logits/rejected": 2.8737611770629883, + "logps/chosen": -167.91342163085938, + "logps/rejected": -167.91342163085938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.325582504272461, + "rewards/margins": 0.0, + "rewards/rejected": -12.325582504272461, + "step": 2521 + }, + { + "epoch": 1.7403829567017421, + "grad_norm": 0.29855212569236755, + "learning_rate": 4.833269451897279e-06, + "logits/chosen": 3.085895538330078, + "logits/rejected": 3.204789876937866, + "logps/chosen": -167.6271209716797, + "logps/rejected": -181.21995544433594, + "loss": 0.5211, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.004691123962402, + "rewards/margins": 1.4135417938232422, + "rewards/rejected": -13.418233871459961, + "step": 2522 + }, + { + "epoch": 1.7410729687769537, + "grad_norm": 0.355939656496048, + "learning_rate": 4.835185894978919e-06, + "logits/chosen": 3.445438861846924, + "logits/rejected": 3.445438861846924, + "logps/chosen": -186.5699462890625, + "logps/rejected": -186.5699462890625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.89628791809082, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.896286964416504, + "step": 2523 + }, + { + "epoch": 1.7417629808521649, + "grad_norm": 0.3160251975059509, + "learning_rate": 4.83710233806056e-06, + "logits/chosen": 3.0780787467956543, + "logits/rejected": 3.0780787467956543, + "logps/chosen": -200.43023681640625, + "logps/rejected": -200.43023681640625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -15.212474822998047, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -15.212474822998047, + "step": 2524 + }, + { + "epoch": 1.7424529929273762, + "grad_norm": 3.0382065773010254, + "learning_rate": 4.839018781142201e-06, + "logits/chosen": 3.351508140563965, + "logits/rejected": 3.678825855255127, + "logps/chosen": -164.33651733398438, + "logps/rejected": -178.22312927246094, + "loss": 0.5371, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.585736274719238, + "rewards/margins": 1.444687843322754, + "rewards/rejected": -13.030424118041992, + "step": 2525 + }, + { + "epoch": 1.7431430050025876, + "grad_norm": 1.2610691785812378, + "learning_rate": 4.840935224223841e-06, + "logits/chosen": 3.1559829711914062, + "logits/rejected": 3.161675453186035, + "logps/chosen": -173.80587768554688, + "logps/rejected": -181.3028564453125, + "loss": 0.5326, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.473089218139648, + "rewards/margins": 0.7760634422302246, + "rewards/rejected": -13.249153137207031, + "step": 2526 + }, + { + "epoch": 1.7438330170777987, + "grad_norm": 0.3690810203552246, + "learning_rate": 4.8428516673054815e-06, + "logits/chosen": 3.2182722091674805, + "logits/rejected": 3.186674118041992, + "logps/chosen": -162.468505859375, + "logps/rejected": -178.86111450195312, + "loss": 0.5217, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.361368179321289, + "rewards/margins": 1.6951181888580322, + "rewards/rejected": -13.056486129760742, + "step": 2527 + }, + { + "epoch": 1.7445230291530103, + "grad_norm": 0.32212620973587036, + "learning_rate": 4.8447681103871215e-06, + "logits/chosen": 3.174872636795044, + "logits/rejected": 3.362621784210205, + "logps/chosen": -141.20443725585938, + "logps/rejected": -180.20945739746094, + "loss": 0.3472, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.355030059814453, + "rewards/margins": 3.8185954093933105, + "rewards/rejected": -13.173625946044922, + "step": 2528 + }, + { + "epoch": 1.7452130412282214, + "grad_norm": 1.9407905340194702, + "learning_rate": 4.846684553468762e-06, + "logits/chosen": 3.1488287448883057, + "logits/rejected": 3.2091288566589355, + "logps/chosen": -146.42556762695312, + "logps/rejected": -161.4666748046875, + "loss": 0.536, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.609125137329102, + "rewards/margins": 1.5171865224838257, + "rewards/rejected": -11.126311302185059, + "step": 2529 + }, + { + "epoch": 1.7459030533034328, + "grad_norm": 12.301592826843262, + "learning_rate": 4.848600996550403e-06, + "logits/chosen": 3.3195648193359375, + "logits/rejected": 3.3060503005981445, + "logps/chosen": -172.81942749023438, + "logps/rejected": -174.03582763671875, + "loss": 0.776, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.41636848449707, + "rewards/margins": 0.1372973918914795, + "rewards/rejected": -12.553666114807129, + "step": 2530 + }, + { + "epoch": 1.7465930653786441, + "grad_norm": 0.3368041515350342, + "learning_rate": 4.850517439632043e-06, + "logits/chosen": 3.335125207901001, + "logits/rejected": 3.335125207901001, + "logps/chosen": -178.9221954345703, + "logps/rejected": -178.9221954345703, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.040342330932617, + "rewards/margins": 0.0, + "rewards/rejected": -13.040342330932617, + "step": 2531 + }, + { + "epoch": 1.7472830774538555, + "grad_norm": 0.2755817472934723, + "learning_rate": 4.852433882713684e-06, + "logits/chosen": 2.796135902404785, + "logits/rejected": 3.083220958709717, + "logps/chosen": -155.51914978027344, + "logps/rejected": -175.02442932128906, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.848625183105469, + "rewards/margins": 1.8845030069351196, + "rewards/rejected": -12.733129501342773, + "step": 2532 + }, + { + "epoch": 1.7479730895290668, + "grad_norm": 0.492969274520874, + "learning_rate": 4.854350325795325e-06, + "logits/chosen": 3.267416477203369, + "logits/rejected": 3.4652061462402344, + "logps/chosen": -166.257080078125, + "logps/rejected": -172.14022827148438, + "loss": 0.6082, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.876701354980469, + "rewards/margins": 0.5333845615386963, + "rewards/rejected": -12.410085678100586, + "step": 2533 + }, + { + "epoch": 1.748663101604278, + "grad_norm": 0.4442753791809082, + "learning_rate": 4.856266768876965e-06, + "logits/chosen": 3.3505518436431885, + "logits/rejected": 3.4728031158447266, + "logps/chosen": -175.02706909179688, + "logps/rejected": -180.6429443359375, + "loss": 0.6079, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.577678680419922, + "rewards/margins": 0.5571370124816895, + "rewards/rejected": -13.13481616973877, + "step": 2534 + }, + { + "epoch": 1.7493531136794895, + "grad_norm": 0.7594671845436096, + "learning_rate": 4.8581832119586055e-06, + "logits/chosen": 2.880460739135742, + "logits/rejected": 2.8978044986724854, + "logps/chosen": -155.30679321289062, + "logps/rejected": -159.29025268554688, + "loss": 0.6112, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.824766159057617, + "rewards/margins": 0.4076426029205322, + "rewards/rejected": -11.232409477233887, + "step": 2535 + }, + { + "epoch": 1.7500431257547007, + "grad_norm": 0.3328269124031067, + "learning_rate": 4.8600996550402455e-06, + "logits/chosen": 3.165738344192505, + "logits/rejected": 3.1882288455963135, + "logps/chosen": -162.9261474609375, + "logps/rejected": -170.81640625, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.435328483581543, + "rewards/margins": 0.7821711301803589, + "rewards/rejected": -12.217499732971191, + "step": 2536 + }, + { + "epoch": 1.750733137829912, + "grad_norm": 15.696822166442871, + "learning_rate": 4.862016098121886e-06, + "logits/chosen": 2.85026478767395, + "logits/rejected": 3.0540995597839355, + "logps/chosen": -153.73013305664062, + "logps/rejected": -166.39776611328125, + "loss": 0.6619, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.554285049438477, + "rewards/margins": 1.3015328645706177, + "rewards/rejected": -11.855817794799805, + "step": 2537 + }, + { + "epoch": 1.7514231499051234, + "grad_norm": 11.260299682617188, + "learning_rate": 4.863932541203526e-06, + "logits/chosen": 3.3716976642608643, + "logits/rejected": 3.2372047901153564, + "logps/chosen": -175.22305297851562, + "logps/rejected": -167.8702392578125, + "loss": 1.3125, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.805551528930664, + "rewards/margins": -0.7055274248123169, + "rewards/rejected": -12.10002326965332, + "step": 2538 + }, + { + "epoch": 1.7521131619803345, + "grad_norm": 3.862272024154663, + "learning_rate": 4.865848984285167e-06, + "logits/chosen": 3.5628552436828613, + "logits/rejected": 3.707951307296753, + "logps/chosen": -169.71852111816406, + "logps/rejected": -176.67372131347656, + "loss": 0.6088, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.116510391235352, + "rewards/margins": 0.7074819803237915, + "rewards/rejected": -12.823992729187012, + "step": 2539 + }, + { + "epoch": 1.752803174055546, + "grad_norm": 0.3395425081253052, + "learning_rate": 4.867765427366807e-06, + "logits/chosen": 3.5971875190734863, + "logits/rejected": 3.5971875190734863, + "logps/chosen": -166.9953155517578, + "logps/rejected": -166.9953155517578, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.915645599365234, + "rewards/margins": 0.0, + "rewards/rejected": -11.915645599365234, + "step": 2540 + }, + { + "epoch": 1.7534931861307572, + "grad_norm": 0.41667884588241577, + "learning_rate": 4.869681870448449e-06, + "logits/chosen": 3.2718987464904785, + "logits/rejected": 3.2718987464904785, + "logps/chosen": -173.14669799804688, + "logps/rejected": -173.14669799804688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.734797477722168, + "rewards/margins": 0.0, + "rewards/rejected": -12.734797477722168, + "step": 2541 + }, + { + "epoch": 1.7541831982059686, + "grad_norm": 0.3343859314918518, + "learning_rate": 4.871598313530089e-06, + "logits/chosen": 3.445357322692871, + "logits/rejected": 3.503173351287842, + "logps/chosen": -173.14317321777344, + "logps/rejected": -184.5075225830078, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.562188148498535, + "rewards/margins": 1.144051194190979, + "rewards/rejected": -13.706239700317383, + "step": 2542 + }, + { + "epoch": 1.75487321028118, + "grad_norm": 0.8818022608757019, + "learning_rate": 4.8735147566117295e-06, + "logits/chosen": 3.405445098876953, + "logits/rejected": 3.486858367919922, + "logps/chosen": -165.05105590820312, + "logps/rejected": -174.64590454101562, + "loss": 0.5259, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.672765731811523, + "rewards/margins": 0.9299132823944092, + "rewards/rejected": -12.602677345275879, + "step": 2543 + }, + { + "epoch": 1.7555632223563913, + "grad_norm": 0.3245061933994293, + "learning_rate": 4.8754311996933695e-06, + "logits/chosen": 3.302790403366089, + "logits/rejected": 3.302790403366089, + "logps/chosen": -155.31283569335938, + "logps/rejected": -155.31283569335938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.643250465393066, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -10.643250465393066, + "step": 2544 + }, + { + "epoch": 1.7562532344316026, + "grad_norm": 0.3731518089771271, + "learning_rate": 4.87734764277501e-06, + "logits/chosen": 3.3748159408569336, + "logits/rejected": 3.508779525756836, + "logps/chosen": -156.51162719726562, + "logps/rejected": -166.8878936767578, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.779504776000977, + "rewards/margins": 1.0492973327636719, + "rewards/rejected": -11.828802108764648, + "step": 2545 + }, + { + "epoch": 1.7569432465068138, + "grad_norm": 0.47085338830947876, + "learning_rate": 4.87926408585665e-06, + "logits/chosen": 3.434870719909668, + "logits/rejected": 3.5191938877105713, + "logps/chosen": -170.31793212890625, + "logps/rejected": -174.96908569335938, + "loss": 0.6101, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.250808715820312, + "rewards/margins": 0.440108060836792, + "rewards/rejected": -12.690917015075684, + "step": 2546 + }, + { + "epoch": 1.7576332585820253, + "grad_norm": 0.2481614053249359, + "learning_rate": 4.881180528938291e-06, + "logits/chosen": 3.3288025856018066, + "logits/rejected": 3.530118465423584, + "logps/chosen": -166.61618041992188, + "logps/rejected": -187.155029296875, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.643438339233398, + "rewards/margins": 2.0526723861694336, + "rewards/rejected": -13.696111679077148, + "step": 2547 + }, + { + "epoch": 1.7583232706572365, + "grad_norm": 0.26073595881462097, + "learning_rate": 4.883096972019931e-06, + "logits/chosen": 3.4846813678741455, + "logits/rejected": 3.690429925918579, + "logps/chosen": -159.55502319335938, + "logps/rejected": -179.17034912109375, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.114446640014648, + "rewards/margins": 1.969496250152588, + "rewards/rejected": -13.083942413330078, + "step": 2548 + }, + { + "epoch": 1.7590132827324478, + "grad_norm": 0.39448264241218567, + "learning_rate": 4.885013415101572e-06, + "logits/chosen": 3.126138687133789, + "logits/rejected": 3.1747395992279053, + "logps/chosen": -178.12744140625, + "logps/rejected": -186.12730407714844, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.175217628479004, + "rewards/margins": 0.870477020740509, + "rewards/rejected": -14.045694351196289, + "step": 2549 + }, + { + "epoch": 1.7597032948076592, + "grad_norm": 0.24946506321430206, + "learning_rate": 4.886929858183213e-06, + "logits/chosen": 3.81876802444458, + "logits/rejected": 3.81876802444458, + "logps/chosen": -173.71047973632812, + "logps/rejected": -173.71047973632812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.5587158203125, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.5587158203125, + "step": 2550 + }, + { + "epoch": 1.7603933068828703, + "grad_norm": 0.25684112310409546, + "learning_rate": 4.888846301264853e-06, + "logits/chosen": 3.0746185779571533, + "logits/rejected": 3.1751363277435303, + "logps/chosen": -146.80194091796875, + "logps/rejected": -161.6472930908203, + "loss": 0.5205, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.779150009155273, + "rewards/margins": 1.5173556804656982, + "rewards/rejected": -11.296504974365234, + "step": 2551 + }, + { + "epoch": 1.761083318958082, + "grad_norm": 0.363826185464859, + "learning_rate": 4.8907627443464934e-06, + "logits/chosen": 3.6900570392608643, + "logits/rejected": 3.6900570392608643, + "logps/chosen": -165.2058563232422, + "logps/rejected": -165.2058563232422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.636987686157227, + "rewards/margins": 0.0, + "rewards/rejected": -11.636987686157227, + "step": 2552 + }, + { + "epoch": 1.761773331033293, + "grad_norm": 0.27685824036598206, + "learning_rate": 4.892679187428133e-06, + "logits/chosen": 3.3838188648223877, + "logits/rejected": 3.3534584045410156, + "logps/chosen": -160.89212036132812, + "logps/rejected": -169.8721923828125, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.356904983520508, + "rewards/margins": 0.9574567675590515, + "rewards/rejected": -12.314361572265625, + "step": 2553 + }, + { + "epoch": 1.7624633431085044, + "grad_norm": 3.312548875808716, + "learning_rate": 4.894595630509774e-06, + "logits/chosen": 3.7341556549072266, + "logits/rejected": 3.7613041400909424, + "logps/chosen": -178.82534790039062, + "logps/rejected": -181.60659790039062, + "loss": 0.6293, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.081491470336914, + "rewards/margins": 0.20095640420913696, + "rewards/rejected": -13.282447814941406, + "step": 2554 + }, + { + "epoch": 1.7631533551837157, + "grad_norm": 0.24381481111049652, + "learning_rate": 4.896512073591414e-06, + "logits/chosen": 3.6673147678375244, + "logits/rejected": 3.713182210922241, + "logps/chosen": -155.92962646484375, + "logps/rejected": -169.503173828125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.776984214782715, + "rewards/margins": 1.3921048641204834, + "rewards/rejected": -12.169088363647461, + "step": 2555 + }, + { + "epoch": 1.7638433672589269, + "grad_norm": 0.30712637305259705, + "learning_rate": 4.898428516673055e-06, + "logits/chosen": 3.587191104888916, + "logits/rejected": 3.848489761352539, + "logps/chosen": -164.62771606445312, + "logps/rejected": -180.98681640625, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.75675106048584, + "rewards/margins": 1.6906706094741821, + "rewards/rejected": -13.44742202758789, + "step": 2556 + }, + { + "epoch": 1.7645333793341385, + "grad_norm": 0.7531607151031494, + "learning_rate": 4.900344959754696e-06, + "logits/chosen": 3.6191232204437256, + "logits/rejected": 3.7655832767486572, + "logps/chosen": -144.442138671875, + "logps/rejected": -169.24618530273438, + "loss": 0.4374, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.807225227355957, + "rewards/margins": 2.3122806549072266, + "rewards/rejected": -12.1195068359375, + "step": 2557 + }, + { + "epoch": 1.7652233914093496, + "grad_norm": 15.264294624328613, + "learning_rate": 4.902261402836337e-06, + "logits/chosen": 3.134157419204712, + "logits/rejected": 3.075847864151001, + "logps/chosen": -162.3153076171875, + "logps/rejected": -154.469970703125, + "loss": 1.3705, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.628218650817871, + "rewards/margins": -0.7637474536895752, + "rewards/rejected": -10.864471435546875, + "step": 2558 + }, + { + "epoch": 1.765913403484561, + "grad_norm": 0.2192639857530594, + "learning_rate": 4.904177845917977e-06, + "logits/chosen": 2.979945182800293, + "logits/rejected": 3.4339606761932373, + "logps/chosen": -152.26564025878906, + "logps/rejected": -194.33358764648438, + "loss": 0.3469, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.374959945678711, + "rewards/margins": 4.272477149963379, + "rewards/rejected": -14.647438049316406, + "step": 2559 + }, + { + "epoch": 1.7666034155597723, + "grad_norm": 0.34784582257270813, + "learning_rate": 4.906094288999617e-06, + "logits/chosen": 3.3324291706085205, + "logits/rejected": 3.368255853652954, + "logps/chosen": -160.04769897460938, + "logps/rejected": -169.01161193847656, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.165916442871094, + "rewards/margins": 0.9281625151634216, + "rewards/rejected": -12.09407901763916, + "step": 2560 + }, + { + "epoch": 1.7672934276349836, + "grad_norm": 1.114651083946228, + "learning_rate": 4.908010732081257e-06, + "logits/chosen": 3.1713008880615234, + "logits/rejected": 3.2554123401641846, + "logps/chosen": -142.202880859375, + "logps/rejected": -157.72900390625, + "loss": 0.5288, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.405977249145508, + "rewards/margins": 1.600285291671753, + "rewards/rejected": -11.006263732910156, + "step": 2561 + }, + { + "epoch": 1.767983439710195, + "grad_norm": 0.275625079870224, + "learning_rate": 4.909927175162898e-06, + "logits/chosen": 3.0954365730285645, + "logits/rejected": 3.0954365730285645, + "logps/chosen": -163.4427490234375, + "logps/rejected": -163.4427490234375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.67591381072998, + "rewards/margins": -4.172325134277344e-07, + "rewards/rejected": -11.675914764404297, + "step": 2562 + }, + { + "epoch": 1.7686734517854061, + "grad_norm": 10.788700103759766, + "learning_rate": 4.911843618244538e-06, + "logits/chosen": 3.0855488777160645, + "logits/rejected": 3.1161534786224365, + "logps/chosen": -153.97621154785156, + "logps/rejected": -152.6216278076172, + "loss": 1.1955, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.479458808898926, + "rewards/margins": -0.014548897743225098, + "rewards/rejected": -10.464910507202148, + "step": 2563 + }, + { + "epoch": 1.7693634638606177, + "grad_norm": 0.460530549287796, + "learning_rate": 4.913760061326179e-06, + "logits/chosen": 3.1185874938964844, + "logits/rejected": 3.1185874938964844, + "logps/chosen": -146.04385375976562, + "logps/rejected": -146.04385375976562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -9.790677070617676, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -9.790677070617676, + "step": 2564 + }, + { + "epoch": 1.7700534759358288, + "grad_norm": 11.726475715637207, + "learning_rate": 4.91567650440782e-06, + "logits/chosen": 3.6792473793029785, + "logits/rejected": 3.6139819622039795, + "logps/chosen": -165.32235717773438, + "logps/rejected": -164.01341247558594, + "loss": 1.2121, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.813492774963379, + "rewards/margins": -0.03721761703491211, + "rewards/rejected": -11.776275634765625, + "step": 2565 + }, + { + "epoch": 1.7707434880110402, + "grad_norm": 0.3227214813232422, + "learning_rate": 4.91759294748946e-06, + "logits/chosen": 3.2920634746551514, + "logits/rejected": 3.3804867267608643, + "logps/chosen": -152.88009643554688, + "logps/rejected": -171.22198486328125, + "loss": 0.5211, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.49655532836914, + "rewards/margins": 1.8432034254074097, + "rewards/rejected": -12.33975887298584, + "step": 2566 + }, + { + "epoch": 1.7714335000862516, + "grad_norm": 0.2536389231681824, + "learning_rate": 4.919509390571101e-06, + "logits/chosen": 3.292407751083374, + "logits/rejected": 3.292407751083374, + "logps/chosen": -185.02732849121094, + "logps/rejected": -185.02732849121094, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.716520309448242, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -13.716520309448242, + "step": 2567 + }, + { + "epoch": 1.7721235121614627, + "grad_norm": 0.3310226500034332, + "learning_rate": 4.9214258336527405e-06, + "logits/chosen": 3.4324300289154053, + "logits/rejected": 3.4324300289154053, + "logps/chosen": -170.81085205078125, + "logps/rejected": -170.81085205078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.167621612548828, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.167619705200195, + "step": 2568 + }, + { + "epoch": 1.7728135242366743, + "grad_norm": 0.35400545597076416, + "learning_rate": 4.923342276734381e-06, + "logits/chosen": 3.34501576423645, + "logits/rejected": 3.297741651535034, + "logps/chosen": -168.88973999023438, + "logps/rejected": -179.75160217285156, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.120455741882324, + "rewards/margins": 1.1026644706726074, + "rewards/rejected": -13.22312068939209, + "step": 2569 + }, + { + "epoch": 1.7735035363118854, + "grad_norm": 0.2973395884037018, + "learning_rate": 4.925258719816022e-06, + "logits/chosen": 3.6969223022460938, + "logits/rejected": 3.6969223022460938, + "logps/chosen": -176.44271850585938, + "logps/rejected": -176.44271850585938, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.807701110839844, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.807701110839844, + "step": 2570 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 0.9046458005905151, + "learning_rate": 4.927175162897662e-06, + "logits/chosen": 3.719597578048706, + "logits/rejected": 3.823859691619873, + "logps/chosen": -166.03297424316406, + "logps/rejected": -179.02699279785156, + "loss": 0.5238, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.642208099365234, + "rewards/margins": 1.2923989295959473, + "rewards/rejected": -12.934608459472656, + "step": 2571 + }, + { + "epoch": 1.774883560462308, + "grad_norm": 0.3728271424770355, + "learning_rate": 4.929091605979303e-06, + "logits/chosen": 3.2920761108398438, + "logits/rejected": 3.328968048095703, + "logps/chosen": -157.625244140625, + "logps/rejected": -163.60296630859375, + "loss": 0.6078, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.865413665771484, + "rewards/margins": 0.5662451982498169, + "rewards/rejected": -11.431658744812012, + "step": 2572 + }, + { + "epoch": 1.7755735725375192, + "grad_norm": 0.34108617901802063, + "learning_rate": 4.931008049060944e-06, + "logits/chosen": 3.145775079727173, + "logits/rejected": 3.2238593101501465, + "logps/chosen": -135.64369201660156, + "logps/rejected": -147.91998291015625, + "loss": 0.5218, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.840810775756836, + "rewards/margins": 1.2404769659042358, + "rewards/rejected": -10.081287384033203, + "step": 2573 + }, + { + "epoch": 1.7762635846127308, + "grad_norm": 0.3525688648223877, + "learning_rate": 4.932924492142584e-06, + "logits/chosen": 3.473775863647461, + "logits/rejected": 3.50584077835083, + "logps/chosen": -175.34451293945312, + "logps/rejected": -183.29669189453125, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.705940246582031, + "rewards/margins": 0.8297933340072632, + "rewards/rejected": -13.535734176635742, + "step": 2574 + }, + { + "epoch": 1.776953596687942, + "grad_norm": 19.46826171875, + "learning_rate": 4.9348409352242246e-06, + "logits/chosen": 3.36812686920166, + "logits/rejected": 3.4385485649108887, + "logps/chosen": -154.1933135986328, + "logps/rejected": -171.44888305664062, + "loss": 1.0387, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.569250106811523, + "rewards/margins": 1.823177456855774, + "rewards/rejected": -12.392427444458008, + "step": 2575 + }, + { + "epoch": 1.7776436087631533, + "grad_norm": 0.2640335261821747, + "learning_rate": 4.9367573783058645e-06, + "logits/chosen": 3.517094850540161, + "logits/rejected": 3.5115838050842285, + "logps/chosen": -174.30043029785156, + "logps/rejected": -182.67063903808594, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.632530212402344, + "rewards/margins": 0.8785814046859741, + "rewards/rejected": -13.511112213134766, + "step": 2576 + }, + { + "epoch": 1.7783336208383647, + "grad_norm": 0.24150210618972778, + "learning_rate": 4.938673821387505e-06, + "logits/chosen": 3.448068857192993, + "logits/rejected": 3.448068857192993, + "logps/chosen": -157.95025634765625, + "logps/rejected": -157.95025634765625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.032209396362305, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -11.032210350036621, + "step": 2577 + }, + { + "epoch": 1.779023632913576, + "grad_norm": 0.3326169550418854, + "learning_rate": 4.940590264469145e-06, + "logits/chosen": 3.557126045227051, + "logits/rejected": 3.557126045227051, + "logps/chosen": -159.6382293701172, + "logps/rejected": -159.63824462890625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.847007751464844, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -10.847007751464844, + "step": 2578 + }, + { + "epoch": 1.7797136449887874, + "grad_norm": 0.3263240158557892, + "learning_rate": 4.942506707550786e-06, + "logits/chosen": 3.503265142440796, + "logits/rejected": 3.503265142440796, + "logps/chosen": -167.13201904296875, + "logps/rejected": -167.13201904296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.94113540649414, + "rewards/margins": 0.0, + "rewards/rejected": -11.94113540649414, + "step": 2579 + }, + { + "epoch": 1.7804036570639985, + "grad_norm": 0.3114745318889618, + "learning_rate": 4.944423150632426e-06, + "logits/chosen": 3.229041576385498, + "logits/rejected": 3.229041576385498, + "logps/chosen": -173.62330627441406, + "logps/rejected": -173.62330627441406, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.461377143859863, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.461377143859863, + "step": 2580 + }, + { + "epoch": 1.78109366913921, + "grad_norm": 0.291610449552536, + "learning_rate": 4.946339593714068e-06, + "logits/chosen": 2.9874911308288574, + "logits/rejected": 3.1369285583496094, + "logps/chosen": -150.5833740234375, + "logps/rejected": -174.06979370117188, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.35555648803711, + "rewards/margins": 2.2625181674957275, + "rewards/rejected": -12.618074417114258, + "step": 2581 + }, + { + "epoch": 1.7817836812144212, + "grad_norm": 0.26566281914711, + "learning_rate": 4.948256036795708e-06, + "logits/chosen": 3.3401169776916504, + "logits/rejected": 3.39663028717041, + "logps/chosen": -156.5752410888672, + "logps/rejected": -167.92041015625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.926424026489258, + "rewards/margins": 1.1563063859939575, + "rewards/rejected": -12.082731246948242, + "step": 2582 + }, + { + "epoch": 1.7824736932896326, + "grad_norm": 0.29652678966522217, + "learning_rate": 4.9501724798773485e-06, + "logits/chosen": 3.408900499343872, + "logits/rejected": 3.4614102840423584, + "logps/chosen": -154.43785095214844, + "logps/rejected": -163.8179931640625, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.628077507019043, + "rewards/margins": 0.9465031027793884, + "rewards/rejected": -11.574580192565918, + "step": 2583 + }, + { + "epoch": 1.783163705364844, + "grad_norm": 0.25619736313819885, + "learning_rate": 4.9520889229589885e-06, + "logits/chosen": 3.4799857139587402, + "logits/rejected": 3.4799857139587402, + "logps/chosen": -160.16751098632812, + "logps/rejected": -160.16751098632812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.369211196899414, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.369211196899414, + "step": 2584 + }, + { + "epoch": 1.783853717440055, + "grad_norm": 10.41987133026123, + "learning_rate": 4.954005366040629e-06, + "logits/chosen": 3.491225242614746, + "logits/rejected": 3.4797306060791016, + "logps/chosen": -152.1064910888672, + "logps/rejected": -150.89453125, + "loss": 0.7521, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.508659362792969, + "rewards/margins": -0.09885001182556152, + "rewards/rejected": -10.409809112548828, + "step": 2585 + }, + { + "epoch": 1.7845437295152666, + "grad_norm": 0.3064979910850525, + "learning_rate": 4.955921809122269e-06, + "logits/chosen": 3.4231643676757812, + "logits/rejected": 3.512622356414795, + "logps/chosen": -142.4476776123047, + "logps/rejected": -155.71722412109375, + "loss": 0.5213, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.490285873413086, + "rewards/margins": 1.3354125022888184, + "rewards/rejected": -10.825697898864746, + "step": 2586 + }, + { + "epoch": 1.7852337415904778, + "grad_norm": 1.9592475891113281, + "learning_rate": 4.95783825220391e-06, + "logits/chosen": 3.8062477111816406, + "logits/rejected": 3.7494699954986572, + "logps/chosen": -178.353271484375, + "logps/rejected": -181.57672119140625, + "loss": 0.6164, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.134410858154297, + "rewards/margins": 0.3120373487472534, + "rewards/rejected": -13.446447372436523, + "step": 2587 + }, + { + "epoch": 1.7859237536656891, + "grad_norm": 0.27495449781417847, + "learning_rate": 4.95975469528555e-06, + "logits/chosen": 3.5150794982910156, + "logits/rejected": 3.5150794982910156, + "logps/chosen": -166.07464599609375, + "logps/rejected": -166.07464599609375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.841846466064453, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.84184455871582, + "step": 2588 + }, + { + "epoch": 1.7866137657409005, + "grad_norm": 0.2501257359981537, + "learning_rate": 4.961671138367191e-06, + "logits/chosen": 3.34558367729187, + "logits/rejected": 3.605186939239502, + "logps/chosen": -142.9654083251953, + "logps/rejected": -168.9971160888672, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.642705917358398, + "rewards/margins": 2.53291392326355, + "rewards/rejected": -12.175620079040527, + "step": 2589 + }, + { + "epoch": 1.7873037778161118, + "grad_norm": 0.33252328634262085, + "learning_rate": 4.963587581448832e-06, + "logits/chosen": 3.508449077606201, + "logits/rejected": 3.508449077606201, + "logps/chosen": -163.31149291992188, + "logps/rejected": -163.31149291992188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.409757614135742, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.409757614135742, + "step": 2590 + }, + { + "epoch": 1.7879937898913232, + "grad_norm": 0.3439583480358124, + "learning_rate": 4.965504024530472e-06, + "logits/chosen": 3.6286261081695557, + "logits/rejected": 3.6286261081695557, + "logps/chosen": -158.4906005859375, + "logps/rejected": -158.4906005859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.065194129943848, + "rewards/margins": 0.0, + "rewards/rejected": -11.065194129943848, + "step": 2591 + }, + { + "epoch": 1.7886838019665343, + "grad_norm": 0.269671767950058, + "learning_rate": 4.9674204676121125e-06, + "logits/chosen": 3.7477598190307617, + "logits/rejected": 3.787046432495117, + "logps/chosen": -159.5475616455078, + "logps/rejected": -180.70095825195312, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.363201141357422, + "rewards/margins": 2.0296103954315186, + "rewards/rejected": -13.392810821533203, + "step": 2592 + }, + { + "epoch": 1.789373814041746, + "grad_norm": 0.2227470874786377, + "learning_rate": 4.9693369106937525e-06, + "logits/chosen": 3.146362543106079, + "logits/rejected": 3.2721307277679443, + "logps/chosen": -157.22402954101562, + "logps/rejected": -176.31094360351562, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.917181968688965, + "rewards/margins": 1.946640968322754, + "rewards/rejected": -12.863823890686035, + "step": 2593 + }, + { + "epoch": 1.790063826116957, + "grad_norm": 0.2863227427005768, + "learning_rate": 4.971253353775393e-06, + "logits/chosen": 3.5424835681915283, + "logits/rejected": 3.4853458404541016, + "logps/chosen": -145.58657836914062, + "logps/rejected": -151.52093505859375, + "loss": 0.6079, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.954972267150879, + "rewards/margins": 0.5633859634399414, + "rewards/rejected": -10.518357276916504, + "step": 2594 + }, + { + "epoch": 1.7907538381921684, + "grad_norm": 0.294097900390625, + "learning_rate": 4.973169796857033e-06, + "logits/chosen": 3.220416784286499, + "logits/rejected": 3.3094077110290527, + "logps/chosen": -161.0115203857422, + "logps/rejected": -172.4808807373047, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.441373825073242, + "rewards/margins": 1.1591508388519287, + "rewards/rejected": -12.60052490234375, + "step": 2595 + }, + { + "epoch": 1.7914438502673797, + "grad_norm": 0.4142448604106903, + "learning_rate": 4.975086239938674e-06, + "logits/chosen": 3.472136974334717, + "logits/rejected": 3.509178638458252, + "logps/chosen": -155.5373077392578, + "logps/rejected": -170.0557861328125, + "loss": 0.5219, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.764873504638672, + "rewards/margins": 1.481286883354187, + "rewards/rejected": -12.246162414550781, + "step": 2596 + }, + { + "epoch": 1.7921338623425909, + "grad_norm": 0.2770462930202484, + "learning_rate": 4.977002683020315e-06, + "logits/chosen": 3.161485195159912, + "logits/rejected": 3.161485195159912, + "logps/chosen": -153.55389404296875, + "logps/rejected": -153.55389404296875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.646844863891602, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -10.646844863891602, + "step": 2597 + }, + { + "epoch": 1.7928238744178024, + "grad_norm": 0.4048641622066498, + "learning_rate": 4.978919126101956e-06, + "logits/chosen": 3.168916702270508, + "logits/rejected": 3.3789284229278564, + "logps/chosen": -134.93350219726562, + "logps/rejected": -157.61065673828125, + "loss": 0.4364, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.645803451538086, + "rewards/margins": 2.3988778591156006, + "rewards/rejected": -11.044681549072266, + "step": 2598 + }, + { + "epoch": 1.7935138864930136, + "grad_norm": 0.32778480648994446, + "learning_rate": 4.980835569183596e-06, + "logits/chosen": 3.6949164867401123, + "logits/rejected": 3.6949164867401123, + "logps/chosen": -157.23068237304688, + "logps/rejected": -157.23069763183594, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.01551342010498, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.01551342010498, + "step": 2599 + }, + { + "epoch": 1.794203898568225, + "grad_norm": 0.365308940410614, + "learning_rate": 4.9827520122652365e-06, + "logits/chosen": 3.769416570663452, + "logits/rejected": 3.7306559085845947, + "logps/chosen": -178.33206176757812, + "logps/rejected": -183.3362274169922, + "loss": 0.6085, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.044170379638672, + "rewards/margins": 0.5189082026481628, + "rewards/rejected": -13.563078880310059, + "step": 2600 + }, + { + "epoch": 1.7948939106434363, + "grad_norm": 1.0458526611328125, + "learning_rate": 4.9846684553468764e-06, + "logits/chosen": 3.3604073524475098, + "logits/rejected": 3.3912811279296875, + "logps/chosen": -144.44265747070312, + "logps/rejected": -147.105224609375, + "loss": 0.6154, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.500351905822754, + "rewards/margins": 0.32544147968292236, + "rewards/rejected": -9.825793266296387, + "step": 2601 + }, + { + "epoch": 1.7955839227186474, + "grad_norm": 0.34975317120552063, + "learning_rate": 4.986584898428517e-06, + "logits/chosen": 3.385373830795288, + "logits/rejected": 3.385373830795288, + "logps/chosen": -173.16909790039062, + "logps/rejected": -173.16909790039062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.49960994720459, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.499610900878906, + "step": 2602 + }, + { + "epoch": 1.796273934793859, + "grad_norm": 38.41809844970703, + "learning_rate": 4.988501341510157e-06, + "logits/chosen": 3.514451265335083, + "logits/rejected": 3.3959834575653076, + "logps/chosen": -173.87033081054688, + "logps/rejected": -167.40924072265625, + "loss": 1.265, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.561524391174316, + "rewards/margins": -0.6578149795532227, + "rewards/rejected": -11.903709411621094, + "step": 2603 + }, + { + "epoch": 1.7969639468690701, + "grad_norm": 0.2844322621822357, + "learning_rate": 4.990417784591798e-06, + "logits/chosen": 3.433346748352051, + "logits/rejected": 3.466688632965088, + "logps/chosen": -163.53109741210938, + "logps/rejected": -171.09715270996094, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.735818862915039, + "rewards/margins": 0.7169240713119507, + "rewards/rejected": -12.452743530273438, + "step": 2604 + }, + { + "epoch": 1.7976539589442815, + "grad_norm": 0.32865363359451294, + "learning_rate": 4.992334227673439e-06, + "logits/chosen": 3.324831247329712, + "logits/rejected": 3.322146415710449, + "logps/chosen": -167.70262145996094, + "logps/rejected": -175.23025512695312, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.819938659667969, + "rewards/margins": 0.7943887114524841, + "rewards/rejected": -12.614326477050781, + "step": 2605 + }, + { + "epoch": 1.7983439710194928, + "grad_norm": 0.20491139590740204, + "learning_rate": 4.994250670755079e-06, + "logits/chosen": 3.1251368522644043, + "logits/rejected": 3.331495523452759, + "logps/chosen": -162.06747436523438, + "logps/rejected": -185.69606018066406, + "loss": 0.4339, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.575004577636719, + "rewards/margins": 2.397916316986084, + "rewards/rejected": -13.972921371459961, + "step": 2606 + }, + { + "epoch": 1.7990339830947042, + "grad_norm": 0.26955661177635193, + "learning_rate": 4.99616711383672e-06, + "logits/chosen": 3.3279075622558594, + "logits/rejected": 3.3279075622558594, + "logps/chosen": -182.62716674804688, + "logps/rejected": -182.62716674804688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.494017601013184, + "rewards/margins": 0.0, + "rewards/rejected": -13.494017601013184, + "step": 2607 + }, + { + "epoch": 1.7997239951699155, + "grad_norm": 0.2928895354270935, + "learning_rate": 4.99808355691836e-06, + "logits/chosen": 2.974637508392334, + "logits/rejected": 3.0745952129364014, + "logps/chosen": -164.5579376220703, + "logps/rejected": -188.51181030273438, + "loss": 0.4344, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.633235931396484, + "rewards/margins": 2.4510884284973145, + "rewards/rejected": -14.08432388305664, + "step": 2608 + }, + { + "epoch": 1.8004140072451267, + "grad_norm": 25.975534439086914, + "learning_rate": 5e-06, + "logits/chosen": 3.5533175468444824, + "logits/rejected": 3.379587411880493, + "logps/chosen": -150.550537109375, + "logps/rejected": -154.4773406982422, + "loss": 0.6885, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.180889129638672, + "rewards/margins": 0.31468263268470764, + "rewards/rejected": -10.495572090148926, + "step": 2609 + }, + { + "epoch": 1.8011040193203383, + "grad_norm": 0.33774128556251526, + "learning_rate": 4.997123130034523e-06, + "logits/chosen": 3.250422954559326, + "logits/rejected": 3.3060483932495117, + "logps/chosen": -163.09312438964844, + "logps/rejected": -170.603515625, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.441843032836914, + "rewards/margins": 0.8348260521888733, + "rewards/rejected": -12.276668548583984, + "step": 2610 + }, + { + "epoch": 1.8017940313955494, + "grad_norm": 0.2955133318901062, + "learning_rate": 4.994246260069046e-06, + "logits/chosen": 3.1649911403656006, + "logits/rejected": 3.1269869804382324, + "logps/chosen": -141.534912109375, + "logps/rejected": -163.62887573242188, + "loss": 0.5214, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.47008991241455, + "rewards/margins": 2.088622808456421, + "rewards/rejected": -11.55871295928955, + "step": 2611 + }, + { + "epoch": 1.8024840434707607, + "grad_norm": 0.31349363923072815, + "learning_rate": 4.991369390103568e-06, + "logits/chosen": 3.3978054523468018, + "logits/rejected": 3.3996269702911377, + "logps/chosen": -159.4285888671875, + "logps/rejected": -167.62130737304688, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.138330459594727, + "rewards/margins": 0.832923412322998, + "rewards/rejected": -11.97125244140625, + "step": 2612 + }, + { + "epoch": 1.803174055545972, + "grad_norm": 0.5165104866027832, + "learning_rate": 4.98849252013809e-06, + "logits/chosen": 3.3422436714172363, + "logits/rejected": 3.710822582244873, + "logps/chosen": -143.28695678710938, + "logps/rejected": -169.53033447265625, + "loss": 0.4353, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.476659774780273, + "rewards/margins": 2.648106575012207, + "rewards/rejected": -12.12476634979248, + "step": 2613 + }, + { + "epoch": 1.8038640676211832, + "grad_norm": 0.27749207615852356, + "learning_rate": 4.985615650172613e-06, + "logits/chosen": 3.6602718830108643, + "logits/rejected": 3.6602718830108643, + "logps/chosen": -170.66458129882812, + "logps/rejected": -170.66458129882812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.39027214050293, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.39027214050293, + "step": 2614 + }, + { + "epoch": 1.8045540796963948, + "grad_norm": 12.153928756713867, + "learning_rate": 4.982738780207135e-06, + "logits/chosen": 3.3106017112731934, + "logits/rejected": 3.2993862628936768, + "logps/chosen": -162.28164672851562, + "logps/rejected": -166.95706176757812, + "loss": 0.944, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.593525886535645, + "rewards/margins": 0.5204352140426636, + "rewards/rejected": -12.113961219787598, + "step": 2615 + }, + { + "epoch": 1.805244091771606, + "grad_norm": 0.34783825278282166, + "learning_rate": 4.979861910241657e-06, + "logits/chosen": 3.2592034339904785, + "logits/rejected": 3.2592034339904785, + "logps/chosen": -158.66165161132812, + "logps/rejected": -158.66165161132812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.03043270111084, + "rewards/margins": 0.0, + "rewards/rejected": -11.03043270111084, + "step": 2616 + }, + { + "epoch": 1.8059341038468173, + "grad_norm": 11.010542869567871, + "learning_rate": 4.97698504027618e-06, + "logits/chosen": 3.2048821449279785, + "logits/rejected": 3.343700885772705, + "logps/chosen": -192.35037231445312, + "logps/rejected": -198.37344360351562, + "loss": 0.5793, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.43828010559082, + "rewards/margins": 0.6238918304443359, + "rewards/rejected": -15.062171936035156, + "step": 2617 + }, + { + "epoch": 1.8066241159220287, + "grad_norm": 0.3423161804676056, + "learning_rate": 4.974108170310703e-06, + "logits/chosen": 2.9051930904388428, + "logits/rejected": 3.075228691101074, + "logps/chosen": -173.31256103515625, + "logps/rejected": -180.67990112304688, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.452485084533691, + "rewards/margins": 0.7541851997375488, + "rewards/rejected": -13.206669807434082, + "step": 2618 + }, + { + "epoch": 1.80731412799724, + "grad_norm": 0.29174208641052246, + "learning_rate": 4.971231300345225e-06, + "logits/chosen": 3.3828368186950684, + "logits/rejected": 3.3828368186950684, + "logps/chosen": -181.08370971679688, + "logps/rejected": -181.08370971679688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.438522338867188, + "rewards/margins": 0.0, + "rewards/rejected": -13.438522338867188, + "step": 2619 + }, + { + "epoch": 1.8080041400724514, + "grad_norm": 0.2716783881187439, + "learning_rate": 4.968354430379747e-06, + "logits/chosen": 3.5326693058013916, + "logits/rejected": 3.7019643783569336, + "logps/chosen": -174.51028442382812, + "logps/rejected": -184.8563232421875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.796390533447266, + "rewards/margins": 1.0402772426605225, + "rewards/rejected": -13.836668014526367, + "step": 2620 + }, + { + "epoch": 1.8086941521476625, + "grad_norm": 0.9841213226318359, + "learning_rate": 4.9654775604142695e-06, + "logits/chosen": 3.7302913665771484, + "logits/rejected": 3.8457062244415283, + "logps/chosen": -173.0205078125, + "logps/rejected": -183.29122924804688, + "loss": 0.5252, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.368585586547852, + "rewards/margins": 1.1152949333190918, + "rewards/rejected": -13.483880996704102, + "step": 2621 + }, + { + "epoch": 1.8093841642228738, + "grad_norm": 0.31895479559898376, + "learning_rate": 4.962600690448792e-06, + "logits/chosen": 3.873100757598877, + "logits/rejected": 3.873100757598877, + "logps/chosen": -184.18548583984375, + "logps/rejected": -184.18548583984375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.608760833740234, + "rewards/margins": 0.0, + "rewards/rejected": -13.608760833740234, + "step": 2622 + }, + { + "epoch": 1.8100741762980852, + "grad_norm": 3.549144744873047, + "learning_rate": 4.959723820483315e-06, + "logits/chosen": 3.6242218017578125, + "logits/rejected": 3.722414255142212, + "logps/chosen": -167.63064575195312, + "logps/rejected": -178.9468994140625, + "loss": 0.5582, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.97749137878418, + "rewards/margins": 1.1160516738891602, + "rewards/rejected": -13.093544006347656, + "step": 2623 + }, + { + "epoch": 1.8107641883732966, + "grad_norm": 0.27453476190567017, + "learning_rate": 4.956846950517837e-06, + "logits/chosen": 3.687819004058838, + "logits/rejected": 3.863170623779297, + "logps/chosen": -165.6399688720703, + "logps/rejected": -178.19879150390625, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.748376846313477, + "rewards/margins": 1.2963271141052246, + "rewards/rejected": -13.04470443725586, + "step": 2624 + }, + { + "epoch": 1.811454200448508, + "grad_norm": 0.3075701892375946, + "learning_rate": 4.95397008055236e-06, + "logits/chosen": 3.6007189750671387, + "logits/rejected": 3.6007189750671387, + "logps/chosen": -171.64785766601562, + "logps/rejected": -171.64785766601562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.491552352905273, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.491551399230957, + "step": 2625 + }, + { + "epoch": 1.812144212523719, + "grad_norm": 16.967086791992188, + "learning_rate": 4.951093210586882e-06, + "logits/chosen": 3.843874454498291, + "logits/rejected": 3.848306179046631, + "logps/chosen": -173.09011840820312, + "logps/rejected": -181.41812133789062, + "loss": 0.9658, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.718979835510254, + "rewards/margins": 0.8502994775772095, + "rewards/rejected": -13.569278717041016, + "step": 2626 + }, + { + "epoch": 1.8128342245989306, + "grad_norm": 0.33168113231658936, + "learning_rate": 4.9482163406214044e-06, + "logits/chosen": 3.6954102516174316, + "logits/rejected": 3.6954102516174316, + "logps/chosen": -183.83584594726562, + "logps/rejected": -183.83584594726562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.59770393371582, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.59770393371582, + "step": 2627 + }, + { + "epoch": 1.8135242366741418, + "grad_norm": 0.2817605435848236, + "learning_rate": 4.945339470655926e-06, + "logits/chosen": 3.6767024993896484, + "logits/rejected": 3.791578769683838, + "logps/chosen": -160.13624572753906, + "logps/rejected": -185.19210815429688, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.19540023803711, + "rewards/margins": 2.5168042182922363, + "rewards/rejected": -13.71220588684082, + "step": 2628 + }, + { + "epoch": 1.814214248749353, + "grad_norm": 0.2735840082168579, + "learning_rate": 4.942462600690449e-06, + "logits/chosen": 3.8049709796905518, + "logits/rejected": 4.07814359664917, + "logps/chosen": -161.42686462402344, + "logps/rejected": -177.62152099609375, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.360635757446289, + "rewards/margins": 1.637160062789917, + "rewards/rejected": -12.997795104980469, + "step": 2629 + }, + { + "epoch": 1.8149042608245645, + "grad_norm": 0.6620225310325623, + "learning_rate": 4.939585730724972e-06, + "logits/chosen": 3.3521056175231934, + "logits/rejected": 3.444913864135742, + "logps/chosen": -171.90817260742188, + "logps/rejected": -176.22250366210938, + "loss": 0.6116, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.339446067810059, + "rewards/margins": 0.39702439308166504, + "rewards/rejected": -12.736470222473145, + "step": 2630 + }, + { + "epoch": 1.8155942728997756, + "grad_norm": 0.20469170808792114, + "learning_rate": 4.936708860759495e-06, + "logits/chosen": 3.444692850112915, + "logits/rejected": 3.464348316192627, + "logps/chosen": -165.06283569335938, + "logps/rejected": -173.8280029296875, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.685647964477539, + "rewards/margins": 0.9350838661193848, + "rewards/rejected": -12.620731353759766, + "step": 2631 + }, + { + "epoch": 1.8162842849749872, + "grad_norm": 0.2962532341480255, + "learning_rate": 4.933831990794017e-06, + "logits/chosen": 3.3605992794036865, + "logits/rejected": 3.3605992794036865, + "logps/chosen": -171.17572021484375, + "logps/rejected": -171.17572021484375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.122194290161133, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.122194290161133, + "step": 2632 + }, + { + "epoch": 1.8169742970501983, + "grad_norm": 2.5120038986206055, + "learning_rate": 4.9309551208285385e-06, + "logits/chosen": 3.5645627975463867, + "logits/rejected": 3.5192878246307373, + "logps/chosen": -161.94497680664062, + "logps/rejected": -173.44039916992188, + "loss": 0.5371, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.486013412475586, + "rewards/margins": 1.2427034378051758, + "rewards/rejected": -12.728717803955078, + "step": 2633 + }, + { + "epoch": 1.8176643091254097, + "grad_norm": 0.5016968846321106, + "learning_rate": 4.928078250863061e-06, + "logits/chosen": 3.7387166023254395, + "logits/rejected": 3.7557530403137207, + "logps/chosen": -169.0670623779297, + "logps/rejected": -174.18505859375, + "loss": 0.6081, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.07010269165039, + "rewards/margins": 0.5480165481567383, + "rewards/rejected": -12.618119239807129, + "step": 2634 + }, + { + "epoch": 1.818354321200621, + "grad_norm": 7.381196022033691, + "learning_rate": 4.925201380897584e-06, + "logits/chosen": 3.854623317718506, + "logits/rejected": 3.7905569076538086, + "logps/chosen": -151.37893676757812, + "logps/rejected": -155.34983825683594, + "loss": 0.5777, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.267945289611816, + "rewards/margins": 0.3762843608856201, + "rewards/rejected": -10.644229888916016, + "step": 2635 + }, + { + "epoch": 1.8190443332758324, + "grad_norm": 0.2987198233604431, + "learning_rate": 4.922324510932106e-06, + "logits/chosen": 3.684018135070801, + "logits/rejected": 3.684018135070801, + "logps/chosen": -173.22344970703125, + "logps/rejected": -173.22348022460938, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.51778793334961, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.51778793334961, + "step": 2636 + }, + { + "epoch": 1.8197343453510437, + "grad_norm": 0.25311437249183655, + "learning_rate": 4.919447640966629e-06, + "logits/chosen": 3.6118361949920654, + "logits/rejected": 3.6311259269714355, + "logps/chosen": -162.64076232910156, + "logps/rejected": -169.1280975341797, + "loss": 0.6074, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.213164329528809, + "rewards/margins": 0.6137911677360535, + "rewards/rejected": -11.826955795288086, + "step": 2637 + }, + { + "epoch": 1.8204243574262549, + "grad_norm": 0.3170487582683563, + "learning_rate": 4.9165707710011516e-06, + "logits/chosen": 3.801177978515625, + "logits/rejected": 3.8206448554992676, + "logps/chosen": -161.9596405029297, + "logps/rejected": -172.3020477294922, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.474417686462402, + "rewards/margins": 1.0532710552215576, + "rewards/rejected": -12.527688980102539, + "step": 2638 + }, + { + "epoch": 1.8211143695014664, + "grad_norm": 11.270736694335938, + "learning_rate": 4.9136939010356735e-06, + "logits/chosen": 3.307501792907715, + "logits/rejected": 3.371725559234619, + "logps/chosen": -143.42367553710938, + "logps/rejected": -153.21604919433594, + "loss": 0.786, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.67510986328125, + "rewards/margins": 0.9893713593482971, + "rewards/rejected": -10.664481163024902, + "step": 2639 + }, + { + "epoch": 1.8218043815766776, + "grad_norm": 0.7357524037361145, + "learning_rate": 4.9108170310701954e-06, + "logits/chosen": 3.4198286533355713, + "logits/rejected": 3.566553831100464, + "logps/chosen": -150.3473663330078, + "logps/rejected": -165.8728485107422, + "loss": 0.5242, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.213821411132812, + "rewards/margins": 1.6075550317764282, + "rewards/rejected": -11.821375846862793, + "step": 2640 + }, + { + "epoch": 1.822494393651889, + "grad_norm": 0.28638818860054016, + "learning_rate": 4.907940161104718e-06, + "logits/chosen": 3.5800745487213135, + "logits/rejected": 3.66302227973938, + "logps/chosen": -149.97943115234375, + "logps/rejected": -175.30667114257812, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.156217575073242, + "rewards/margins": 2.4983291625976562, + "rewards/rejected": -12.654546737670898, + "step": 2641 + }, + { + "epoch": 1.8231844057271003, + "grad_norm": 0.245305597782135, + "learning_rate": 4.905063291139241e-06, + "logits/chosen": 3.4951694011688232, + "logits/rejected": 3.5983388423919678, + "logps/chosen": -147.89483642578125, + "logps/rejected": -169.88380432128906, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.114801406860352, + "rewards/margins": 2.1532931327819824, + "rewards/rejected": -12.268095016479492, + "step": 2642 + }, + { + "epoch": 1.8238744178023114, + "grad_norm": 2.7828755378723145, + "learning_rate": 4.902186421173764e-06, + "logits/chosen": 3.83535099029541, + "logits/rejected": 3.9705801010131836, + "logps/chosen": -162.80419921875, + "logps/rejected": -165.1387176513672, + "loss": 0.6204, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.381813049316406, + "rewards/margins": 0.2673187255859375, + "rewards/rejected": -11.649131774902344, + "step": 2643 + }, + { + "epoch": 1.824564429877523, + "grad_norm": 0.3444865643978119, + "learning_rate": 4.899309551208286e-06, + "logits/chosen": 3.5281457901000977, + "logits/rejected": 3.659149646759033, + "logps/chosen": -155.81124877929688, + "logps/rejected": -166.2014923095703, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.68525505065918, + "rewards/margins": 1.0801684856414795, + "rewards/rejected": -11.765422821044922, + "step": 2644 + }, + { + "epoch": 1.8252544419527341, + "grad_norm": 0.35407859086990356, + "learning_rate": 4.8964326812428085e-06, + "logits/chosen": 3.566377639770508, + "logits/rejected": 3.6592116355895996, + "logps/chosen": -163.0404510498047, + "logps/rejected": -167.58828735351562, + "loss": 0.6083, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.47622299194336, + "rewards/margins": 0.5325711965560913, + "rewards/rejected": -12.008793830871582, + "step": 2645 + }, + { + "epoch": 1.8259444540279455, + "grad_norm": 0.31219226121902466, + "learning_rate": 4.89355581127733e-06, + "logits/chosen": 3.8206090927124023, + "logits/rejected": 3.817117691040039, + "logps/chosen": -174.7467803955078, + "logps/rejected": -184.83834838867188, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.679058074951172, + "rewards/margins": 1.028109073638916, + "rewards/rejected": -13.70716667175293, + "step": 2646 + }, + { + "epoch": 1.8266344661031568, + "grad_norm": 3.9643056392669678, + "learning_rate": 4.890678941311853e-06, + "logits/chosen": 3.518872022628784, + "logits/rejected": 3.5539791584014893, + "logps/chosen": -167.23715209960938, + "logps/rejected": -169.40582275390625, + "loss": 0.6255, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.857931137084961, + "rewards/margins": 0.22620540857315063, + "rewards/rejected": -12.084136962890625, + "step": 2647 + }, + { + "epoch": 1.827324478178368, + "grad_norm": 0.3263109624385834, + "learning_rate": 4.887802071346375e-06, + "logits/chosen": 3.4609551429748535, + "logits/rejected": 3.560579538345337, + "logps/chosen": -137.35650634765625, + "logps/rejected": -155.01669311523438, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.986595153808594, + "rewards/margins": 1.8406368494033813, + "rewards/rejected": -10.827231407165527, + "step": 2648 + }, + { + "epoch": 1.8280144902535795, + "grad_norm": 0.366956889629364, + "learning_rate": 4.884925201380898e-06, + "logits/chosen": 3.67134952545166, + "logits/rejected": 3.764963150024414, + "logps/chosen": -156.5235595703125, + "logps/rejected": -167.53819274902344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.86268424987793, + "rewards/margins": 1.118631362915039, + "rewards/rejected": -11.981315612792969, + "step": 2649 + }, + { + "epoch": 1.8287045023287907, + "grad_norm": 1.8437581062316895, + "learning_rate": 4.882048331415421e-06, + "logits/chosen": 3.586388349533081, + "logits/rejected": 3.5190138816833496, + "logps/chosen": -173.88514709472656, + "logps/rejected": -185.37179565429688, + "loss": 0.5272, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.603364944458008, + "rewards/margins": 1.1009912490844727, + "rewards/rejected": -13.704355239868164, + "step": 2650 + }, + { + "epoch": 1.829394514404002, + "grad_norm": 17.99570083618164, + "learning_rate": 4.879171461449943e-06, + "logits/chosen": 3.887874126434326, + "logits/rejected": 3.9011850357055664, + "logps/chosen": -148.8737335205078, + "logps/rejected": -165.20068359375, + "loss": 0.619, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.153707504272461, + "rewards/margins": 1.6426379680633545, + "rewards/rejected": -11.796346664428711, + "step": 2651 + }, + { + "epoch": 1.8300845264792134, + "grad_norm": 15.315940856933594, + "learning_rate": 4.876294591484465e-06, + "logits/chosen": 3.702117681503296, + "logits/rejected": 3.688267230987549, + "logps/chosen": -166.8756866455078, + "logps/rejected": -167.14230346679688, + "loss": 0.6787, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.982978820800781, + "rewards/margins": 0.030766606330871582, + "rewards/rejected": -12.013744354248047, + "step": 2652 + }, + { + "epoch": 1.8307745385544247, + "grad_norm": 20.738399505615234, + "learning_rate": 4.873417721518987e-06, + "logits/chosen": 3.362485885620117, + "logits/rejected": 3.2435879707336426, + "logps/chosen": -143.31617736816406, + "logps/rejected": -154.69891357421875, + "loss": 0.7982, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.467254638671875, + "rewards/margins": 1.1313304901123047, + "rewards/rejected": -10.598584175109863, + "step": 2653 + }, + { + "epoch": 1.831464550629636, + "grad_norm": 0.24829213321208954, + "learning_rate": 4.87054085155351e-06, + "logits/chosen": 3.8353397846221924, + "logits/rejected": 3.9189064502716064, + "logps/chosen": -171.30262756347656, + "logps/rejected": -181.73532104492188, + "loss": 0.6065, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.211043357849121, + "rewards/margins": 1.1410824060440063, + "rewards/rejected": -13.35212516784668, + "step": 2654 + }, + { + "epoch": 1.8321545627048472, + "grad_norm": 0.31415075063705444, + "learning_rate": 4.867663981588033e-06, + "logits/chosen": 3.5753936767578125, + "logits/rejected": 3.5753936767578125, + "logps/chosen": -175.26522827148438, + "logps/rejected": -175.26522827148438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.733980178833008, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.733980178833008, + "step": 2655 + }, + { + "epoch": 1.8328445747800588, + "grad_norm": 0.3160412609577179, + "learning_rate": 4.864787111622555e-06, + "logits/chosen": 3.58209228515625, + "logits/rejected": 3.58209228515625, + "logps/chosen": -164.08389282226562, + "logps/rejected": -164.08389282226562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.521697998046875, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.521697998046875, + "step": 2656 + }, + { + "epoch": 1.83353458685527, + "grad_norm": 0.25674062967300415, + "learning_rate": 4.8619102416570775e-06, + "logits/chosen": 3.725432872772217, + "logits/rejected": 3.725432872772217, + "logps/chosen": -175.03985595703125, + "logps/rejected": -175.03985595703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.782265663146973, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.782264709472656, + "step": 2657 + }, + { + "epoch": 1.8342245989304813, + "grad_norm": 0.3766227662563324, + "learning_rate": 4.8590333716916e-06, + "logits/chosen": 3.6522293090820312, + "logits/rejected": 3.6522293090820312, + "logps/chosen": -172.84848022460938, + "logps/rejected": -172.84848022460938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.59717845916748, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.597179412841797, + "step": 2658 + }, + { + "epoch": 1.8349146110056926, + "grad_norm": 10.492596626281738, + "learning_rate": 4.856156501726122e-06, + "logits/chosen": 3.7161715030670166, + "logits/rejected": 3.6858086585998535, + "logps/chosen": -158.3363494873047, + "logps/rejected": -162.1343994140625, + "loss": 0.6542, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.129997253417969, + "rewards/margins": 0.41420477628707886, + "rewards/rejected": -11.544200897216797, + "step": 2659 + }, + { + "epoch": 1.8356046230809038, + "grad_norm": 0.29518771171569824, + "learning_rate": 4.853279631760644e-06, + "logits/chosen": 3.7790699005126953, + "logits/rejected": 3.877032995223999, + "logps/chosen": -169.23672485351562, + "logps/rejected": -178.19198608398438, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.939809799194336, + "rewards/margins": 0.9414569735527039, + "rewards/rejected": -12.881265640258789, + "step": 2660 + }, + { + "epoch": 1.8362946351561154, + "grad_norm": 0.43575435876846313, + "learning_rate": 4.850402761795167e-06, + "logits/chosen": 3.504912853240967, + "logits/rejected": 3.504912853240967, + "logps/chosen": -144.49183654785156, + "logps/rejected": -144.49183654785156, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.627524375915527, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -9.627524375915527, + "step": 2661 + }, + { + "epoch": 1.8369846472313265, + "grad_norm": 0.2898285388946533, + "learning_rate": 4.84752589182969e-06, + "logits/chosen": 3.7398011684417725, + "logits/rejected": 3.730907917022705, + "logps/chosen": -162.48812866210938, + "logps/rejected": -176.39712524414062, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.368907928466797, + "rewards/margins": 1.4037816524505615, + "rewards/rejected": -12.772688865661621, + "step": 2662 + }, + { + "epoch": 1.8376746593065378, + "grad_norm": 0.35876455903053284, + "learning_rate": 4.8446490218642125e-06, + "logits/chosen": 3.840803384780884, + "logits/rejected": 3.950474739074707, + "logps/chosen": -153.36505126953125, + "logps/rejected": -159.2494354248047, + "loss": 0.6075, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.507428169250488, + "rewards/margins": 0.6043636798858643, + "rewards/rejected": -11.111791610717773, + "step": 2663 + }, + { + "epoch": 1.8383646713817492, + "grad_norm": 0.3507258892059326, + "learning_rate": 4.841772151898735e-06, + "logits/chosen": 3.946500062942505, + "logits/rejected": 3.946500062942505, + "logps/chosen": -176.32357788085938, + "logps/rejected": -176.32357788085938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.909671783447266, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -12.909671783447266, + "step": 2664 + }, + { + "epoch": 1.8390546834569605, + "grad_norm": 3.0753443241119385, + "learning_rate": 4.838895281933257e-06, + "logits/chosen": 3.6761927604675293, + "logits/rejected": 3.7866718769073486, + "logps/chosen": -152.32391357421875, + "logps/rejected": -163.02542114257812, + "loss": 0.5453, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.630744934082031, + "rewards/margins": 1.098980188369751, + "rewards/rejected": -11.729724884033203, + "step": 2665 + }, + { + "epoch": 1.839744695532172, + "grad_norm": 0.37036970257759094, + "learning_rate": 4.836018411967779e-06, + "logits/chosen": 3.2798099517822266, + "logits/rejected": 3.2798099517822266, + "logps/chosen": -168.56863403320312, + "logps/rejected": -168.56863403320312, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.989412307739258, + "rewards/margins": 6.556510925292969e-07, + "rewards/rejected": -11.989413261413574, + "step": 2666 + }, + { + "epoch": 1.840434707607383, + "grad_norm": 0.35121220350265503, + "learning_rate": 4.833141542002302e-06, + "logits/chosen": 3.6048128604888916, + "logits/rejected": 3.6048128604888916, + "logps/chosen": -155.04689025878906, + "logps/rejected": -155.04689025878906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.496118545532227, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -10.496118545532227, + "step": 2667 + }, + { + "epoch": 1.8411247196825944, + "grad_norm": 0.28707706928253174, + "learning_rate": 4.830264672036825e-06, + "logits/chosen": 3.772566318511963, + "logits/rejected": 3.8318862915039062, + "logps/chosen": -161.64505004882812, + "logps/rejected": -173.08705139160156, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.297569274902344, + "rewards/margins": 1.1657062768936157, + "rewards/rejected": -12.463275909423828, + "step": 2668 + }, + { + "epoch": 1.8418147317578057, + "grad_norm": 0.8836772441864014, + "learning_rate": 4.827387802071347e-06, + "logits/chosen": 3.8179163932800293, + "logits/rejected": 3.9681832790374756, + "logps/chosen": -164.2490234375, + "logps/rejected": -174.61593627929688, + "loss": 0.5258, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.604574203491211, + "rewards/margins": 1.0390493869781494, + "rewards/rejected": -12.643623352050781, + "step": 2669 + }, + { + "epoch": 1.842504743833017, + "grad_norm": 0.3554425835609436, + "learning_rate": 4.824510932105869e-06, + "logits/chosen": 3.6587727069854736, + "logits/rejected": 3.8110508918762207, + "logps/chosen": -150.7467041015625, + "logps/rejected": -182.3098907470703, + "loss": 0.4335, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.108409881591797, + "rewards/margins": 3.254058361053467, + "rewards/rejected": -13.362467765808105, + "step": 2670 + }, + { + "epoch": 1.8431947559082285, + "grad_norm": 0.4048422574996948, + "learning_rate": 4.821634062140392e-06, + "logits/chosen": 4.035915851593018, + "logits/rejected": 4.035915851593018, + "logps/chosen": -160.83799743652344, + "logps/rejected": -160.8380126953125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.124649047851562, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.124649047851562, + "step": 2671 + }, + { + "epoch": 1.8438847679834396, + "grad_norm": 0.30616188049316406, + "learning_rate": 4.818757192174914e-06, + "logits/chosen": 3.654582977294922, + "logits/rejected": 3.633455514907837, + "logps/chosen": -161.1248016357422, + "logps/rejected": -166.24652099609375, + "loss": 0.6079, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.208915710449219, + "rewards/margins": 0.5644080638885498, + "rewards/rejected": -11.773324012756348, + "step": 2672 + }, + { + "epoch": 1.8445747800586512, + "grad_norm": 0.3425862193107605, + "learning_rate": 4.815880322209436e-06, + "logits/chosen": 3.5119404792785645, + "logits/rejected": 3.5864968299865723, + "logps/chosen": -146.6673126220703, + "logps/rejected": -155.37075805664062, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.92930793762207, + "rewards/margins": 0.8891448974609375, + "rewards/rejected": -10.818452835083008, + "step": 2673 + }, + { + "epoch": 1.8452647921338623, + "grad_norm": 0.2763172686100006, + "learning_rate": 4.813003452243959e-06, + "logits/chosen": 3.5321857929229736, + "logits/rejected": 3.827996015548706, + "logps/chosen": -152.61378479003906, + "logps/rejected": -172.026611328125, + "loss": 0.52, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.377873420715332, + "rewards/margins": 1.9874141216278076, + "rewards/rejected": -12.365287780761719, + "step": 2674 + }, + { + "epoch": 1.8459548042090737, + "grad_norm": 0.2996695041656494, + "learning_rate": 4.8101265822784815e-06, + "logits/chosen": 3.503725290298462, + "logits/rejected": 3.6283884048461914, + "logps/chosen": -164.7156982421875, + "logps/rejected": -176.6575469970703, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.534366607666016, + "rewards/margins": 1.1099153757095337, + "rewards/rejected": -12.644282341003418, + "step": 2675 + }, + { + "epoch": 1.846644816284285, + "grad_norm": 0.24807648360729218, + "learning_rate": 4.807249712313004e-06, + "logits/chosen": 3.9617929458618164, + "logits/rejected": 4.0135345458984375, + "logps/chosen": -159.52859497070312, + "logps/rejected": -172.90988159179688, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.218768119812012, + "rewards/margins": 1.335407018661499, + "rewards/rejected": -12.55417537689209, + "step": 2676 + }, + { + "epoch": 1.8473348283594961, + "grad_norm": 0.2987838089466095, + "learning_rate": 4.804372842347526e-06, + "logits/chosen": 3.723869800567627, + "logits/rejected": 3.7166483402252197, + "logps/chosen": -170.73544311523438, + "logps/rejected": -177.55380249023438, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.162416458129883, + "rewards/margins": 0.690354585647583, + "rewards/rejected": -12.852770805358887, + "step": 2677 + }, + { + "epoch": 1.8480248404347077, + "grad_norm": 0.2857268452644348, + "learning_rate": 4.801495972382049e-06, + "logits/chosen": 3.520482063293457, + "logits/rejected": 3.754368782043457, + "logps/chosen": -140.07144165039062, + "logps/rejected": -168.51773071289062, + "loss": 0.4343, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.17223072052002, + "rewards/margins": 2.885411262512207, + "rewards/rejected": -12.057641983032227, + "step": 2678 + }, + { + "epoch": 1.8487148525099188, + "grad_norm": 0.4472706615924835, + "learning_rate": 4.798619102416571e-06, + "logits/chosen": 3.808471918106079, + "logits/rejected": 3.808471918106079, + "logps/chosen": -158.45269775390625, + "logps/rejected": -158.45269775390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.153619766235352, + "rewards/margins": 0.0, + "rewards/rejected": -11.153619766235352, + "step": 2679 + }, + { + "epoch": 1.8494048645851302, + "grad_norm": 0.31134915351867676, + "learning_rate": 4.795742232451094e-06, + "logits/chosen": 3.5484492778778076, + "logits/rejected": 3.644472599029541, + "logps/chosen": -180.20053100585938, + "logps/rejected": -186.01107788085938, + "loss": 0.6078, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.270774841308594, + "rewards/margins": 0.572913646697998, + "rewards/rejected": -13.843688011169434, + "step": 2680 + }, + { + "epoch": 1.8500948766603416, + "grad_norm": 20.499319076538086, + "learning_rate": 4.792865362485616e-06, + "logits/chosen": 3.297342300415039, + "logits/rejected": 3.3570046424865723, + "logps/chosen": -167.3927001953125, + "logps/rejected": -175.8374786376953, + "loss": 1.1495, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.053985595703125, + "rewards/margins": 0.8490316867828369, + "rewards/rejected": -12.903017044067383, + "step": 2681 + }, + { + "epoch": 1.850784888735553, + "grad_norm": 1.271390438079834, + "learning_rate": 4.7899884925201384e-06, + "logits/chosen": 3.2846782207489014, + "logits/rejected": 3.586369514465332, + "logps/chosen": -143.61614990234375, + "logps/rejected": -156.86007690429688, + "loss": 0.5325, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.519023895263672, + "rewards/margins": 1.3536617755889893, + "rewards/rejected": -10.872686386108398, + "step": 2682 + }, + { + "epoch": 1.8514749008107643, + "grad_norm": 0.331287145614624, + "learning_rate": 4.787111622554661e-06, + "logits/chosen": 3.61311936378479, + "logits/rejected": 3.96220064163208, + "logps/chosen": -134.52487182617188, + "logps/rejected": -169.93484497070312, + "loss": 0.4333, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.617887496948242, + "rewards/margins": 3.564852714538574, + "rewards/rejected": -12.182741165161133, + "step": 2683 + }, + { + "epoch": 1.8521649128859754, + "grad_norm": 0.3221052587032318, + "learning_rate": 4.784234752589184e-06, + "logits/chosen": 3.668459892272949, + "logits/rejected": 3.7684736251831055, + "logps/chosen": -143.18515014648438, + "logps/rejected": -155.92324829101562, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.475249290466309, + "rewards/margins": 1.2875778675079346, + "rewards/rejected": -10.76282787322998, + "step": 2684 + }, + { + "epoch": 1.852854924961187, + "grad_norm": 0.3155815899372101, + "learning_rate": 4.781357882623706e-06, + "logits/chosen": 3.4331109523773193, + "logits/rejected": 3.483264207839966, + "logps/chosen": -153.93692016601562, + "logps/rejected": -166.6867218017578, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.71253490447998, + "rewards/margins": 1.254188895225525, + "rewards/rejected": -11.966723442077637, + "step": 2685 + }, + { + "epoch": 1.853544937036398, + "grad_norm": 0.3495463728904724, + "learning_rate": 4.778481012658228e-06, + "logits/chosen": 3.6666266918182373, + "logits/rejected": 3.6666266918182373, + "logps/chosen": -176.66807556152344, + "logps/rejected": -176.66807556152344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.840008735656738, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.840008735656738, + "step": 2686 + }, + { + "epoch": 1.8542349491116095, + "grad_norm": 0.3366701304912567, + "learning_rate": 4.775604142692751e-06, + "logits/chosen": 3.678701400756836, + "logits/rejected": 3.678701400756836, + "logps/chosen": -171.6487274169922, + "logps/rejected": -171.6487274169922, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.216596603393555, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.216595649719238, + "step": 2687 + }, + { + "epoch": 1.8549249611868208, + "grad_norm": 0.246785968542099, + "learning_rate": 4.772727272727273e-06, + "logits/chosen": 3.2735378742218018, + "logits/rejected": 3.4909050464630127, + "logps/chosen": -149.564453125, + "logps/rejected": -181.35922241210938, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.257940292358398, + "rewards/margins": 3.167818546295166, + "rewards/rejected": -13.425760269165039, + "step": 2688 + }, + { + "epoch": 1.855614973262032, + "grad_norm": 0.2564345598220825, + "learning_rate": 4.769850402761795e-06, + "logits/chosen": 3.86793851852417, + "logits/rejected": 3.86793851852417, + "logps/chosen": -177.60682678222656, + "logps/rejected": -177.6068115234375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.916845321655273, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.91684627532959, + "step": 2689 + }, + { + "epoch": 1.8563049853372435, + "grad_norm": 0.3184705376625061, + "learning_rate": 4.766973532796318e-06, + "logits/chosen": 3.1950955390930176, + "logits/rejected": 3.1950955390930176, + "logps/chosen": -161.5384521484375, + "logps/rejected": -161.5384521484375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.403695106506348, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -11.403695106506348, + "step": 2690 + }, + { + "epoch": 1.8569949974124547, + "grad_norm": 0.27921396493911743, + "learning_rate": 4.764096662830841e-06, + "logits/chosen": 3.464106798171997, + "logits/rejected": 3.532099962234497, + "logps/chosen": -147.31265258789062, + "logps/rejected": -159.525146484375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.848640441894531, + "rewards/margins": 1.242223858833313, + "rewards/rejected": -11.090865135192871, + "step": 2691 + }, + { + "epoch": 1.857685009487666, + "grad_norm": 0.9056383967399597, + "learning_rate": 4.761219792865363e-06, + "logits/chosen": 3.45683217048645, + "logits/rejected": 3.55389666557312, + "logps/chosen": -160.42315673828125, + "logps/rejected": -184.64093017578125, + "loss": 0.4369, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.263936996459961, + "rewards/margins": 2.3274011611938477, + "rewards/rejected": -13.591337203979492, + "step": 2692 + }, + { + "epoch": 1.8583750215628774, + "grad_norm": 0.25523993372917175, + "learning_rate": 4.758342922899885e-06, + "logits/chosen": 3.837651252746582, + "logits/rejected": 3.9049367904663086, + "logps/chosen": -166.945068359375, + "logps/rejected": -180.32191467285156, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.827878952026367, + "rewards/margins": 1.3818252086639404, + "rewards/rejected": -13.20970344543457, + "step": 2693 + }, + { + "epoch": 1.8590650336380885, + "grad_norm": 1.6888160705566406, + "learning_rate": 4.7554660529344075e-06, + "logits/chosen": 3.560394287109375, + "logits/rejected": 3.573127031326294, + "logps/chosen": -164.16065979003906, + "logps/rejected": -167.416748046875, + "loss": 0.6157, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.518610000610352, + "rewards/margins": 0.3209608197212219, + "rewards/rejected": -11.839570999145508, + "step": 2694 + }, + { + "epoch": 1.8597550457133, + "grad_norm": 3.474874496459961, + "learning_rate": 4.75258918296893e-06, + "logits/chosen": 3.654144763946533, + "logits/rejected": 3.7395496368408203, + "logps/chosen": -180.2510528564453, + "logps/rejected": -182.9131317138672, + "loss": 0.625, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.405707359313965, + "rewards/margins": 0.2292919158935547, + "rewards/rejected": -13.63499927520752, + "step": 2695 + }, + { + "epoch": 1.8604450577885112, + "grad_norm": 0.3061971366405487, + "learning_rate": 4.749712313003453e-06, + "logits/chosen": 3.2576394081115723, + "logits/rejected": 3.431973457336426, + "logps/chosen": -161.22422790527344, + "logps/rejected": -169.13198852539062, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.445103645324707, + "rewards/margins": 0.8043115735054016, + "rewards/rejected": -12.24941635131836, + "step": 2696 + }, + { + "epoch": 1.8611350698637226, + "grad_norm": 10.913841247558594, + "learning_rate": 4.746835443037975e-06, + "logits/chosen": 3.357875347137451, + "logits/rejected": 3.5020389556884766, + "logps/chosen": -181.82437133789062, + "logps/rejected": -192.13790893554688, + "loss": 0.5978, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.497347831726074, + "rewards/margins": 0.9803758263587952, + "rewards/rejected": -14.477723121643066, + "step": 2697 + }, + { + "epoch": 1.861825081938934, + "grad_norm": 1.2888638973236084, + "learning_rate": 4.743958573072498e-06, + "logits/chosen": 3.387479782104492, + "logits/rejected": 3.419034957885742, + "logps/chosen": -166.35858154296875, + "logps/rejected": -169.34361267089844, + "loss": 0.6153, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.650030136108398, + "rewards/margins": 0.3266524076461792, + "rewards/rejected": -11.976682662963867, + "step": 2698 + }, + { + "epoch": 1.8625150940141453, + "grad_norm": 0.3839147686958313, + "learning_rate": 4.74108170310702e-06, + "logits/chosen": 3.309812307357788, + "logits/rejected": 3.651643753051758, + "logps/chosen": -130.8850555419922, + "logps/rejected": -164.9280548095703, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.226483345031738, + "rewards/margins": 3.344787120819092, + "rewards/rejected": -11.571270942687988, + "step": 2699 + }, + { + "epoch": 1.8632051060893566, + "grad_norm": 17.37859535217285, + "learning_rate": 4.7382048331415425e-06, + "logits/chosen": 3.568861484527588, + "logits/rejected": 3.5850281715393066, + "logps/chosen": -170.37994384765625, + "logps/rejected": -167.70742797851562, + "loss": 0.8421, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.251241683959961, + "rewards/margins": -0.21496695280075073, + "rewards/rejected": -12.036273956298828, + "step": 2700 + }, + { + "epoch": 1.8638951181645678, + "grad_norm": 7.671998977661133, + "learning_rate": 4.735327963176064e-06, + "logits/chosen": 3.34035325050354, + "logits/rejected": 3.331683397293091, + "logps/chosen": -163.00941467285156, + "logps/rejected": -162.8755645751953, + "loss": 0.7116, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.615753173828125, + "rewards/margins": -0.03451335430145264, + "rewards/rejected": -11.581239700317383, + "step": 2701 + }, + { + "epoch": 1.8645851302397793, + "grad_norm": 0.6812795996665955, + "learning_rate": 4.732451093210587e-06, + "logits/chosen": 3.3988633155822754, + "logits/rejected": 3.3828485012054443, + "logps/chosen": -161.82369995117188, + "logps/rejected": -166.4271697998047, + "loss": 0.609, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.432615280151367, + "rewards/margins": 0.488287091255188, + "rewards/rejected": -11.920902252197266, + "step": 2702 + }, + { + "epoch": 1.8652751423149905, + "grad_norm": 0.5711163878440857, + "learning_rate": 4.72957422324511e-06, + "logits/chosen": 3.3461475372314453, + "logits/rejected": 3.3536057472229004, + "logps/chosen": -159.52841186523438, + "logps/rejected": -180.55421447753906, + "loss": 0.5236, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.27517318725586, + "rewards/margins": 2.0978760719299316, + "rewards/rejected": -13.37304973602295, + "step": 2703 + }, + { + "epoch": 1.8659651543902018, + "grad_norm": 0.2815801501274109, + "learning_rate": 4.726697353279633e-06, + "logits/chosen": 3.715928316116333, + "logits/rejected": 3.715928316116333, + "logps/chosen": -179.2823486328125, + "logps/rejected": -179.2823486328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.180521011352539, + "rewards/margins": 0.0, + "rewards/rejected": -13.180521011352539, + "step": 2704 + }, + { + "epoch": 1.8666551664654132, + "grad_norm": 2.413001775741577, + "learning_rate": 4.723820483314155e-06, + "logits/chosen": 3.4956836700439453, + "logits/rejected": 3.5081796646118164, + "logps/chosen": -149.70114135742188, + "logps/rejected": -153.6912841796875, + "loss": 0.6161, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.988996505737305, + "rewards/margins": 0.31654441356658936, + "rewards/rejected": -10.305541038513184, + "step": 2705 + }, + { + "epoch": 1.8673451785406243, + "grad_norm": 0.3022140860557556, + "learning_rate": 4.7209436133486766e-06, + "logits/chosen": 3.8135666847229004, + "logits/rejected": 3.8135666847229004, + "logps/chosen": -172.3008270263672, + "logps/rejected": -172.3008270263672, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.58517837524414, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.58517837524414, + "step": 2706 + }, + { + "epoch": 1.868035190615836, + "grad_norm": 32.25492858886719, + "learning_rate": 4.718066743383199e-06, + "logits/chosen": 3.443582534790039, + "logits/rejected": 3.550442934036255, + "logps/chosen": -155.574951171875, + "logps/rejected": -160.4385223388672, + "loss": 1.3741, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.740592002868652, + "rewards/margins": 0.4534027576446533, + "rewards/rejected": -11.193994522094727, + "step": 2707 + }, + { + "epoch": 1.868725202691047, + "grad_norm": 0.2810054123401642, + "learning_rate": 4.715189873417722e-06, + "logits/chosen": 3.72871470451355, + "logits/rejected": 3.72871470451355, + "logps/chosen": -157.5092010498047, + "logps/rejected": -157.5092010498047, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.014680862426758, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -11.014680862426758, + "step": 2708 + }, + { + "epoch": 1.8694152147662584, + "grad_norm": 0.2345244139432907, + "learning_rate": 4.712313003452244e-06, + "logits/chosen": 3.8085641860961914, + "logits/rejected": 3.8128902912139893, + "logps/chosen": -163.55770874023438, + "logps/rejected": -179.22959899902344, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.426778793334961, + "rewards/margins": 1.5841741561889648, + "rewards/rejected": -13.010953903198242, + "step": 2709 + }, + { + "epoch": 1.8701052268414697, + "grad_norm": 0.31305694580078125, + "learning_rate": 4.709436133486767e-06, + "logits/chosen": 3.4125900268554688, + "logits/rejected": 3.558213233947754, + "logps/chosen": -162.8515625, + "logps/rejected": -170.43817138671875, + "loss": 0.6069, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.379732131958008, + "rewards/margins": 0.7179762125015259, + "rewards/rejected": -12.097707748413086, + "step": 2710 + }, + { + "epoch": 1.870795238916681, + "grad_norm": 0.25551265478134155, + "learning_rate": 4.70655926352129e-06, + "logits/chosen": 3.580963134765625, + "logits/rejected": 3.580963134765625, + "logps/chosen": -157.9304656982422, + "logps/rejected": -157.9304656982422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.000770568847656, + "rewards/margins": 0.0, + "rewards/rejected": -11.000770568847656, + "step": 2711 + }, + { + "epoch": 1.8714852509918924, + "grad_norm": 0.28995800018310547, + "learning_rate": 4.7036823935558115e-06, + "logits/chosen": 3.6366100311279297, + "logits/rejected": 3.6366100311279297, + "logps/chosen": -170.94943237304688, + "logps/rejected": -170.94943237304688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.361091613769531, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.361091613769531, + "step": 2712 + }, + { + "epoch": 1.8721752630671036, + "grad_norm": 0.3368758261203766, + "learning_rate": 4.700805523590334e-06, + "logits/chosen": 3.200892925262451, + "logits/rejected": 3.3541858196258545, + "logps/chosen": -142.26321411132812, + "logps/rejected": -167.95001220703125, + "loss": 0.4361, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.36870288848877, + "rewards/margins": 2.562929391860962, + "rewards/rejected": -11.931632041931152, + "step": 2713 + }, + { + "epoch": 1.8728652751423152, + "grad_norm": 0.2392859011888504, + "learning_rate": 4.697928653624856e-06, + "logits/chosen": 3.4408113956451416, + "logits/rejected": 3.631340980529785, + "logps/chosen": -162.2567596435547, + "logps/rejected": -175.49473571777344, + "loss": 0.5212, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.711297988891602, + "rewards/margins": 1.3747828006744385, + "rewards/rejected": -13.086080551147461, + "step": 2714 + }, + { + "epoch": 1.8735552872175263, + "grad_norm": 0.3235894739627838, + "learning_rate": 4.695051783659379e-06, + "logits/chosen": 3.860145092010498, + "logits/rejected": 3.860145092010498, + "logps/chosen": -165.9365234375, + "logps/rejected": -165.9365234375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.93726921081543, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.93726921081543, + "step": 2715 + }, + { + "epoch": 1.8742452992927376, + "grad_norm": 15.68840217590332, + "learning_rate": 4.692174913693902e-06, + "logits/chosen": 3.2925257682800293, + "logits/rejected": 3.4537739753723145, + "logps/chosen": -145.10855102539062, + "logps/rejected": -155.64524841308594, + "loss": 1.0921, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.854839324951172, + "rewards/margins": 0.9361169338226318, + "rewards/rejected": -10.790956497192383, + "step": 2716 + }, + { + "epoch": 1.874935311367949, + "grad_norm": 0.2856079339981079, + "learning_rate": 4.689298043728424e-06, + "logits/chosen": 3.9724578857421875, + "logits/rejected": 4.09848690032959, + "logps/chosen": -167.0583953857422, + "logps/rejected": -180.59994506835938, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.969649314880371, + "rewards/margins": 1.3982062339782715, + "rewards/rejected": -13.367855072021484, + "step": 2717 + }, + { + "epoch": 1.8756253234431601, + "grad_norm": 0.2636300325393677, + "learning_rate": 4.6864211737629465e-06, + "logits/chosen": 3.847090244293213, + "logits/rejected": 3.7945127487182617, + "logps/chosen": -159.61199951171875, + "logps/rejected": -167.34188842773438, + "loss": 0.6067, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.293169021606445, + "rewards/margins": 0.7861629724502563, + "rewards/rejected": -12.07933235168457, + "step": 2718 + }, + { + "epoch": 1.8763153355183717, + "grad_norm": 0.3721313774585724, + "learning_rate": 4.683544303797468e-06, + "logits/chosen": 3.7114334106445312, + "logits/rejected": 3.7114334106445312, + "logps/chosen": -163.944091796875, + "logps/rejected": -163.944091796875, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.64657974243164, + "rewards/margins": 5.364418029785156e-07, + "rewards/rejected": -11.646580696105957, + "step": 2719 + }, + { + "epoch": 1.8770053475935828, + "grad_norm": 0.4606885313987732, + "learning_rate": 4.680667433831991e-06, + "logits/chosen": 3.3364250659942627, + "logits/rejected": 3.570565700531006, + "logps/chosen": -151.54397583007812, + "logps/rejected": -168.01657104492188, + "loss": 0.5225, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.48320198059082, + "rewards/margins": 1.599740743637085, + "rewards/rejected": -12.082942008972168, + "step": 2720 + }, + { + "epoch": 1.8776953596687942, + "grad_norm": 0.3310142159461975, + "learning_rate": 4.677790563866514e-06, + "logits/chosen": 3.659846067428589, + "logits/rejected": 4.070322513580322, + "logps/chosen": -144.23348999023438, + "logps/rejected": -162.62167358398438, + "loss": 0.5213, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.613138198852539, + "rewards/margins": 1.8079628944396973, + "rewards/rejected": -11.421100616455078, + "step": 2721 + }, + { + "epoch": 1.8783853717440055, + "grad_norm": 0.2460835874080658, + "learning_rate": 4.674913693901036e-06, + "logits/chosen": 3.741567373275757, + "logits/rejected": 3.9288575649261475, + "logps/chosen": -186.74639892578125, + "logps/rejected": -194.4251251220703, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.772951126098633, + "rewards/margins": 0.7258773446083069, + "rewards/rejected": -14.49882698059082, + "step": 2722 + }, + { + "epoch": 1.8790753838192167, + "grad_norm": 0.22332194447517395, + "learning_rate": 4.672036823935559e-06, + "logits/chosen": 3.4315788745880127, + "logits/rejected": 3.6542389392852783, + "logps/chosen": -166.19729614257812, + "logps/rejected": -183.9752197265625, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.84014892578125, + "rewards/margins": 1.7997487783432007, + "rewards/rejected": -13.639898300170898, + "step": 2723 + }, + { + "epoch": 1.8797653958944283, + "grad_norm": 0.3072826564311981, + "learning_rate": 4.6691599539700814e-06, + "logits/chosen": 3.6299946308135986, + "logits/rejected": 3.6299946308135986, + "logps/chosen": -177.28941345214844, + "logps/rejected": -177.28939819335938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.145575523376465, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -13.145574569702148, + "step": 2724 + }, + { + "epoch": 1.8804554079696394, + "grad_norm": 2.2925775051116943, + "learning_rate": 4.666283084004603e-06, + "logits/chosen": 3.483880043029785, + "logits/rejected": 3.5366830825805664, + "logps/chosen": -151.05899047851562, + "logps/rejected": -164.31785583496094, + "loss": 0.5346, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.474736213684082, + "rewards/margins": 1.2790743112564087, + "rewards/rejected": -11.753809928894043, + "step": 2725 + }, + { + "epoch": 1.8811454200448507, + "grad_norm": 0.2574285864830017, + "learning_rate": 4.663406214039125e-06, + "logits/chosen": 3.824337959289551, + "logits/rejected": 3.886054515838623, + "logps/chosen": -167.64366149902344, + "logps/rejected": -180.2270050048828, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.855348587036133, + "rewards/margins": 1.2819803953170776, + "rewards/rejected": -13.1373291015625, + "step": 2726 + }, + { + "epoch": 1.881835432120062, + "grad_norm": 0.25690093636512756, + "learning_rate": 4.660529344073648e-06, + "logits/chosen": 3.888145685195923, + "logits/rejected": 3.888145685195923, + "logps/chosen": -163.98269653320312, + "logps/rejected": -163.98268127441406, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.448078155517578, + "rewards/margins": -2.980232238769531e-07, + "rewards/rejected": -11.448078155517578, + "step": 2727 + }, + { + "epoch": 1.8825254441952735, + "grad_norm": 0.2196885347366333, + "learning_rate": 4.657652474108171e-06, + "logits/chosen": 4.013396739959717, + "logits/rejected": 4.087175369262695, + "logps/chosen": -154.35104370117188, + "logps/rejected": -164.33641052246094, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.64404010772705, + "rewards/margins": 1.0049769878387451, + "rewards/rejected": -11.649017333984375, + "step": 2728 + }, + { + "epoch": 1.8832154562704848, + "grad_norm": 0.3667171895503998, + "learning_rate": 4.654775604142694e-06, + "logits/chosen": 3.867479085922241, + "logits/rejected": 3.867479085922241, + "logps/chosen": -176.19921875, + "logps/rejected": -176.19920349121094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.909624099731445, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.909624099731445, + "step": 2729 + }, + { + "epoch": 1.883905468345696, + "grad_norm": 0.2680812478065491, + "learning_rate": 4.6518987341772155e-06, + "logits/chosen": 3.8488142490386963, + "logits/rejected": 3.909322500228882, + "logps/chosen": -159.88067626953125, + "logps/rejected": -172.4598388671875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.240371704101562, + "rewards/margins": 1.1581270694732666, + "rewards/rejected": -12.39849853515625, + "step": 2730 + }, + { + "epoch": 1.8845954804209075, + "grad_norm": 0.23860682547092438, + "learning_rate": 4.649021864211738e-06, + "logits/chosen": 3.3216545581817627, + "logits/rejected": 3.3525469303131104, + "logps/chosen": -157.4074249267578, + "logps/rejected": -170.17066955566406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.09427261352539, + "rewards/margins": 1.2894995212554932, + "rewards/rejected": -12.383771896362305, + "step": 2731 + }, + { + "epoch": 1.8852854924961187, + "grad_norm": 0.23598900437355042, + "learning_rate": 4.64614499424626e-06, + "logits/chosen": 3.445662021636963, + "logits/rejected": 3.445662021636963, + "logps/chosen": -162.9345245361328, + "logps/rejected": -162.9345245361328, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.50472640991211, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -11.504724502563477, + "step": 2732 + }, + { + "epoch": 1.88597550457133, + "grad_norm": 0.5547542572021484, + "learning_rate": 4.643268124280783e-06, + "logits/chosen": 3.9689345359802246, + "logits/rejected": 4.053926467895508, + "logps/chosen": -175.76158142089844, + "logps/rejected": -181.2239990234375, + "loss": 0.6086, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.54316234588623, + "rewards/margins": 0.5104392766952515, + "rewards/rejected": -13.05360221862793, + "step": 2733 + }, + { + "epoch": 1.8866655166465414, + "grad_norm": 0.2551807761192322, + "learning_rate": 4.640391254315305e-06, + "logits/chosen": 3.660003423690796, + "logits/rejected": 3.660003423690796, + "logps/chosen": -181.1978302001953, + "logps/rejected": -181.1978302001953, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.259330749511719, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.259329795837402, + "step": 2734 + }, + { + "epoch": 1.8873555287217525, + "grad_norm": 0.2990206778049469, + "learning_rate": 4.637514384349828e-06, + "logits/chosen": 3.6296815872192383, + "logits/rejected": 3.6296815872192383, + "logps/chosen": -168.71258544921875, + "logps/rejected": -168.71258544921875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.132318496704102, + "rewards/margins": 2.980232238769531e-07, + "rewards/rejected": -12.132318496704102, + "step": 2735 + }, + { + "epoch": 1.888045540796964, + "grad_norm": 0.21142497658729553, + "learning_rate": 4.6346375143843505e-06, + "logits/chosen": 3.708188533782959, + "logits/rejected": 3.864443302154541, + "logps/chosen": -161.33843994140625, + "logps/rejected": -182.69195556640625, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.441938400268555, + "rewards/margins": 2.1665546894073486, + "rewards/rejected": -13.60849380493164, + "step": 2736 + }, + { + "epoch": 1.8887355528721752, + "grad_norm": 0.3222752511501312, + "learning_rate": 4.631760644418873e-06, + "logits/chosen": 3.8573150634765625, + "logits/rejected": 3.9980766773223877, + "logps/chosen": -171.2647247314453, + "logps/rejected": -179.64492797851562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.383882522583008, + "rewards/margins": 0.8612095713615417, + "rewards/rejected": -13.245092391967773, + "step": 2737 + }, + { + "epoch": 1.8894255649473866, + "grad_norm": 16.342288970947266, + "learning_rate": 4.628883774453395e-06, + "logits/chosen": 4.037964820861816, + "logits/rejected": 4.006084442138672, + "logps/chosen": -170.06655883789062, + "logps/rejected": -168.29722595214844, + "loss": 0.7946, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.238227844238281, + "rewards/margins": -0.1567631959915161, + "rewards/rejected": -12.081463813781738, + "step": 2738 + }, + { + "epoch": 1.890115577022598, + "grad_norm": 0.2712470591068268, + "learning_rate": 4.626006904487917e-06, + "logits/chosen": 3.338941812515259, + "logits/rejected": 3.448312520980835, + "logps/chosen": -165.84814453125, + "logps/rejected": -179.93853759765625, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.079565048217773, + "rewards/margins": 1.4313782453536987, + "rewards/rejected": -13.510943412780762, + "step": 2739 + }, + { + "epoch": 1.890805589097809, + "grad_norm": 25.956588745117188, + "learning_rate": 4.62313003452244e-06, + "logits/chosen": 3.6898412704467773, + "logits/rejected": 3.6511218547821045, + "logps/chosen": -171.33474731445312, + "logps/rejected": -166.61581420898438, + "loss": 1.0274, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.377172470092773, + "rewards/margins": -0.41653501987457275, + "rewards/rejected": -11.960638046264648, + "step": 2740 + }, + { + "epoch": 1.8914956011730206, + "grad_norm": 0.2569011151790619, + "learning_rate": 4.620253164556963e-06, + "logits/chosen": 3.513408660888672, + "logits/rejected": 3.513408660888672, + "logps/chosen": -177.6995849609375, + "logps/rejected": -177.69956970214844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.94720458984375, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.94720458984375, + "step": 2741 + }, + { + "epoch": 1.8921856132482318, + "grad_norm": 0.25105801224708557, + "learning_rate": 4.617376294591485e-06, + "logits/chosen": 3.8285279273986816, + "logits/rejected": 3.801589012145996, + "logps/chosen": -165.3302001953125, + "logps/rejected": -180.27059936523438, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.796487808227539, + "rewards/margins": 1.4835988283157349, + "rewards/rejected": -13.280086517333984, + "step": 2742 + }, + { + "epoch": 1.8928756253234431, + "grad_norm": 0.2959364354610443, + "learning_rate": 4.614499424626007e-06, + "logits/chosen": 3.5767393112182617, + "logits/rejected": 3.5767393112182617, + "logps/chosen": -178.42062377929688, + "logps/rejected": -178.42062377929688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.075766563415527, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.075766563415527, + "step": 2743 + }, + { + "epoch": 1.8935656373986545, + "grad_norm": 0.2650206983089447, + "learning_rate": 4.61162255466053e-06, + "logits/chosen": 3.5897631645202637, + "logits/rejected": 3.813138008117676, + "logps/chosen": -170.3375701904297, + "logps/rejected": -178.1217041015625, + "loss": 0.6069, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.275307655334473, + "rewards/margins": 0.7196840047836304, + "rewards/rejected": -12.994991302490234, + "step": 2744 + }, + { + "epoch": 1.8942556494738658, + "grad_norm": 0.4039059579372406, + "learning_rate": 4.608745684695052e-06, + "logits/chosen": 3.456273078918457, + "logits/rejected": 3.6479620933532715, + "logps/chosen": -182.5213623046875, + "logps/rejected": -189.34262084960938, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.569925308227539, + "rewards/margins": 0.6454290151596069, + "rewards/rejected": -14.215353012084961, + "step": 2745 + }, + { + "epoch": 1.8949456615490772, + "grad_norm": 9.204022407531738, + "learning_rate": 4.605868814729574e-06, + "logits/chosen": 3.2786669731140137, + "logits/rejected": 3.3065953254699707, + "logps/chosen": -152.3188934326172, + "logps/rejected": -158.22850036621094, + "loss": 0.5824, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.609367370605469, + "rewards/margins": 0.5246288776397705, + "rewards/rejected": -11.133995056152344, + "step": 2746 + }, + { + "epoch": 1.8956356736242883, + "grad_norm": 0.24307043850421906, + "learning_rate": 4.602991944764097e-06, + "logits/chosen": 3.4343044757843018, + "logits/rejected": 3.568314552307129, + "logps/chosen": -172.7633056640625, + "logps/rejected": -180.24774169921875, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.515716552734375, + "rewards/margins": 0.7549203038215637, + "rewards/rejected": -13.270637512207031, + "step": 2747 + }, + { + "epoch": 1.8963256856994999, + "grad_norm": 0.32336270809173584, + "learning_rate": 4.6001150747986196e-06, + "logits/chosen": 3.3044321537017822, + "logits/rejected": 3.3651249408721924, + "logps/chosen": -160.68402099609375, + "logps/rejected": -172.28897094726562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.32443618774414, + "rewards/margins": 1.1500719785690308, + "rewards/rejected": -12.474506378173828, + "step": 2748 + }, + { + "epoch": 1.897015697774711, + "grad_norm": 0.23273751139640808, + "learning_rate": 4.597238204833142e-06, + "logits/chosen": 3.5917389392852783, + "logits/rejected": 3.5917389392852783, + "logps/chosen": -187.6075897216797, + "logps/rejected": -187.6075897216797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.080663681030273, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.080663681030273, + "step": 2749 + }, + { + "epoch": 1.8977057098499224, + "grad_norm": 29.636091232299805, + "learning_rate": 4.594361334867664e-06, + "logits/chosen": 3.9746084213256836, + "logits/rejected": 3.841341972351074, + "logps/chosen": -180.19561767578125, + "logps/rejected": -185.41712951660156, + "loss": 0.952, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.128456115722656, + "rewards/margins": 0.5588404536247253, + "rewards/rejected": -13.687295913696289, + "step": 2750 + }, + { + "epoch": 1.8983957219251337, + "grad_norm": 0.27454525232315063, + "learning_rate": 4.591484464902187e-06, + "logits/chosen": 3.923656940460205, + "logits/rejected": 4.020589828491211, + "logps/chosen": -171.89437866210938, + "logps/rejected": -183.63995361328125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.414995193481445, + "rewards/margins": 1.2234737873077393, + "rewards/rejected": -13.638468742370605, + "step": 2751 + }, + { + "epoch": 1.8990857340003449, + "grad_norm": 0.2642892897129059, + "learning_rate": 4.588607594936709e-06, + "logits/chosen": 3.40462064743042, + "logits/rejected": 3.40462064743042, + "logps/chosen": -167.71644592285156, + "logps/rejected": -167.71644592285156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.020687103271484, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.020685195922852, + "step": 2752 + }, + { + "epoch": 1.8997757460755564, + "grad_norm": 0.3382692337036133, + "learning_rate": 4.585730724971232e-06, + "logits/chosen": 3.069045066833496, + "logits/rejected": 3.213157892227173, + "logps/chosen": -148.55516052246094, + "logps/rejected": -173.52488708496094, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.298254013061523, + "rewards/margins": 2.4447338581085205, + "rewards/rejected": -12.742987632751465, + "step": 2753 + }, + { + "epoch": 1.9004657581507676, + "grad_norm": 0.2609366178512573, + "learning_rate": 4.582853855005754e-06, + "logits/chosen": 3.4179728031158447, + "logits/rejected": 3.5038177967071533, + "logps/chosen": -170.39306640625, + "logps/rejected": -181.6898193359375, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.084688186645508, + "rewards/margins": 1.164488673210144, + "rewards/rejected": -13.249176979064941, + "step": 2754 + }, + { + "epoch": 1.901155770225979, + "grad_norm": 0.2384345680475235, + "learning_rate": 4.5799769850402765e-06, + "logits/chosen": 3.5480566024780273, + "logits/rejected": 3.7939796447753906, + "logps/chosen": -173.01315307617188, + "logps/rejected": -193.66676330566406, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.515522956848145, + "rewards/margins": 2.100881338119507, + "rewards/rejected": -14.616405487060547, + "step": 2755 + }, + { + "epoch": 1.9018457823011903, + "grad_norm": 3.1771862506866455, + "learning_rate": 4.577100115074799e-06, + "logits/chosen": 3.564580202102661, + "logits/rejected": 3.6767568588256836, + "logps/chosen": -172.34719848632812, + "logps/rejected": -175.10430908203125, + "loss": 0.6248, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.496097564697266, + "rewards/margins": 0.23069190979003906, + "rewards/rejected": -12.726788520812988, + "step": 2756 + }, + { + "epoch": 1.9025357943764016, + "grad_norm": 1.407544493675232, + "learning_rate": 4.574223245109322e-06, + "logits/chosen": 3.958979606628418, + "logits/rejected": 3.9931135177612305, + "logps/chosen": -167.1262664794922, + "logps/rejected": -171.04183959960938, + "loss": 0.6124, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.110694885253906, + "rewards/margins": 0.37918245792388916, + "rewards/rejected": -12.489877700805664, + "step": 2757 + }, + { + "epoch": 1.903225806451613, + "grad_norm": 0.2487802952528, + "learning_rate": 4.571346375143844e-06, + "logits/chosen": 3.171173572540283, + "logits/rejected": 3.354923963546753, + "logps/chosen": -140.47950744628906, + "logps/rejected": -163.04115295410156, + "loss": 0.4339, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.319817543029785, + "rewards/margins": 2.368867874145508, + "rewards/rejected": -11.688685417175293, + "step": 2758 + }, + { + "epoch": 1.9039158185268241, + "grad_norm": 0.2500327527523041, + "learning_rate": 4.568469505178366e-06, + "logits/chosen": 3.6546125411987305, + "logits/rejected": 3.6546125411987305, + "logps/chosen": -169.50872802734375, + "logps/rejected": -169.50872802734375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.056116104125977, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.056117057800293, + "step": 2759 + }, + { + "epoch": 1.9046058306020357, + "grad_norm": 1.1371674537658691, + "learning_rate": 4.565592635212889e-06, + "logits/chosen": 3.4264307022094727, + "logits/rejected": 3.4124948978424072, + "logps/chosen": -170.16342163085938, + "logps/rejected": -174.12820434570312, + "loss": 0.6124, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.280619621276855, + "rewards/margins": 0.3779103755950928, + "rewards/rejected": -12.658531188964844, + "step": 2760 + }, + { + "epoch": 1.9052958426772468, + "grad_norm": 22.654804229736328, + "learning_rate": 4.562715765247411e-06, + "logits/chosen": 3.4333081245422363, + "logits/rejected": 3.571654796600342, + "logps/chosen": -160.77569580078125, + "logps/rejected": -177.8072509765625, + "loss": 0.6753, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.350494384765625, + "rewards/margins": 1.766008734703064, + "rewards/rejected": -13.11650276184082, + "step": 2761 + }, + { + "epoch": 1.9059858547524582, + "grad_norm": 0.2573438286781311, + "learning_rate": 4.559838895281933e-06, + "logits/chosen": 3.3674488067626953, + "logits/rejected": 3.449148178100586, + "logps/chosen": -163.3751983642578, + "logps/rejected": -175.7633056640625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.683363914489746, + "rewards/margins": 1.262710690498352, + "rewards/rejected": -12.946075439453125, + "step": 2762 + }, + { + "epoch": 1.9066758668276695, + "grad_norm": 0.2945878505706787, + "learning_rate": 4.556962025316456e-06, + "logits/chosen": 3.268129825592041, + "logits/rejected": 3.9096546173095703, + "logps/chosen": -134.56768798828125, + "logps/rejected": -170.6721649169922, + "loss": 0.4336, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.650848388671875, + "rewards/margins": 3.496518611907959, + "rewards/rejected": -12.147366523742676, + "step": 2763 + }, + { + "epoch": 1.9073658789028807, + "grad_norm": 6.678410053253174, + "learning_rate": 4.554085155350979e-06, + "logits/chosen": 3.4248647689819336, + "logits/rejected": 3.458021640777588, + "logps/chosen": -169.07464599609375, + "logps/rejected": -170.02679443359375, + "loss": 0.649, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.105363845825195, + "rewards/margins": 0.11310577392578125, + "rewards/rejected": -12.218469619750977, + "step": 2764 + }, + { + "epoch": 1.9080558909780923, + "grad_norm": 0.28913986682891846, + "learning_rate": 4.551208285385501e-06, + "logits/chosen": 3.375943183898926, + "logits/rejected": 3.375943183898926, + "logps/chosen": -177.26979064941406, + "logps/rejected": -177.26979064941406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.07375717163086, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.073755264282227, + "step": 2765 + }, + { + "epoch": 1.9087459030533034, + "grad_norm": 0.3064856231212616, + "learning_rate": 4.548331415420024e-06, + "logits/chosen": 3.512608289718628, + "logits/rejected": 3.512608289718628, + "logps/chosen": -190.1348876953125, + "logps/rejected": -190.1348876953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.256490707397461, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.256490707397461, + "step": 2766 + }, + { + "epoch": 1.9094359151285147, + "grad_norm": 0.48377856612205505, + "learning_rate": 4.5454545454545455e-06, + "logits/chosen": 3.2898638248443604, + "logits/rejected": 3.3993420600891113, + "logps/chosen": -147.34933471679688, + "logps/rejected": -155.06640625, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.959678649902344, + "rewards/margins": 0.8041096329689026, + "rewards/rejected": -10.763788223266602, + "step": 2767 + }, + { + "epoch": 1.910125927203726, + "grad_norm": 0.2538195848464966, + "learning_rate": 4.542577675489068e-06, + "logits/chosen": 3.7301366329193115, + "logits/rejected": 3.7367231845855713, + "logps/chosen": -168.9781494140625, + "logps/rejected": -179.01756286621094, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.97909927368164, + "rewards/margins": 1.02794349193573, + "rewards/rejected": -13.007043838500977, + "step": 2768 + }, + { + "epoch": 1.9108159392789372, + "grad_norm": 0.25160136818885803, + "learning_rate": 4.539700805523591e-06, + "logits/chosen": 3.3162732124328613, + "logits/rejected": 3.383152484893799, + "logps/chosen": -168.37582397460938, + "logps/rejected": -175.4296875, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.292781829833984, + "rewards/margins": 0.7357381582260132, + "rewards/rejected": -13.028520584106445, + "step": 2769 + }, + { + "epoch": 1.9115059513541488, + "grad_norm": 0.29526573419570923, + "learning_rate": 4.536823935558113e-06, + "logits/chosen": 2.8544504642486572, + "logits/rejected": 3.0943515300750732, + "logps/chosen": -151.04293823242188, + "logps/rejected": -179.35130310058594, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.385229110717773, + "rewards/margins": 2.864464044570923, + "rewards/rejected": -13.24969482421875, + "step": 2770 + }, + { + "epoch": 1.91219596342936, + "grad_norm": 0.3223128914833069, + "learning_rate": 4.533947065592636e-06, + "logits/chosen": 3.3722195625305176, + "logits/rejected": 3.4004287719726562, + "logps/chosen": -160.6815643310547, + "logps/rejected": -167.09005737304688, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.189496994018555, + "rewards/margins": 0.6759755611419678, + "rewards/rejected": -11.865472793579102, + "step": 2771 + }, + { + "epoch": 1.9128859755045713, + "grad_norm": 0.2302238643169403, + "learning_rate": 4.531070195627158e-06, + "logits/chosen": 3.4546613693237305, + "logits/rejected": 3.5860111713409424, + "logps/chosen": -162.31890869140625, + "logps/rejected": -192.01812744140625, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.68846321105957, + "rewards/margins": 2.8584346771240234, + "rewards/rejected": -14.54689884185791, + "step": 2772 + }, + { + "epoch": 1.9135759875797826, + "grad_norm": 0.4011881351470947, + "learning_rate": 4.5281933256616805e-06, + "logits/chosen": 3.079252243041992, + "logits/rejected": 3.187532424926758, + "logps/chosen": -154.2786865234375, + "logps/rejected": -170.6278076171875, + "loss": 0.5207, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.609609603881836, + "rewards/margins": 1.615229845046997, + "rewards/rejected": -12.22484016418457, + "step": 2773 + }, + { + "epoch": 1.914265999654994, + "grad_norm": 7.629494667053223, + "learning_rate": 4.525316455696203e-06, + "logits/chosen": 3.338573932647705, + "logits/rejected": 3.427194118499756, + "logps/chosen": -145.68287658691406, + "logps/rejected": -167.0631561279297, + "loss": 0.4857, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.91614818572998, + "rewards/margins": 1.9187664985656738, + "rewards/rejected": -11.834915161132812, + "step": 2774 + }, + { + "epoch": 1.9149560117302054, + "grad_norm": 0.23543640971183777, + "learning_rate": 4.522439585730725e-06, + "logits/chosen": 3.478262424468994, + "logits/rejected": 3.5864107608795166, + "logps/chosen": -187.39129638671875, + "logps/rejected": -197.3307647705078, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.008560180664062, + "rewards/margins": 0.9956997036933899, + "rewards/rejected": -15.004261016845703, + "step": 2775 + }, + { + "epoch": 1.9156460238054165, + "grad_norm": 0.2944900691509247, + "learning_rate": 4.519562715765248e-06, + "logits/chosen": 3.6526758670806885, + "logits/rejected": 3.7105610370635986, + "logps/chosen": -176.22821044921875, + "logps/rejected": -187.00418090820312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.916913986206055, + "rewards/margins": 0.9892070293426514, + "rewards/rejected": -13.906120300292969, + "step": 2776 + }, + { + "epoch": 1.916336035880628, + "grad_norm": 18.11322021484375, + "learning_rate": 4.516685845799771e-06, + "logits/chosen": 3.4478421211242676, + "logits/rejected": 3.5310590267181396, + "logps/chosen": -151.45263671875, + "logps/rejected": -161.09732055664062, + "loss": 0.8719, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.463774681091309, + "rewards/margins": 0.8854828476905823, + "rewards/rejected": -11.349257469177246, + "step": 2777 + }, + { + "epoch": 1.9170260479558392, + "grad_norm": 0.19522695243358612, + "learning_rate": 4.513808975834293e-06, + "logits/chosen": 2.9396462440490723, + "logits/rejected": 3.165661334991455, + "logps/chosen": -158.93429565429688, + "logps/rejected": -183.1268768310547, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.036651611328125, + "rewards/margins": 2.4620423316955566, + "rewards/rejected": -13.498695373535156, + "step": 2778 + }, + { + "epoch": 1.9177160600310506, + "grad_norm": 0.2858593463897705, + "learning_rate": 4.510932105868815e-06, + "logits/chosen": 3.328749418258667, + "logits/rejected": 3.328749418258667, + "logps/chosen": -176.9235076904297, + "logps/rejected": -176.9235076904297, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.818243026733398, + "rewards/margins": 0.0, + "rewards/rejected": -12.818243026733398, + "step": 2779 + }, + { + "epoch": 1.918406072106262, + "grad_norm": 0.2916182279586792, + "learning_rate": 4.508055235903337e-06, + "logits/chosen": 3.1375324726104736, + "logits/rejected": 3.4170544147491455, + "logps/chosen": -150.6352081298828, + "logps/rejected": -181.97769165039062, + "loss": 0.4339, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.318990707397461, + "rewards/margins": 3.0580644607543945, + "rewards/rejected": -13.377056121826172, + "step": 2780 + }, + { + "epoch": 1.919096084181473, + "grad_norm": 0.3115704655647278, + "learning_rate": 4.50517836593786e-06, + "logits/chosen": 3.5530292987823486, + "logits/rejected": 3.6274099349975586, + "logps/chosen": -179.73318481445312, + "logps/rejected": -186.09912109375, + "loss": 0.6072, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.253608703613281, + "rewards/margins": 0.6476553678512573, + "rewards/rejected": -13.901262283325195, + "step": 2781 + }, + { + "epoch": 1.9197860962566846, + "grad_norm": 1.1569052934646606, + "learning_rate": 4.502301495972383e-06, + "logits/chosen": 3.7670693397521973, + "logits/rejected": 3.7044358253479004, + "logps/chosen": -175.97052001953125, + "logps/rejected": -179.09100341796875, + "loss": 0.6125, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.783279418945312, + "rewards/margins": 0.3777047395706177, + "rewards/rejected": -13.160983085632324, + "step": 2782 + }, + { + "epoch": 1.9204761083318957, + "grad_norm": 0.28113070130348206, + "learning_rate": 4.499424626006905e-06, + "logits/chosen": 3.8119330406188965, + "logits/rejected": 3.993446111679077, + "logps/chosen": -174.4956512451172, + "logps/rejected": -183.4563446044922, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.618501663208008, + "rewards/margins": 0.9238720536231995, + "rewards/rejected": -13.542373657226562, + "step": 2783 + }, + { + "epoch": 1.921166120407107, + "grad_norm": 0.18526478111743927, + "learning_rate": 4.496547756041428e-06, + "logits/chosen": 3.1894748210906982, + "logits/rejected": 3.3314998149871826, + "logps/chosen": -160.52163696289062, + "logps/rejected": -195.40675354003906, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.321535110473633, + "rewards/margins": 3.587096691131592, + "rewards/rejected": -14.908632278442383, + "step": 2784 + }, + { + "epoch": 1.9218561324823185, + "grad_norm": 0.3281995356082916, + "learning_rate": 4.4936708860759495e-06, + "logits/chosen": 3.356642723083496, + "logits/rejected": 3.356642723083496, + "logps/chosen": -175.70382690429688, + "logps/rejected": -175.70382690429688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.828838348388672, + "rewards/margins": 0.0, + "rewards/rejected": -12.828838348388672, + "step": 2785 + }, + { + "epoch": 1.9225461445575296, + "grad_norm": 0.25748637318611145, + "learning_rate": 4.490794016110472e-06, + "logits/chosen": 3.3602123260498047, + "logits/rejected": 3.3559679985046387, + "logps/chosen": -157.73846435546875, + "logps/rejected": -184.82525634765625, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.164237976074219, + "rewards/margins": 2.706890106201172, + "rewards/rejected": -13.87112808227539, + "step": 2786 + }, + { + "epoch": 1.9232361566327412, + "grad_norm": 2.1843225955963135, + "learning_rate": 4.487917146144994e-06, + "logits/chosen": 3.7943358421325684, + "logits/rejected": 3.699338436126709, + "logps/chosen": -183.3787841796875, + "logps/rejected": -185.33673095703125, + "loss": 0.6251, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.498533248901367, + "rewards/margins": 0.22910022735595703, + "rewards/rejected": -13.727633476257324, + "step": 2787 + }, + { + "epoch": 1.9239261687079523, + "grad_norm": 0.2298513948917389, + "learning_rate": 4.485040276179517e-06, + "logits/chosen": 3.2122249603271484, + "logits/rejected": 3.2271270751953125, + "logps/chosen": -163.49880981445312, + "logps/rejected": -174.82643127441406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.711655616760254, + "rewards/margins": 1.148584246635437, + "rewards/rejected": -12.86023998260498, + "step": 2788 + }, + { + "epoch": 1.9246161807831637, + "grad_norm": 0.3106127977371216, + "learning_rate": 4.48216340621404e-06, + "logits/chosen": 3.5488874912261963, + "logits/rejected": 3.699904203414917, + "logps/chosen": -157.4092254638672, + "logps/rejected": -165.3446044921875, + "loss": 0.6067, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.984188079833984, + "rewards/margins": 0.800639808177948, + "rewards/rejected": -11.78482723236084, + "step": 2789 + }, + { + "epoch": 1.925306192858375, + "grad_norm": 0.29321563243865967, + "learning_rate": 4.479286536248562e-06, + "logits/chosen": 3.5059196949005127, + "logits/rejected": 3.5059196949005127, + "logps/chosen": -184.60025024414062, + "logps/rejected": -184.60023498535156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.58078384399414, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.58078384399414, + "step": 2790 + }, + { + "epoch": 1.9259962049335864, + "grad_norm": 0.23330141603946686, + "learning_rate": 4.4764096662830845e-06, + "logits/chosen": 3.2079358100891113, + "logits/rejected": 3.385793447494507, + "logps/chosen": -159.29879760742188, + "logps/rejected": -187.00912475585938, + "loss": 0.4337, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.102688789367676, + "rewards/margins": 2.7929961681365967, + "rewards/rejected": -13.895685195922852, + "step": 2791 + }, + { + "epoch": 1.9266862170087977, + "grad_norm": 0.2990197539329529, + "learning_rate": 4.4735327963176064e-06, + "logits/chosen": 3.346794605255127, + "logits/rejected": 3.480147123336792, + "logps/chosen": -151.34046936035156, + "logps/rejected": -172.6951141357422, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.626072883605957, + "rewards/margins": 2.06298828125, + "rewards/rejected": -12.689061164855957, + "step": 2792 + }, + { + "epoch": 1.9273762290840089, + "grad_norm": 0.24179020524024963, + "learning_rate": 4.470655926352129e-06, + "logits/chosen": 3.2609949111938477, + "logits/rejected": 3.384519577026367, + "logps/chosen": -162.18040466308594, + "logps/rejected": -182.60955810546875, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.230664253234863, + "rewards/margins": 2.03816819190979, + "rewards/rejected": -13.26883316040039, + "step": 2793 + }, + { + "epoch": 1.9280662411592204, + "grad_norm": 11.69057559967041, + "learning_rate": 4.467779056386652e-06, + "logits/chosen": 3.2847588062286377, + "logits/rejected": 3.314824342727661, + "logps/chosen": -176.14649963378906, + "logps/rejected": -175.2977752685547, + "loss": 1.4315, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.976703643798828, + "rewards/margins": -0.09609705209732056, + "rewards/rejected": -12.880606651306152, + "step": 2794 + }, + { + "epoch": 1.9287562532344316, + "grad_norm": 0.2787216305732727, + "learning_rate": 4.464902186421174e-06, + "logits/chosen": 3.2656307220458984, + "logits/rejected": 3.526146173477173, + "logps/chosen": -179.87942504882812, + "logps/rejected": -189.20623779296875, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.34704875946045, + "rewards/margins": 0.9180007576942444, + "rewards/rejected": -14.265049934387207, + "step": 2795 + }, + { + "epoch": 1.929446265309643, + "grad_norm": 0.28076839447021484, + "learning_rate": 4.462025316455697e-06, + "logits/chosen": 3.280503749847412, + "logits/rejected": 3.4201388359069824, + "logps/chosen": -180.5030517578125, + "logps/rejected": -186.96551513671875, + "loss": 0.6074, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.435829162597656, + "rewards/margins": 0.6172103881835938, + "rewards/rejected": -14.05303955078125, + "step": 2796 + }, + { + "epoch": 1.9301362773848543, + "grad_norm": 0.4302978217601776, + "learning_rate": 4.4591484464902195e-06, + "logits/chosen": 3.450653553009033, + "logits/rejected": 3.450653553009033, + "logps/chosen": -177.16778564453125, + "logps/rejected": -177.16778564453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.20901107788086, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.209010124206543, + "step": 2797 + }, + { + "epoch": 1.9308262894600654, + "grad_norm": 6.786022663116455, + "learning_rate": 4.456271576524741e-06, + "logits/chosen": 3.205557346343994, + "logits/rejected": 3.182001829147339, + "logps/chosen": -161.53457641601562, + "logps/rejected": -161.76806640625, + "loss": 0.667, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.300422668457031, + "rewards/margins": 0.05912572145462036, + "rewards/rejected": -11.359546661376953, + "step": 2798 + }, + { + "epoch": 1.931516301535277, + "grad_norm": 16.291845321655273, + "learning_rate": 4.453394706559263e-06, + "logits/chosen": 3.1978843212127686, + "logits/rejected": 3.183098077774048, + "logps/chosen": -160.55320739746094, + "logps/rejected": -155.75128173828125, + "loss": 1.0691, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.25306224822998, + "rewards/margins": -0.45944690704345703, + "rewards/rejected": -10.793615341186523, + "step": 2799 + }, + { + "epoch": 1.9322063136104881, + "grad_norm": 4.51871395111084, + "learning_rate": 4.450517836593786e-06, + "logits/chosen": 2.9784915447235107, + "logits/rejected": 3.2669517993927, + "logps/chosen": -111.56396484375, + "logps/rejected": -145.57577514648438, + "loss": 0.3036, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.835465431213379, + "rewards/margins": 3.253345489501953, + "rewards/rejected": -10.088811874389648, + "step": 2800 + }, + { + "epoch": 1.9328963256856995, + "grad_norm": 0.31286194920539856, + "learning_rate": 4.447640966628309e-06, + "logits/chosen": 3.456482172012329, + "logits/rejected": 3.456482172012329, + "logps/chosen": -167.65406799316406, + "logps/rejected": -167.654052734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.904035568237305, + "rewards/margins": -2.980232238769531e-07, + "rewards/rejected": -11.904035568237305, + "step": 2801 + }, + { + "epoch": 1.9335863377609108, + "grad_norm": 0.30805763602256775, + "learning_rate": 4.444764096662832e-06, + "logits/chosen": 3.332749843597412, + "logits/rejected": 3.3325798511505127, + "logps/chosen": -166.32659912109375, + "logps/rejected": -176.35055541992188, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.850899696350098, + "rewards/margins": 1.0919172763824463, + "rewards/rejected": -12.942816734313965, + "step": 2802 + }, + { + "epoch": 1.9342763498361222, + "grad_norm": 0.3137570917606354, + "learning_rate": 4.4418872266973536e-06, + "logits/chosen": 3.2882728576660156, + "logits/rejected": 3.303349494934082, + "logps/chosen": -165.6127166748047, + "logps/rejected": -171.64501953125, + "loss": 0.6073, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.908140182495117, + "rewards/margins": 0.6282888650894165, + "rewards/rejected": -12.536429405212402, + "step": 2803 + }, + { + "epoch": 1.9349663619113335, + "grad_norm": 0.302746444940567, + "learning_rate": 4.439010356731876e-06, + "logits/chosen": 3.285430669784546, + "logits/rejected": 3.307438850402832, + "logps/chosen": -178.73654174804688, + "logps/rejected": -183.95452880859375, + "loss": 0.6083, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.112105369567871, + "rewards/margins": 0.5261302590370178, + "rewards/rejected": -13.638235092163086, + "step": 2804 + }, + { + "epoch": 1.9356563739865447, + "grad_norm": 0.391905814409256, + "learning_rate": 4.436133486766398e-06, + "logits/chosen": 3.183659791946411, + "logits/rejected": 3.3363442420959473, + "logps/chosen": -142.57369995117188, + "logps/rejected": -166.50144958496094, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.459012985229492, + "rewards/margins": 2.393038272857666, + "rewards/rejected": -11.852051734924316, + "step": 2805 + }, + { + "epoch": 1.9363463860617562, + "grad_norm": 0.2908235192298889, + "learning_rate": 4.433256616800921e-06, + "logits/chosen": 3.2254555225372314, + "logits/rejected": 3.2254555225372314, + "logps/chosen": -169.180419921875, + "logps/rejected": -169.180419921875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.184091567993164, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.184091567993164, + "step": 2806 + }, + { + "epoch": 1.9370363981369674, + "grad_norm": 0.27530890703201294, + "learning_rate": 4.430379746835443e-06, + "logits/chosen": 3.601879119873047, + "logits/rejected": 3.6843762397766113, + "logps/chosen": -179.1155242919922, + "logps/rejected": -191.86868286132812, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.137577056884766, + "rewards/margins": 1.2707974910736084, + "rewards/rejected": -14.408374786376953, + "step": 2807 + }, + { + "epoch": 1.9377264102121787, + "grad_norm": 0.446663498878479, + "learning_rate": 4.427502876869966e-06, + "logits/chosen": 3.3261866569519043, + "logits/rejected": 3.3388278484344482, + "logps/chosen": -157.25677490234375, + "logps/rejected": -162.2996826171875, + "loss": 0.6128, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.027447700500488, + "rewards/margins": 0.3713025152683258, + "rewards/rejected": -11.398750305175781, + "step": 2808 + }, + { + "epoch": 1.93841642228739, + "grad_norm": 0.312203973531723, + "learning_rate": 4.4246260069044885e-06, + "logits/chosen": 3.9385018348693848, + "logits/rejected": 4.138514995574951, + "logps/chosen": -176.69342041015625, + "logps/rejected": -183.87684631347656, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.944676399230957, + "rewards/margins": 0.6766402721405029, + "rewards/rejected": -13.621316909790039, + "step": 2809 + }, + { + "epoch": 1.9391064343626012, + "grad_norm": 5.996253967285156, + "learning_rate": 4.4217491369390104e-06, + "logits/chosen": 3.3041605949401855, + "logits/rejected": 3.3698410987854004, + "logps/chosen": -166.3886260986328, + "logps/rejected": -173.60397338867188, + "loss": 0.5597, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.743066787719727, + "rewards/margins": 0.7567404508590698, + "rewards/rejected": -12.499807357788086, + "step": 2810 + }, + { + "epoch": 1.9397964464378128, + "grad_norm": 1.2919620275497437, + "learning_rate": 4.418872266973533e-06, + "logits/chosen": 3.4294028282165527, + "logits/rejected": 3.451388120651245, + "logps/chosen": -152.725830078125, + "logps/rejected": -166.89852905273438, + "loss": 0.5337, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.331379890441895, + "rewards/margins": 1.4689161777496338, + "rewards/rejected": -11.80029582977295, + "step": 2811 + }, + { + "epoch": 1.940486458513024, + "grad_norm": 4.2946367263793945, + "learning_rate": 4.415995397008055e-06, + "logits/chosen": 3.658846855163574, + "logits/rejected": 3.7329630851745605, + "logps/chosen": -162.7409210205078, + "logps/rejected": -181.07327270507812, + "loss": 0.4667, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.417168617248535, + "rewards/margins": 1.855790376663208, + "rewards/rejected": -13.27295970916748, + "step": 2812 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.2595219314098358, + "learning_rate": 4.413118527042578e-06, + "logits/chosen": 3.4200665950775146, + "logits/rejected": 3.4200665950775146, + "logps/chosen": -175.67572021484375, + "logps/rejected": -175.67572021484375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.74948501586914, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.74948501586914, + "step": 2813 + }, + { + "epoch": 1.9418664826634466, + "grad_norm": 0.38421985507011414, + "learning_rate": 4.410241657077101e-06, + "logits/chosen": 3.4821767807006836, + "logits/rejected": 3.556227684020996, + "logps/chosen": -177.8404541015625, + "logps/rejected": -182.71807861328125, + "loss": 0.6084, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.956184387207031, + "rewards/margins": 0.5231709480285645, + "rewards/rejected": -13.47935676574707, + "step": 2814 + }, + { + "epoch": 1.9425564947386578, + "grad_norm": 0.2724284827709198, + "learning_rate": 4.4073647871116235e-06, + "logits/chosen": 3.5486137866973877, + "logits/rejected": 3.6801998615264893, + "logps/chosen": -186.9138946533203, + "logps/rejected": -195.31268310546875, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.908463478088379, + "rewards/margins": 0.8189578056335449, + "rewards/rejected": -14.727420806884766, + "step": 2815 + }, + { + "epoch": 1.9432465068138693, + "grad_norm": 0.329088032245636, + "learning_rate": 4.404487917146145e-06, + "logits/chosen": 3.691866874694824, + "logits/rejected": 3.691866874694824, + "logps/chosen": -172.6165313720703, + "logps/rejected": -172.6165313720703, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.59548568725586, + "rewards/margins": 0.0, + "rewards/rejected": -12.59548568725586, + "step": 2816 + }, + { + "epoch": 1.9439365188890805, + "grad_norm": 0.2673545479774475, + "learning_rate": 4.401611047180668e-06, + "logits/chosen": 3.7382826805114746, + "logits/rejected": 3.8301548957824707, + "logps/chosen": -168.82293701171875, + "logps/rejected": -183.00961303710938, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.04597282409668, + "rewards/margins": 1.459219217300415, + "rewards/rejected": -13.505191802978516, + "step": 2817 + }, + { + "epoch": 1.9446265309642918, + "grad_norm": 0.3027840554714203, + "learning_rate": 4.39873417721519e-06, + "logits/chosen": 3.711658000946045, + "logits/rejected": 3.711658000946045, + "logps/chosen": -180.9027862548828, + "logps/rejected": -180.9027862548828, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.276816368103027, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.276816368103027, + "step": 2818 + }, + { + "epoch": 1.9453165430395032, + "grad_norm": 0.3022303283214569, + "learning_rate": 4.395857307249713e-06, + "logits/chosen": 3.7474687099456787, + "logits/rejected": 3.7474687099456787, + "logps/chosen": -179.24432373046875, + "logps/rejected": -179.24432373046875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.068750381469727, + "rewards/margins": 0.0, + "rewards/rejected": -13.068750381469727, + "step": 2819 + }, + { + "epoch": 1.9460065551147145, + "grad_norm": 0.2593997120857239, + "learning_rate": 4.392980437284235e-06, + "logits/chosen": 3.555026054382324, + "logits/rejected": 3.7105355262756348, + "logps/chosen": -176.8770751953125, + "logps/rejected": -185.89173889160156, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.899709701538086, + "rewards/margins": 0.9122428297996521, + "rewards/rejected": -13.811952590942383, + "step": 2820 + }, + { + "epoch": 1.946696567189926, + "grad_norm": 12.616358757019043, + "learning_rate": 4.390103567318758e-06, + "logits/chosen": 3.965710163116455, + "logits/rejected": 4.040050506591797, + "logps/chosen": -193.00001525878906, + "logps/rejected": -189.11428833007812, + "loss": 1.0121, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.563578605651855, + "rewards/margins": -0.40059876441955566, + "rewards/rejected": -14.162980079650879, + "step": 2821 + }, + { + "epoch": 1.947386579265137, + "grad_norm": 0.24908895790576935, + "learning_rate": 4.38722669735328e-06, + "logits/chosen": 3.6012258529663086, + "logits/rejected": 3.6396331787109375, + "logps/chosen": -161.5787353515625, + "logps/rejected": -171.75161743164062, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.478219032287598, + "rewards/margins": 0.9744905233383179, + "rewards/rejected": -12.452710151672363, + "step": 2822 + }, + { + "epoch": 1.9480765913403486, + "grad_norm": 0.3905836045742035, + "learning_rate": 4.384349827387802e-06, + "logits/chosen": 3.746753454208374, + "logits/rejected": 3.746753454208374, + "logps/chosen": -163.4184112548828, + "logps/rejected": -163.4184112548828, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.560827255249023, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -11.560826301574707, + "step": 2823 + }, + { + "epoch": 1.9487666034155597, + "grad_norm": 0.2818002998828888, + "learning_rate": 4.381472957422325e-06, + "logits/chosen": 3.5228443145751953, + "logits/rejected": 3.6087145805358887, + "logps/chosen": -172.28924560546875, + "logps/rejected": -188.2456817626953, + "loss": 0.5206, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.527592658996582, + "rewards/margins": 1.6432011127471924, + "rewards/rejected": -14.170793533325195, + "step": 2824 + }, + { + "epoch": 1.949456615490771, + "grad_norm": 0.819961428642273, + "learning_rate": 4.378596087456847e-06, + "logits/chosen": 3.340906858444214, + "logits/rejected": 3.660367488861084, + "logps/chosen": -154.92935180664062, + "logps/rejected": -187.02915954589844, + "loss": 0.3532, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.751527786254883, + "rewards/margins": 3.176562786102295, + "rewards/rejected": -13.928091049194336, + "step": 2825 + }, + { + "epoch": 1.9501466275659824, + "grad_norm": 0.25885623693466187, + "learning_rate": 4.37571921749137e-06, + "logits/chosen": 3.283871650695801, + "logits/rejected": 3.440001964569092, + "logps/chosen": -168.36904907226562, + "logps/rejected": -176.34375, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.27131175994873, + "rewards/margins": 0.7790913581848145, + "rewards/rejected": -13.050403594970703, + "step": 2826 + }, + { + "epoch": 1.9508366396411936, + "grad_norm": 0.20007793605327606, + "learning_rate": 4.3728423475258925e-06, + "logits/chosen": 3.4839727878570557, + "logits/rejected": 3.542207956314087, + "logps/chosen": -169.38560485839844, + "logps/rejected": -180.86317443847656, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.973038673400879, + "rewards/margins": 1.19489586353302, + "rewards/rejected": -13.16793441772461, + "step": 2827 + }, + { + "epoch": 1.9515266517164052, + "grad_norm": 0.31375402212142944, + "learning_rate": 4.3699654775604145e-06, + "logits/chosen": 3.7591652870178223, + "logits/rejected": 3.7591652870178223, + "logps/chosen": -153.85372924804688, + "logps/rejected": -153.85372924804688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.452235221862793, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -10.452235221862793, + "step": 2828 + }, + { + "epoch": 1.9522166637916163, + "grad_norm": 22.087732315063477, + "learning_rate": 4.367088607594937e-06, + "logits/chosen": 3.3836803436279297, + "logits/rejected": 3.381406784057617, + "logps/chosen": -181.37705993652344, + "logps/rejected": -178.16204833984375, + "loss": 0.9863, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.426091194152832, + "rewards/margins": -0.37369972467422485, + "rewards/rejected": -13.05239200592041, + "step": 2829 + }, + { + "epoch": 1.9529066758668276, + "grad_norm": 0.3303023874759674, + "learning_rate": 4.364211737629459e-06, + "logits/chosen": 3.5466384887695312, + "logits/rejected": 3.5466384887695312, + "logps/chosen": -162.9943389892578, + "logps/rejected": -162.99435424804688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.447458267211914, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -11.447458267211914, + "step": 2830 + }, + { + "epoch": 1.953596687942039, + "grad_norm": 0.2589588165283203, + "learning_rate": 4.361334867663982e-06, + "logits/chosen": 3.670518636703491, + "logits/rejected": 3.804551124572754, + "logps/chosen": -171.58865356445312, + "logps/rejected": -185.74917602539062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.375720977783203, + "rewards/margins": 1.4276461601257324, + "rewards/rejected": -13.803367614746094, + "step": 2831 + }, + { + "epoch": 1.9542867000172504, + "grad_norm": 0.2683882713317871, + "learning_rate": 4.358457997698504e-06, + "logits/chosen": 3.684464693069458, + "logits/rejected": 3.763859987258911, + "logps/chosen": -161.70431518554688, + "logps/rejected": -174.68487548828125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.346609115600586, + "rewards/margins": 1.302681565284729, + "rewards/rejected": -12.649290084838867, + "step": 2832 + }, + { + "epoch": 1.9549767120924617, + "grad_norm": 0.3664386570453644, + "learning_rate": 4.355581127733027e-06, + "logits/chosen": 3.862668037414551, + "logits/rejected": 3.862668037414551, + "logps/chosen": -174.51930236816406, + "logps/rejected": -174.519287109375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.629562377929688, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.629561424255371, + "step": 2833 + }, + { + "epoch": 1.9556667241676728, + "grad_norm": 0.3423043191432953, + "learning_rate": 4.3527042577675494e-06, + "logits/chosen": 3.6287338733673096, + "logits/rejected": 4.129047393798828, + "logps/chosen": -151.12355041503906, + "logps/rejected": -180.89244079589844, + "loss": 0.3499, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.40894889831543, + "rewards/margins": 3.0913124084472656, + "rewards/rejected": -13.500261306762695, + "step": 2834 + }, + { + "epoch": 1.9563567362428842, + "grad_norm": 16.5445556640625, + "learning_rate": 4.349827387802072e-06, + "logits/chosen": 3.8006796836853027, + "logits/rejected": 3.823607921600342, + "logps/chosen": -166.858642578125, + "logps/rejected": -181.51097106933594, + "loss": 0.7078, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.91103744506836, + "rewards/margins": 1.4315638542175293, + "rewards/rejected": -13.342601776123047, + "step": 2835 + }, + { + "epoch": 1.9570467483180956, + "grad_norm": 0.3428533971309662, + "learning_rate": 4.346950517836594e-06, + "logits/chosen": 3.7623023986816406, + "logits/rejected": 3.7623023986816406, + "logps/chosen": -182.14093017578125, + "logps/rejected": -182.14093017578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.583656311035156, + "rewards/margins": 0.0, + "rewards/rejected": -13.583656311035156, + "step": 2836 + }, + { + "epoch": 1.957736760393307, + "grad_norm": 0.2681604027748108, + "learning_rate": 4.344073647871117e-06, + "logits/chosen": 3.369469165802002, + "logits/rejected": 3.369469165802002, + "logps/chosen": -168.31385803222656, + "logps/rejected": -168.3138427734375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.972055435180664, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -11.972054481506348, + "step": 2837 + }, + { + "epoch": 1.9584267724685183, + "grad_norm": 0.26169469952583313, + "learning_rate": 4.341196777905639e-06, + "logits/chosen": 3.660891056060791, + "logits/rejected": 3.722012996673584, + "logps/chosen": -173.4158172607422, + "logps/rejected": -180.69113159179688, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.369840621948242, + "rewards/margins": 0.769752025604248, + "rewards/rejected": -13.139593124389648, + "step": 2838 + }, + { + "epoch": 1.9591167845437294, + "grad_norm": 0.2565465569496155, + "learning_rate": 4.338319907940162e-06, + "logits/chosen": 3.5202505588531494, + "logits/rejected": 3.5430476665496826, + "logps/chosen": -184.2696533203125, + "logps/rejected": -193.12066650390625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.723000526428223, + "rewards/margins": 0.8626099824905396, + "rewards/rejected": -14.585610389709473, + "step": 2839 + }, + { + "epoch": 1.959806796618941, + "grad_norm": 19.636953353881836, + "learning_rate": 4.3354430379746835e-06, + "logits/chosen": 3.7335205078125, + "logits/rejected": 3.653104782104492, + "logps/chosen": -180.70925903320312, + "logps/rejected": -177.99801635742188, + "loss": 0.8863, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.323781967163086, + "rewards/margins": -0.26574695110321045, + "rewards/rejected": -13.058035850524902, + "step": 2840 + }, + { + "epoch": 1.960496808694152, + "grad_norm": 0.3471769094467163, + "learning_rate": 4.332566168009206e-06, + "logits/chosen": 3.7033469676971436, + "logits/rejected": 3.7033469676971436, + "logps/chosen": -167.528564453125, + "logps/rejected": -167.528564453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.886224746704102, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -11.886224746704102, + "step": 2841 + }, + { + "epoch": 1.9611868207693635, + "grad_norm": 0.28109487891197205, + "learning_rate": 4.329689298043729e-06, + "logits/chosen": 3.610375165939331, + "logits/rejected": 3.565453290939331, + "logps/chosen": -178.6652374267578, + "logps/rejected": -187.27915954589844, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.995033264160156, + "rewards/margins": 0.8395074605941772, + "rewards/rejected": -13.834541320800781, + "step": 2842 + }, + { + "epoch": 1.9618768328445748, + "grad_norm": 0.3490496277809143, + "learning_rate": 4.326812428078251e-06, + "logits/chosen": 3.7055411338806152, + "logits/rejected": 3.812958240509033, + "logps/chosen": -149.0777587890625, + "logps/rejected": -154.34164428710938, + "loss": 0.6083, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.201545715332031, + "rewards/margins": 0.5273498892784119, + "rewards/rejected": -10.728896141052246, + "step": 2843 + }, + { + "epoch": 1.962566844919786, + "grad_norm": 9.86543083190918, + "learning_rate": 4.323935558112774e-06, + "logits/chosen": 3.4560165405273438, + "logits/rejected": 3.3924319744110107, + "logps/chosen": -179.91744995117188, + "logps/rejected": -180.03433227539062, + "loss": 0.6696, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.334587097167969, + "rewards/margins": 0.052667856216430664, + "rewards/rejected": -13.38725471496582, + "step": 2844 + }, + { + "epoch": 1.9632568569949975, + "grad_norm": 0.30929920077323914, + "learning_rate": 4.321058688147296e-06, + "logits/chosen": 3.612014055252075, + "logits/rejected": 3.8223278522491455, + "logps/chosen": -145.29403686523438, + "logps/rejected": -152.39813232421875, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.85416030883789, + "rewards/margins": 0.7493494153022766, + "rewards/rejected": -10.603509902954102, + "step": 2845 + }, + { + "epoch": 1.9639468690702087, + "grad_norm": 7.616428852081299, + "learning_rate": 4.3181818181818185e-06, + "logits/chosen": 3.3142597675323486, + "logits/rejected": 3.467393636703491, + "logps/chosen": -152.13369750976562, + "logps/rejected": -173.3803253173828, + "loss": 0.4566, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.449499130249023, + "rewards/margins": 2.2267990112304688, + "rewards/rejected": -12.676298141479492, + "step": 2846 + }, + { + "epoch": 1.96463688114542, + "grad_norm": 0.24713902175426483, + "learning_rate": 4.315304948216341e-06, + "logits/chosen": 3.8760275840759277, + "logits/rejected": 4.064461708068848, + "logps/chosen": -159.49888610839844, + "logps/rejected": -182.62109375, + "loss": 0.4348, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.116109848022461, + "rewards/margins": 2.282031297683716, + "rewards/rejected": -13.398141860961914, + "step": 2847 + }, + { + "epoch": 1.9653268932206314, + "grad_norm": 0.3131541907787323, + "learning_rate": 4.312428078250863e-06, + "logits/chosen": 3.744859218597412, + "logits/rejected": 3.744859218597412, + "logps/chosen": -183.42483520507812, + "logps/rejected": -183.42483520507812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.716280937194824, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -13.716279983520508, + "step": 2848 + }, + { + "epoch": 1.9660169052958427, + "grad_norm": 22.618925094604492, + "learning_rate": 4.309551208285386e-06, + "logits/chosen": 3.509705066680908, + "logits/rejected": 3.460988759994507, + "logps/chosen": -167.3538360595703, + "logps/rejected": -174.5848846435547, + "loss": 1.0811, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.880327224731445, + "rewards/margins": 0.7026752233505249, + "rewards/rejected": -12.583002090454102, + "step": 2849 + }, + { + "epoch": 1.966706917371054, + "grad_norm": 0.31075870990753174, + "learning_rate": 4.306674338319909e-06, + "logits/chosen": 3.626481771469116, + "logits/rejected": 3.6342356204986572, + "logps/chosen": -156.54859924316406, + "logps/rejected": -187.45211791992188, + "loss": 0.4347, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.915562629699707, + "rewards/margins": 3.1005334854125977, + "rewards/rejected": -14.016096115112305, + "step": 2850 + }, + { + "epoch": 1.9673969294462652, + "grad_norm": 0.21563448011875153, + "learning_rate": 4.303797468354431e-06, + "logits/chosen": 3.493840456008911, + "logits/rejected": 3.598299741744995, + "logps/chosen": -155.8846435546875, + "logps/rejected": -184.87576293945312, + "loss": 0.4338, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.909268379211426, + "rewards/margins": 2.9797210693359375, + "rewards/rejected": -13.88899040222168, + "step": 2851 + }, + { + "epoch": 1.9680869415214768, + "grad_norm": 0.3619476854801178, + "learning_rate": 4.300920598388953e-06, + "logits/chosen": 3.486367702484131, + "logits/rejected": 3.486367702484131, + "logps/chosen": -179.47503662109375, + "logps/rejected": -179.47503662109375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.106897354125977, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.106897354125977, + "step": 2852 + }, + { + "epoch": 1.968776953596688, + "grad_norm": 0.3327350914478302, + "learning_rate": 4.298043728423475e-06, + "logits/chosen": 4.037607669830322, + "logits/rejected": 4.037607669830322, + "logps/chosen": -167.9430694580078, + "logps/rejected": -167.94305419921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.060501098632812, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.060501098632812, + "step": 2853 + }, + { + "epoch": 1.9694669656718993, + "grad_norm": 0.3015710711479187, + "learning_rate": 4.295166858457998e-06, + "logits/chosen": 3.925225257873535, + "logits/rejected": 3.925225257873535, + "logps/chosen": -159.93380737304688, + "logps/rejected": -159.93380737304688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.376029968261719, + "rewards/margins": 0.0, + "rewards/rejected": -11.376029968261719, + "step": 2854 + }, + { + "epoch": 1.9701569777471106, + "grad_norm": 0.3621940016746521, + "learning_rate": 4.292289988492521e-06, + "logits/chosen": 3.528651714324951, + "logits/rejected": 3.4870033264160156, + "logps/chosen": -166.19212341308594, + "logps/rejected": -174.00799560546875, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.95134162902832, + "rewards/margins": 0.8868016004562378, + "rewards/rejected": -12.838142395019531, + "step": 2855 + }, + { + "epoch": 1.9708469898223218, + "grad_norm": 1.3421814441680908, + "learning_rate": 4.289413118527043e-06, + "logits/chosen": 3.448965311050415, + "logits/rejected": 3.513892650604248, + "logps/chosen": -169.28384399414062, + "logps/rejected": -183.57940673828125, + "loss": 0.449, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.146596908569336, + "rewards/margins": 1.3589236736297607, + "rewards/rejected": -13.50551986694336, + "step": 2856 + }, + { + "epoch": 1.9715370018975333, + "grad_norm": 0.3240394592285156, + "learning_rate": 4.286536248561566e-06, + "logits/chosen": 3.568107843399048, + "logits/rejected": 3.635477066040039, + "logps/chosen": -155.04713439941406, + "logps/rejected": -165.45140075683594, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.872735977172852, + "rewards/margins": 0.9741135835647583, + "rewards/rejected": -11.84684944152832, + "step": 2857 + }, + { + "epoch": 1.9722270139727445, + "grad_norm": 0.2854647636413574, + "learning_rate": 4.2836593785960876e-06, + "logits/chosen": 3.7897348403930664, + "logits/rejected": 3.7897348403930664, + "logps/chosen": -168.269287109375, + "logps/rejected": -168.269287109375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.905067443847656, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.905067443847656, + "step": 2858 + }, + { + "epoch": 1.9729170260479558, + "grad_norm": 0.30067211389541626, + "learning_rate": 4.28078250863061e-06, + "logits/chosen": 3.4939513206481934, + "logits/rejected": 3.5627012252807617, + "logps/chosen": -167.10365295410156, + "logps/rejected": -174.22647094726562, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.924074172973633, + "rewards/margins": 0.6597108840942383, + "rewards/rejected": -12.583785057067871, + "step": 2859 + }, + { + "epoch": 1.9736070381231672, + "grad_norm": 0.44093379378318787, + "learning_rate": 4.277905638665132e-06, + "logits/chosen": 3.692631721496582, + "logits/rejected": 3.692631721496582, + "logps/chosen": -165.45883178710938, + "logps/rejected": -165.45883178710938, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.741809844970703, + "rewards/margins": 1.1920928955078125e-06, + "rewards/rejected": -11.741811752319336, + "step": 2860 + }, + { + "epoch": 1.9742970501983783, + "grad_norm": 1.9263050556182861, + "learning_rate": 4.275028768699655e-06, + "logits/chosen": 3.8296806812286377, + "logits/rejected": 4.012835502624512, + "logps/chosen": -173.67630004882812, + "logps/rejected": -187.45840454101562, + "loss": 0.5367, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.649480819702148, + "rewards/margins": 1.3456820249557495, + "rewards/rejected": -13.995162010192871, + "step": 2861 + }, + { + "epoch": 1.9749870622735899, + "grad_norm": 0.2557711601257324, + "learning_rate": 4.272151898734178e-06, + "logits/chosen": 3.821492910385132, + "logits/rejected": 3.821492910385132, + "logps/chosen": -163.845458984375, + "logps/rejected": -163.845458984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.634981155395508, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.634979248046875, + "step": 2862 + }, + { + "epoch": 1.975677074348801, + "grad_norm": 0.2350674718618393, + "learning_rate": 4.2692750287687e-06, + "logits/chosen": 3.710731029510498, + "logits/rejected": 3.7619080543518066, + "logps/chosen": -167.8184814453125, + "logps/rejected": -176.52108764648438, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.965917587280273, + "rewards/margins": 0.8728161454200745, + "rewards/rejected": -12.838733673095703, + "step": 2863 + }, + { + "epoch": 1.9763670864240124, + "grad_norm": 0.7172220945358276, + "learning_rate": 4.2663981588032225e-06, + "logits/chosen": 3.6410021781921387, + "logits/rejected": 3.973881721496582, + "logps/chosen": -153.3594970703125, + "logps/rejected": -169.98712158203125, + "loss": 0.4395, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.504883766174316, + "rewards/margins": 1.6261067390441895, + "rewards/rejected": -12.130990982055664, + "step": 2864 + }, + { + "epoch": 1.9770570984992237, + "grad_norm": 0.2847665548324585, + "learning_rate": 4.2635212888377444e-06, + "logits/chosen": 3.6097588539123535, + "logits/rejected": 3.79415225982666, + "logps/chosen": -167.22930908203125, + "logps/rejected": -177.60211181640625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.126571655273438, + "rewards/margins": 1.0081760883331299, + "rewards/rejected": -13.134747505187988, + "step": 2865 + }, + { + "epoch": 1.977747110574435, + "grad_norm": 0.23771318793296814, + "learning_rate": 4.260644418872267e-06, + "logits/chosen": 4.0217790603637695, + "logits/rejected": 4.0217790603637695, + "logps/chosen": -174.04312133789062, + "logps/rejected": -174.04312133789062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.583524703979492, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.583524703979492, + "step": 2866 + }, + { + "epoch": 1.9784371226496464, + "grad_norm": 0.3236018121242523, + "learning_rate": 4.25776754890679e-06, + "logits/chosen": 3.732093572616577, + "logits/rejected": 3.732093572616577, + "logps/chosen": -160.9734344482422, + "logps/rejected": -160.9734344482422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.428167343139648, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -11.428167343139648, + "step": 2867 + }, + { + "epoch": 1.9791271347248576, + "grad_norm": 7.78354549407959, + "learning_rate": 4.254890678941313e-06, + "logits/chosen": 3.7841224670410156, + "logits/rejected": 4.082548141479492, + "logps/chosen": -164.26068115234375, + "logps/rejected": -178.10903930664062, + "loss": 0.5541, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.727197647094727, + "rewards/margins": 1.3539789915084839, + "rewards/rejected": -13.0811767578125, + "step": 2868 + }, + { + "epoch": 1.9798171468000691, + "grad_norm": 0.2749975621700287, + "learning_rate": 4.252013808975835e-06, + "logits/chosen": 3.899340867996216, + "logits/rejected": 4.122406005859375, + "logps/chosen": -162.85858154296875, + "logps/rejected": -170.82586669921875, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.615789413452148, + "rewards/margins": 0.83230060338974, + "rewards/rejected": -12.448089599609375, + "step": 2869 + }, + { + "epoch": 1.9805071588752803, + "grad_norm": 0.26827630400657654, + "learning_rate": 4.2491369390103575e-06, + "logits/chosen": 3.498840808868408, + "logits/rejected": 3.665513753890991, + "logps/chosen": -153.02017211914062, + "logps/rejected": -164.7652587890625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.47331428527832, + "rewards/margins": 1.2298469543457031, + "rewards/rejected": -11.703161239624023, + "step": 2870 + }, + { + "epoch": 1.9811971709504916, + "grad_norm": 0.2974972128868103, + "learning_rate": 4.246260069044879e-06, + "logits/chosen": 3.8074793815612793, + "logits/rejected": 3.8074793815612793, + "logps/chosen": -184.3673858642578, + "logps/rejected": -184.3673858642578, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.632896423339844, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.632896423339844, + "step": 2871 + }, + { + "epoch": 1.981887183025703, + "grad_norm": 0.2733505666255951, + "learning_rate": 4.243383199079402e-06, + "logits/chosen": 3.675471067428589, + "logits/rejected": 3.675471067428589, + "logps/chosen": -193.48248291015625, + "logps/rejected": -193.48248291015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.657530784606934, + "rewards/margins": 0.0, + "rewards/rejected": -14.657530784606934, + "step": 2872 + }, + { + "epoch": 1.9825771951009141, + "grad_norm": 0.22939585149288177, + "learning_rate": 4.240506329113924e-06, + "logits/chosen": 3.5750129222869873, + "logits/rejected": 3.5944881439208984, + "logps/chosen": -169.03305053710938, + "logps/rejected": -183.73948669433594, + "loss": 0.5208, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.119977951049805, + "rewards/margins": 1.5369060039520264, + "rewards/rejected": -13.65688419342041, + "step": 2873 + }, + { + "epoch": 1.9832672071761257, + "grad_norm": 0.3299712538719177, + "learning_rate": 4.237629459148447e-06, + "logits/chosen": 3.8128247261047363, + "logits/rejected": 3.8128247261047363, + "logps/chosen": -174.27781677246094, + "logps/rejected": -174.27781677246094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.681306838989258, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.681306838989258, + "step": 2874 + }, + { + "epoch": 1.9839572192513368, + "grad_norm": 0.2628491520881653, + "learning_rate": 4.23475258918297e-06, + "logits/chosen": 3.935149669647217, + "logits/rejected": 3.935149669647217, + "logps/chosen": -178.31944274902344, + "logps/rejected": -178.31942749023438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.003177642822266, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -13.003177642822266, + "step": 2875 + }, + { + "epoch": 1.9846472313265482, + "grad_norm": 0.2472691535949707, + "learning_rate": 4.231875719217492e-06, + "logits/chosen": 3.8953821659088135, + "logits/rejected": 4.0324296951293945, + "logps/chosen": -159.07373046875, + "logps/rejected": -178.68087768554688, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.115768432617188, + "rewards/margins": 1.9935619831085205, + "rewards/rejected": -13.109329223632812, + "step": 2876 + }, + { + "epoch": 1.9853372434017595, + "grad_norm": 0.23878465592861176, + "learning_rate": 4.228998849252014e-06, + "logits/chosen": 3.90602445602417, + "logits/rejected": 4.082590103149414, + "logps/chosen": -171.12939453125, + "logps/rejected": -195.009765625, + "loss": 0.5202, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.299280166625977, + "rewards/margins": 2.32222318649292, + "rewards/rejected": -14.621501922607422, + "step": 2877 + }, + { + "epoch": 1.986027255476971, + "grad_norm": 0.30079540610313416, + "learning_rate": 4.226121979286536e-06, + "logits/chosen": 3.5049803256988525, + "logits/rejected": 3.609212636947632, + "logps/chosen": -178.762451171875, + "logps/rejected": -185.0229034423828, + "loss": 0.6077, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.96522331237793, + "rewards/margins": 0.5841350555419922, + "rewards/rejected": -13.549359321594238, + "step": 2878 + }, + { + "epoch": 1.9867172675521823, + "grad_norm": 0.27676355838775635, + "learning_rate": 4.223245109321059e-06, + "logits/chosen": 3.582731246948242, + "logits/rejected": 3.7048754692077637, + "logps/chosen": -166.38031005859375, + "logps/rejected": -180.97134399414062, + "loss": 0.521, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.873945236206055, + "rewards/margins": 1.3604212999343872, + "rewards/rejected": -13.234367370605469, + "step": 2879 + }, + { + "epoch": 1.9874072796273934, + "grad_norm": 0.2953374981880188, + "learning_rate": 4.220368239355582e-06, + "logits/chosen": 3.70674991607666, + "logits/rejected": 3.750349521636963, + "logps/chosen": -171.47789001464844, + "logps/rejected": -184.2628173828125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.15196704864502, + "rewards/margins": 1.309470772743225, + "rewards/rejected": -13.461438179016113, + "step": 2880 + }, + { + "epoch": 1.9880972917026047, + "grad_norm": 0.3892366886138916, + "learning_rate": 4.217491369390104e-06, + "logits/chosen": 3.593724489212036, + "logits/rejected": 3.7048027515411377, + "logps/chosen": -147.11102294921875, + "logps/rejected": -171.29693603515625, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.953903198242188, + "rewards/margins": 2.444077253341675, + "rewards/rejected": -12.397981643676758, + "step": 2881 + }, + { + "epoch": 1.988787303777816, + "grad_norm": 0.26588907837867737, + "learning_rate": 4.2146144994246265e-06, + "logits/chosen": 3.3439619541168213, + "logits/rejected": 3.6208407878875732, + "logps/chosen": -173.44775390625, + "logps/rejected": -193.7316436767578, + "loss": 0.5204, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.51018238067627, + "rewards/margins": 1.9915574789047241, + "rewards/rejected": -14.501739501953125, + "step": 2882 + }, + { + "epoch": 1.9894773158530275, + "grad_norm": 0.34574294090270996, + "learning_rate": 4.2117376294591485e-06, + "logits/chosen": 3.1877763271331787, + "logits/rejected": 3.1877763271331787, + "logps/chosen": -164.68112182617188, + "logps/rejected": -164.68112182617188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.787700653076172, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.787700653076172, + "step": 2883 + }, + { + "epoch": 1.9901673279282388, + "grad_norm": 0.41742441058158875, + "learning_rate": 4.208860759493671e-06, + "logits/chosen": 3.1364917755126953, + "logits/rejected": 3.078326463699341, + "logps/chosen": -152.31475830078125, + "logps/rejected": -157.54234313964844, + "loss": 0.6089, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.636666297912598, + "rewards/margins": 0.4943103790283203, + "rewards/rejected": -11.130976676940918, + "step": 2884 + }, + { + "epoch": 1.99085734000345, + "grad_norm": 0.24640445411205292, + "learning_rate": 4.205983889528193e-06, + "logits/chosen": 3.6542272567749023, + "logits/rejected": 3.6542272567749023, + "logps/chosen": -171.47491455078125, + "logps/rejected": -171.47491455078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.29412841796875, + "rewards/margins": 0.0, + "rewards/rejected": -12.29412841796875, + "step": 2885 + }, + { + "epoch": 1.9915473520786615, + "grad_norm": 0.2542819380760193, + "learning_rate": 4.203107019562716e-06, + "logits/chosen": 3.7281577587127686, + "logits/rejected": 3.7184255123138428, + "logps/chosen": -181.576904296875, + "logps/rejected": -190.57705688476562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.301644325256348, + "rewards/margins": 0.9471811056137085, + "rewards/rejected": -14.248825073242188, + "step": 2886 + }, + { + "epoch": 1.9922373641538726, + "grad_norm": 13.777369499206543, + "learning_rate": 4.200230149597239e-06, + "logits/chosen": 3.7784533500671387, + "logits/rejected": 3.7882862091064453, + "logps/chosen": -181.4228057861328, + "logps/rejected": -179.27330017089844, + "loss": 0.8279, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.336213111877441, + "rewards/margins": -0.19809657335281372, + "rewards/rejected": -13.138116836547852, + "step": 2887 + }, + { + "epoch": 1.992927376229084, + "grad_norm": 0.2403673529624939, + "learning_rate": 4.1973532796317615e-06, + "logits/chosen": 3.6513595581054688, + "logits/rejected": 3.7336831092834473, + "logps/chosen": -174.34799194335938, + "logps/rejected": -186.27828979492188, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.543437004089355, + "rewards/margins": 1.1817682981491089, + "rewards/rejected": -13.725205421447754, + "step": 2888 + }, + { + "epoch": 1.9936173883042954, + "grad_norm": 0.24705630540847778, + "learning_rate": 4.1944764096662834e-06, + "logits/chosen": 3.7532718181610107, + "logits/rejected": 3.9604737758636475, + "logps/chosen": -169.4977264404297, + "logps/rejected": -190.1123046875, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.17373275756836, + "rewards/margins": 2.0599493980407715, + "rewards/rejected": -14.233682632446289, + "step": 2889 + }, + { + "epoch": 1.9943074003795065, + "grad_norm": 9.71937370300293, + "learning_rate": 4.191599539700806e-06, + "logits/chosen": 3.8908486366271973, + "logits/rejected": 3.8502933979034424, + "logps/chosen": -158.92001342773438, + "logps/rejected": -183.032470703125, + "loss": 0.4928, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.166133880615234, + "rewards/margins": 2.4047017097473145, + "rewards/rejected": -13.57083511352539, + "step": 2890 + }, + { + "epoch": 1.994997412454718, + "grad_norm": 0.32057949900627136, + "learning_rate": 4.188722669735328e-06, + "logits/chosen": 3.0765538215637207, + "logits/rejected": 3.3443007469177246, + "logps/chosen": -146.70632934570312, + "logps/rejected": -166.94924926757812, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.97822380065918, + "rewards/margins": 1.9501186609268188, + "rewards/rejected": -11.928342819213867, + "step": 2891 + }, + { + "epoch": 1.9956874245299292, + "grad_norm": 0.3298564553260803, + "learning_rate": 4.185845799769851e-06, + "logits/chosen": 3.3738155364990234, + "logits/rejected": 3.3738155364990234, + "logps/chosen": -175.62631225585938, + "logps/rejected": -175.62631225585938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.944750785827637, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.944750785827637, + "step": 2892 + }, + { + "epoch": 1.9963774366051406, + "grad_norm": 0.2439972162246704, + "learning_rate": 4.182968929804373e-06, + "logits/chosen": 3.7478604316711426, + "logits/rejected": 3.821718215942383, + "logps/chosen": -165.1958465576172, + "logps/rejected": -190.56993103027344, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.833754539489746, + "rewards/margins": 2.568751811981201, + "rewards/rejected": -14.402506828308105, + "step": 2893 + }, + { + "epoch": 1.997067448680352, + "grad_norm": 0.47782063484191895, + "learning_rate": 4.180092059838896e-06, + "logits/chosen": 4.041431427001953, + "logits/rejected": 4.178961277008057, + "logps/chosen": -170.15762329101562, + "logps/rejected": -175.34268188476562, + "loss": 0.609, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.239713668823242, + "rewards/margins": 0.49004000425338745, + "rewards/rejected": -12.729753494262695, + "step": 2894 + }, + { + "epoch": 1.9977574607555633, + "grad_norm": 0.21613097190856934, + "learning_rate": 4.177215189873418e-06, + "logits/chosen": 3.851412773132324, + "logits/rejected": 3.945155620574951, + "logps/chosen": -185.85708618164062, + "logps/rejected": -196.41876220703125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.08791732788086, + "rewards/margins": 1.0641800165176392, + "rewards/rejected": -15.15209674835205, + "step": 2895 + }, + { + "epoch": 1.9984474728307746, + "grad_norm": 1.3986449241638184, + "learning_rate": 4.17433831990794e-06, + "logits/chosen": 3.626668691635132, + "logits/rejected": 3.691685676574707, + "logps/chosen": -153.9080352783203, + "logps/rejected": -168.9247589111328, + "loss": 0.5279, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.6642484664917, + "rewards/margins": 1.5100808143615723, + "rewards/rejected": -12.17432975769043, + "step": 2896 + }, + { + "epoch": 1.9991374849059858, + "grad_norm": 0.5098597407341003, + "learning_rate": 4.171461449942463e-06, + "logits/chosen": 3.4052035808563232, + "logits/rejected": 3.7062323093414307, + "logps/chosen": -186.32386779785156, + "logps/rejected": -201.33139038085938, + "loss": 0.524, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.073335647583008, + "rewards/margins": 1.4896739721298218, + "rewards/rejected": -15.563009262084961, + "step": 2897 + }, + { + "epoch": 1.9998274969811973, + "grad_norm": 0.7078998684883118, + "learning_rate": 4.168584579976985e-06, + "logits/chosen": 3.768958568572998, + "logits/rejected": 3.846449375152588, + "logps/chosen": -180.42181396484375, + "logps/rejected": -195.32376098632812, + "loss": 0.5229, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.178245544433594, + "rewards/margins": 1.4954791069030762, + "rewards/rejected": -14.673725128173828, + "step": 2898 + }, + { + "epoch": 2.000690012075211, + "grad_norm": 0.31047847867012024, + "learning_rate": 4.165707710011508e-06, + "logits/chosen": 3.5884392261505127, + "logits/rejected": 3.6042964458465576, + "logps/chosen": -181.5784912109375, + "logps/rejected": -189.13795471191406, + "loss": 0.7799, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -13.4262056350708, + "rewards/margins": 0.7646964192390442, + "rewards/rejected": -14.190900802612305, + "step": 2899 + }, + { + "epoch": 2.0013800241504227, + "grad_norm": 0.30077579617500305, + "learning_rate": 4.1628308400460306e-06, + "logits/chosen": 3.75142502784729, + "logits/rejected": 3.75142502784729, + "logps/chosen": -180.49679565429688, + "logps/rejected": -180.49679565429688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.265694618225098, + "rewards/margins": 0.0, + "rewards/rejected": -13.265695571899414, + "step": 2900 + }, + { + "epoch": 2.002070036225634, + "grad_norm": 0.29270821809768677, + "learning_rate": 4.1599539700805525e-06, + "logits/chosen": 3.6916110515594482, + "logits/rejected": 3.6916110515594482, + "logps/chosen": -178.0675506591797, + "logps/rejected": -178.0675506591797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.056965827941895, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.056965827941895, + "step": 2901 + }, + { + "epoch": 2.0027600483008454, + "grad_norm": 0.26060324907302856, + "learning_rate": 4.157077100115075e-06, + "logits/chosen": 3.905566453933716, + "logits/rejected": 3.905566453933716, + "logps/chosen": -187.858154296875, + "logps/rejected": -187.858154296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.95327377319336, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.95327377319336, + "step": 2902 + }, + { + "epoch": 2.0034500603760566, + "grad_norm": 0.28880617022514343, + "learning_rate": 4.154200230149597e-06, + "logits/chosen": 3.5013461112976074, + "logits/rejected": 3.6574015617370605, + "logps/chosen": -156.7721710205078, + "logps/rejected": -165.10423278808594, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.90169906616211, + "rewards/margins": 0.8403088450431824, + "rewards/rejected": -11.742008209228516, + "step": 2903 + }, + { + "epoch": 2.0041400724512677, + "grad_norm": 0.3515578806400299, + "learning_rate": 4.15132336018412e-06, + "logits/chosen": 3.627990484237671, + "logits/rejected": 3.670616865158081, + "logps/chosen": -153.74537658691406, + "logps/rejected": -161.890869140625, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.71385383605957, + "rewards/margins": 0.8344686031341553, + "rewards/rejected": -11.548322677612305, + "step": 2904 + }, + { + "epoch": 2.0048300845264793, + "grad_norm": 1.3828924894332886, + "learning_rate": 4.148446490218642e-06, + "logits/chosen": 3.6777286529541016, + "logits/rejected": 3.650692939758301, + "logps/chosen": -165.00926208496094, + "logps/rejected": -168.69866943359375, + "loss": 0.614, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.026511192321777, + "rewards/margins": 0.34828096628189087, + "rewards/rejected": -12.374792098999023, + "step": 2905 + }, + { + "epoch": 2.0055200966016904, + "grad_norm": 22.85452651977539, + "learning_rate": 4.145569620253165e-06, + "logits/chosen": 3.712876558303833, + "logits/rejected": 3.5629165172576904, + "logps/chosen": -181.6834716796875, + "logps/rejected": -177.159423828125, + "loss": 1.0525, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.369363784790039, + "rewards/margins": -0.44238007068634033, + "rewards/rejected": -12.926982879638672, + "step": 2906 + }, + { + "epoch": 2.006210108676902, + "grad_norm": 0.2844538688659668, + "learning_rate": 4.1426927502876875e-06, + "logits/chosen": 3.020758867263794, + "logits/rejected": 3.3452272415161133, + "logps/chosen": -153.29454040527344, + "logps/rejected": -192.44769287109375, + "loss": 0.4333, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.518903732299805, + "rewards/margins": 3.9064884185791016, + "rewards/rejected": -14.425392150878906, + "step": 2907 + }, + { + "epoch": 2.006900120752113, + "grad_norm": 14.163536071777344, + "learning_rate": 4.13981588032221e-06, + "logits/chosen": 3.580458402633667, + "logits/rejected": 3.6259419918060303, + "logps/chosen": -171.4974365234375, + "logps/rejected": -177.97659301757812, + "loss": 0.944, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.308167457580566, + "rewards/margins": 0.5999364852905273, + "rewards/rejected": -12.908103942871094, + "step": 2908 + }, + { + "epoch": 2.0075901328273247, + "grad_norm": 0.27678847312927246, + "learning_rate": 4.136939010356732e-06, + "logits/chosen": 3.4328460693359375, + "logits/rejected": 3.6911416053771973, + "logps/chosen": -158.2637939453125, + "logps/rejected": -172.14892578125, + "loss": 0.5207, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.071067810058594, + "rewards/margins": 1.4386249780654907, + "rewards/rejected": -12.50969409942627, + "step": 2909 + }, + { + "epoch": 2.008280144902536, + "grad_norm": 0.3138757646083832, + "learning_rate": 4.134062140391255e-06, + "logits/chosen": 3.6565980911254883, + "logits/rejected": 3.6565980911254883, + "logps/chosen": -167.7318878173828, + "logps/rejected": -167.73190307617188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.845304489135742, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -11.845305442810059, + "step": 2910 + }, + { + "epoch": 2.008970156977747, + "grad_norm": 1.6074297428131104, + "learning_rate": 4.131185270425777e-06, + "logits/chosen": 3.3134117126464844, + "logits/rejected": 3.4177842140197754, + "logps/chosen": -146.35037231445312, + "logps/rejected": -190.564208984375, + "loss": 0.2707, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.80459213256836, + "rewards/margins": 4.468129634857178, + "rewards/rejected": -14.272721290588379, + "step": 2911 + }, + { + "epoch": 2.0096601690529585, + "grad_norm": 0.25677546858787537, + "learning_rate": 4.1283084004603e-06, + "logits/chosen": 3.277796506881714, + "logits/rejected": 3.4480605125427246, + "logps/chosen": -150.8811492919922, + "logps/rejected": -174.9714813232422, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.315153121948242, + "rewards/margins": 2.3614487648010254, + "rewards/rejected": -12.67660140991211, + "step": 2912 + }, + { + "epoch": 2.0103501811281697, + "grad_norm": 0.23070073127746582, + "learning_rate": 4.1254315304948216e-06, + "logits/chosen": 3.2997894287109375, + "logits/rejected": 3.6517419815063477, + "logps/chosen": -166.45745849609375, + "logps/rejected": -185.82461547851562, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.780975341796875, + "rewards/margins": 2.0163116455078125, + "rewards/rejected": -13.797286987304688, + "step": 2913 + }, + { + "epoch": 2.0110401932033812, + "grad_norm": 13.284418106079102, + "learning_rate": 4.122554660529344e-06, + "logits/chosen": 3.494861602783203, + "logits/rejected": 3.527716636657715, + "logps/chosen": -175.75100708007812, + "logps/rejected": -178.01458740234375, + "loss": 0.7752, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.907400131225586, + "rewards/margins": 0.25892174243927, + "rewards/rejected": -13.166322708129883, + "step": 2914 + }, + { + "epoch": 2.0117302052785924, + "grad_norm": 0.3136395812034607, + "learning_rate": 4.119677790563867e-06, + "logits/chosen": 3.4347715377807617, + "logits/rejected": 3.4347715377807617, + "logps/chosen": -163.37823486328125, + "logps/rejected": -163.37823486328125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.694129943847656, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.694129943847656, + "step": 2915 + }, + { + "epoch": 2.0124202173538035, + "grad_norm": 0.28192365169525146, + "learning_rate": 4.116800920598389e-06, + "logits/chosen": 3.291520357131958, + "logits/rejected": 3.291520357131958, + "logps/chosen": -180.1761932373047, + "logps/rejected": -180.1761932373047, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.34606647491455, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.346065521240234, + "step": 2916 + }, + { + "epoch": 2.013110229429015, + "grad_norm": 0.33890122175216675, + "learning_rate": 4.113924050632912e-06, + "logits/chosen": 3.35611629486084, + "logits/rejected": 3.5457708835601807, + "logps/chosen": -176.2478485107422, + "logps/rejected": -198.7674102783203, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.885013580322266, + "rewards/margins": 2.3511295318603516, + "rewards/rejected": -15.236143112182617, + "step": 2917 + }, + { + "epoch": 2.013800241504226, + "grad_norm": 0.2567324936389923, + "learning_rate": 4.111047180667434e-06, + "logits/chosen": 3.637716770172119, + "logits/rejected": 3.617816925048828, + "logps/chosen": -164.27955627441406, + "logps/rejected": -171.6385040283203, + "loss": 0.6071, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.648775100708008, + "rewards/margins": 0.6654882431030273, + "rewards/rejected": -12.314264297485352, + "step": 2918 + }, + { + "epoch": 2.014490253579438, + "grad_norm": 0.2605314254760742, + "learning_rate": 4.1081703107019565e-06, + "logits/chosen": 3.734795093536377, + "logits/rejected": 3.9637274742126465, + "logps/chosen": -182.34925842285156, + "logps/rejected": -190.88487243652344, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.457340240478516, + "rewards/margins": 0.8777981400489807, + "rewards/rejected": -14.335138320922852, + "step": 2919 + }, + { + "epoch": 2.015180265654649, + "grad_norm": 9.476570129394531, + "learning_rate": 4.105293440736479e-06, + "logits/chosen": 3.8743646144866943, + "logits/rejected": 3.8187828063964844, + "logps/chosen": -174.81924438476562, + "logps/rejected": -174.52041625976562, + "loss": 0.7282, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.72882080078125, + "rewards/margins": -0.06242704391479492, + "rewards/rejected": -12.666393280029297, + "step": 2920 + }, + { + "epoch": 2.01587027772986, + "grad_norm": 0.2793157994747162, + "learning_rate": 4.102416570771002e-06, + "logits/chosen": 3.4133293628692627, + "logits/rejected": 3.657230854034424, + "logps/chosen": -143.93511962890625, + "logps/rejected": -175.359130859375, + "loss": 0.4338, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.644436836242676, + "rewards/margins": 3.151167154312134, + "rewards/rejected": -12.79560375213623, + "step": 2921 + }, + { + "epoch": 2.0165602898050716, + "grad_norm": 0.17137527465820312, + "learning_rate": 4.099539700805524e-06, + "logits/chosen": 3.4707515239715576, + "logits/rejected": 3.9192006587982178, + "logps/chosen": -150.3075408935547, + "logps/rejected": -192.37881469726562, + "loss": 0.4332, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.231407165527344, + "rewards/margins": 4.244173049926758, + "rewards/rejected": -14.475578308105469, + "step": 2922 + }, + { + "epoch": 2.0172503018802828, + "grad_norm": 0.2884292006492615, + "learning_rate": 4.096662830840046e-06, + "logits/chosen": 3.7470052242279053, + "logits/rejected": 3.7470052242279053, + "logps/chosen": -176.97018432617188, + "logps/rejected": -176.97018432617188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.868799209594727, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.86879825592041, + "step": 2923 + }, + { + "epoch": 2.0179403139554943, + "grad_norm": 0.2792048752307892, + "learning_rate": 4.093785960874569e-06, + "logits/chosen": 3.571065664291382, + "logits/rejected": 3.571065664291382, + "logps/chosen": -177.66119384765625, + "logps/rejected": -177.66116333007812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.017767906188965, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.017767906188965, + "step": 2924 + }, + { + "epoch": 2.0186303260307055, + "grad_norm": 0.21879617869853973, + "learning_rate": 4.0909090909090915e-06, + "logits/chosen": 3.472111940383911, + "logits/rejected": 3.472111940383911, + "logps/chosen": -185.25747680664062, + "logps/rejected": -185.25747680664062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.602499008178711, + "rewards/margins": 0.0, + "rewards/rejected": -13.602499008178711, + "step": 2925 + }, + { + "epoch": 2.019320338105917, + "grad_norm": 0.3914102017879486, + "learning_rate": 4.088032220943613e-06, + "logits/chosen": 3.6421470642089844, + "logits/rejected": 3.7056002616882324, + "logps/chosen": -183.6234893798828, + "logps/rejected": -188.7342071533203, + "loss": 0.6081, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.700727462768555, + "rewards/margins": 0.5428292751312256, + "rewards/rejected": -14.24355697631836, + "step": 2926 + }, + { + "epoch": 2.020010350181128, + "grad_norm": 1.637069582939148, + "learning_rate": 4.085155350978136e-06, + "logits/chosen": 3.7726168632507324, + "logits/rejected": 3.6531476974487305, + "logps/chosen": -173.2632293701172, + "logps/rejected": -185.54006958007812, + "loss": 0.5301, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.42342758178711, + "rewards/margins": 1.2717063426971436, + "rewards/rejected": -13.695135116577148, + "step": 2927 + }, + { + "epoch": 2.0207003622563393, + "grad_norm": 0.3596668243408203, + "learning_rate": 4.082278481012659e-06, + "logits/chosen": 3.8685665130615234, + "logits/rejected": 4.017325401306152, + "logps/chosen": -172.43896484375, + "logps/rejected": -186.74703979492188, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.562838554382324, + "rewards/margins": 1.3614205121994019, + "rewards/rejected": -13.924259185791016, + "step": 2928 + }, + { + "epoch": 2.021390374331551, + "grad_norm": 0.6424103379249573, + "learning_rate": 4.079401611047181e-06, + "logits/chosen": 3.5186591148376465, + "logits/rejected": 3.619837760925293, + "logps/chosen": -177.3912811279297, + "logps/rejected": -188.483154296875, + "loss": 0.5232, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.89396858215332, + "rewards/margins": 1.0991019010543823, + "rewards/rejected": -13.993070602416992, + "step": 2929 + }, + { + "epoch": 2.022080386406762, + "grad_norm": 0.20818020403385162, + "learning_rate": 4.076524741081704e-06, + "logits/chosen": 3.73392915725708, + "logits/rejected": 3.781923294067383, + "logps/chosen": -176.35850524902344, + "logps/rejected": -193.9938201904297, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.804086685180664, + "rewards/margins": 1.7905025482177734, + "rewards/rejected": -14.594589233398438, + "step": 2930 + }, + { + "epoch": 2.0227703984819736, + "grad_norm": 0.25543156266212463, + "learning_rate": 4.073647871116226e-06, + "logits/chosen": 3.435274600982666, + "logits/rejected": 3.5428028106689453, + "logps/chosen": -171.06857299804688, + "logps/rejected": -183.78973388671875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.182656288146973, + "rewards/margins": 1.2923862934112549, + "rewards/rejected": -13.475042343139648, + "step": 2931 + }, + { + "epoch": 2.0234604105571847, + "grad_norm": 26.5426025390625, + "learning_rate": 4.070771001150748e-06, + "logits/chosen": 3.5604748725891113, + "logits/rejected": 3.702286958694458, + "logps/chosen": -168.8220672607422, + "logps/rejected": -184.7846221923828, + "loss": 1.0261, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.202515602111816, + "rewards/margins": 1.5677664279937744, + "rewards/rejected": -13.770280838012695, + "step": 2932 + }, + { + "epoch": 2.024150422632396, + "grad_norm": 1.0641251802444458, + "learning_rate": 4.067894131185271e-06, + "logits/chosen": 3.4045629501342773, + "logits/rejected": 3.4623680114746094, + "logps/chosen": -163.40921020507812, + "logps/rejected": -168.681640625, + "loss": 0.6091, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.507513046264648, + "rewards/margins": 0.48441946506500244, + "rewards/rejected": -11.99193286895752, + "step": 2933 + }, + { + "epoch": 2.0248404347076074, + "grad_norm": 13.20490837097168, + "learning_rate": 4.065017261219793e-06, + "logits/chosen": 3.6580190658569336, + "logits/rejected": 3.5985922813415527, + "logps/chosen": -173.98849487304688, + "logps/rejected": -181.651123046875, + "loss": 0.8101, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.483880043029785, + "rewards/margins": 0.8180849552154541, + "rewards/rejected": -13.30196475982666, + "step": 2934 + }, + { + "epoch": 2.0255304467828186, + "grad_norm": 0.27986064553260803, + "learning_rate": 4.062140391254316e-06, + "logits/chosen": 3.541670083999634, + "logits/rejected": 3.541670083999634, + "logps/chosen": -168.12802124023438, + "logps/rejected": -168.12802124023438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.050559997558594, + "rewards/margins": 0.0, + "rewards/rejected": -12.050559997558594, + "step": 2935 + }, + { + "epoch": 2.02622045885803, + "grad_norm": 1.1404879093170166, + "learning_rate": 4.059263521288838e-06, + "logits/chosen": 3.429943799972534, + "logits/rejected": 3.6401844024658203, + "logps/chosen": -164.1864013671875, + "logps/rejected": -181.5780029296875, + "loss": 0.5305, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.81203842163086, + "rewards/margins": 1.701140284538269, + "rewards/rejected": -13.513179779052734, + "step": 2936 + }, + { + "epoch": 2.0269104709332413, + "grad_norm": 0.23472259938716888, + "learning_rate": 4.0563866513233605e-06, + "logits/chosen": 3.640460252761841, + "logits/rejected": 3.806485652923584, + "logps/chosen": -170.37490844726562, + "logps/rejected": -193.91893005371094, + "loss": 0.4342, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.198851585388184, + "rewards/margins": 2.2997007369995117, + "rewards/rejected": -14.498551368713379, + "step": 2937 + }, + { + "epoch": 2.027600483008453, + "grad_norm": 0.31954869627952576, + "learning_rate": 4.0535097813578825e-06, + "logits/chosen": 3.697244644165039, + "logits/rejected": 3.697244644165039, + "logps/chosen": -174.813720703125, + "logps/rejected": -174.813720703125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.765665054321289, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.765666961669922, + "step": 2938 + }, + { + "epoch": 2.028290495083664, + "grad_norm": 0.3712407946586609, + "learning_rate": 4.050632911392405e-06, + "logits/chosen": 3.6779470443725586, + "logits/rejected": 3.8013224601745605, + "logps/chosen": -180.03038024902344, + "logps/rejected": -194.08193969726562, + "loss": 0.5219, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.203561782836914, + "rewards/margins": 1.4154915809631348, + "rewards/rejected": -14.61905288696289, + "step": 2939 + }, + { + "epoch": 2.028980507158875, + "grad_norm": 0.3258289396762848, + "learning_rate": 4.047756041426928e-06, + "logits/chosen": 3.6288211345672607, + "logits/rejected": 3.6288211345672607, + "logps/chosen": -167.81942749023438, + "logps/rejected": -167.81942749023438, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.942031860351562, + "rewards/margins": 6.556510925292969e-07, + "rewards/rejected": -11.942033767700195, + "step": 2940 + }, + { + "epoch": 2.0296705192340867, + "grad_norm": 0.22757075726985931, + "learning_rate": 4.044879171461451e-06, + "logits/chosen": 3.7696118354797363, + "logits/rejected": 3.769536018371582, + "logps/chosen": -170.12136840820312, + "logps/rejected": -181.2346954345703, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.225310325622559, + "rewards/margins": 1.1190710067749023, + "rewards/rejected": -13.344381332397461, + "step": 2941 + }, + { + "epoch": 2.030360531309298, + "grad_norm": 0.3174542486667633, + "learning_rate": 4.042002301495973e-06, + "logits/chosen": 3.537027359008789, + "logits/rejected": 3.7133233547210693, + "logps/chosen": -177.2833251953125, + "logps/rejected": -196.49661254882812, + "loss": 0.5212, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.021991729736328, + "rewards/margins": 1.896545648574829, + "rewards/rejected": -14.918537139892578, + "step": 2942 + }, + { + "epoch": 2.0310505433845094, + "grad_norm": 13.670604705810547, + "learning_rate": 4.0391254315304955e-06, + "logits/chosen": 3.825270414352417, + "logits/rejected": 3.9312472343444824, + "logps/chosen": -170.13583374023438, + "logps/rejected": -177.5233917236328, + "loss": 0.6617, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.359248161315918, + "rewards/margins": 0.6845194101333618, + "rewards/rejected": -13.043766975402832, + "step": 2943 + }, + { + "epoch": 2.0317405554597205, + "grad_norm": 0.26209184527397156, + "learning_rate": 4.0362485615650174e-06, + "logits/chosen": 3.747067928314209, + "logits/rejected": 3.8937244415283203, + "logps/chosen": -182.7408447265625, + "logps/rejected": -191.21987915039062, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.610538482666016, + "rewards/margins": 0.8182505369186401, + "rewards/rejected": -14.428789138793945, + "step": 2944 + }, + { + "epoch": 2.0324305675349317, + "grad_norm": 0.29782819747924805, + "learning_rate": 4.03337169159954e-06, + "logits/chosen": 3.7818145751953125, + "logits/rejected": 3.886381149291992, + "logps/chosen": -158.60797119140625, + "logps/rejected": -173.86441040039062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.962455749511719, + "rewards/margins": 1.449561357498169, + "rewards/rejected": -12.412016868591309, + "step": 2945 + }, + { + "epoch": 2.0331205796101433, + "grad_norm": 0.2875227630138397, + "learning_rate": 4.030494821634062e-06, + "logits/chosen": 3.6368308067321777, + "logits/rejected": 3.6983773708343506, + "logps/chosen": -178.48239135742188, + "logps/rejected": -188.1854248046875, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.13656997680664, + "rewards/margins": 0.9783163070678711, + "rewards/rejected": -14.114886283874512, + "step": 2946 + }, + { + "epoch": 2.0338105916853544, + "grad_norm": 9.900995254516602, + "learning_rate": 4.027617951668585e-06, + "logits/chosen": 3.2494328022003174, + "logits/rejected": 3.5261991024017334, + "logps/chosen": -134.78878784179688, + "logps/rejected": -169.06771850585938, + "loss": 0.413, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.792938232421875, + "rewards/margins": 3.410130500793457, + "rewards/rejected": -12.203069686889648, + "step": 2947 + }, + { + "epoch": 2.034500603760566, + "grad_norm": 0.292036235332489, + "learning_rate": 4.024741081703108e-06, + "logits/chosen": 3.3942699432373047, + "logits/rejected": 3.3942699432373047, + "logps/chosen": -191.2696990966797, + "logps/rejected": -191.26971435546875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.539520263671875, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.539520263671875, + "step": 2948 + }, + { + "epoch": 2.035190615835777, + "grad_norm": 0.30465295910835266, + "learning_rate": 4.02186421173763e-06, + "logits/chosen": 3.520214080810547, + "logits/rejected": 3.520214080810547, + "logps/chosen": -176.41452026367188, + "logps/rejected": -176.41452026367188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.57589340209961, + "rewards/margins": 0.0, + "rewards/rejected": -12.57589340209961, + "step": 2949 + }, + { + "epoch": 2.0358806279109882, + "grad_norm": 0.27282485365867615, + "learning_rate": 4.018987341772152e-06, + "logits/chosen": 3.8088386058807373, + "logits/rejected": 3.9830398559570312, + "logps/chosen": -177.75848388671875, + "logps/rejected": -185.69435119628906, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.814398765563965, + "rewards/margins": 0.8124253153800964, + "rewards/rejected": -13.626824378967285, + "step": 2950 + }, + { + "epoch": 2.0365706399862, + "grad_norm": 0.7281805872917175, + "learning_rate": 4.016110471806674e-06, + "logits/chosen": 3.395407199859619, + "logits/rejected": 3.7541275024414062, + "logps/chosen": -145.58480834960938, + "logps/rejected": -179.75942993164062, + "loss": 0.3517, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.69771671295166, + "rewards/margins": 3.455063819885254, + "rewards/rejected": -13.152780532836914, + "step": 2951 + }, + { + "epoch": 2.037260652061411, + "grad_norm": 0.7348312139511108, + "learning_rate": 4.013233601841197e-06, + "logits/chosen": 3.682508945465088, + "logits/rejected": 3.8413705825805664, + "logps/chosen": -177.3097381591797, + "logps/rejected": -188.7530059814453, + "loss": 0.5265, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.003726959228516, + "rewards/margins": 1.1823341846466064, + "rewards/rejected": -14.186060905456543, + "step": 2952 + }, + { + "epoch": 2.0379506641366225, + "grad_norm": 0.2829853296279907, + "learning_rate": 4.01035673187572e-06, + "logits/chosen": 3.9083311557769775, + "logits/rejected": 3.9083311557769775, + "logps/chosen": -178.49436950683594, + "logps/rejected": -178.49436950683594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.078941345214844, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -13.078941345214844, + "step": 2953 + }, + { + "epoch": 2.0386406762118336, + "grad_norm": 0.26386332511901855, + "learning_rate": 4.007479861910242e-06, + "logits/chosen": 3.903926372528076, + "logits/rejected": 3.903926372528076, + "logps/chosen": -175.18666076660156, + "logps/rejected": -175.18666076660156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.730316162109375, + "rewards/margins": 0.0, + "rewards/rejected": -12.730316162109375, + "step": 2954 + }, + { + "epoch": 2.0393306882870452, + "grad_norm": 0.27659282088279724, + "learning_rate": 4.0046029919447646e-06, + "logits/chosen": 3.9850049018859863, + "logits/rejected": 3.9850049018859863, + "logps/chosen": -182.8830108642578, + "logps/rejected": -182.8830108642578, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.305319786071777, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.305318832397461, + "step": 2955 + }, + { + "epoch": 2.0400207003622564, + "grad_norm": 0.39228302240371704, + "learning_rate": 4.0017261219792865e-06, + "logits/chosen": 3.5599825382232666, + "logits/rejected": 3.655277729034424, + "logps/chosen": -140.4671173095703, + "logps/rejected": -149.9840850830078, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.233808517456055, + "rewards/margins": 0.9275359511375427, + "rewards/rejected": -10.161344528198242, + "step": 2956 + }, + { + "epoch": 2.0407107124374675, + "grad_norm": 0.5027796030044556, + "learning_rate": 3.998849252013809e-06, + "logits/chosen": 3.687162160873413, + "logits/rejected": 3.6510396003723145, + "logps/chosen": -175.96237182617188, + "logps/rejected": -180.85702514648438, + "loss": 0.6085, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.656725883483887, + "rewards/margins": 0.5149834156036377, + "rewards/rejected": -13.171710014343262, + "step": 2957 + }, + { + "epoch": 2.041400724512679, + "grad_norm": 0.23525062203407288, + "learning_rate": 3.995972382048331e-06, + "logits/chosen": 3.7342655658721924, + "logits/rejected": 3.7814104557037354, + "logps/chosen": -175.78506469726562, + "logps/rejected": -186.03289794921875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.744100570678711, + "rewards/margins": 1.034239649772644, + "rewards/rejected": -13.778340339660645, + "step": 2958 + }, + { + "epoch": 2.04209073658789, + "grad_norm": 0.3828960359096527, + "learning_rate": 3.993095512082854e-06, + "logits/chosen": 3.273479700088501, + "logits/rejected": 3.3104772567749023, + "logps/chosen": -154.42942810058594, + "logps/rejected": -170.79971313476562, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.675342559814453, + "rewards/margins": 1.5667824745178223, + "rewards/rejected": -12.242125511169434, + "step": 2959 + }, + { + "epoch": 2.0427807486631018, + "grad_norm": 0.3706636130809784, + "learning_rate": 3.990218642117377e-06, + "logits/chosen": 3.4564366340637207, + "logits/rejected": 3.4564366340637207, + "logps/chosen": -184.3247528076172, + "logps/rejected": -184.3247528076172, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.918161392211914, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.918161392211914, + "step": 2960 + }, + { + "epoch": 2.043470760738313, + "grad_norm": 0.26085206866264343, + "learning_rate": 3.9873417721518995e-06, + "logits/chosen": 3.266727924346924, + "logits/rejected": 3.358581781387329, + "logps/chosen": -170.10232543945312, + "logps/rejected": -183.78558349609375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.229047775268555, + "rewards/margins": 1.3993473052978516, + "rewards/rejected": -13.628395080566406, + "step": 2961 + }, + { + "epoch": 2.044160772813524, + "grad_norm": 1.0795360803604126, + "learning_rate": 3.9844649021864214e-06, + "logits/chosen": 3.597313642501831, + "logits/rejected": 3.5254101753234863, + "logps/chosen": -159.1025390625, + "logps/rejected": -171.60888671875, + "loss": 0.5259, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.263042449951172, + "rewards/margins": 1.3183943033218384, + "rewards/rejected": -12.581437110900879, + "step": 2962 + }, + { + "epoch": 2.0448507848887356, + "grad_norm": 0.29172950983047485, + "learning_rate": 3.981588032220944e-06, + "logits/chosen": 3.633554458618164, + "logits/rejected": 3.633554458618164, + "logps/chosen": -184.8323974609375, + "logps/rejected": -184.8323974609375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.665409088134766, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.665409088134766, + "step": 2963 + }, + { + "epoch": 2.0455407969639468, + "grad_norm": 0.9331964254379272, + "learning_rate": 3.978711162255466e-06, + "logits/chosen": 3.398643732070923, + "logits/rejected": 3.431239604949951, + "logps/chosen": -174.09649658203125, + "logps/rejected": -178.1751708984375, + "loss": 0.611, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.943961143493652, + "rewards/margins": 0.4144526720046997, + "rewards/rejected": -13.358413696289062, + "step": 2964 + }, + { + "epoch": 2.0462308090391583, + "grad_norm": 0.2936953008174896, + "learning_rate": 3.975834292289989e-06, + "logits/chosen": 3.8893589973449707, + "logits/rejected": 3.8893589973449707, + "logps/chosen": -174.2474822998047, + "logps/rejected": -174.2474822998047, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.77955150604248, + "rewards/margins": 0.0, + "rewards/rejected": -12.77955150604248, + "step": 2965 + }, + { + "epoch": 2.0469208211143695, + "grad_norm": 0.27864545583724976, + "learning_rate": 3.972957422324511e-06, + "logits/chosen": 3.6422982215881348, + "logits/rejected": 3.624535083770752, + "logps/chosen": -182.96978759765625, + "logps/rejected": -193.46725463867188, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.412008285522461, + "rewards/margins": 1.0686867237091064, + "rewards/rejected": -14.480695724487305, + "step": 2966 + }, + { + "epoch": 2.047610833189581, + "grad_norm": 0.2526869475841522, + "learning_rate": 3.970080552359034e-06, + "logits/chosen": 3.5199193954467773, + "logits/rejected": 3.7135825157165527, + "logps/chosen": -168.2657012939453, + "logps/rejected": -178.34439086914062, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.978548049926758, + "rewards/margins": 1.033345103263855, + "rewards/rejected": -13.011892318725586, + "step": 2967 + }, + { + "epoch": 2.048300845264792, + "grad_norm": 0.2358790785074234, + "learning_rate": 3.967203682393556e-06, + "logits/chosen": 3.7301361560821533, + "logits/rejected": 3.7301361560821533, + "logps/chosen": -174.95196533203125, + "logps/rejected": -174.95196533203125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.516218185424805, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -12.516218185424805, + "step": 2968 + }, + { + "epoch": 2.0489908573400033, + "grad_norm": 0.25375521183013916, + "learning_rate": 3.964326812428078e-06, + "logits/chosen": 3.592320442199707, + "logits/rejected": 3.718197822570801, + "logps/chosen": -164.30615234375, + "logps/rejected": -179.70565795898438, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.653251647949219, + "rewards/margins": 1.594454050064087, + "rewards/rejected": -13.247705459594727, + "step": 2969 + }, + { + "epoch": 2.049680869415215, + "grad_norm": 1.3893203735351562, + "learning_rate": 3.961449942462601e-06, + "logits/chosen": 3.5882253646850586, + "logits/rejected": 3.6706557273864746, + "logps/chosen": -174.85772705078125, + "logps/rejected": -178.31829833984375, + "loss": 0.6137, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.702831268310547, + "rewards/margins": 0.35317814350128174, + "rewards/rejected": -13.056007385253906, + "step": 2970 + }, + { + "epoch": 2.050370881490426, + "grad_norm": 0.2866330146789551, + "learning_rate": 3.958573072497123e-06, + "logits/chosen": 3.464296817779541, + "logits/rejected": 3.6121835708618164, + "logps/chosen": -150.70504760742188, + "logps/rejected": -172.2935791015625, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.197476387023926, + "rewards/margins": 2.1826417446136475, + "rewards/rejected": -12.380117416381836, + "step": 2971 + }, + { + "epoch": 2.0510608935656376, + "grad_norm": 0.31595563888549805, + "learning_rate": 3.955696202531646e-06, + "logits/chosen": 3.71187686920166, + "logits/rejected": 3.71187686920166, + "logps/chosen": -187.12545776367188, + "logps/rejected": -187.12545776367188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.994232177734375, + "rewards/margins": 0.0, + "rewards/rejected": -13.994232177734375, + "step": 2972 + }, + { + "epoch": 2.0517509056408487, + "grad_norm": 0.23855867981910706, + "learning_rate": 3.952819332566169e-06, + "logits/chosen": 3.484358310699463, + "logits/rejected": 3.5546207427978516, + "logps/chosen": -163.76809692382812, + "logps/rejected": -185.1053466796875, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.647356033325195, + "rewards/margins": 2.0703470706939697, + "rewards/rejected": -13.717704772949219, + "step": 2973 + }, + { + "epoch": 2.05244091771606, + "grad_norm": 1.1368191242218018, + "learning_rate": 3.949942462600691e-06, + "logits/chosen": 3.505641222000122, + "logits/rejected": 3.5468807220458984, + "logps/chosen": -171.5599365234375, + "logps/rejected": -174.8095703125, + "loss": 0.614, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.345027923583984, + "rewards/margins": 0.347675085067749, + "rewards/rejected": -12.692703247070312, + "step": 2974 + }, + { + "epoch": 2.0531309297912714, + "grad_norm": 0.3425810635089874, + "learning_rate": 3.947065592635213e-06, + "logits/chosen": 3.882143497467041, + "logits/rejected": 3.882143497467041, + "logps/chosen": -176.08599853515625, + "logps/rejected": -176.08599853515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.843950271606445, + "rewards/margins": 0.0, + "rewards/rejected": -12.843950271606445, + "step": 2975 + }, + { + "epoch": 2.0538209418664826, + "grad_norm": 0.3727370500564575, + "learning_rate": 3.944188722669735e-06, + "logits/chosen": 3.3808302879333496, + "logits/rejected": 3.5649213790893555, + "logps/chosen": -169.1183624267578, + "logps/rejected": -181.08961486816406, + "loss": 0.5221, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.050323486328125, + "rewards/margins": 1.1862972974777222, + "rewards/rejected": -13.23661994934082, + "step": 2976 + }, + { + "epoch": 2.054510953941694, + "grad_norm": 0.2538484036922455, + "learning_rate": 3.941311852704258e-06, + "logits/chosen": 3.8798956871032715, + "logits/rejected": 3.9206676483154297, + "logps/chosen": -164.8845672607422, + "logps/rejected": -178.4422607421875, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.494709014892578, + "rewards/margins": 1.294155240058899, + "rewards/rejected": -12.788864135742188, + "step": 2977 + }, + { + "epoch": 2.0552009660169053, + "grad_norm": 0.5033981204032898, + "learning_rate": 3.938434982738781e-06, + "logits/chosen": 3.3114116191864014, + "logits/rejected": 3.5002260208129883, + "logps/chosen": -159.5379638671875, + "logps/rejected": -182.9064483642578, + "loss": 0.4367, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.207659721374512, + "rewards/margins": 2.353163719177246, + "rewards/rejected": -13.560823440551758, + "step": 2978 + }, + { + "epoch": 2.0558909780921164, + "grad_norm": 0.32839202880859375, + "learning_rate": 3.935558112773303e-06, + "logits/chosen": 3.493955135345459, + "logits/rejected": 3.5227108001708984, + "logps/chosen": -162.32763671875, + "logps/rejected": -172.91702270507812, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.429874420166016, + "rewards/margins": 1.0836031436920166, + "rewards/rejected": -12.513477325439453, + "step": 2979 + }, + { + "epoch": 2.056580990167328, + "grad_norm": 0.2724311053752899, + "learning_rate": 3.9326812428078255e-06, + "logits/chosen": 3.443959951400757, + "logits/rejected": 3.5647475719451904, + "logps/chosen": -153.8148651123047, + "logps/rejected": -167.77420043945312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.560622215270996, + "rewards/margins": 1.3867144584655762, + "rewards/rejected": -11.94733715057373, + "step": 2980 + }, + { + "epoch": 2.057271002242539, + "grad_norm": 0.28359055519104004, + "learning_rate": 3.929804372842348e-06, + "logits/chosen": 3.5984840393066406, + "logits/rejected": 3.6879916191101074, + "logps/chosen": -181.62832641601562, + "logps/rejected": -195.46697998046875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.346510887145996, + "rewards/margins": 1.4058992862701416, + "rewards/rejected": -14.752410888671875, + "step": 2981 + }, + { + "epoch": 2.0579610143177507, + "grad_norm": 0.24674023687839508, + "learning_rate": 3.92692750287687e-06, + "logits/chosen": 3.3656859397888184, + "logits/rejected": 3.584536552429199, + "logps/chosen": -147.80418395996094, + "logps/rejected": -171.9712677001953, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.154779434204102, + "rewards/margins": 2.366827964782715, + "rewards/rejected": -12.521608352661133, + "step": 2982 + }, + { + "epoch": 2.058651026392962, + "grad_norm": 0.3205921947956085, + "learning_rate": 3.924050632911393e-06, + "logits/chosen": 3.563549518585205, + "logits/rejected": 3.727128505706787, + "logps/chosen": -162.44374084472656, + "logps/rejected": -174.11544799804688, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.621728897094727, + "rewards/margins": 1.1395812034606934, + "rewards/rejected": -12.761310577392578, + "step": 2983 + }, + { + "epoch": 2.0593410384681734, + "grad_norm": 15.73208999633789, + "learning_rate": 3.921173762945915e-06, + "logits/chosen": 3.4555675983428955, + "logits/rejected": 3.510838508605957, + "logps/chosen": -177.67535400390625, + "logps/rejected": -187.22422790527344, + "loss": 0.6914, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.880596160888672, + "rewards/margins": 1.0342034101486206, + "rewards/rejected": -13.914798736572266, + "step": 2984 + }, + { + "epoch": 2.0600310505433845, + "grad_norm": 0.33414867520332336, + "learning_rate": 3.918296892980438e-06, + "logits/chosen": 3.614351272583008, + "logits/rejected": 3.7290074825286865, + "logps/chosen": -149.95672607421875, + "logps/rejected": -160.81141662597656, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.176322937011719, + "rewards/margins": 1.04668128490448, + "rewards/rejected": -11.223004341125488, + "step": 2985 + }, + { + "epoch": 2.0607210626185957, + "grad_norm": 0.3702697157859802, + "learning_rate": 3.9154200230149604e-06, + "logits/chosen": 3.5171139240264893, + "logits/rejected": 3.5171139240264893, + "logps/chosen": -171.13909912109375, + "logps/rejected": -171.13909912109375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.645493507385254, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -12.645492553710938, + "step": 2986 + }, + { + "epoch": 2.0614110746938072, + "grad_norm": 0.2246735692024231, + "learning_rate": 3.912543153049482e-06, + "logits/chosen": 3.3810136318206787, + "logits/rejected": 3.5235800743103027, + "logps/chosen": -181.13156127929688, + "logps/rejected": -192.17431640625, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.38229751586914, + "rewards/margins": 1.1560696363449097, + "rewards/rejected": -14.53836727142334, + "step": 2987 + }, + { + "epoch": 2.0621010867690184, + "grad_norm": 0.3192780315876007, + "learning_rate": 3.909666283084005e-06, + "logits/chosen": 3.3930606842041016, + "logits/rejected": 3.41143536567688, + "logps/chosen": -166.43988037109375, + "logps/rejected": -176.1426239013672, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.795231819152832, + "rewards/margins": 0.9813351631164551, + "rewards/rejected": -12.776566505432129, + "step": 2988 + }, + { + "epoch": 2.06279109884423, + "grad_norm": 12.001955032348633, + "learning_rate": 3.906789413118527e-06, + "logits/chosen": 3.4716475009918213, + "logits/rejected": 3.6093966960906982, + "logps/chosen": -171.89816284179688, + "logps/rejected": -171.7018585205078, + "loss": 0.6755, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.572761535644531, + "rewards/margins": 0.03814089298248291, + "rewards/rejected": -12.610902786254883, + "step": 2989 + }, + { + "epoch": 2.063481110919441, + "grad_norm": 0.35312899947166443, + "learning_rate": 3.90391254315305e-06, + "logits/chosen": 3.698765754699707, + "logits/rejected": 3.698765754699707, + "logps/chosen": -176.71701049804688, + "logps/rejected": -176.71701049804688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.818377494812012, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.818377494812012, + "step": 2990 + }, + { + "epoch": 2.064171122994652, + "grad_norm": 0.2492283582687378, + "learning_rate": 3.901035673187572e-06, + "logits/chosen": 3.4414029121398926, + "logits/rejected": 3.4414029121398926, + "logps/chosen": -177.40045166015625, + "logps/rejected": -177.40045166015625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.034621238708496, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -13.034621238708496, + "step": 2991 + }, + { + "epoch": 2.064861135069864, + "grad_norm": 0.24170172214508057, + "learning_rate": 3.8981588032220945e-06, + "logits/chosen": 3.424039840698242, + "logits/rejected": 3.476691722869873, + "logps/chosen": -170.436279296875, + "logps/rejected": -183.47393798828125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.232617378234863, + "rewards/margins": 1.3394181728363037, + "rewards/rejected": -13.572035789489746, + "step": 2992 + }, + { + "epoch": 2.065551147145075, + "grad_norm": 0.25037506222724915, + "learning_rate": 3.895281933256617e-06, + "logits/chosen": 3.1113076210021973, + "logits/rejected": 3.409979820251465, + "logps/chosen": -143.2530059814453, + "logps/rejected": -177.8230438232422, + "loss": 0.4338, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.586688995361328, + "rewards/margins": 3.445335865020752, + "rewards/rejected": -13.032024383544922, + "step": 2993 + }, + { + "epoch": 2.0662411592202865, + "grad_norm": 0.2434384971857071, + "learning_rate": 3.89240506329114e-06, + "logits/chosen": 3.3907644748687744, + "logits/rejected": 3.4934701919555664, + "logps/chosen": -171.19912719726562, + "logps/rejected": -182.64222717285156, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.474769592285156, + "rewards/margins": 1.0961875915527344, + "rewards/rejected": -13.57095718383789, + "step": 2994 + }, + { + "epoch": 2.0669311712954976, + "grad_norm": 0.21134591102600098, + "learning_rate": 3.889528193325662e-06, + "logits/chosen": 3.6482253074645996, + "logits/rejected": 3.6482253074645996, + "logps/chosen": -187.7246856689453, + "logps/rejected": -187.7246856689453, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.87896728515625, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.878966331481934, + "step": 2995 + }, + { + "epoch": 2.0676211833707088, + "grad_norm": 0.7200548052787781, + "learning_rate": 3.886651323360184e-06, + "logits/chosen": 3.714076519012451, + "logits/rejected": 3.856808662414551, + "logps/chosen": -173.41311645507812, + "logps/rejected": -178.0989990234375, + "loss": 0.6104, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.518206596374512, + "rewards/margins": 0.43189144134521484, + "rewards/rejected": -12.950098991394043, + "step": 2996 + }, + { + "epoch": 2.0683111954459203, + "grad_norm": 0.2740277647972107, + "learning_rate": 3.883774453394707e-06, + "logits/chosen": 3.4947290420532227, + "logits/rejected": 3.5858700275421143, + "logps/chosen": -169.94451904296875, + "logps/rejected": -179.7881317138672, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.153160095214844, + "rewards/margins": 0.9822977781295776, + "rewards/rejected": -13.135457992553711, + "step": 2997 + }, + { + "epoch": 2.0690012075211315, + "grad_norm": 0.2942913770675659, + "learning_rate": 3.8808975834292295e-06, + "logits/chosen": 3.9615750312805176, + "logits/rejected": 3.9615750312805176, + "logps/chosen": -170.3233642578125, + "logps/rejected": -170.3233642578125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.241605758666992, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.241605758666992, + "step": 2998 + }, + { + "epoch": 2.069691219596343, + "grad_norm": 0.2601030468940735, + "learning_rate": 3.8780207134637514e-06, + "logits/chosen": 3.354020118713379, + "logits/rejected": 3.5350499153137207, + "logps/chosen": -159.2332000732422, + "logps/rejected": -166.41586303710938, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.08210563659668, + "rewards/margins": 0.7128074169158936, + "rewards/rejected": -11.794914245605469, + "step": 2999 + }, + { + "epoch": 2.070381231671554, + "grad_norm": 0.25985103845596313, + "learning_rate": 3.875143843498274e-06, + "logits/chosen": 3.464219570159912, + "logits/rejected": 3.464219570159912, + "logps/chosen": -181.1697998046875, + "logps/rejected": -181.1697998046875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.451692581176758, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -13.451692581176758, + "step": 3000 + }, + { + "epoch": 2.0710712437467658, + "grad_norm": 0.31542736291885376, + "learning_rate": 3.872266973532797e-06, + "logits/chosen": 3.7190442085266113, + "logits/rejected": 3.7190442085266113, + "logps/chosen": -182.55215454101562, + "logps/rejected": -182.55215454101562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.592195510864258, + "rewards/margins": 0.0, + "rewards/rejected": -13.592195510864258, + "step": 3001 + }, + { + "epoch": 2.071761255821977, + "grad_norm": 0.3826601803302765, + "learning_rate": 3.869390103567319e-06, + "logits/chosen": 3.4211339950561523, + "logits/rejected": 3.61337947845459, + "logps/chosen": -158.32080078125, + "logps/rejected": -172.9158172607422, + "loss": 0.5213, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.05675220489502, + "rewards/margins": 1.4980802536010742, + "rewards/rejected": -12.55483341217041, + "step": 3002 + }, + { + "epoch": 2.072451267897188, + "grad_norm": 0.172138512134552, + "learning_rate": 3.866513233601842e-06, + "logits/chosen": 3.0729572772979736, + "logits/rejected": 3.4002277851104736, + "logps/chosen": -127.14439392089844, + "logps/rejected": -177.0693359375, + "loss": 0.3472, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.133917808532715, + "rewards/margins": 4.898505210876465, + "rewards/rejected": -13.03242301940918, + "step": 3003 + }, + { + "epoch": 2.0731412799723996, + "grad_norm": 0.32867884635925293, + "learning_rate": 3.863636363636364e-06, + "logits/chosen": 3.5397863388061523, + "logits/rejected": 3.5397863388061523, + "logps/chosen": -179.43028259277344, + "logps/rejected": -179.4302978515625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.214332580566406, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.214332580566406, + "step": 3004 + }, + { + "epoch": 2.0738312920476107, + "grad_norm": 0.3221563994884491, + "learning_rate": 3.860759493670886e-06, + "logits/chosen": 3.7786214351654053, + "logits/rejected": 3.7786214351654053, + "logps/chosen": -185.6652374267578, + "logps/rejected": -185.6652374267578, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.961341857910156, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.961341857910156, + "step": 3005 + }, + { + "epoch": 2.0745213041228223, + "grad_norm": 0.306864857673645, + "learning_rate": 3.857882623705409e-06, + "logits/chosen": 3.0909156799316406, + "logits/rejected": 3.4803884029388428, + "logps/chosen": -166.74969482421875, + "logps/rejected": -182.11383056640625, + "loss": 0.5205, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.900962829589844, + "rewards/margins": 1.573326826095581, + "rewards/rejected": -13.474288940429688, + "step": 3006 + }, + { + "epoch": 2.0752113161980335, + "grad_norm": 16.97545623779297, + "learning_rate": 3.855005753739931e-06, + "logits/chosen": 3.296949863433838, + "logits/rejected": 3.3291237354278564, + "logps/chosen": -151.60247802734375, + "logps/rejected": -167.900390625, + "loss": 0.7311, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.388301849365234, + "rewards/margins": 1.595590591430664, + "rewards/rejected": -11.983893394470215, + "step": 3007 + }, + { + "epoch": 2.0759013282732446, + "grad_norm": 5.2213616371154785, + "learning_rate": 3.852128883774454e-06, + "logits/chosen": 3.3913350105285645, + "logits/rejected": 3.5150036811828613, + "logps/chosen": -152.2689971923828, + "logps/rejected": -172.20330810546875, + "loss": 0.5636, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.618578910827637, + "rewards/margins": 1.8699662685394287, + "rewards/rejected": -12.488545417785645, + "step": 3008 + }, + { + "epoch": 2.076591340348456, + "grad_norm": 0.2782753109931946, + "learning_rate": 3.849252013808976e-06, + "logits/chosen": 3.1820449829101562, + "logits/rejected": 3.240457057952881, + "logps/chosen": -162.01937866210938, + "logps/rejected": -171.82086181640625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.321617126464844, + "rewards/margins": 1.0745363235473633, + "rewards/rejected": -12.396153450012207, + "step": 3009 + }, + { + "epoch": 2.0772813524236673, + "grad_norm": 0.2788500189781189, + "learning_rate": 3.8463751438434986e-06, + "logits/chosen": 3.354295253753662, + "logits/rejected": 3.4534108638763428, + "logps/chosen": -158.84608459472656, + "logps/rejected": -170.7379608154297, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.03288459777832, + "rewards/margins": 1.198154091835022, + "rewards/rejected": -12.231039047241211, + "step": 3010 + }, + { + "epoch": 2.077971364498879, + "grad_norm": 0.35836321115493774, + "learning_rate": 3.8434982738780205e-06, + "logits/chosen": 2.9940075874328613, + "logits/rejected": 2.996025562286377, + "logps/chosen": -144.20101928710938, + "logps/rejected": -154.91986083984375, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.718684196472168, + "rewards/margins": 1.1135098934173584, + "rewards/rejected": -10.832194328308105, + "step": 3011 + }, + { + "epoch": 2.07866137657409, + "grad_norm": 0.27628740668296814, + "learning_rate": 3.840621403912543e-06, + "logits/chosen": 3.577094554901123, + "logits/rejected": 3.716212749481201, + "logps/chosen": -168.30917358398438, + "logps/rejected": -185.84625244140625, + "loss": 0.5205, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.019086837768555, + "rewards/margins": 1.7552416324615479, + "rewards/rejected": -13.774328231811523, + "step": 3012 + }, + { + "epoch": 2.079351388649301, + "grad_norm": 0.2814275920391083, + "learning_rate": 3.837744533947066e-06, + "logits/chosen": 3.3343162536621094, + "logits/rejected": 3.4019687175750732, + "logps/chosen": -172.02110290527344, + "logps/rejected": -184.57301330566406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.418176651000977, + "rewards/margins": 1.248270034790039, + "rewards/rejected": -13.666446685791016, + "step": 3013 + }, + { + "epoch": 2.0800414007245127, + "grad_norm": 0.35261598229408264, + "learning_rate": 3.834867663981589e-06, + "logits/chosen": 3.459207773208618, + "logits/rejected": 3.459207773208618, + "logps/chosen": -166.82504272460938, + "logps/rejected": -166.82504272460938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.856161117553711, + "rewards/margins": 0.0, + "rewards/rejected": -11.856161117553711, + "step": 3014 + }, + { + "epoch": 2.080731412799724, + "grad_norm": 0.24914798140525818, + "learning_rate": 3.831990794016111e-06, + "logits/chosen": 3.2085204124450684, + "logits/rejected": 3.253124713897705, + "logps/chosen": -152.27374267578125, + "logps/rejected": -172.6505584716797, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.377310752868652, + "rewards/margins": 2.10221004486084, + "rewards/rejected": -12.479520797729492, + "step": 3015 + }, + { + "epoch": 2.0814214248749354, + "grad_norm": 0.36960795521736145, + "learning_rate": 3.829113924050633e-06, + "logits/chosen": 3.192922353744507, + "logits/rejected": 3.192922353744507, + "logps/chosen": -172.2458953857422, + "logps/rejected": -172.2458953857422, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.422191619873047, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.422191619873047, + "step": 3016 + }, + { + "epoch": 2.0821114369501466, + "grad_norm": 0.32984963059425354, + "learning_rate": 3.8262370540851554e-06, + "logits/chosen": 3.217780113220215, + "logits/rejected": 3.2912983894348145, + "logps/chosen": -149.34295654296875, + "logps/rejected": -157.08584594726562, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.28825855255127, + "rewards/margins": 0.7680266499519348, + "rewards/rejected": -11.056285858154297, + "step": 3017 + }, + { + "epoch": 2.082801449025358, + "grad_norm": 0.34309419989585876, + "learning_rate": 3.823360184119678e-06, + "logits/chosen": 3.2922427654266357, + "logits/rejected": 3.2922427654266357, + "logps/chosen": -163.91030883789062, + "logps/rejected": -163.91030883789062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.59815788269043, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -11.598155975341797, + "step": 3018 + }, + { + "epoch": 2.0834914611005693, + "grad_norm": 15.64942741394043, + "learning_rate": 3.820483314154201e-06, + "logits/chosen": 3.235708713531494, + "logits/rejected": 3.1113805770874023, + "logps/chosen": -155.38253784179688, + "logps/rejected": -149.71908569335938, + "loss": 1.1479, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.744095802307129, + "rewards/margins": -0.5397166013717651, + "rewards/rejected": -10.204379081726074, + "step": 3019 + }, + { + "epoch": 2.0841814731757804, + "grad_norm": 0.22159920632839203, + "learning_rate": 3.817606444188723e-06, + "logits/chosen": 3.2040672302246094, + "logits/rejected": 3.5084166526794434, + "logps/chosen": -151.27752685546875, + "logps/rejected": -170.94207763671875, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.295865058898926, + "rewards/margins": 2.0001442432403564, + "rewards/rejected": -12.296009063720703, + "step": 3020 + }, + { + "epoch": 2.084871485250992, + "grad_norm": 0.4165310561656952, + "learning_rate": 3.814729574223246e-06, + "logits/chosen": 3.477670192718506, + "logits/rejected": 3.477670192718506, + "logps/chosen": -158.33668518066406, + "logps/rejected": -158.33668518066406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.088031768798828, + "rewards/margins": 0.0, + "rewards/rejected": -11.088031768798828, + "step": 3021 + }, + { + "epoch": 2.085561497326203, + "grad_norm": 0.2613852620124817, + "learning_rate": 3.811852704257768e-06, + "logits/chosen": 3.229071617126465, + "logits/rejected": 3.5119028091430664, + "logps/chosen": -154.01034545898438, + "logps/rejected": -171.68190002441406, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.686796188354492, + "rewards/margins": 1.7453526258468628, + "rewards/rejected": -12.432147979736328, + "step": 3022 + }, + { + "epoch": 2.0862515094014147, + "grad_norm": 0.4297269284725189, + "learning_rate": 3.8089758342922904e-06, + "logits/chosen": 2.872621536254883, + "logits/rejected": 2.872621536254883, + "logps/chosen": -141.5475311279297, + "logps/rejected": -141.5475311279297, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -9.388875007629395, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -9.388875007629395, + "step": 3023 + }, + { + "epoch": 2.086941521476626, + "grad_norm": 1.3914610147476196, + "learning_rate": 3.8060989643268128e-06, + "logits/chosen": 3.2010676860809326, + "logits/rejected": 3.333538055419922, + "logps/chosen": -165.23056030273438, + "logps/rejected": -168.67083740234375, + "loss": 0.6136, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.841854095458984, + "rewards/margins": 0.35515522956848145, + "rewards/rejected": -12.197010040283203, + "step": 3024 + }, + { + "epoch": 2.087631533551837, + "grad_norm": 0.2631312608718872, + "learning_rate": 3.803222094361335e-06, + "logits/chosen": 3.1850953102111816, + "logits/rejected": 3.153050422668457, + "logps/chosen": -164.5714111328125, + "logps/rejected": -181.65533447265625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.644429206848145, + "rewards/margins": 1.6516190767288208, + "rewards/rejected": -13.296048164367676, + "step": 3025 + }, + { + "epoch": 2.0883215456270485, + "grad_norm": 0.2948538362979889, + "learning_rate": 3.800345224395858e-06, + "logits/chosen": 3.4389195442199707, + "logits/rejected": 3.5921742916107178, + "logps/chosen": -167.80889892578125, + "logps/rejected": -190.8231201171875, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.062009811401367, + "rewards/margins": 2.367136001586914, + "rewards/rejected": -14.429145812988281, + "step": 3026 + }, + { + "epoch": 2.0890115577022597, + "grad_norm": 0.27945417165756226, + "learning_rate": 3.7974683544303802e-06, + "logits/chosen": 3.3206069469451904, + "logits/rejected": 3.3206069469451904, + "logps/chosen": -162.37327575683594, + "logps/rejected": -162.37327575683594, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.6812744140625, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -11.681275367736816, + "step": 3027 + }, + { + "epoch": 2.0897015697774712, + "grad_norm": 0.33045244216918945, + "learning_rate": 3.794591484464902e-06, + "logits/chosen": 3.212877035140991, + "logits/rejected": 3.212877035140991, + "logps/chosen": -144.50294494628906, + "logps/rejected": -144.50294494628906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.666496276855469, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -9.666496276855469, + "step": 3028 + }, + { + "epoch": 2.0903915818526824, + "grad_norm": 0.3240562677383423, + "learning_rate": 3.791714614499425e-06, + "logits/chosen": 3.2841057777404785, + "logits/rejected": 3.2841057777404785, + "logps/chosen": -168.30242919921875, + "logps/rejected": -168.30242919921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.920705795288086, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.920705795288086, + "step": 3029 + }, + { + "epoch": 2.091081593927894, + "grad_norm": 0.36733099818229675, + "learning_rate": 3.7888377445339473e-06, + "logits/chosen": 3.395653009414673, + "logits/rejected": 3.395653009414673, + "logps/chosen": -161.02484130859375, + "logps/rejected": -161.02484130859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.373394966125488, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.373394012451172, + "step": 3030 + }, + { + "epoch": 2.091771606003105, + "grad_norm": 0.34481778740882874, + "learning_rate": 3.78596087456847e-06, + "logits/chosen": 3.45208740234375, + "logits/rejected": 3.45208740234375, + "logps/chosen": -147.94407653808594, + "logps/rejected": -147.94407653808594, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.992287635803223, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -9.992287635803223, + "step": 3031 + }, + { + "epoch": 2.092461618078316, + "grad_norm": 0.3134823441505432, + "learning_rate": 3.783084004602992e-06, + "logits/chosen": 3.0329349040985107, + "logits/rejected": 3.0329349040985107, + "logps/chosen": -173.8105010986328, + "logps/rejected": -173.8105010986328, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.49050235748291, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.490501403808594, + "step": 3032 + }, + { + "epoch": 2.093151630153528, + "grad_norm": 0.3085824251174927, + "learning_rate": 3.7802071346375148e-06, + "logits/chosen": 3.351247787475586, + "logits/rejected": 3.517974853515625, + "logps/chosen": -168.89959716796875, + "logps/rejected": -178.84298706054688, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.012928009033203, + "rewards/margins": 0.9772090911865234, + "rewards/rejected": -12.990135192871094, + "step": 3033 + }, + { + "epoch": 2.093841642228739, + "grad_norm": 0.5487726330757141, + "learning_rate": 3.777330264672037e-06, + "logits/chosen": 3.283681869506836, + "logits/rejected": 3.3253774642944336, + "logps/chosen": -154.55531311035156, + "logps/rejected": -165.2353973388672, + "loss": 0.5241, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.82689094543457, + "rewards/margins": 1.0546916723251343, + "rewards/rejected": -11.881584167480469, + "step": 3034 + }, + { + "epoch": 2.0945316543039505, + "grad_norm": 0.38482239842414856, + "learning_rate": 3.77445339470656e-06, + "logits/chosen": 2.859955310821533, + "logits/rejected": 2.8929781913757324, + "logps/chosen": -160.93190002441406, + "logps/rejected": -166.7154083251953, + "loss": 0.6079, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.384913444519043, + "rewards/margins": 0.561540961265564, + "rewards/rejected": -11.946454048156738, + "step": 3035 + }, + { + "epoch": 2.0952216663791616, + "grad_norm": 0.47636643052101135, + "learning_rate": 3.771576524741082e-06, + "logits/chosen": 3.539910316467285, + "logits/rejected": 3.7155680656433105, + "logps/chosen": -156.7692108154297, + "logps/rejected": -170.13050842285156, + "loss": 0.5228, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.738414764404297, + "rewards/margins": 1.3745155334472656, + "rewards/rejected": -12.112930297851562, + "step": 3036 + }, + { + "epoch": 2.0959116784543728, + "grad_norm": 0.2763042151927948, + "learning_rate": 3.7686996547756046e-06, + "logits/chosen": 3.694524049758911, + "logits/rejected": 3.694524049758911, + "logps/chosen": -186.00619506835938, + "logps/rejected": -186.00619506835938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.772762298583984, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.772762298583984, + "step": 3037 + }, + { + "epoch": 2.0966016905295843, + "grad_norm": 0.3054121136665344, + "learning_rate": 3.765822784810127e-06, + "logits/chosen": 3.1121554374694824, + "logits/rejected": 3.1883959770202637, + "logps/chosen": -145.0128173828125, + "logps/rejected": -158.24539184570312, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.528807640075684, + "rewards/margins": 1.3331329822540283, + "rewards/rejected": -10.86194133758545, + "step": 3038 + }, + { + "epoch": 2.0972917026047955, + "grad_norm": 0.32796695828437805, + "learning_rate": 3.7629459148446497e-06, + "logits/chosen": 3.2867372035980225, + "logits/rejected": 3.489009380340576, + "logps/chosen": -139.5933074951172, + "logps/rejected": -157.13650512695312, + "loss": 0.5204, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.159894943237305, + "rewards/margins": 1.778156042098999, + "rewards/rejected": -10.938050270080566, + "step": 3039 + }, + { + "epoch": 2.097981714680007, + "grad_norm": 4.055022239685059, + "learning_rate": 3.7600690448791717e-06, + "logits/chosen": 3.197540283203125, + "logits/rejected": 3.2384650707244873, + "logps/chosen": -157.92910766601562, + "logps/rejected": -158.37789916992188, + "loss": 0.6636, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.920620918273926, + "rewards/margins": 0.06835722923278809, + "rewards/rejected": -10.988977432250977, + "step": 3040 + }, + { + "epoch": 2.098671726755218, + "grad_norm": 0.3041250705718994, + "learning_rate": 3.757192174913694e-06, + "logits/chosen": 3.220726490020752, + "logits/rejected": 3.4928200244903564, + "logps/chosen": -158.03329467773438, + "logps/rejected": -183.32435607910156, + "loss": 0.4348, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.96513843536377, + "rewards/margins": 2.5763680934906006, + "rewards/rejected": -13.541505813598633, + "step": 3041 + }, + { + "epoch": 2.0993617388304298, + "grad_norm": 0.2128603756427765, + "learning_rate": 3.7543153049482168e-06, + "logits/chosen": 3.1213667392730713, + "logits/rejected": 3.35408616065979, + "logps/chosen": -162.910400390625, + "logps/rejected": -184.07803344726562, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.58621597290039, + "rewards/margins": 2.030489206314087, + "rewards/rejected": -13.616705894470215, + "step": 3042 + }, + { + "epoch": 2.100051750905641, + "grad_norm": 0.38714295625686646, + "learning_rate": 3.751438434982739e-06, + "logits/chosen": 3.5781350135803223, + "logits/rejected": 3.5781350135803223, + "logps/chosen": -163.22021484375, + "logps/rejected": -163.22021484375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.661724090576172, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.661724090576172, + "step": 3043 + }, + { + "epoch": 2.100741762980852, + "grad_norm": 0.2687162160873413, + "learning_rate": 3.7485615650172615e-06, + "logits/chosen": 3.569420337677002, + "logits/rejected": 3.569420337677002, + "logps/chosen": -169.40206909179688, + "logps/rejected": -169.40206909179688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.197484970092773, + "rewards/margins": 0.0, + "rewards/rejected": -12.197484970092773, + "step": 3044 + }, + { + "epoch": 2.1014317750560636, + "grad_norm": 0.3612144887447357, + "learning_rate": 3.745684695051784e-06, + "logits/chosen": 3.7624480724334717, + "logits/rejected": 3.7624480724334717, + "logps/chosen": -177.41213989257812, + "logps/rejected": -177.4121551513672, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.090195655822754, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.090194702148438, + "step": 3045 + }, + { + "epoch": 2.1021217871312747, + "grad_norm": 0.4754113256931305, + "learning_rate": 3.7428078250863066e-06, + "logits/chosen": 3.4697327613830566, + "logits/rejected": 3.868786334991455, + "logps/chosen": -145.21536254882812, + "logps/rejected": -173.09515380859375, + "loss": 0.3492, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.846521377563477, + "rewards/margins": 2.828962802886963, + "rewards/rejected": -12.675483703613281, + "step": 3046 + }, + { + "epoch": 2.1028117992064863, + "grad_norm": 0.21424952149391174, + "learning_rate": 3.739930955120829e-06, + "logits/chosen": 3.1920793056488037, + "logits/rejected": 3.329481601715088, + "logps/chosen": -164.21372985839844, + "logps/rejected": -191.2279052734375, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.64380168914795, + "rewards/margins": 2.6748557090759277, + "rewards/rejected": -14.318656921386719, + "step": 3047 + }, + { + "epoch": 2.1035018112816974, + "grad_norm": 0.3339514136314392, + "learning_rate": 3.737054085155351e-06, + "logits/chosen": 3.847964286804199, + "logits/rejected": 3.847964286804199, + "logps/chosen": -175.72239685058594, + "logps/rejected": -175.72238159179688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.809866905212402, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.809866905212402, + "step": 3048 + }, + { + "epoch": 2.1041918233569086, + "grad_norm": 0.33503612875938416, + "learning_rate": 3.7341772151898737e-06, + "logits/chosen": 3.5500588417053223, + "logits/rejected": 3.5133094787597656, + "logps/chosen": -167.07125854492188, + "logps/rejected": -177.62774658203125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.710811614990234, + "rewards/margins": 1.0963140726089478, + "rewards/rejected": -12.807125091552734, + "step": 3049 + }, + { + "epoch": 2.10488183543212, + "grad_norm": 0.3529583215713501, + "learning_rate": 3.731300345224396e-06, + "logits/chosen": 3.7027666568756104, + "logits/rejected": 3.7027666568756104, + "logps/chosen": -178.5145263671875, + "logps/rejected": -178.5145263671875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.13160228729248, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.13160228729248, + "step": 3050 + }, + { + "epoch": 2.1055718475073313, + "grad_norm": 0.33258676528930664, + "learning_rate": 3.728423475258919e-06, + "logits/chosen": 3.5762524604797363, + "logits/rejected": 3.5762524604797363, + "logps/chosen": -183.24783325195312, + "logps/rejected": -183.24783325195312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.403372764587402, + "rewards/margins": 0.0, + "rewards/rejected": -13.403372764587402, + "step": 3051 + }, + { + "epoch": 2.106261859582543, + "grad_norm": 26.53217887878418, + "learning_rate": 3.7255466052934407e-06, + "logits/chosen": 3.4283294677734375, + "logits/rejected": 3.3750219345092773, + "logps/chosen": -169.00192260742188, + "logps/rejected": -175.0098876953125, + "loss": 1.0777, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.301647186279297, + "rewards/margins": 0.6203439235687256, + "rewards/rejected": -12.921991348266602, + "step": 3052 + }, + { + "epoch": 2.106951871657754, + "grad_norm": 0.36887794733047485, + "learning_rate": 3.7226697353279635e-06, + "logits/chosen": 3.7197790145874023, + "logits/rejected": 3.7329611778259277, + "logps/chosen": -166.8761749267578, + "logps/rejected": -176.25466918945312, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.88292407989502, + "rewards/margins": 0.9282395839691162, + "rewards/rejected": -12.811163902282715, + "step": 3053 + }, + { + "epoch": 2.107641883732965, + "grad_norm": 0.268073171377182, + "learning_rate": 3.719792865362486e-06, + "logits/chosen": 3.66013503074646, + "logits/rejected": 3.887958288192749, + "logps/chosen": -163.77821350097656, + "logps/rejected": -182.81979370117188, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.541818618774414, + "rewards/margins": 1.9166139364242554, + "rewards/rejected": -13.458431243896484, + "step": 3054 + }, + { + "epoch": 2.1083318958081767, + "grad_norm": 4.460702896118164, + "learning_rate": 3.7169159953970086e-06, + "logits/chosen": 3.400425910949707, + "logits/rejected": 3.403149127960205, + "logps/chosen": -168.3417510986328, + "logps/rejected": -169.6410675048828, + "loss": 0.6526, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.905044555664062, + "rewards/margins": 0.10081273317337036, + "rewards/rejected": -12.00585651397705, + "step": 3055 + }, + { + "epoch": 2.109021907883388, + "grad_norm": 0.3241436183452606, + "learning_rate": 3.7140391254315305e-06, + "logits/chosen": 3.427896022796631, + "logits/rejected": 3.5930466651916504, + "logps/chosen": -144.37252807617188, + "logps/rejected": -166.27764892578125, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.844058990478516, + "rewards/margins": 2.2167630195617676, + "rewards/rejected": -12.060821533203125, + "step": 3056 + }, + { + "epoch": 2.1097119199585994, + "grad_norm": 0.3591931164264679, + "learning_rate": 3.7111622554660533e-06, + "logits/chosen": 3.607452630996704, + "logits/rejected": 3.607452630996704, + "logps/chosen": -162.45899963378906, + "logps/rejected": -162.45899963378906, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.499679565429688, + "rewards/margins": 0.0, + "rewards/rejected": -11.499679565429688, + "step": 3057 + }, + { + "epoch": 2.1104019320338105, + "grad_norm": 0.37362900376319885, + "learning_rate": 3.7082853855005757e-06, + "logits/chosen": 3.6439218521118164, + "logits/rejected": 3.6439218521118164, + "logps/chosen": -175.7281951904297, + "logps/rejected": -175.72817993164062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.823822021484375, + "rewards/margins": 0.0, + "rewards/rejected": -12.823822021484375, + "step": 3058 + }, + { + "epoch": 2.111091944109022, + "grad_norm": 0.3685089349746704, + "learning_rate": 3.7054085155350985e-06, + "logits/chosen": 3.454970359802246, + "logits/rejected": 3.4867680072784424, + "logps/chosen": -166.98635864257812, + "logps/rejected": -179.66671752929688, + "loss": 0.5221, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.006966590881348, + "rewards/margins": 1.2406377792358398, + "rewards/rejected": -13.247604370117188, + "step": 3059 + }, + { + "epoch": 2.1117819561842333, + "grad_norm": 21.183147430419922, + "learning_rate": 3.7025316455696204e-06, + "logits/chosen": 3.5417590141296387, + "logits/rejected": 3.4919614791870117, + "logps/chosen": -182.06808471679688, + "logps/rejected": -173.54727172851562, + "loss": 1.4808, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.282219886779785, + "rewards/margins": -0.8741440176963806, + "rewards/rejected": -12.408076286315918, + "step": 3060 + }, + { + "epoch": 2.1124719682594444, + "grad_norm": 0.39491766691207886, + "learning_rate": 3.6996547756041427e-06, + "logits/chosen": 3.1559336185455322, + "logits/rejected": 3.23637318611145, + "logps/chosen": -163.064697265625, + "logps/rejected": -175.62335205078125, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.693305969238281, + "rewards/margins": 1.300794005393982, + "rewards/rejected": -12.994100570678711, + "step": 3061 + }, + { + "epoch": 2.113161980334656, + "grad_norm": 0.20861895382404327, + "learning_rate": 3.6967779056386655e-06, + "logits/chosen": 3.470932960510254, + "logits/rejected": 4.0002946853637695, + "logps/chosen": -138.2821044921875, + "logps/rejected": -188.15171813964844, + "loss": 0.3466, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.088889122009277, + "rewards/margins": 4.87190055847168, + "rewards/rejected": -13.960789680480957, + "step": 3062 + }, + { + "epoch": 2.113851992409867, + "grad_norm": 0.37492936849594116, + "learning_rate": 3.693901035673188e-06, + "logits/chosen": 3.314831256866455, + "logits/rejected": 3.4146077632904053, + "logps/chosen": -165.413330078125, + "logps/rejected": -191.55902099609375, + "loss": 0.5212, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.898963928222656, + "rewards/margins": 2.535954475402832, + "rewards/rejected": -14.434918403625488, + "step": 3063 + }, + { + "epoch": 2.1145420044850787, + "grad_norm": 24.136409759521484, + "learning_rate": 3.69102416570771e-06, + "logits/chosen": 3.3671209812164307, + "logits/rejected": 3.4099907875061035, + "logps/chosen": -151.73043823242188, + "logps/rejected": -155.4590301513672, + "loss": 0.7171, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.372106552124023, + "rewards/margins": 0.37153440713882446, + "rewards/rejected": -10.743640899658203, + "step": 3064 + }, + { + "epoch": 2.11523201656029, + "grad_norm": 0.23251423239707947, + "learning_rate": 3.6881472957422326e-06, + "logits/chosen": 3.9358181953430176, + "logits/rejected": 4.047115325927734, + "logps/chosen": -174.84689331054688, + "logps/rejected": -185.13345336914062, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.606684684753418, + "rewards/margins": 1.0301953554153442, + "rewards/rejected": -13.636879920959473, + "step": 3065 + }, + { + "epoch": 2.115922028635501, + "grad_norm": 0.3627948462963104, + "learning_rate": 3.6852704257767553e-06, + "logits/chosen": 3.813180923461914, + "logits/rejected": 3.8665060997009277, + "logps/chosen": -175.183349609375, + "logps/rejected": -188.6312255859375, + "loss": 0.5217, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.850014686584473, + "rewards/margins": 1.40877103805542, + "rewards/rejected": -14.258787155151367, + "step": 3066 + }, + { + "epoch": 2.1166120407107125, + "grad_norm": 0.9252438545227051, + "learning_rate": 3.6823935558112777e-06, + "logits/chosen": 3.5493297576904297, + "logits/rejected": 3.5070266723632812, + "logps/chosen": -165.81637573242188, + "logps/rejected": -169.7603759765625, + "loss": 0.6161, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.811785697937012, + "rewards/margins": 0.3166731894016266, + "rewards/rejected": -12.128459930419922, + "step": 3067 + }, + { + "epoch": 2.1173020527859236, + "grad_norm": 0.2706688940525055, + "learning_rate": 3.6795166858457996e-06, + "logits/chosen": 4.129620552062988, + "logits/rejected": 4.129620552062988, + "logps/chosen": -175.64820861816406, + "logps/rejected": -175.64820861816406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.661505699157715, + "rewards/margins": 0.0, + "rewards/rejected": -12.661505699157715, + "step": 3068 + }, + { + "epoch": 2.1179920648611352, + "grad_norm": 0.32873964309692383, + "learning_rate": 3.6766398158803224e-06, + "logits/chosen": 3.2926902770996094, + "logits/rejected": 3.420612096786499, + "logps/chosen": -167.36135864257812, + "logps/rejected": -178.44219970703125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.011576652526855, + "rewards/margins": 1.1359366178512573, + "rewards/rejected": -13.147513389587402, + "step": 3069 + }, + { + "epoch": 2.1186820769363464, + "grad_norm": 0.3377319574356079, + "learning_rate": 3.6737629459148447e-06, + "logits/chosen": 3.3997137546539307, + "logits/rejected": 3.4421768188476562, + "logps/chosen": -153.73353576660156, + "logps/rejected": -164.78619384765625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.501214027404785, + "rewards/margins": 1.2102323770523071, + "rewards/rejected": -11.711446762084961, + "step": 3070 + }, + { + "epoch": 2.1193720890115575, + "grad_norm": 0.30727794766426086, + "learning_rate": 3.6708860759493675e-06, + "logits/chosen": 3.70340633392334, + "logits/rejected": 3.7689576148986816, + "logps/chosen": -176.9243621826172, + "logps/rejected": -183.60128784179688, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.919965744018555, + "rewards/margins": 0.7077969908714294, + "rewards/rejected": -13.627763748168945, + "step": 3071 + }, + { + "epoch": 2.120062101086769, + "grad_norm": 1.5210487842559814, + "learning_rate": 3.6680092059838903e-06, + "logits/chosen": 3.4488320350646973, + "logits/rejected": 3.7327849864959717, + "logps/chosen": -152.42352294921875, + "logps/rejected": -168.09786987304688, + "loss": 0.4465, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.544857025146484, + "rewards/margins": 1.5114595890045166, + "rewards/rejected": -12.056316375732422, + "step": 3072 + }, + { + "epoch": 2.12075211316198, + "grad_norm": 0.3316786289215088, + "learning_rate": 3.6651323360184122e-06, + "logits/chosen": 3.1060922145843506, + "logits/rejected": 3.189110040664673, + "logps/chosen": -157.6856689453125, + "logps/rejected": -170.36328125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.16738510131836, + "rewards/margins": 1.2804745435714722, + "rewards/rejected": -12.447860717773438, + "step": 3073 + }, + { + "epoch": 2.121442125237192, + "grad_norm": 0.31779786944389343, + "learning_rate": 3.6622554660529346e-06, + "logits/chosen": 3.6613688468933105, + "logits/rejected": 3.6613688468933105, + "logps/chosen": -173.72128295898438, + "logps/rejected": -173.72128295898438, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.566776275634766, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.566776275634766, + "step": 3074 + }, + { + "epoch": 2.122132137312403, + "grad_norm": 0.29507124423980713, + "learning_rate": 3.6593785960874573e-06, + "logits/chosen": 3.452585220336914, + "logits/rejected": 3.5203709602355957, + "logps/chosen": -157.35989379882812, + "logps/rejected": -167.01779174804688, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.871698379516602, + "rewards/margins": 0.9692870378494263, + "rewards/rejected": -11.840985298156738, + "step": 3075 + }, + { + "epoch": 2.1228221493876145, + "grad_norm": 0.2480309009552002, + "learning_rate": 3.6565017261219797e-06, + "logits/chosen": 3.456557035446167, + "logits/rejected": 3.6721012592315674, + "logps/chosen": -170.23452758789062, + "logps/rejected": -189.64129638671875, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.203668594360352, + "rewards/margins": 1.9801316261291504, + "rewards/rejected": -14.18380069732666, + "step": 3076 + }, + { + "epoch": 2.1235121614628256, + "grad_norm": 0.2650773227214813, + "learning_rate": 3.653624856156502e-06, + "logits/chosen": 3.847933530807495, + "logits/rejected": 3.847933530807495, + "logps/chosen": -186.0601043701172, + "logps/rejected": -186.0601043701172, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.907633781433105, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -13.907632827758789, + "step": 3077 + }, + { + "epoch": 2.1242021735380368, + "grad_norm": 1.1813664436340332, + "learning_rate": 3.6507479861910244e-06, + "logits/chosen": 3.887871265411377, + "logits/rejected": 3.943726062774658, + "logps/chosen": -179.0543212890625, + "logps/rejected": -183.17327880859375, + "loss": 0.6104, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.138240814208984, + "rewards/margins": 0.4323047399520874, + "rewards/rejected": -13.570545196533203, + "step": 3078 + }, + { + "epoch": 2.1248921856132483, + "grad_norm": 0.2870160937309265, + "learning_rate": 3.647871116225547e-06, + "logits/chosen": 3.5972461700439453, + "logits/rejected": 3.657954692840576, + "logps/chosen": -181.80392456054688, + "logps/rejected": -194.0609130859375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.244568824768066, + "rewards/margins": 1.245650291442871, + "rewards/rejected": -14.490219116210938, + "step": 3079 + }, + { + "epoch": 2.1255821976884595, + "grad_norm": 0.3024778962135315, + "learning_rate": 3.6449942462600695e-06, + "logits/chosen": 3.95438814163208, + "logits/rejected": 3.9666478633880615, + "logps/chosen": -174.72259521484375, + "logps/rejected": -182.21066284179688, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.777321815490723, + "rewards/margins": 0.7435610294342041, + "rewards/rejected": -13.520882606506348, + "step": 3080 + }, + { + "epoch": 2.126272209763671, + "grad_norm": 0.28841695189476013, + "learning_rate": 3.6421173762945915e-06, + "logits/chosen": 3.3502979278564453, + "logits/rejected": 3.393141269683838, + "logps/chosen": -170.38088989257812, + "logps/rejected": -177.14576721191406, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.161198616027832, + "rewards/margins": 0.6602662205696106, + "rewards/rejected": -12.821465492248535, + "step": 3081 + }, + { + "epoch": 2.126962221838882, + "grad_norm": 0.35746079683303833, + "learning_rate": 3.6392405063291142e-06, + "logits/chosen": 4.137859344482422, + "logits/rejected": 4.137859344482422, + "logps/chosen": -167.72341918945312, + "logps/rejected": -167.72341918945312, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.906920433044434, + "rewards/margins": 0.0, + "rewards/rejected": -11.906920433044434, + "step": 3082 + }, + { + "epoch": 2.1276522339140933, + "grad_norm": 0.26097384095191956, + "learning_rate": 3.6363636363636366e-06, + "logits/chosen": 3.798454761505127, + "logits/rejected": 3.9009153842926025, + "logps/chosen": -166.79721069335938, + "logps/rejected": -185.26112365722656, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.046826362609863, + "rewards/margins": 1.9245784282684326, + "rewards/rejected": -13.971405029296875, + "step": 3083 + }, + { + "epoch": 2.128342245989305, + "grad_norm": 0.33288705348968506, + "learning_rate": 3.6334867663981594e-06, + "logits/chosen": 3.4947118759155273, + "logits/rejected": 3.4947118759155273, + "logps/chosen": -174.5012664794922, + "logps/rejected": -174.5012664794922, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.548147201538086, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.548147201538086, + "step": 3084 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 0.37421879172325134, + "learning_rate": 3.6306098964326813e-06, + "logits/chosen": 3.4392054080963135, + "logits/rejected": 3.4392054080963135, + "logps/chosen": -158.73171997070312, + "logps/rejected": -158.73171997070312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.034591674804688, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -11.034591674804688, + "step": 3085 + }, + { + "epoch": 2.1297222701397276, + "grad_norm": 0.3967350423336029, + "learning_rate": 3.627733026467204e-06, + "logits/chosen": 3.494598865509033, + "logits/rejected": 3.706038475036621, + "logps/chosen": -155.8223876953125, + "logps/rejected": -187.25, + "loss": 0.4351, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.831077575683594, + "rewards/margins": 3.1862874031066895, + "rewards/rejected": -14.017364501953125, + "step": 3086 + }, + { + "epoch": 2.1304122822149387, + "grad_norm": 0.6377840638160706, + "learning_rate": 3.6248561565017264e-06, + "logits/chosen": 3.5155601501464844, + "logits/rejected": 3.695927858352661, + "logps/chosen": -161.59817504882812, + "logps/rejected": -177.45108032226562, + "loss": 0.524, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.540177345275879, + "rewards/margins": 1.587092399597168, + "rewards/rejected": -13.127269744873047, + "step": 3087 + }, + { + "epoch": 2.13110229429015, + "grad_norm": 0.25631019473075867, + "learning_rate": 3.621979286536249e-06, + "logits/chosen": 3.479668378829956, + "logits/rejected": 3.5482280254364014, + "logps/chosen": -176.81185913085938, + "logps/rejected": -189.77023315429688, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.863838195800781, + "rewards/margins": 1.3094850778579712, + "rewards/rejected": -14.173323631286621, + "step": 3088 + }, + { + "epoch": 2.1317923063653614, + "grad_norm": 2.03877854347229, + "learning_rate": 3.619102416570771e-06, + "logits/chosen": 3.7536373138427734, + "logits/rejected": 3.8187804222106934, + "logps/chosen": -167.62155151367188, + "logps/rejected": -169.9153289794922, + "loss": 0.6257, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.133825302124023, + "rewards/margins": 0.22418427467346191, + "rewards/rejected": -12.358009338378906, + "step": 3089 + }, + { + "epoch": 2.1324823184405726, + "grad_norm": 0.23719649016857147, + "learning_rate": 3.6162255466052935e-06, + "logits/chosen": 3.776824474334717, + "logits/rejected": 3.787522315979004, + "logps/chosen": -156.13787841796875, + "logps/rejected": -162.58578491210938, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.833555221557617, + "rewards/margins": 0.6544903516769409, + "rewards/rejected": -11.488045692443848, + "step": 3090 + }, + { + "epoch": 2.133172330515784, + "grad_norm": 0.3826068937778473, + "learning_rate": 3.6133486766398162e-06, + "logits/chosen": 3.381066083908081, + "logits/rejected": 3.49590802192688, + "logps/chosen": -155.13893127441406, + "logps/rejected": -163.67079162597656, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.966493606567383, + "rewards/margins": 0.8454667925834656, + "rewards/rejected": -11.811960220336914, + "step": 3091 + }, + { + "epoch": 2.1338623425909953, + "grad_norm": 0.34764647483825684, + "learning_rate": 3.610471806674339e-06, + "logits/chosen": 3.1063666343688965, + "logits/rejected": 3.2929248809814453, + "logps/chosen": -149.0763397216797, + "logps/rejected": -168.52529907226562, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.049656867980957, + "rewards/margins": 2.023245096206665, + "rewards/rejected": -12.07290267944336, + "step": 3092 + }, + { + "epoch": 2.134552354666207, + "grad_norm": 0.26303163170814514, + "learning_rate": 3.607594936708861e-06, + "logits/chosen": 3.695558786392212, + "logits/rejected": 3.695558786392212, + "logps/chosen": -175.6286163330078, + "logps/rejected": -175.6286163330078, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.729199409484863, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -12.729198455810547, + "step": 3093 + }, + { + "epoch": 2.135242366741418, + "grad_norm": 2.6088006496429443, + "learning_rate": 3.6047180667433833e-06, + "logits/chosen": 3.5941600799560547, + "logits/rejected": 3.6692070960998535, + "logps/chosen": -175.4548797607422, + "logps/rejected": -188.31959533691406, + "loss": 0.5369, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.687219619750977, + "rewards/margins": 1.2391263246536255, + "rewards/rejected": -13.926345825195312, + "step": 3094 + }, + { + "epoch": 2.135932378816629, + "grad_norm": 1.416197657585144, + "learning_rate": 3.601841196777906e-06, + "logits/chosen": 3.824665069580078, + "logits/rejected": 3.824627637863159, + "logps/chosen": -171.03314208984375, + "logps/rejected": -175.21385192871094, + "loss": 0.6111, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.195510864257812, + "rewards/margins": 0.41010332107543945, + "rewards/rejected": -12.605613708496094, + "step": 3095 + }, + { + "epoch": 2.1366223908918407, + "grad_norm": 6.263790130615234, + "learning_rate": 3.5989643268124284e-06, + "logits/chosen": 3.511629581451416, + "logits/rejected": 3.7378978729248047, + "logps/chosen": -167.21994018554688, + "logps/rejected": -176.78758239746094, + "loss": 0.5565, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.953964233398438, + "rewards/margins": 0.8833484053611755, + "rewards/rejected": -12.837312698364258, + "step": 3096 + }, + { + "epoch": 2.137312402967052, + "grad_norm": 0.21081985533237457, + "learning_rate": 3.5960874568469508e-06, + "logits/chosen": 3.119036912918091, + "logits/rejected": 3.25368070602417, + "logps/chosen": -150.61154174804688, + "logps/rejected": -177.12649536132812, + "loss": 0.4335, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.133827209472656, + "rewards/margins": 2.8318698406219482, + "rewards/rejected": -12.965697288513184, + "step": 3097 + }, + { + "epoch": 2.1380024150422634, + "grad_norm": 0.3098006844520569, + "learning_rate": 3.593210586881473e-06, + "logits/chosen": 3.5574450492858887, + "logits/rejected": 3.5574450492858887, + "logps/chosen": -177.6591033935547, + "logps/rejected": -177.6591033935547, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.015538215637207, + "rewards/margins": 0.0, + "rewards/rejected": -13.015538215637207, + "step": 3098 + }, + { + "epoch": 2.1386924271174745, + "grad_norm": 0.30010247230529785, + "learning_rate": 3.590333716915996e-06, + "logits/chosen": 3.338625431060791, + "logits/rejected": 3.560422420501709, + "logps/chosen": -156.00283813476562, + "logps/rejected": -171.3687286376953, + "loss": 0.5213, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.849433898925781, + "rewards/margins": 1.6162129640579224, + "rewards/rejected": -12.46564769744873, + "step": 3099 + }, + { + "epoch": 2.1393824391926857, + "grad_norm": 0.2930518090724945, + "learning_rate": 3.5874568469505183e-06, + "logits/chosen": 3.704890489578247, + "logits/rejected": 3.8111135959625244, + "logps/chosen": -178.5370635986328, + "logps/rejected": -191.6861114501953, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.308879852294922, + "rewards/margins": 1.2402359247207642, + "rewards/rejected": -14.549116134643555, + "step": 3100 + }, + { + "epoch": 2.1400724512678972, + "grad_norm": 0.5246613025665283, + "learning_rate": 3.58457997698504e-06, + "logits/chosen": 3.4027817249298096, + "logits/rejected": 3.4027817249298096, + "logps/chosen": -168.86795043945312, + "logps/rejected": -168.86795043945312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.21627140045166, + "rewards/margins": 0.0, + "rewards/rejected": -12.21627140045166, + "step": 3101 + }, + { + "epoch": 2.1407624633431084, + "grad_norm": 0.29808953404426575, + "learning_rate": 3.581703107019563e-06, + "logits/chosen": 3.994661331176758, + "logits/rejected": 4.042994499206543, + "logps/chosen": -170.71636962890625, + "logps/rejected": -183.471435546875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.05871868133545, + "rewards/margins": 1.2991526126861572, + "rewards/rejected": -13.357871055603027, + "step": 3102 + }, + { + "epoch": 2.14145247541832, + "grad_norm": 0.38657036423683167, + "learning_rate": 3.5788262370540853e-06, + "logits/chosen": 3.3192341327667236, + "logits/rejected": 3.373764991760254, + "logps/chosen": -181.62994384765625, + "logps/rejected": -192.47879028320312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.347604751586914, + "rewards/margins": 1.0791659355163574, + "rewards/rejected": -14.426770210266113, + "step": 3103 + }, + { + "epoch": 2.142142487493531, + "grad_norm": 11.15714168548584, + "learning_rate": 3.575949367088608e-06, + "logits/chosen": 3.2783706188201904, + "logits/rejected": 3.4007949829101562, + "logps/chosen": -161.55081176757812, + "logps/rejected": -179.3389434814453, + "loss": 0.7616, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.675322532653809, + "rewards/margins": 1.7172584533691406, + "rewards/rejected": -13.392581939697266, + "step": 3104 + }, + { + "epoch": 2.142832499568742, + "grad_norm": 0.27525749802589417, + "learning_rate": 3.57307249712313e-06, + "logits/chosen": 3.2118000984191895, + "logits/rejected": 3.3269100189208984, + "logps/chosen": -173.58023071289062, + "logps/rejected": -182.27513122558594, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.467869758605957, + "rewards/margins": 0.8964028358459473, + "rewards/rejected": -13.364274024963379, + "step": 3105 + }, + { + "epoch": 2.143522511643954, + "grad_norm": 0.28702083230018616, + "learning_rate": 3.570195627157653e-06, + "logits/chosen": 3.5449182987213135, + "logits/rejected": 3.5700724124908447, + "logps/chosen": -174.85202026367188, + "logps/rejected": -194.43641662597656, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.695758819580078, + "rewards/margins": 2.034364938735962, + "rewards/rejected": -14.730124473571777, + "step": 3106 + }, + { + "epoch": 2.144212523719165, + "grad_norm": 0.31011709570884705, + "learning_rate": 3.567318757192175e-06, + "logits/chosen": 3.604135513305664, + "logits/rejected": 3.652144432067871, + "logps/chosen": -171.45614624023438, + "logps/rejected": -182.97630310058594, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.324460983276367, + "rewards/margins": 1.1444146633148193, + "rewards/rejected": -13.46887493133545, + "step": 3107 + }, + { + "epoch": 2.1449025357943765, + "grad_norm": 0.2733755111694336, + "learning_rate": 3.564441887226698e-06, + "logits/chosen": 3.242828607559204, + "logits/rejected": 3.29758358001709, + "logps/chosen": -153.35484313964844, + "logps/rejected": -178.11927795410156, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.36611270904541, + "rewards/margins": 2.527557611465454, + "rewards/rejected": -12.893670082092285, + "step": 3108 + }, + { + "epoch": 2.1455925478695876, + "grad_norm": 0.320867657661438, + "learning_rate": 3.56156501726122e-06, + "logits/chosen": 3.5283689498901367, + "logits/rejected": 3.7514853477478027, + "logps/chosen": -175.78448486328125, + "logps/rejected": -191.85165405273438, + "loss": 0.5203, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.721982955932617, + "rewards/margins": 1.615321159362793, + "rewards/rejected": -14.337305068969727, + "step": 3109 + }, + { + "epoch": 2.146282559944799, + "grad_norm": 0.32485416531562805, + "learning_rate": 3.5586881472957426e-06, + "logits/chosen": 3.712890625, + "logits/rejected": 3.745166778564453, + "logps/chosen": -183.89358520507812, + "logps/rejected": -190.3535614013672, + "loss": 0.6073, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.59950065612793, + "rewards/margins": 0.6370092630386353, + "rewards/rejected": -14.236509323120117, + "step": 3110 + }, + { + "epoch": 2.1469725720200104, + "grad_norm": 0.3695944845676422, + "learning_rate": 3.555811277330265e-06, + "logits/chosen": 3.342291831970215, + "logits/rejected": 3.5091500282287598, + "logps/chosen": -181.7886505126953, + "logps/rejected": -193.94287109375, + "loss": 0.5215, + "rewards/accuracies": 0.625, + "rewards/chosen": -13.182632446289062, + "rewards/margins": 1.2581967115402222, + "rewards/rejected": -14.44083023071289, + "step": 3111 + }, + { + "epoch": 2.1476625840952215, + "grad_norm": 0.2658901512622833, + "learning_rate": 3.5529344073647877e-06, + "logits/chosen": 3.428868293762207, + "logits/rejected": 3.682056188583374, + "logps/chosen": -162.952880859375, + "logps/rejected": -185.0281219482422, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.587361335754395, + "rewards/margins": 2.209716796875, + "rewards/rejected": -13.797079086303711, + "step": 3112 + }, + { + "epoch": 2.148352596170433, + "grad_norm": 0.3350929617881775, + "learning_rate": 3.5500575373993097e-06, + "logits/chosen": 3.7237143516540527, + "logits/rejected": 3.7237143516540527, + "logps/chosen": -181.69403076171875, + "logps/rejected": -181.6940460205078, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.345216751098633, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.34521770477295, + "step": 3113 + }, + { + "epoch": 2.149042608245644, + "grad_norm": 0.2791765630245209, + "learning_rate": 3.547180667433832e-06, + "logits/chosen": 3.976315975189209, + "logits/rejected": 4.050331115722656, + "logps/chosen": -175.06985473632812, + "logps/rejected": -186.4180145263672, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.625997543334961, + "rewards/margins": 1.1311359405517578, + "rewards/rejected": -13.757133483886719, + "step": 3114 + }, + { + "epoch": 2.1497326203208558, + "grad_norm": 0.29003024101257324, + "learning_rate": 3.544303797468355e-06, + "logits/chosen": 3.4665098190307617, + "logits/rejected": 3.495917320251465, + "logps/chosen": -174.5611572265625, + "logps/rejected": -183.94827270507812, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.724065780639648, + "rewards/margins": 0.9532690644264221, + "rewards/rejected": -13.67733383178711, + "step": 3115 + }, + { + "epoch": 2.150422632396067, + "grad_norm": 0.3113037943840027, + "learning_rate": 3.541426927502877e-06, + "logits/chosen": 3.084585666656494, + "logits/rejected": 3.1456570625305176, + "logps/chosen": -170.276123046875, + "logps/rejected": -182.427001953125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.222509384155273, + "rewards/margins": 1.271854281425476, + "rewards/rejected": -13.494363784790039, + "step": 3116 + }, + { + "epoch": 2.1511126444712785, + "grad_norm": 6.56858491897583, + "learning_rate": 3.5385500575373995e-06, + "logits/chosen": 3.6674814224243164, + "logits/rejected": 3.7063910961151123, + "logps/chosen": -168.66751098632812, + "logps/rejected": -178.2589111328125, + "loss": 0.5666, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.215085983276367, + "rewards/margins": 0.9680029153823853, + "rewards/rejected": -13.183089256286621, + "step": 3117 + }, + { + "epoch": 2.1518026565464896, + "grad_norm": 0.3061733841896057, + "learning_rate": 3.535673187571922e-06, + "logits/chosen": 3.7381443977355957, + "logits/rejected": 3.7381443977355957, + "logps/chosen": -172.50100708007812, + "logps/rejected": -172.50100708007812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.372366905212402, + "rewards/margins": -2.9802322387695312e-08, + "rewards/rejected": -12.372366905212402, + "step": 3118 + }, + { + "epoch": 2.1524926686217007, + "grad_norm": 0.23010429739952087, + "learning_rate": 3.5327963176064446e-06, + "logits/chosen": 3.3032822608947754, + "logits/rejected": 3.6095833778381348, + "logps/chosen": -124.16244506835938, + "logps/rejected": -156.42593383789062, + "loss": 0.4335, + "rewards/accuracies": 0.5, + "rewards/chosen": -7.534430503845215, + "rewards/margins": 3.206974506378174, + "rewards/rejected": -10.741405487060547, + "step": 3119 + }, + { + "epoch": 2.1531826806969123, + "grad_norm": 0.3506099283695221, + "learning_rate": 3.529919447640967e-06, + "logits/chosen": 3.7285189628601074, + "logits/rejected": 4.018731117248535, + "logps/chosen": -159.85423278808594, + "logps/rejected": -166.250732421875, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.297321319580078, + "rewards/margins": 0.6429771780967712, + "rewards/rejected": -11.940299034118652, + "step": 3120 + }, + { + "epoch": 2.1538726927721235, + "grad_norm": 0.313396155834198, + "learning_rate": 3.5270425776754898e-06, + "logits/chosen": 3.620011568069458, + "logits/rejected": 3.6593313217163086, + "logps/chosen": -180.8699188232422, + "logps/rejected": -188.88955688476562, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.196868896484375, + "rewards/margins": 0.7991305589675903, + "rewards/rejected": -13.996000289916992, + "step": 3121 + }, + { + "epoch": 2.154562704847335, + "grad_norm": 0.2561229169368744, + "learning_rate": 3.5241657077100117e-06, + "logits/chosen": 3.3780722618103027, + "logits/rejected": 3.792642116546631, + "logps/chosen": -161.5539093017578, + "logps/rejected": -192.78846740722656, + "loss": 0.4339, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.311431884765625, + "rewards/margins": 3.092355251312256, + "rewards/rejected": -14.403787612915039, + "step": 3122 + }, + { + "epoch": 2.155252716922546, + "grad_norm": 0.3708925247192383, + "learning_rate": 3.521288837744534e-06, + "logits/chosen": 3.9394423961639404, + "logits/rejected": 3.9394423961639404, + "logps/chosen": -178.84996032714844, + "logps/rejected": -178.84996032714844, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.216646194458008, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.216646194458008, + "step": 3123 + }, + { + "epoch": 2.1559427289977573, + "grad_norm": 0.3091185390949249, + "learning_rate": 3.518411967779057e-06, + "logits/chosen": 3.7387547492980957, + "logits/rejected": 3.9008982181549072, + "logps/chosen": -169.14491271972656, + "logps/rejected": -176.76473999023438, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.141801834106445, + "rewards/margins": 0.7367037534713745, + "rewards/rejected": -12.87850570678711, + "step": 3124 + }, + { + "epoch": 2.156632741072969, + "grad_norm": 11.827475547790527, + "learning_rate": 3.5155350978135796e-06, + "logits/chosen": 3.269395351409912, + "logits/rejected": 3.278613567352295, + "logps/chosen": -177.12417602539062, + "logps/rejected": -186.67868041992188, + "loss": 0.9034, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.867968559265137, + "rewards/margins": 0.9683965444564819, + "rewards/rejected": -13.83636474609375, + "step": 3125 + }, + { + "epoch": 2.15732275314818, + "grad_norm": 0.26507192850112915, + "learning_rate": 3.5126582278481015e-06, + "logits/chosen": 3.43349552154541, + "logits/rejected": 3.5623254776000977, + "logps/chosen": -173.48866271972656, + "logps/rejected": -181.01771545410156, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.559428215026855, + "rewards/margins": 0.7811089754104614, + "rewards/rejected": -13.340538024902344, + "step": 3126 + }, + { + "epoch": 2.1580127652233916, + "grad_norm": 0.3301188349723816, + "learning_rate": 3.509781357882624e-06, + "logits/chosen": 3.316453456878662, + "logits/rejected": 3.316453456878662, + "logps/chosen": -176.99978637695312, + "logps/rejected": -176.99978637695312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.79621696472168, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.796216011047363, + "step": 3127 + }, + { + "epoch": 2.1587027772986027, + "grad_norm": 0.3545786440372467, + "learning_rate": 3.5069044879171466e-06, + "logits/chosen": 3.4344327449798584, + "logits/rejected": 3.4344327449798584, + "logps/chosen": -175.46522521972656, + "logps/rejected": -175.46522521972656, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.61242389678955, + "rewards/margins": 0.0, + "rewards/rejected": -12.61242389678955, + "step": 3128 + }, + { + "epoch": 2.159392789373814, + "grad_norm": 0.2763032913208008, + "learning_rate": 3.504027617951669e-06, + "logits/chosen": 3.193571090698242, + "logits/rejected": 3.3789706230163574, + "logps/chosen": -153.0692138671875, + "logps/rejected": -179.91763305664062, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.429718017578125, + "rewards/margins": 2.6861214637756348, + "rewards/rejected": -13.115839004516602, + "step": 3129 + }, + { + "epoch": 2.1600828014490254, + "grad_norm": 16.705751419067383, + "learning_rate": 3.5011507479861913e-06, + "logits/chosen": 3.673527240753174, + "logits/rejected": 3.512326240539551, + "logps/chosen": -188.5072021484375, + "logps/rejected": -194.43923950195312, + "loss": 0.736, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.903596878051758, + "rewards/margins": 0.6696472764015198, + "rewards/rejected": -14.573244094848633, + "step": 3130 + }, + { + "epoch": 2.1607728135242366, + "grad_norm": 0.4222869575023651, + "learning_rate": 3.4982738780207137e-06, + "logits/chosen": 3.694613456726074, + "logits/rejected": 3.7523858547210693, + "logps/chosen": -170.53456115722656, + "logps/rejected": -188.5135498046875, + "loss": 0.5218, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.280510902404785, + "rewards/margins": 1.8317415714263916, + "rewards/rejected": -14.112253189086914, + "step": 3131 + }, + { + "epoch": 2.161462825599448, + "grad_norm": 0.2612754702568054, + "learning_rate": 3.4953970080552365e-06, + "logits/chosen": 3.259260654449463, + "logits/rejected": 3.5906822681427, + "logps/chosen": -155.89930725097656, + "logps/rejected": -179.31446838378906, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.941563606262207, + "rewards/margins": 2.3356432914733887, + "rewards/rejected": -13.277206420898438, + "step": 3132 + }, + { + "epoch": 2.1621528376746593, + "grad_norm": 0.25426971912384033, + "learning_rate": 3.492520138089759e-06, + "logits/chosen": 3.714212417602539, + "logits/rejected": 3.819088935852051, + "logps/chosen": -157.22122192382812, + "logps/rejected": -171.66355895996094, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.8377685546875, + "rewards/margins": 1.3691883087158203, + "rewards/rejected": -12.20695686340332, + "step": 3133 + }, + { + "epoch": 2.162842849749871, + "grad_norm": 0.2591085731983185, + "learning_rate": 3.4896432681242808e-06, + "logits/chosen": 3.353633165359497, + "logits/rejected": 3.3903040885925293, + "logps/chosen": -151.9414520263672, + "logps/rejected": -168.96835327148438, + "loss": 0.5205, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.58837604522705, + "rewards/margins": 1.4905791282653809, + "rewards/rejected": -12.07895565032959, + "step": 3134 + }, + { + "epoch": 2.163532861825082, + "grad_norm": 0.3522898852825165, + "learning_rate": 3.4867663981588035e-06, + "logits/chosen": 3.784907579421997, + "logits/rejected": 3.83304500579834, + "logps/chosen": -190.21099853515625, + "logps/rejected": -196.4478302001953, + "loss": 0.6076, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.146296501159668, + "rewards/margins": 0.5936368703842163, + "rewards/rejected": -14.739933013916016, + "step": 3135 + }, + { + "epoch": 2.164222873900293, + "grad_norm": 0.32992058992385864, + "learning_rate": 3.483889528193326e-06, + "logits/chosen": 3.6767420768737793, + "logits/rejected": 3.6767420768737793, + "logps/chosen": -192.35533142089844, + "logps/rejected": -192.35533142089844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.651119232177734, + "rewards/margins": 0.0, + "rewards/rejected": -14.651119232177734, + "step": 3136 + }, + { + "epoch": 2.1649128859755047, + "grad_norm": 0.8158502578735352, + "learning_rate": 3.4810126582278487e-06, + "logits/chosen": 3.648158550262451, + "logits/rejected": 3.7557592391967773, + "logps/chosen": -178.9543914794922, + "logps/rejected": -188.6123504638672, + "loss": 0.5283, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.07461929321289, + "rewards/margins": 1.0145864486694336, + "rewards/rejected": -14.089205741882324, + "step": 3137 + }, + { + "epoch": 2.165602898050716, + "grad_norm": 5.471824645996094, + "learning_rate": 3.4781357882623706e-06, + "logits/chosen": 3.7038652896881104, + "logits/rejected": 3.8698582649230957, + "logps/chosen": -158.71400451660156, + "logps/rejected": -180.93893432617188, + "loss": 0.4644, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.191608428955078, + "rewards/margins": 2.075528621673584, + "rewards/rejected": -13.267136573791504, + "step": 3138 + }, + { + "epoch": 2.1662929101259274, + "grad_norm": 0.28424134850502014, + "learning_rate": 3.4752589182968934e-06, + "logits/chosen": 3.785783529281616, + "logits/rejected": 4.146927356719971, + "logps/chosen": -172.2274169921875, + "logps/rejected": -187.12420654296875, + "loss": 0.5206, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.269659042358398, + "rewards/margins": 1.5467002391815186, + "rewards/rejected": -13.816359519958496, + "step": 3139 + }, + { + "epoch": 2.1669829222011385, + "grad_norm": 0.38685914874076843, + "learning_rate": 3.4723820483314157e-06, + "logits/chosen": 3.0370595455169678, + "logits/rejected": 3.1502764225006104, + "logps/chosen": -162.33920288085938, + "logps/rejected": -174.99673461914062, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.45108699798584, + "rewards/margins": 1.245158076286316, + "rewards/rejected": -12.696244239807129, + "step": 3140 + }, + { + "epoch": 2.1676729342763497, + "grad_norm": 0.3187330365180969, + "learning_rate": 3.4695051783659385e-06, + "logits/chosen": 3.5054917335510254, + "logits/rejected": 3.609640598297119, + "logps/chosen": -164.9746551513672, + "logps/rejected": -175.3204345703125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.733104705810547, + "rewards/margins": 1.0880742073059082, + "rewards/rejected": -12.821179389953613, + "step": 3141 + }, + { + "epoch": 2.1683629463515612, + "grad_norm": 0.3281504213809967, + "learning_rate": 3.4666283084004604e-06, + "logits/chosen": 3.7281413078308105, + "logits/rejected": 3.7281413078308105, + "logps/chosen": -178.64059448242188, + "logps/rejected": -178.64059448242188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.183734893798828, + "rewards/margins": 0.0, + "rewards/rejected": -13.183734893798828, + "step": 3142 + }, + { + "epoch": 2.1690529584267724, + "grad_norm": 0.44763508439064026, + "learning_rate": 3.4637514384349828e-06, + "logits/chosen": 3.4763712882995605, + "logits/rejected": 3.63847017288208, + "logps/chosen": -169.42575073242188, + "logps/rejected": -188.68406677246094, + "loss": 0.5212, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.060480117797852, + "rewards/margins": 1.9714972972869873, + "rewards/rejected": -14.031976699829102, + "step": 3143 + }, + { + "epoch": 2.169742970501984, + "grad_norm": 0.31697550415992737, + "learning_rate": 3.4608745684695055e-06, + "logits/chosen": 3.772711753845215, + "logits/rejected": 3.874119281768799, + "logps/chosen": -174.78842163085938, + "logps/rejected": -189.41738891601562, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.774092674255371, + "rewards/margins": 1.5130417346954346, + "rewards/rejected": -14.287135124206543, + "step": 3144 + }, + { + "epoch": 2.170432982577195, + "grad_norm": 0.43944424390792847, + "learning_rate": 3.4579976985040283e-06, + "logits/chosen": 3.762627601623535, + "logits/rejected": 3.8866820335388184, + "logps/chosen": -183.8735809326172, + "logps/rejected": -194.85763549804688, + "loss": 0.522, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.565155029296875, + "rewards/margins": 1.1901932954788208, + "rewards/rejected": -14.755349159240723, + "step": 3145 + }, + { + "epoch": 2.171122994652406, + "grad_norm": 0.33021414279937744, + "learning_rate": 3.4551208285385502e-06, + "logits/chosen": 3.802645683288574, + "logits/rejected": 3.9105069637298584, + "logps/chosen": -168.29754638671875, + "logps/rejected": -186.26416015625, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.984456062316895, + "rewards/margins": 1.814997911453247, + "rewards/rejected": -13.799454689025879, + "step": 3146 + }, + { + "epoch": 2.171813006727618, + "grad_norm": 0.32856303453445435, + "learning_rate": 3.4522439585730726e-06, + "logits/chosen": 3.399874687194824, + "logits/rejected": 3.4370737075805664, + "logps/chosen": -157.51187133789062, + "logps/rejected": -168.53482055664062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.92734432220459, + "rewards/margins": 1.124921202659607, + "rewards/rejected": -12.052265167236328, + "step": 3147 + }, + { + "epoch": 2.172503018802829, + "grad_norm": 1.4482165575027466, + "learning_rate": 3.4493670886075954e-06, + "logits/chosen": 3.738440990447998, + "logits/rejected": 3.7829904556274414, + "logps/chosen": -166.38307189941406, + "logps/rejected": -170.34735107421875, + "loss": 0.6135, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.737079620361328, + "rewards/margins": 0.357464075088501, + "rewards/rejected": -12.094544410705566, + "step": 3148 + }, + { + "epoch": 2.1731930308780405, + "grad_norm": 2.5614330768585205, + "learning_rate": 3.4464902186421177e-06, + "logits/chosen": 3.580822467803955, + "logits/rejected": 3.953666925430298, + "logps/chosen": -173.70443725585938, + "logps/rejected": -190.28756713867188, + "loss": 0.4659, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.501182556152344, + "rewards/margins": 1.6835880279541016, + "rewards/rejected": -14.184769630432129, + "step": 3149 + }, + { + "epoch": 2.1738830429532516, + "grad_norm": 0.3330516815185547, + "learning_rate": 3.44361334867664e-06, + "logits/chosen": 3.553788661956787, + "logits/rejected": 3.553788661956787, + "logps/chosen": -175.89781188964844, + "logps/rejected": -175.89781188964844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.83427906036377, + "rewards/margins": 0.0, + "rewards/rejected": -12.83427906036377, + "step": 3150 + }, + { + "epoch": 2.174573055028463, + "grad_norm": 0.3176749646663666, + "learning_rate": 3.4407364787111624e-06, + "logits/chosen": 3.7706315517425537, + "logits/rejected": 3.8714020252227783, + "logps/chosen": -180.9268341064453, + "logps/rejected": -188.28897094726562, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.454341888427734, + "rewards/margins": 0.7554368376731873, + "rewards/rejected": -14.20977783203125, + "step": 3151 + }, + { + "epoch": 2.1752630671036743, + "grad_norm": 12.148361206054688, + "learning_rate": 3.437859608745685e-06, + "logits/chosen": 3.5661275386810303, + "logits/rejected": 3.54634690284729, + "logps/chosen": -172.98568725585938, + "logps/rejected": -172.72206115722656, + "loss": 0.7031, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.570670127868652, + "rewards/margins": -0.019113779067993164, + "rewards/rejected": -12.551556587219238, + "step": 3152 + }, + { + "epoch": 2.1759530791788855, + "grad_norm": 0.28889453411102295, + "learning_rate": 3.4349827387802076e-06, + "logits/chosen": 3.6170833110809326, + "logits/rejected": 3.763559341430664, + "logps/chosen": -188.708251953125, + "logps/rejected": -196.7496337890625, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.059881210327148, + "rewards/margins": 0.8213061094284058, + "rewards/rejected": -14.881187438964844, + "step": 3153 + }, + { + "epoch": 2.176643091254097, + "grad_norm": 0.7476423978805542, + "learning_rate": 3.4321058688147295e-06, + "logits/chosen": 3.5089592933654785, + "logits/rejected": 3.7786178588867188, + "logps/chosen": -179.7866973876953, + "logps/rejected": -190.63485717773438, + "loss": 0.5237, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.222639083862305, + "rewards/margins": 1.0426422357559204, + "rewards/rejected": -14.265280723571777, + "step": 3154 + }, + { + "epoch": 2.177333103329308, + "grad_norm": 0.28852754831314087, + "learning_rate": 3.4292289988492523e-06, + "logits/chosen": 4.096513748168945, + "logits/rejected": 4.096513748168945, + "logps/chosen": -188.86953735351562, + "logps/rejected": -188.86953735351562, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.170398712158203, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -14.170398712158203, + "step": 3155 + }, + { + "epoch": 2.1780231154045198, + "grad_norm": 0.31402787566185, + "learning_rate": 3.4263521288837746e-06, + "logits/chosen": 3.5252835750579834, + "logits/rejected": 3.4686615467071533, + "logps/chosen": -172.14810180664062, + "logps/rejected": -189.33773803710938, + "loss": 0.5207, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.440088272094727, + "rewards/margins": 1.7619389295578003, + "rewards/rejected": -14.202028274536133, + "step": 3156 + }, + { + "epoch": 2.178713127479731, + "grad_norm": 0.3663617968559265, + "learning_rate": 3.4234752589182974e-06, + "logits/chosen": 3.4783639907836914, + "logits/rejected": 3.5831682682037354, + "logps/chosen": -169.6768035888672, + "logps/rejected": -182.74765014648438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.222358703613281, + "rewards/margins": 1.3252917528152466, + "rewards/rejected": -13.547651290893555, + "step": 3157 + }, + { + "epoch": 2.179403139554942, + "grad_norm": 0.4120608866214752, + "learning_rate": 3.4205983889528193e-06, + "logits/chosen": 3.325336217880249, + "logits/rejected": 3.325336217880249, + "logps/chosen": -172.29254150390625, + "logps/rejected": -172.29254150390625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.186626434326172, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.186626434326172, + "step": 3158 + }, + { + "epoch": 2.1800931516301536, + "grad_norm": 0.32529497146606445, + "learning_rate": 3.417721518987342e-06, + "logits/chosen": 3.3695244789123535, + "logits/rejected": 3.463054656982422, + "logps/chosen": -148.05117797851562, + "logps/rejected": -156.41378784179688, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.17968463897705, + "rewards/margins": 0.8422024250030518, + "rewards/rejected": -11.02188777923584, + "step": 3159 + }, + { + "epoch": 2.1807831637053647, + "grad_norm": 0.27453291416168213, + "learning_rate": 3.4148446490218644e-06, + "logits/chosen": 3.660276412963867, + "logits/rejected": 3.660276412963867, + "logps/chosen": -183.29905700683594, + "logps/rejected": -183.29905700683594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.413262367248535, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.413262367248535, + "step": 3160 + }, + { + "epoch": 2.1814731757805763, + "grad_norm": 0.2906400263309479, + "learning_rate": 3.411967779056387e-06, + "logits/chosen": 3.3648440837860107, + "logits/rejected": 3.5616097450256348, + "logps/chosen": -175.9322052001953, + "logps/rejected": -187.97482299804688, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.839240074157715, + "rewards/margins": 1.2276456356048584, + "rewards/rejected": -14.066884994506836, + "step": 3161 + }, + { + "epoch": 2.1821631878557874, + "grad_norm": 0.32265928387641907, + "learning_rate": 3.409090909090909e-06, + "logits/chosen": 3.3774454593658447, + "logits/rejected": 3.4574472904205322, + "logps/chosen": -177.9739990234375, + "logps/rejected": -186.78472900390625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.104157447814941, + "rewards/margins": 0.8474869728088379, + "rewards/rejected": -13.951643943786621, + "step": 3162 + }, + { + "epoch": 2.1828531999309986, + "grad_norm": 0.46851783990859985, + "learning_rate": 3.4062140391254315e-06, + "logits/chosen": 3.2190446853637695, + "logits/rejected": 3.236888885498047, + "logps/chosen": -141.31298828125, + "logps/rejected": -163.04592895507812, + "loss": 0.5223, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.444993019104004, + "rewards/margins": 2.1134071350097656, + "rewards/rejected": -11.558401107788086, + "step": 3163 + }, + { + "epoch": 2.18354321200621, + "grad_norm": 0.3210011124610901, + "learning_rate": 3.4033371691599543e-06, + "logits/chosen": 3.4742493629455566, + "logits/rejected": 3.4742493629455566, + "logps/chosen": -193.56365966796875, + "logps/rejected": -193.56365966796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.522873878479004, + "rewards/margins": 0.0, + "rewards/rejected": -14.522873878479004, + "step": 3164 + }, + { + "epoch": 2.1842332240814213, + "grad_norm": 0.42957308888435364, + "learning_rate": 3.400460299194477e-06, + "logits/chosen": 3.2047832012176514, + "logits/rejected": 3.3262600898742676, + "logps/chosen": -175.65646362304688, + "logps/rejected": -199.0526123046875, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.872633934020996, + "rewards/margins": 2.265669345855713, + "rewards/rejected": -15.138303756713867, + "step": 3165 + }, + { + "epoch": 2.184923236156633, + "grad_norm": 0.338561087846756, + "learning_rate": 3.397583429228999e-06, + "logits/chosen": 3.7111082077026367, + "logits/rejected": 3.799856185913086, + "logps/chosen": -163.2254638671875, + "logps/rejected": -173.2447509765625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.570826530456543, + "rewards/margins": 0.9916206002235413, + "rewards/rejected": -12.562446594238281, + "step": 3166 + }, + { + "epoch": 2.185613248231844, + "grad_norm": 8.764653205871582, + "learning_rate": 3.3947065592635213e-06, + "logits/chosen": 3.5855650901794434, + "logits/rejected": 3.548560857772827, + "logps/chosen": -177.43618774414062, + "logps/rejected": -176.13804626464844, + "loss": 0.778, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.960204124450684, + "rewards/margins": -0.13494443893432617, + "rewards/rejected": -12.8252592086792, + "step": 3167 + }, + { + "epoch": 2.1863032603070556, + "grad_norm": 0.2825620174407959, + "learning_rate": 3.391829689298044e-06, + "logits/chosen": 3.3035545349121094, + "logits/rejected": 3.490102767944336, + "logps/chosen": -164.275146484375, + "logps/rejected": -181.48744201660156, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.75269603729248, + "rewards/margins": 1.696144700050354, + "rewards/rejected": -13.448841094970703, + "step": 3168 + }, + { + "epoch": 2.1869932723822667, + "grad_norm": 0.2915877401828766, + "learning_rate": 3.3889528193325664e-06, + "logits/chosen": 3.209472417831421, + "logits/rejected": 3.3724100589752197, + "logps/chosen": -162.4218292236328, + "logps/rejected": -183.68722534179688, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.347494125366211, + "rewards/margins": 2.138486862182617, + "rewards/rejected": -13.485980987548828, + "step": 3169 + }, + { + "epoch": 2.187683284457478, + "grad_norm": 0.43122029304504395, + "learning_rate": 3.386075949367089e-06, + "logits/chosen": 3.0818395614624023, + "logits/rejected": 3.2730939388275146, + "logps/chosen": -170.53477478027344, + "logps/rejected": -189.3668670654297, + "loss": 0.5213, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.115949630737305, + "rewards/margins": 1.8295179605484009, + "rewards/rejected": -13.945467948913574, + "step": 3170 + }, + { + "epoch": 2.1883732965326894, + "grad_norm": 0.3714683949947357, + "learning_rate": 3.383199079401611e-06, + "logits/chosen": 3.5465078353881836, + "logits/rejected": 3.5465078353881836, + "logps/chosen": -177.41221618652344, + "logps/rejected": -177.41221618652344, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.136432647705078, + "rewards/margins": 0.0, + "rewards/rejected": -13.136432647705078, + "step": 3171 + }, + { + "epoch": 2.1890633086079005, + "grad_norm": 0.25808772444725037, + "learning_rate": 3.380322209436134e-06, + "logits/chosen": 3.441544532775879, + "logits/rejected": 3.441544532775879, + "logps/chosen": -167.43283081054688, + "logps/rejected": -167.43283081054688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.85025405883789, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -11.85025405883789, + "step": 3172 + }, + { + "epoch": 2.189753320683112, + "grad_norm": 0.31898805499076843, + "learning_rate": 3.3774453394706563e-06, + "logits/chosen": 3.4582958221435547, + "logits/rejected": 3.4606058597564697, + "logps/chosen": -167.08944702148438, + "logps/rejected": -178.42108154296875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.919288635253906, + "rewards/margins": 1.2364007234573364, + "rewards/rejected": -13.155688285827637, + "step": 3173 + }, + { + "epoch": 2.1904433327583233, + "grad_norm": 0.2634897828102112, + "learning_rate": 3.374568469505179e-06, + "logits/chosen": 3.4169795513153076, + "logits/rejected": 3.5736236572265625, + "logps/chosen": -160.5150909423828, + "logps/rejected": -181.6232147216797, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.207621574401855, + "rewards/margins": 2.1483545303344727, + "rewards/rejected": -13.355976104736328, + "step": 3174 + }, + { + "epoch": 2.1911333448335344, + "grad_norm": 0.4012005031108856, + "learning_rate": 3.371691599539701e-06, + "logits/chosen": 3.5115468502044678, + "logits/rejected": 3.4582102298736572, + "logps/chosen": -166.42138671875, + "logps/rejected": -181.70578002929688, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.825571060180664, + "rewards/margins": 1.538356065750122, + "rewards/rejected": -13.363926887512207, + "step": 3175 + }, + { + "epoch": 2.191823356908746, + "grad_norm": 0.38421744108200073, + "learning_rate": 3.3688147295742233e-06, + "logits/chosen": 3.527689218521118, + "logits/rejected": 3.527689218521118, + "logps/chosen": -161.63671875, + "logps/rejected": -161.63671875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.325911521911621, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -11.325911521911621, + "step": 3176 + }, + { + "epoch": 2.192513368983957, + "grad_norm": 2.581186532974243, + "learning_rate": 3.365937859608746e-06, + "logits/chosen": 3.545806646347046, + "logits/rejected": 3.6612632274627686, + "logps/chosen": -170.59637451171875, + "logps/rejected": -182.0271759033203, + "loss": 0.5423, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.359885215759277, + "rewards/margins": 1.072931170463562, + "rewards/rejected": -13.432816505432129, + "step": 3177 + }, + { + "epoch": 2.1932033810591687, + "grad_norm": 0.27799591422080994, + "learning_rate": 3.3630609896432685e-06, + "logits/chosen": 3.3308329582214355, + "logits/rejected": 3.3308329582214355, + "logps/chosen": -157.43240356445312, + "logps/rejected": -157.43240356445312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.083518028259277, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -11.083518981933594, + "step": 3178 + }, + { + "epoch": 2.19389339313438, + "grad_norm": 10.703088760375977, + "learning_rate": 3.360184119677791e-06, + "logits/chosen": 3.1909523010253906, + "logits/rejected": 3.0869219303131104, + "logps/chosen": -156.47796630859375, + "logps/rejected": -167.22384643554688, + "loss": 0.6643, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.672289848327637, + "rewards/margins": 1.1163021326065063, + "rewards/rejected": -11.788592338562012, + "step": 3179 + }, + { + "epoch": 2.194583405209591, + "grad_norm": 0.31571874022483826, + "learning_rate": 3.357307249712313e-06, + "logits/chosen": 3.5642919540405273, + "logits/rejected": 3.5642919540405273, + "logps/chosen": -182.09078979492188, + "logps/rejected": -182.09078979492188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.335760116577148, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.335760116577148, + "step": 3180 + }, + { + "epoch": 2.1952734172848025, + "grad_norm": 0.37871530652046204, + "learning_rate": 3.354430379746836e-06, + "logits/chosen": 3.51523494720459, + "logits/rejected": 3.51523494720459, + "logps/chosen": -167.15553283691406, + "logps/rejected": -167.15553283691406, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.018054962158203, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.018054962158203, + "step": 3181 + }, + { + "epoch": 2.1959634293600137, + "grad_norm": 0.3527064323425293, + "learning_rate": 3.3515535097813583e-06, + "logits/chosen": 3.30087947845459, + "logits/rejected": 3.30087947845459, + "logps/chosen": -154.65951538085938, + "logps/rejected": -154.65951538085938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.601033210754395, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -10.601034164428711, + "step": 3182 + }, + { + "epoch": 2.1966534414352252, + "grad_norm": 0.2929668724536896, + "learning_rate": 3.3486766398158802e-06, + "logits/chosen": 3.596827983856201, + "logits/rejected": 3.6687464714050293, + "logps/chosen": -167.98338317871094, + "logps/rejected": -179.04119873046875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.963802337646484, + "rewards/margins": 1.0893343687057495, + "rewards/rejected": -13.053136825561523, + "step": 3183 + }, + { + "epoch": 2.1973434535104364, + "grad_norm": 0.27806392312049866, + "learning_rate": 3.345799769850403e-06, + "logits/chosen": 3.7688684463500977, + "logits/rejected": 3.7688684463500977, + "logps/chosen": -192.87681579589844, + "logps/rejected": -192.87681579589844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.276277542114258, + "rewards/margins": 0.0, + "rewards/rejected": -14.276277542114258, + "step": 3184 + }, + { + "epoch": 2.198033465585648, + "grad_norm": 0.2952990233898163, + "learning_rate": 3.3429228998849258e-06, + "logits/chosen": 3.5204474925994873, + "logits/rejected": 3.5204474925994873, + "logps/chosen": -155.940673828125, + "logps/rejected": -155.940673828125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.759932518005371, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -10.759932518005371, + "step": 3185 + }, + { + "epoch": 2.198723477660859, + "grad_norm": 0.301400750875473, + "learning_rate": 3.340046029919448e-06, + "logits/chosen": 3.770859956741333, + "logits/rejected": 3.8934943675994873, + "logps/chosen": -169.69825744628906, + "logps/rejected": -181.55799865722656, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.918951988220215, + "rewards/margins": 1.202359676361084, + "rewards/rejected": -13.121312141418457, + "step": 3186 + }, + { + "epoch": 2.19941348973607, + "grad_norm": 49.80668640136719, + "learning_rate": 3.33716915995397e-06, + "logits/chosen": 3.289109230041504, + "logits/rejected": 3.20500111579895, + "logps/chosen": -150.74778747558594, + "logps/rejected": -147.36306762695312, + "loss": 0.9843, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.42403793334961, + "rewards/margins": -0.37160414457321167, + "rewards/rejected": -10.052433013916016, + "step": 3187 + }, + { + "epoch": 2.200103501811282, + "grad_norm": 0.47239983081817627, + "learning_rate": 3.334292289988493e-06, + "logits/chosen": 3.3127281665802, + "logits/rejected": 3.5383002758026123, + "logps/chosen": -154.66607666015625, + "logps/rejected": -169.33096313476562, + "loss": 0.5226, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.887622833251953, + "rewards/margins": 1.4620702266693115, + "rewards/rejected": -12.349693298339844, + "step": 3188 + }, + { + "epoch": 2.200793513886493, + "grad_norm": 0.3184202313423157, + "learning_rate": 3.331415420023015e-06, + "logits/chosen": 3.291865348815918, + "logits/rejected": 3.3909945487976074, + "logps/chosen": -177.87786865234375, + "logps/rejected": -187.8731231689453, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.879013061523438, + "rewards/margins": 1.0413455963134766, + "rewards/rejected": -13.920358657836914, + "step": 3189 + }, + { + "epoch": 2.2014835259617045, + "grad_norm": 0.5244048833847046, + "learning_rate": 3.328538550057538e-06, + "logits/chosen": 3.2596988677978516, + "logits/rejected": 3.2596988677978516, + "logps/chosen": -156.68055725097656, + "logps/rejected": -156.68055725097656, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.899811744689941, + "rewards/margins": 8.344650268554688e-07, + "rewards/rejected": -10.899812698364258, + "step": 3190 + }, + { + "epoch": 2.2021735380369156, + "grad_norm": 0.30334317684173584, + "learning_rate": 3.32566168009206e-06, + "logits/chosen": 3.750730276107788, + "logits/rejected": 3.750730276107788, + "logps/chosen": -170.62860107421875, + "logps/rejected": -170.62860107421875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.378670692443848, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.378670692443848, + "step": 3191 + }, + { + "epoch": 2.2028635501121268, + "grad_norm": 0.39568397402763367, + "learning_rate": 3.3227848101265827e-06, + "logits/chosen": 3.3956289291381836, + "logits/rejected": 3.3956289291381836, + "logps/chosen": -178.45159912109375, + "logps/rejected": -178.45159912109375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.888964653015137, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.888964653015137, + "step": 3192 + }, + { + "epoch": 2.2035535621873383, + "grad_norm": 0.3214215934276581, + "learning_rate": 3.319907940161105e-06, + "logits/chosen": 3.532059669494629, + "logits/rejected": 3.5568904876708984, + "logps/chosen": -146.8870086669922, + "logps/rejected": -166.73817443847656, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.890363693237305, + "rewards/margins": 2.02364182472229, + "rewards/rejected": -11.914005279541016, + "step": 3193 + }, + { + "epoch": 2.2042435742625495, + "grad_norm": 0.36575984954833984, + "learning_rate": 3.3170310701956278e-06, + "logits/chosen": 3.392570972442627, + "logits/rejected": 3.5263798236846924, + "logps/chosen": -157.56887817382812, + "logps/rejected": -162.79205322265625, + "loss": 0.6081, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.85721206665039, + "rewards/margins": 0.5440071225166321, + "rewards/rejected": -11.40121841430664, + "step": 3194 + }, + { + "epoch": 2.204933586337761, + "grad_norm": 0.37982290983200073, + "learning_rate": 3.3141542002301497e-06, + "logits/chosen": 3.1929359436035156, + "logits/rejected": 3.1929359436035156, + "logps/chosen": -163.5899658203125, + "logps/rejected": -163.5899658203125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.672637939453125, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.672637939453125, + "step": 3195 + }, + { + "epoch": 2.205623598412972, + "grad_norm": 1.0135974884033203, + "learning_rate": 3.311277330264672e-06, + "logits/chosen": 3.0734312534332275, + "logits/rejected": 3.0912961959838867, + "logps/chosen": -135.2403564453125, + "logps/rejected": -148.14605712890625, + "loss": 0.5271, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.79236888885498, + "rewards/margins": 1.3416649103164673, + "rewards/rejected": -10.134034156799316, + "step": 3196 + }, + { + "epoch": 2.2063136104881833, + "grad_norm": 0.3054846227169037, + "learning_rate": 3.308400460299195e-06, + "logits/chosen": 3.231231451034546, + "logits/rejected": 3.2221693992614746, + "logps/chosen": -171.2197723388672, + "logps/rejected": -183.6493682861328, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.255992889404297, + "rewards/margins": 1.2627406120300293, + "rewards/rejected": -13.518733978271484, + "step": 3197 + }, + { + "epoch": 2.207003622563395, + "grad_norm": 0.47892826795578003, + "learning_rate": 3.305523590333717e-06, + "logits/chosen": 3.380894660949707, + "logits/rejected": 3.4038219451904297, + "logps/chosen": -150.26177978515625, + "logps/rejected": -166.18753051757812, + "loss": 0.5217, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.20768928527832, + "rewards/margins": 1.5530939102172852, + "rewards/rejected": -11.760783195495605, + "step": 3198 + }, + { + "epoch": 2.207693634638606, + "grad_norm": 12.44989013671875, + "learning_rate": 3.3026467203682395e-06, + "logits/chosen": 3.2136027812957764, + "logits/rejected": 3.2981693744659424, + "logps/chosen": -151.21017456054688, + "logps/rejected": -146.99285888671875, + "loss": 0.9808, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.398408889770508, + "rewards/margins": -0.367848664522171, + "rewards/rejected": -10.030559539794922, + "step": 3199 + }, + { + "epoch": 2.2083836467138176, + "grad_norm": 0.29786887764930725, + "learning_rate": 3.299769850402762e-06, + "logits/chosen": 3.218231201171875, + "logits/rejected": 3.3732948303222656, + "logps/chosen": -153.24484252929688, + "logps/rejected": -162.9620361328125, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.617745399475098, + "rewards/margins": 0.9310353994369507, + "rewards/rejected": -11.54878044128418, + "step": 3200 + }, + { + "epoch": 2.2090736587890287, + "grad_norm": 0.28743746876716614, + "learning_rate": 3.2968929804372847e-06, + "logits/chosen": 3.285762071609497, + "logits/rejected": 3.6202099323272705, + "logps/chosen": -150.82586669921875, + "logps/rejected": -173.27584838867188, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.283143997192383, + "rewards/margins": 2.160126209259033, + "rewards/rejected": -12.443270683288574, + "step": 3201 + }, + { + "epoch": 2.2097636708642403, + "grad_norm": 0.2962949872016907, + "learning_rate": 3.294016110471807e-06, + "logits/chosen": 3.2231969833374023, + "logits/rejected": 3.5111145973205566, + "logps/chosen": -130.64027404785156, + "logps/rejected": -153.52743530273438, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.46548843383789, + "rewards/margins": 2.264071226119995, + "rewards/rejected": -10.729558944702148, + "step": 3202 + }, + { + "epoch": 2.2104536829394514, + "grad_norm": 0.3534530699253082, + "learning_rate": 3.2911392405063294e-06, + "logits/chosen": 3.0574777126312256, + "logits/rejected": 3.3154189586639404, + "logps/chosen": -171.36166381835938, + "logps/rejected": -184.7093048095703, + "loss": 0.5209, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.20143985748291, + "rewards/margins": 1.3780734539031982, + "rewards/rejected": -13.579513549804688, + "step": 3203 + }, + { + "epoch": 2.2111436950146626, + "grad_norm": 0.332332968711853, + "learning_rate": 3.2882623705408517e-06, + "logits/chosen": 3.3512113094329834, + "logits/rejected": 3.3512113094329834, + "logps/chosen": -160.42111206054688, + "logps/rejected": -160.42111206054688, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.378311157226562, + "rewards/margins": 8.344650268554688e-07, + "rewards/rejected": -11.378313064575195, + "step": 3204 + }, + { + "epoch": 2.211833707089874, + "grad_norm": 14.024856567382812, + "learning_rate": 3.2853855005753745e-06, + "logits/chosen": 3.0746779441833496, + "logits/rejected": 3.3195462226867676, + "logps/chosen": -141.46485900878906, + "logps/rejected": -168.3465118408203, + "loss": 0.4975, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.42010498046875, + "rewards/margins": 2.6129517555236816, + "rewards/rejected": -12.03305721282959, + "step": 3205 + }, + { + "epoch": 2.2125237191650853, + "grad_norm": 0.28266897797584534, + "learning_rate": 3.282508630609897e-06, + "logits/chosen": 3.0475990772247314, + "logits/rejected": 3.205796480178833, + "logps/chosen": -166.88714599609375, + "logps/rejected": -175.31344604492188, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.944452285766602, + "rewards/margins": 0.8669204711914062, + "rewards/rejected": -12.811372756958008, + "step": 3206 + }, + { + "epoch": 2.213213731240297, + "grad_norm": 0.3625290095806122, + "learning_rate": 3.2796317606444188e-06, + "logits/chosen": 3.3107123374938965, + "logits/rejected": 3.3107123374938965, + "logps/chosen": -175.32696533203125, + "logps/rejected": -175.32699584960938, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.021867752075195, + "rewards/margins": 7.152557373046875e-07, + "rewards/rejected": -13.021868705749512, + "step": 3207 + }, + { + "epoch": 2.213903743315508, + "grad_norm": 0.36133384704589844, + "learning_rate": 3.2767548906789415e-06, + "logits/chosen": 3.2772371768951416, + "logits/rejected": 3.2772371768951416, + "logps/chosen": -184.0084228515625, + "logps/rejected": -184.0084228515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.617753028869629, + "rewards/margins": 0.0, + "rewards/rejected": -13.617753028869629, + "step": 3208 + }, + { + "epoch": 2.2145937553907196, + "grad_norm": 0.3188905417919159, + "learning_rate": 3.273878020713464e-06, + "logits/chosen": 3.4163053035736084, + "logits/rejected": 3.5089612007141113, + "logps/chosen": -152.29730224609375, + "logps/rejected": -167.54942321777344, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.518486022949219, + "rewards/margins": 1.5565569400787354, + "rewards/rejected": -12.075043678283691, + "step": 3209 + }, + { + "epoch": 2.2152837674659307, + "grad_norm": 0.34306472539901733, + "learning_rate": 3.2710011507479867e-06, + "logits/chosen": 3.3911173343658447, + "logits/rejected": 3.3911173343658447, + "logps/chosen": -180.59011840820312, + "logps/rejected": -180.59011840820312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.272510528564453, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.272509574890137, + "step": 3210 + }, + { + "epoch": 2.215973779541142, + "grad_norm": 0.3905327618122101, + "learning_rate": 3.2681242807825086e-06, + "logits/chosen": 3.4691286087036133, + "logits/rejected": 3.4502224922180176, + "logps/chosen": -161.9783935546875, + "logps/rejected": -167.93040466308594, + "loss": 0.6082, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.518082618713379, + "rewards/margins": 0.5357064604759216, + "rewards/rejected": -12.053789138793945, + "step": 3211 + }, + { + "epoch": 2.2166637916163534, + "grad_norm": 4.794855117797852, + "learning_rate": 3.2652474108170314e-06, + "logits/chosen": 3.158869504928589, + "logits/rejected": 3.184873104095459, + "logps/chosen": -143.9255828857422, + "logps/rejected": -148.1915283203125, + "loss": 0.5589, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.537846565246582, + "rewards/margins": 0.46229666471481323, + "rewards/rejected": -10.000144004821777, + "step": 3212 + }, + { + "epoch": 2.2173538036915645, + "grad_norm": 0.3512531518936157, + "learning_rate": 3.2623705408515537e-06, + "logits/chosen": 3.4002723693847656, + "logits/rejected": 3.469578504562378, + "logps/chosen": -133.80563354492188, + "logps/rejected": -157.7452850341797, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.701666831970215, + "rewards/margins": 2.356778860092163, + "rewards/rejected": -11.058445930480957, + "step": 3213 + }, + { + "epoch": 2.218043815766776, + "grad_norm": 0.4261552691459656, + "learning_rate": 3.2594936708860765e-06, + "logits/chosen": 3.1299571990966797, + "logits/rejected": 3.3257381916046143, + "logps/chosen": -139.71421813964844, + "logps/rejected": -164.17686462402344, + "loss": 0.4346, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.027080535888672, + "rewards/margins": 2.4701290130615234, + "rewards/rejected": -11.497209548950195, + "step": 3214 + }, + { + "epoch": 2.2187338278419872, + "grad_norm": 0.391846626996994, + "learning_rate": 3.2566168009205984e-06, + "logits/chosen": 3.407115936279297, + "logits/rejected": 3.407115936279297, + "logps/chosen": -164.76025390625, + "logps/rejected": -164.76026916503906, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.879518508911133, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -11.879518508911133, + "step": 3215 + }, + { + "epoch": 2.2194238399171984, + "grad_norm": 0.32940295338630676, + "learning_rate": 3.2537399309551208e-06, + "logits/chosen": 3.1463823318481445, + "logits/rejected": 3.2249531745910645, + "logps/chosen": -181.84817504882812, + "logps/rejected": -188.97645568847656, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.377756118774414, + "rewards/margins": 0.6801124215126038, + "rewards/rejected": -14.057868957519531, + "step": 3216 + }, + { + "epoch": 2.22011385199241, + "grad_norm": 0.3437933027744293, + "learning_rate": 3.2508630609896436e-06, + "logits/chosen": 3.428839683532715, + "logits/rejected": 3.428839683532715, + "logps/chosen": -177.34033203125, + "logps/rejected": -177.34033203125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.050537109375, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.050538063049316, + "step": 3217 + }, + { + "epoch": 2.220803864067621, + "grad_norm": 0.3308676779270172, + "learning_rate": 3.2479861910241663e-06, + "logits/chosen": 3.2220163345336914, + "logits/rejected": 3.254377603530884, + "logps/chosen": -147.98582458496094, + "logps/rejected": -164.82595825195312, + "loss": 0.5206, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.281913757324219, + "rewards/margins": 1.6670677661895752, + "rewards/rejected": -11.948982238769531, + "step": 3218 + }, + { + "epoch": 2.2214938761428327, + "grad_norm": 0.3446705937385559, + "learning_rate": 3.2451093210586883e-06, + "logits/chosen": 3.0467443466186523, + "logits/rejected": 3.0467443466186523, + "logps/chosen": -164.68722534179688, + "logps/rejected": -164.68722534179688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.862144470214844, + "rewards/margins": -2.980232238769531e-07, + "rewards/rejected": -11.862144470214844, + "step": 3219 + }, + { + "epoch": 2.222183888218044, + "grad_norm": 0.3052861988544464, + "learning_rate": 3.2422324510932106e-06, + "logits/chosen": 3.240353584289551, + "logits/rejected": 3.561736583709717, + "logps/chosen": -137.8028106689453, + "logps/rejected": -164.924072265625, + "loss": 0.4346, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.990215301513672, + "rewards/margins": 2.6886353492736816, + "rewards/rejected": -11.678851127624512, + "step": 3220 + }, + { + "epoch": 2.222873900293255, + "grad_norm": 0.33807680010795593, + "learning_rate": 3.2393555811277334e-06, + "logits/chosen": 3.3724887371063232, + "logits/rejected": 3.3724887371063232, + "logps/chosen": -182.2825469970703, + "logps/rejected": -182.2825469970703, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.479042053222656, + "rewards/margins": 0.0, + "rewards/rejected": -13.479042053222656, + "step": 3221 + }, + { + "epoch": 2.2235639123684665, + "grad_norm": 0.39971715211868286, + "learning_rate": 3.2364787111622557e-06, + "logits/chosen": 3.2031357288360596, + "logits/rejected": 3.3527708053588867, + "logps/chosen": -138.14468383789062, + "logps/rejected": -163.08740234375, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.067831039428711, + "rewards/margins": 2.5343730449676514, + "rewards/rejected": -11.602203369140625, + "step": 3222 + }, + { + "epoch": 2.2242539244436776, + "grad_norm": 0.3406284749507904, + "learning_rate": 3.2336018411967785e-06, + "logits/chosen": 3.2653450965881348, + "logits/rejected": 3.3336429595947266, + "logps/chosen": -169.71722412109375, + "logps/rejected": -176.6237030029297, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.188287734985352, + "rewards/margins": 0.7123171091079712, + "rewards/rejected": -12.900606155395508, + "step": 3223 + }, + { + "epoch": 2.224943936518889, + "grad_norm": 0.28049492835998535, + "learning_rate": 3.2307249712313004e-06, + "logits/chosen": 3.5975570678710938, + "logits/rejected": 3.6568803787231445, + "logps/chosen": -162.95895385742188, + "logps/rejected": -174.0069580078125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.361223220825195, + "rewards/margins": 1.1195476055145264, + "rewards/rejected": -12.4807710647583, + "step": 3224 + }, + { + "epoch": 2.2256339485941004, + "grad_norm": 0.3991832733154297, + "learning_rate": 3.2278481012658232e-06, + "logits/chosen": 3.552028179168701, + "logits/rejected": 3.552028179168701, + "logps/chosen": -176.921630859375, + "logps/rejected": -176.921630859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.771388053894043, + "rewards/margins": -4.172325134277344e-07, + "rewards/rejected": -12.771388053894043, + "step": 3225 + }, + { + "epoch": 2.226323960669312, + "grad_norm": 11.283940315246582, + "learning_rate": 3.2249712313003456e-06, + "logits/chosen": 3.535529851913452, + "logits/rejected": 3.5176382064819336, + "logps/chosen": -156.60179138183594, + "logps/rejected": -162.03623962402344, + "loss": 1.1804, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.918741226196289, + "rewards/margins": 0.5343104004859924, + "rewards/rejected": -11.453052520751953, + "step": 3226 + }, + { + "epoch": 2.227013972744523, + "grad_norm": 0.3260868787765503, + "learning_rate": 3.2220943613348683e-06, + "logits/chosen": 3.230480194091797, + "logits/rejected": 3.5100622177124023, + "logps/chosen": -137.47093200683594, + "logps/rejected": -177.37362670898438, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.869743347167969, + "rewards/margins": 3.969203472137451, + "rewards/rejected": -12.838947296142578, + "step": 3227 + }, + { + "epoch": 2.227703984819734, + "grad_norm": 0.3462621569633484, + "learning_rate": 3.2192174913693903e-06, + "logits/chosen": 3.865875482559204, + "logits/rejected": 3.865875482559204, + "logps/chosen": -179.35244750976562, + "logps/rejected": -179.35244750976562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.239668846130371, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -13.239667892456055, + "step": 3228 + }, + { + "epoch": 2.2283939968949458, + "grad_norm": 0.48530688881874084, + "learning_rate": 3.2163406214039126e-06, + "logits/chosen": 3.213834762573242, + "logits/rejected": 3.3546957969665527, + "logps/chosen": -170.2274169921875, + "logps/rejected": -175.53262329101562, + "loss": 0.6079, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.233959197998047, + "rewards/margins": 0.5598545670509338, + "rewards/rejected": -12.793811798095703, + "step": 3229 + }, + { + "epoch": 2.229084008970157, + "grad_norm": 0.34218069911003113, + "learning_rate": 3.2134637514384354e-06, + "logits/chosen": 3.323808193206787, + "logits/rejected": 3.323808193206787, + "logps/chosen": -175.19891357421875, + "logps/rejected": -175.19891357421875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.704109191894531, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.704109191894531, + "step": 3230 + }, + { + "epoch": 2.2297740210453685, + "grad_norm": 0.2528841197490692, + "learning_rate": 3.2105868814729578e-06, + "logits/chosen": 3.732083797454834, + "logits/rejected": 4.062394142150879, + "logps/chosen": -157.697998046875, + "logps/rejected": -185.77365112304688, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.085638046264648, + "rewards/margins": 2.663952350616455, + "rewards/rejected": -13.749589920043945, + "step": 3231 + }, + { + "epoch": 2.2304640331205796, + "grad_norm": 0.3305690884590149, + "learning_rate": 3.20771001150748e-06, + "logits/chosen": 3.5703372955322266, + "logits/rejected": 3.5839805603027344, + "logps/chosen": -169.23040771484375, + "logps/rejected": -178.8240203857422, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.003662109375, + "rewards/margins": 0.8913246393203735, + "rewards/rejected": -12.894987106323242, + "step": 3232 + }, + { + "epoch": 2.2311540451957907, + "grad_norm": 1.6283562183380127, + "learning_rate": 3.2048331415420025e-06, + "logits/chosen": 3.361968517303467, + "logits/rejected": 3.524806261062622, + "logps/chosen": -156.2862548828125, + "logps/rejected": -170.14447021484375, + "loss": 0.5381, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.750707626342773, + "rewards/margins": 1.3779562711715698, + "rewards/rejected": -12.128664016723633, + "step": 3233 + }, + { + "epoch": 2.2318440572710023, + "grad_norm": 0.3211893141269684, + "learning_rate": 3.2019562715765252e-06, + "logits/chosen": 3.004686117172241, + "logits/rejected": 3.227121114730835, + "logps/chosen": -123.39619445800781, + "logps/rejected": -157.96807861328125, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -7.616551399230957, + "rewards/margins": 3.3735694885253906, + "rewards/rejected": -10.990121841430664, + "step": 3234 + }, + { + "epoch": 2.2325340693462135, + "grad_norm": 1.5450016260147095, + "learning_rate": 3.1990794016110476e-06, + "logits/chosen": 3.216876745223999, + "logits/rejected": 3.260202407836914, + "logps/chosen": -156.54507446289062, + "logps/rejected": -159.84689331054688, + "loss": 0.6131, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.856850624084473, + "rewards/margins": 0.36502528190612793, + "rewards/rejected": -11.22187614440918, + "step": 3235 + }, + { + "epoch": 2.233224081421425, + "grad_norm": 0.3431243300437927, + "learning_rate": 3.1962025316455695e-06, + "logits/chosen": 3.593478202819824, + "logits/rejected": 3.593478202819824, + "logps/chosen": -167.49945068359375, + "logps/rejected": -167.49945068359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.801360130310059, + "rewards/margins": 0.0, + "rewards/rejected": -11.801360130310059, + "step": 3236 + }, + { + "epoch": 2.233914093496636, + "grad_norm": 0.35140880942344666, + "learning_rate": 3.1933256616800923e-06, + "logits/chosen": 3.289346694946289, + "logits/rejected": 3.3524556159973145, + "logps/chosen": -179.58743286132812, + "logps/rejected": -190.16915893554688, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.266924858093262, + "rewards/margins": 1.0868358612060547, + "rewards/rejected": -14.353761672973633, + "step": 3237 + }, + { + "epoch": 2.2346041055718473, + "grad_norm": 0.3588889539241791, + "learning_rate": 3.190448791714615e-06, + "logits/chosen": 3.4882099628448486, + "logits/rejected": 3.7377126216888428, + "logps/chosen": -153.8157958984375, + "logps/rejected": -174.81460571289062, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.65540885925293, + "rewards/margins": 2.1253674030303955, + "rewards/rejected": -12.780776977539062, + "step": 3238 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 1.7610490322113037, + "learning_rate": 3.1875719217491374e-06, + "logits/chosen": 3.542273759841919, + "logits/rejected": 3.565218448638916, + "logps/chosen": -166.86981201171875, + "logps/rejected": -170.03085327148438, + "loss": 0.6125, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.874838829040527, + "rewards/margins": 0.3773077726364136, + "rewards/rejected": -12.25214672088623, + "step": 3239 + }, + { + "epoch": 2.23598412972227, + "grad_norm": 0.4059305489063263, + "learning_rate": 3.1846950517836593e-06, + "logits/chosen": 3.5074055194854736, + "logits/rejected": 3.5074055194854736, + "logps/chosen": -158.07054138183594, + "logps/rejected": -158.07054138183594, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.880258560180664, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -10.880257606506348, + "step": 3240 + }, + { + "epoch": 2.2366741417974816, + "grad_norm": 0.369198739528656, + "learning_rate": 3.181818181818182e-06, + "logits/chosen": 3.242260456085205, + "logits/rejected": 3.242260456085205, + "logps/chosen": -159.4489288330078, + "logps/rejected": -159.4489288330078, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.1469087600708, + "rewards/margins": 0.0, + "rewards/rejected": -11.1469087600708, + "step": 3241 + }, + { + "epoch": 2.2373641538726927, + "grad_norm": 0.330642431974411, + "learning_rate": 3.1789413118527045e-06, + "logits/chosen": 3.0038669109344482, + "logits/rejected": 3.5337178707122803, + "logps/chosen": -160.07093811035156, + "logps/rejected": -188.06802368164062, + "loss": 0.4356, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.245170593261719, + "rewards/margins": 2.7777328491210938, + "rewards/rejected": -14.022903442382812, + "step": 3242 + }, + { + "epoch": 2.2380541659479043, + "grad_norm": 0.31806519627571106, + "learning_rate": 3.1760644418872272e-06, + "logits/chosen": 3.6126770973205566, + "logits/rejected": 3.794879913330078, + "logps/chosen": -164.90447998046875, + "logps/rejected": -177.06088256835938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.66777515411377, + "rewards/margins": 1.2019367218017578, + "rewards/rejected": -12.869710922241211, + "step": 3243 + }, + { + "epoch": 2.2387441780231154, + "grad_norm": 8.740852355957031, + "learning_rate": 3.173187571921749e-06, + "logits/chosen": 3.5547235012054443, + "logits/rejected": 3.5989809036254883, + "logps/chosen": -159.96995544433594, + "logps/rejected": -160.4023895263672, + "loss": 0.6768, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.375435829162598, + "rewards/margins": 0.03522920608520508, + "rewards/rejected": -11.410665512084961, + "step": 3244 + }, + { + "epoch": 2.2394341900983266, + "grad_norm": 0.2995139956474304, + "learning_rate": 3.170310701956272e-06, + "logits/chosen": 3.427511215209961, + "logits/rejected": 3.578636646270752, + "logps/chosen": -164.8604278564453, + "logps/rejected": -186.77487182617188, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.733217239379883, + "rewards/margins": 2.084477186203003, + "rewards/rejected": -13.817694664001465, + "step": 3245 + }, + { + "epoch": 2.240124202173538, + "grad_norm": 0.2688613831996918, + "learning_rate": 3.1674338319907943e-06, + "logits/chosen": 3.2491540908813477, + "logits/rejected": 3.6054131984710693, + "logps/chosen": -150.99395751953125, + "logps/rejected": -166.95956420898438, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.192516326904297, + "rewards/margins": 1.6370759010314941, + "rewards/rejected": -11.829591751098633, + "step": 3246 + }, + { + "epoch": 2.2408142142487493, + "grad_norm": 0.4453573524951935, + "learning_rate": 3.164556962025317e-06, + "logits/chosen": 3.1530842781066895, + "logits/rejected": 3.181147575378418, + "logps/chosen": -152.98318481445312, + "logps/rejected": -164.8126220703125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.512391090393066, + "rewards/margins": 1.161292552947998, + "rewards/rejected": -11.673683166503906, + "step": 3247 + }, + { + "epoch": 2.241504226323961, + "grad_norm": 0.3093123435974121, + "learning_rate": 3.161680092059839e-06, + "logits/chosen": 3.234083652496338, + "logits/rejected": 3.332118034362793, + "logps/chosen": -162.42105102539062, + "logps/rejected": -175.71029663085938, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.3035306930542, + "rewards/margins": 1.3453420400619507, + "rewards/rejected": -12.648872375488281, + "step": 3248 + }, + { + "epoch": 2.242194238399172, + "grad_norm": 0.29807206988334656, + "learning_rate": 3.1588032220943614e-06, + "logits/chosen": 3.387840986251831, + "logits/rejected": 3.4809892177581787, + "logps/chosen": -164.6480712890625, + "logps/rejected": -186.025146484375, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.72776985168457, + "rewards/margins": 2.2079944610595703, + "rewards/rejected": -13.93576431274414, + "step": 3249 + }, + { + "epoch": 2.242884250474383, + "grad_norm": 0.3516823947429657, + "learning_rate": 3.155926352128884e-06, + "logits/chosen": 3.1620700359344482, + "logits/rejected": 3.3204379081726074, + "logps/chosen": -158.67469787597656, + "logps/rejected": -167.8087615966797, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.029855728149414, + "rewards/margins": 0.9365963339805603, + "rewards/rejected": -11.966452598571777, + "step": 3250 + }, + { + "epoch": 2.2435742625495947, + "grad_norm": 1.0649452209472656, + "learning_rate": 3.1530494821634065e-06, + "logits/chosen": 3.8866143226623535, + "logits/rejected": 3.859933376312256, + "logps/chosen": -161.90509033203125, + "logps/rejected": -165.06134033203125, + "loss": 0.6156, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.231693267822266, + "rewards/margins": 0.3232119083404541, + "rewards/rejected": -11.554903984069824, + "step": 3251 + }, + { + "epoch": 2.244264274624806, + "grad_norm": 0.2595980763435364, + "learning_rate": 3.150172612197929e-06, + "logits/chosen": 3.078568696975708, + "logits/rejected": 3.1457693576812744, + "logps/chosen": -153.06480407714844, + "logps/rejected": -161.60987854003906, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.598990440368652, + "rewards/margins": 0.8438072204589844, + "rewards/rejected": -11.44279670715332, + "step": 3252 + }, + { + "epoch": 2.2449542867000174, + "grad_norm": 0.3298541009426117, + "learning_rate": 3.147295742232451e-06, + "logits/chosen": 3.2941818237304688, + "logits/rejected": 3.312572717666626, + "logps/chosen": -179.056396484375, + "logps/rejected": -190.8250274658203, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.10116958618164, + "rewards/margins": 1.2208225727081299, + "rewards/rejected": -14.321992874145508, + "step": 3253 + }, + { + "epoch": 2.2456442987752285, + "grad_norm": 0.30723580718040466, + "learning_rate": 3.144418872266974e-06, + "logits/chosen": 3.8198816776275635, + "logits/rejected": 3.8198816776275635, + "logps/chosen": -187.28457641601562, + "logps/rejected": -187.28457641601562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.13399600982666, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -14.133995056152344, + "step": 3254 + }, + { + "epoch": 2.2463343108504397, + "grad_norm": 0.34050801396369934, + "learning_rate": 3.1415420023014963e-06, + "logits/chosen": 3.5777487754821777, + "logits/rejected": 3.6202383041381836, + "logps/chosen": -152.4123992919922, + "logps/rejected": -173.25750732421875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.647698402404785, + "rewards/margins": 1.931276798248291, + "rewards/rejected": -12.578974723815918, + "step": 3255 + }, + { + "epoch": 2.2470243229256512, + "grad_norm": 5.2363505363464355, + "learning_rate": 3.1386651323360182e-06, + "logits/chosen": 3.4403345584869385, + "logits/rejected": 3.5219500064849854, + "logps/chosen": -161.04888916015625, + "logps/rejected": -170.3419189453125, + "loss": 0.5484, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.349357604980469, + "rewards/margins": 0.8611844778060913, + "rewards/rejected": -12.210540771484375, + "step": 3256 + }, + { + "epoch": 2.2477143350008624, + "grad_norm": 1.069831371307373, + "learning_rate": 3.135788262370541e-06, + "logits/chosen": 3.1812233924865723, + "logits/rejected": 3.341430187225342, + "logps/chosen": -156.01065063476562, + "logps/rejected": -173.70346069335938, + "loss": 0.5241, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.984329223632812, + "rewards/margins": 1.7147364616394043, + "rewards/rejected": -12.699066162109375, + "step": 3257 + }, + { + "epoch": 2.248404347076074, + "grad_norm": 0.3654578924179077, + "learning_rate": 3.132911392405064e-06, + "logits/chosen": 3.2260050773620605, + "logits/rejected": 3.263373851776123, + "logps/chosen": -161.2552490234375, + "logps/rejected": -175.5962677001953, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.444560050964355, + "rewards/margins": 1.4585299491882324, + "rewards/rejected": -12.903090476989746, + "step": 3258 + }, + { + "epoch": 2.249094359151285, + "grad_norm": 0.31979086995124817, + "learning_rate": 3.130034522439586e-06, + "logits/chosen": 3.4852747917175293, + "logits/rejected": 3.4852747917175293, + "logps/chosen": -179.6710662841797, + "logps/rejected": -179.6710662841797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.124279022216797, + "rewards/margins": 0.0, + "rewards/rejected": -13.124279022216797, + "step": 3259 + }, + { + "epoch": 2.2497843712264967, + "grad_norm": 4.434333801269531, + "learning_rate": 3.127157652474108e-06, + "logits/chosen": 3.1435070037841797, + "logits/rejected": 3.435502052307129, + "logps/chosen": -152.1072540283203, + "logps/rejected": -165.23239135742188, + "loss": 0.5493, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.493961334228516, + "rewards/margins": 1.2576539516448975, + "rewards/rejected": -11.751615524291992, + "step": 3260 + }, + { + "epoch": 2.250474383301708, + "grad_norm": 0.37729737162590027, + "learning_rate": 3.124280782508631e-06, + "logits/chosen": 3.221302032470703, + "logits/rejected": 3.450850248336792, + "logps/chosen": -147.6201629638672, + "logps/rejected": -177.07595825195312, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.033731460571289, + "rewards/margins": 2.914625644683838, + "rewards/rejected": -12.948356628417969, + "step": 3261 + }, + { + "epoch": 2.251164395376919, + "grad_norm": 0.2771701216697693, + "learning_rate": 3.121403912543153e-06, + "logits/chosen": 3.6145236492156982, + "logits/rejected": 3.697277307510376, + "logps/chosen": -170.62115478515625, + "logps/rejected": -181.63369750976562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.243474960327148, + "rewards/margins": 1.0961637496948242, + "rewards/rejected": -13.339638710021973, + "step": 3262 + }, + { + "epoch": 2.2518544074521305, + "grad_norm": 0.34380432963371277, + "learning_rate": 3.118527042577676e-06, + "logits/chosen": 3.290175199508667, + "logits/rejected": 3.332787036895752, + "logps/chosen": -163.03756713867188, + "logps/rejected": -185.94476318359375, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.534027099609375, + "rewards/margins": 2.305326461791992, + "rewards/rejected": -13.839353561401367, + "step": 3263 + }, + { + "epoch": 2.2525444195273416, + "grad_norm": 0.37194693088531494, + "learning_rate": 3.115650172612198e-06, + "logits/chosen": 3.4286770820617676, + "logits/rejected": 3.4286770820617676, + "logps/chosen": -160.10888671875, + "logps/rejected": -160.10888671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.227303504943848, + "rewards/margins": 0.0, + "rewards/rejected": -11.227303504943848, + "step": 3264 + }, + { + "epoch": 2.253234431602553, + "grad_norm": 4.365417957305908, + "learning_rate": 3.1127733026467207e-06, + "logits/chosen": 3.4465551376342773, + "logits/rejected": 3.5274035930633545, + "logps/chosen": -169.75094604492188, + "logps/rejected": -171.9019775390625, + "loss": 0.6275, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.229179382324219, + "rewards/margins": 0.21224069595336914, + "rewards/rejected": -12.44141960144043, + "step": 3265 + }, + { + "epoch": 2.2539244436777643, + "grad_norm": 0.3068324029445648, + "learning_rate": 3.109896432681243e-06, + "logits/chosen": 3.3532440662384033, + "logits/rejected": 3.585916042327881, + "logps/chosen": -160.77847290039062, + "logps/rejected": -181.79122924804688, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.300227165222168, + "rewards/margins": 2.08034348487854, + "rewards/rejected": -13.380570411682129, + "step": 3266 + }, + { + "epoch": 2.254614455752976, + "grad_norm": 0.3834540843963623, + "learning_rate": 3.107019562715766e-06, + "logits/chosen": 3.4604690074920654, + "logits/rejected": 3.4604690074920654, + "logps/chosen": -165.468017578125, + "logps/rejected": -165.468017578125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.004437446594238, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -12.004436492919922, + "step": 3267 + }, + { + "epoch": 2.255304467828187, + "grad_norm": 0.28963780403137207, + "learning_rate": 3.1041426927502877e-06, + "logits/chosen": 3.6380038261413574, + "logits/rejected": 3.6380038261413574, + "logps/chosen": -183.0433807373047, + "logps/rejected": -183.04339599609375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.321929931640625, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.321929931640625, + "step": 3268 + }, + { + "epoch": 2.255994479903398, + "grad_norm": 40.844200134277344, + "learning_rate": 3.10126582278481e-06, + "logits/chosen": 3.137763023376465, + "logits/rejected": 3.3287100791931152, + "logps/chosen": -158.1854248046875, + "logps/rejected": -172.44081115722656, + "loss": 0.8722, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.117058753967285, + "rewards/margins": 1.341062307357788, + "rewards/rejected": -12.458120346069336, + "step": 3269 + }, + { + "epoch": 2.2566844919786098, + "grad_norm": 0.2958502173423767, + "learning_rate": 3.098388952819333e-06, + "logits/chosen": 3.338998794555664, + "logits/rejected": 3.3574435710906982, + "logps/chosen": -169.00318908691406, + "logps/rejected": -176.74244689941406, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.013635635375977, + "rewards/margins": 0.798984169960022, + "rewards/rejected": -12.812620162963867, + "step": 3270 + }, + { + "epoch": 2.257374504053821, + "grad_norm": 0.26831644773483276, + "learning_rate": 3.095512082853855e-06, + "logits/chosen": 3.1588854789733887, + "logits/rejected": 3.188328504562378, + "logps/chosen": -169.02967834472656, + "logps/rejected": -185.94033813476562, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.162633895874023, + "rewards/margins": 1.5886346101760864, + "rewards/rejected": -13.75126838684082, + "step": 3271 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 0.27483606338500977, + "learning_rate": 3.0926352128883776e-06, + "logits/chosen": 3.542191982269287, + "logits/rejected": 3.646167755126953, + "logps/chosen": -169.57484436035156, + "logps/rejected": -190.84010314941406, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.192548751831055, + "rewards/margins": 2.1653170585632324, + "rewards/rejected": -14.357864379882812, + "step": 3272 + }, + { + "epoch": 2.2587545282042436, + "grad_norm": 0.407900333404541, + "learning_rate": 3.0897583429229e-06, + "logits/chosen": 3.5598483085632324, + "logits/rejected": 3.748004913330078, + "logps/chosen": -158.74349975585938, + "logps/rejected": -174.4892578125, + "loss": 0.5208, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.207359313964844, + "rewards/margins": 1.5332603454589844, + "rewards/rejected": -12.740619659423828, + "step": 3273 + }, + { + "epoch": 2.2594445402794547, + "grad_norm": 0.26255694031715393, + "learning_rate": 3.0868814729574227e-06, + "logits/chosen": 3.532968282699585, + "logits/rejected": 3.5555522441864014, + "logps/chosen": -166.1388702392578, + "logps/rejected": -176.29916381835938, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.881420135498047, + "rewards/margins": 1.0254158973693848, + "rewards/rejected": -12.906835556030273, + "step": 3274 + }, + { + "epoch": 2.2601345523546663, + "grad_norm": 0.3344268500804901, + "learning_rate": 3.084004602991945e-06, + "logits/chosen": 3.4128153324127197, + "logits/rejected": 3.660109281539917, + "logps/chosen": -163.93490600585938, + "logps/rejected": -177.05364990234375, + "loss": 0.521, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.483522415161133, + "rewards/margins": 1.3375658988952637, + "rewards/rejected": -12.821087837219238, + "step": 3275 + }, + { + "epoch": 2.2608245644298774, + "grad_norm": 0.35655084252357483, + "learning_rate": 3.081127733026468e-06, + "logits/chosen": 3.1179282665252686, + "logits/rejected": 3.1179282665252686, + "logps/chosen": -172.96922302246094, + "logps/rejected": -172.96922302246094, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.516380310058594, + "rewards/margins": -4.172325134277344e-07, + "rewards/rejected": -12.516380310058594, + "step": 3276 + }, + { + "epoch": 2.261514576505089, + "grad_norm": 0.2505456805229187, + "learning_rate": 3.0782508630609897e-06, + "logits/chosen": 3.4488799571990967, + "logits/rejected": 3.53845477104187, + "logps/chosen": -180.04563903808594, + "logps/rejected": -187.12420654296875, + "loss": 0.6068, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.084596633911133, + "rewards/margins": 0.7373393177986145, + "rewards/rejected": -13.821935653686523, + "step": 3277 + }, + { + "epoch": 2.2622045885803, + "grad_norm": 0.3416574001312256, + "learning_rate": 3.0753739930955125e-06, + "logits/chosen": 3.4418256282806396, + "logits/rejected": 3.4418256282806396, + "logps/chosen": -167.4663848876953, + "logps/rejected": -167.4663848876953, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.927726745605469, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -11.927726745605469, + "step": 3278 + }, + { + "epoch": 2.2628946006555113, + "grad_norm": 0.257505863904953, + "learning_rate": 3.072497123130035e-06, + "logits/chosen": 3.1618995666503906, + "logits/rejected": 3.189849615097046, + "logps/chosen": -173.84149169921875, + "logps/rejected": -182.77743530273438, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.402953147888184, + "rewards/margins": 0.939875066280365, + "rewards/rejected": -13.342828750610352, + "step": 3279 + }, + { + "epoch": 2.263584612730723, + "grad_norm": 0.2873402237892151, + "learning_rate": 3.0696202531645576e-06, + "logits/chosen": 3.3056750297546387, + "logits/rejected": 3.5645503997802734, + "logps/chosen": -145.64581298828125, + "logps/rejected": -166.47950744628906, + "loss": 0.5201, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.870256423950195, + "rewards/margins": 2.059478282928467, + "rewards/rejected": -11.92973518371582, + "step": 3280 + }, + { + "epoch": 2.264274624805934, + "grad_norm": 0.29140523076057434, + "learning_rate": 3.0667433831990796e-06, + "logits/chosen": 3.0868539810180664, + "logits/rejected": 3.458817958831787, + "logps/chosen": -146.79148864746094, + "logps/rejected": -175.6749267578125, + "loss": 0.434, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.006429672241211, + "rewards/margins": 2.856473207473755, + "rewards/rejected": -12.86290168762207, + "step": 3281 + }, + { + "epoch": 2.2649646368811456, + "grad_norm": 0.35589292645454407, + "learning_rate": 3.063866513233602e-06, + "logits/chosen": 3.2592029571533203, + "logits/rejected": 3.5067806243896484, + "logps/chosen": -158.8264617919922, + "logps/rejected": -178.11651611328125, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.334747314453125, + "rewards/margins": 1.864209532737732, + "rewards/rejected": -13.198957443237305, + "step": 3282 + }, + { + "epoch": 2.2656546489563567, + "grad_norm": 1.2628917694091797, + "learning_rate": 3.0609896432681247e-06, + "logits/chosen": 3.7825889587402344, + "logits/rejected": 3.663022041320801, + "logps/chosen": -166.2305450439453, + "logps/rejected": -179.732177734375, + "loss": 0.5287, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.756160736083984, + "rewards/margins": 1.3865221738815308, + "rewards/rejected": -13.142682075500488, + "step": 3283 + }, + { + "epoch": 2.2663446610315683, + "grad_norm": 0.3223482370376587, + "learning_rate": 3.058112773302647e-06, + "logits/chosen": 3.229311943054199, + "logits/rejected": 3.256258010864258, + "logps/chosen": -161.55844116210938, + "logps/rejected": -167.82614135742188, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.460212707519531, + "rewards/margins": 0.6679533123970032, + "rewards/rejected": -12.128166198730469, + "step": 3284 + }, + { + "epoch": 2.2670346731067794, + "grad_norm": 3.068254232406616, + "learning_rate": 3.0552359033371694e-06, + "logits/chosen": 2.983511447906494, + "logits/rejected": 3.3631110191345215, + "logps/chosen": -147.5644989013672, + "logps/rejected": -184.4166259765625, + "loss": 0.3641, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.897704124450684, + "rewards/margins": 3.6627378463745117, + "rewards/rejected": -13.560441017150879, + "step": 3285 + }, + { + "epoch": 2.2677246851819906, + "grad_norm": 0.29698342084884644, + "learning_rate": 3.0523590333716918e-06, + "logits/chosen": 3.197221517562866, + "logits/rejected": 3.359227180480957, + "logps/chosen": -174.16940307617188, + "logps/rejected": -181.02365112304688, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.682123184204102, + "rewards/margins": 0.701507568359375, + "rewards/rejected": -13.383630752563477, + "step": 3286 + }, + { + "epoch": 2.268414697257202, + "grad_norm": 0.29678764939308167, + "learning_rate": 3.0494821634062145e-06, + "logits/chosen": 3.075212001800537, + "logits/rejected": 3.0816259384155273, + "logps/chosen": -157.774658203125, + "logps/rejected": -169.79067993164062, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.018461227416992, + "rewards/margins": 1.2146925926208496, + "rewards/rejected": -12.233154296875, + "step": 3287 + }, + { + "epoch": 2.2691047093324133, + "grad_norm": 0.37394610047340393, + "learning_rate": 3.046605293440737e-06, + "logits/chosen": 3.2521705627441406, + "logits/rejected": 3.4562366008758545, + "logps/chosen": -160.22323608398438, + "logps/rejected": -170.59860229492188, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.403890609741211, + "rewards/margins": 1.007190465927124, + "rewards/rejected": -12.411081314086914, + "step": 3288 + }, + { + "epoch": 2.2697947214076244, + "grad_norm": 0.2664759159088135, + "learning_rate": 3.043728423475259e-06, + "logits/chosen": 3.072535753250122, + "logits/rejected": 3.3565244674682617, + "logps/chosen": -139.64785766601562, + "logps/rejected": -165.12582397460938, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.267251014709473, + "rewards/margins": 2.4467761516571045, + "rewards/rejected": -11.714027404785156, + "step": 3289 + }, + { + "epoch": 2.270484733482836, + "grad_norm": 0.33805006742477417, + "learning_rate": 3.0408515535097816e-06, + "logits/chosen": 3.1596975326538086, + "logits/rejected": 3.389615535736084, + "logps/chosen": -152.12904357910156, + "logps/rejected": -185.76486206054688, + "loss": 0.4333, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.425542831420898, + "rewards/margins": 3.371872663497925, + "rewards/rejected": -13.797415733337402, + "step": 3290 + }, + { + "epoch": 2.271174745558047, + "grad_norm": 0.3037129044532776, + "learning_rate": 3.037974683544304e-06, + "logits/chosen": 3.4311301708221436, + "logits/rejected": 3.514981746673584, + "logps/chosen": -178.31082153320312, + "logps/rejected": -187.8837127685547, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.96458625793457, + "rewards/margins": 0.940890371799469, + "rewards/rejected": -13.905476570129395, + "step": 3291 + }, + { + "epoch": 2.2718647576332587, + "grad_norm": 0.3320106267929077, + "learning_rate": 3.0350978135788267e-06, + "logits/chosen": 3.2864410877227783, + "logits/rejected": 3.2864410877227783, + "logps/chosen": -173.35684204101562, + "logps/rejected": -173.35684204101562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.60712718963623, + "rewards/margins": 0.0, + "rewards/rejected": -12.60712718963623, + "step": 3292 + }, + { + "epoch": 2.27255476970847, + "grad_norm": 0.38059520721435547, + "learning_rate": 3.0322209436133486e-06, + "logits/chosen": 3.8772153854370117, + "logits/rejected": 4.003552436828613, + "logps/chosen": -178.93621826171875, + "logps/rejected": -185.24911499023438, + "loss": 0.6074, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.164152145385742, + "rewards/margins": 0.6192377805709839, + "rewards/rejected": -13.783390045166016, + "step": 3293 + }, + { + "epoch": 2.2732447817836814, + "grad_norm": 0.2754736542701721, + "learning_rate": 3.0293440736478714e-06, + "logits/chosen": 3.1672213077545166, + "logits/rejected": 3.1672213077545166, + "logps/chosen": -171.26898193359375, + "logps/rejected": -171.26898193359375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.264386177062988, + "rewards/margins": 0.0, + "rewards/rejected": -12.264386177062988, + "step": 3294 + }, + { + "epoch": 2.2739347938588925, + "grad_norm": 0.3430801331996918, + "learning_rate": 3.0264672036823938e-06, + "logits/chosen": 3.2804999351501465, + "logits/rejected": 3.4878957271575928, + "logps/chosen": -147.67298889160156, + "logps/rejected": -160.97120666503906, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.852518081665039, + "rewards/margins": 1.255129337310791, + "rewards/rejected": -11.107647895812988, + "step": 3295 + }, + { + "epoch": 2.2746248059341037, + "grad_norm": 0.312234491109848, + "learning_rate": 3.0235903337169165e-06, + "logits/chosen": 3.160038948059082, + "logits/rejected": 3.4317498207092285, + "logps/chosen": -160.28895568847656, + "logps/rejected": -173.63282775878906, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.996291160583496, + "rewards/margins": 1.3351112604141235, + "rewards/rejected": -12.331401824951172, + "step": 3296 + }, + { + "epoch": 2.2753148180093152, + "grad_norm": 0.32584506273269653, + "learning_rate": 3.0207134637514385e-06, + "logits/chosen": 3.5208306312561035, + "logits/rejected": 3.5208306312561035, + "logps/chosen": -174.6514434814453, + "logps/rejected": -174.6514434814453, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.682769775390625, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.682769775390625, + "step": 3297 + }, + { + "epoch": 2.2760048300845264, + "grad_norm": 0.23031428456306458, + "learning_rate": 3.0178365937859612e-06, + "logits/chosen": 3.3605775833129883, + "logits/rejected": 3.5449914932250977, + "logps/chosen": -183.95223999023438, + "logps/rejected": -194.6619873046875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.591805458068848, + "rewards/margins": 1.0525974035263062, + "rewards/rejected": -14.644402503967285, + "step": 3298 + }, + { + "epoch": 2.276694842159738, + "grad_norm": 0.3824895918369293, + "learning_rate": 3.0149597238204836e-06, + "logits/chosen": 3.360909938812256, + "logits/rejected": 3.391087055206299, + "logps/chosen": -159.25, + "logps/rejected": -177.23190307617188, + "loss": 0.5212, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.074003219604492, + "rewards/margins": 1.8394842147827148, + "rewards/rejected": -12.91348648071289, + "step": 3299 + }, + { + "epoch": 2.277384854234949, + "grad_norm": 0.32701510190963745, + "learning_rate": 3.0120828538550064e-06, + "logits/chosen": 3.232374429702759, + "logits/rejected": 3.22853422164917, + "logps/chosen": -156.6307830810547, + "logps/rejected": -164.75506591796875, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.860222816467285, + "rewards/margins": 0.8056145310401917, + "rewards/rejected": -11.665836334228516, + "step": 3300 + }, + { + "epoch": 2.2780748663101607, + "grad_norm": 0.2993943393230438, + "learning_rate": 3.0092059838895283e-06, + "logits/chosen": 3.4029579162597656, + "logits/rejected": 3.4197025299072266, + "logps/chosen": -145.9095916748047, + "logps/rejected": -163.41326904296875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.770410537719727, + "rewards/margins": 1.6831135749816895, + "rewards/rejected": -11.45352554321289, + "step": 3301 + }, + { + "epoch": 2.278764878385372, + "grad_norm": 1.9934366941452026, + "learning_rate": 3.0063291139240506e-06, + "logits/chosen": 3.5063774585723877, + "logits/rejected": 3.7181434631347656, + "logps/chosen": -160.4130096435547, + "logps/rejected": -169.81382751464844, + "loss": 0.5277, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.127304077148438, + "rewards/margins": 0.9112149477005005, + "rewards/rejected": -12.038518905639648, + "step": 3302 + }, + { + "epoch": 2.279454890460583, + "grad_norm": 0.294697642326355, + "learning_rate": 3.0034522439585734e-06, + "logits/chosen": 2.9943928718566895, + "logits/rejected": 3.3727526664733887, + "logps/chosen": -138.22616577148438, + "logps/rejected": -170.58740234375, + "loss": 0.4341, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.967741966247559, + "rewards/margins": 3.2979612350463867, + "rewards/rejected": -12.265703201293945, + "step": 3303 + }, + { + "epoch": 2.2801449025357945, + "grad_norm": 0.31040510535240173, + "learning_rate": 3.0005753739930958e-06, + "logits/chosen": 3.1166634559631348, + "logits/rejected": 3.1466245651245117, + "logps/chosen": -146.20684814453125, + "logps/rejected": -168.82339477539062, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.85452651977539, + "rewards/margins": 2.3741419315338135, + "rewards/rejected": -12.228667259216309, + "step": 3304 + }, + { + "epoch": 2.2808349146110056, + "grad_norm": 0.3050214946269989, + "learning_rate": 2.997698504027618e-06, + "logits/chosen": 3.504633665084839, + "logits/rejected": 3.504633665084839, + "logps/chosen": -144.0345001220703, + "logps/rejected": -144.0345001220703, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.718274116516113, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -9.718274116516113, + "step": 3305 + }, + { + "epoch": 2.281524926686217, + "grad_norm": 4.621966361999512, + "learning_rate": 2.9948216340621405e-06, + "logits/chosen": 3.3304944038391113, + "logits/rejected": 3.414414882659912, + "logps/chosen": -167.65733337402344, + "logps/rejected": -169.54212951660156, + "loss": 0.6348, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.083292007446289, + "rewards/margins": 0.17138159275054932, + "rewards/rejected": -12.254674911499023, + "step": 3306 + }, + { + "epoch": 2.2822149387614283, + "grad_norm": 0.34035399556159973, + "learning_rate": 2.9919447640966633e-06, + "logits/chosen": 3.3014121055603027, + "logits/rejected": 3.412200927734375, + "logps/chosen": -176.43099975585938, + "logps/rejected": -184.7320556640625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.891214370727539, + "rewards/margins": 0.9026561975479126, + "rewards/rejected": -13.79387092590332, + "step": 3307 + }, + { + "epoch": 2.2829049508366395, + "grad_norm": 0.328948974609375, + "learning_rate": 2.9890678941311856e-06, + "logits/chosen": 3.440931797027588, + "logits/rejected": 3.440931797027588, + "logps/chosen": -167.83187866210938, + "logps/rejected": -167.83187866210938, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.122620582580566, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.122621536254883, + "step": 3308 + }, + { + "epoch": 2.283594962911851, + "grad_norm": 6.320224761962891, + "learning_rate": 2.9861910241657075e-06, + "logits/chosen": 3.1628715991973877, + "logits/rejected": 3.2689242362976074, + "logps/chosen": -163.6246337890625, + "logps/rejected": -178.8564453125, + "loss": 0.506, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.587078094482422, + "rewards/margins": 1.5190743207931519, + "rewards/rejected": -13.106151580810547, + "step": 3309 + }, + { + "epoch": 2.284284974987062, + "grad_norm": 0.3441672921180725, + "learning_rate": 2.9833141542002303e-06, + "logits/chosen": 3.470331907272339, + "logits/rejected": 3.567570686340332, + "logps/chosen": -160.33551025390625, + "logps/rejected": -172.39886474609375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.212913513183594, + "rewards/margins": 1.2199811935424805, + "rewards/rejected": -12.43289566040039, + "step": 3310 + }, + { + "epoch": 2.2849749870622738, + "grad_norm": 0.4145393371582031, + "learning_rate": 2.9804372842347527e-06, + "logits/chosen": 3.314493179321289, + "logits/rejected": 3.3221182823181152, + "logps/chosen": -163.7324676513672, + "logps/rejected": -169.260009765625, + "loss": 0.6078, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.580963134765625, + "rewards/margins": 0.5712466835975647, + "rewards/rejected": -12.152209281921387, + "step": 3311 + }, + { + "epoch": 2.285664999137485, + "grad_norm": 0.47505244612693787, + "learning_rate": 2.9775604142692754e-06, + "logits/chosen": 3.271836757659912, + "logits/rejected": 3.342108964920044, + "logps/chosen": -172.41250610351562, + "logps/rejected": -187.4066162109375, + "loss": 0.5248, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.414985656738281, + "rewards/margins": 1.4934475421905518, + "rewards/rejected": -13.908432960510254, + "step": 3312 + }, + { + "epoch": 2.286355011212696, + "grad_norm": 0.3249594569206238, + "learning_rate": 2.9746835443037974e-06, + "logits/chosen": 3.5729169845581055, + "logits/rejected": 3.5729169845581055, + "logps/chosen": -181.886962890625, + "logps/rejected": -181.88699340820312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.29404067993164, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.29404067993164, + "step": 3313 + }, + { + "epoch": 2.2870450232879076, + "grad_norm": 0.38986149430274963, + "learning_rate": 2.97180667433832e-06, + "logits/chosen": 3.1590137481689453, + "logits/rejected": 3.1590137481689453, + "logps/chosen": -168.57415771484375, + "logps/rejected": -168.57415771484375, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.16173267364502, + "rewards/margins": 4.172325134277344e-07, + "rewards/rejected": -12.16173267364502, + "step": 3314 + }, + { + "epoch": 2.2877350353631187, + "grad_norm": 0.29048898816108704, + "learning_rate": 2.9689298043728425e-06, + "logits/chosen": 3.4871015548706055, + "logits/rejected": 3.5933072566986084, + "logps/chosen": -154.01071166992188, + "logps/rejected": -161.544189453125, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.823577880859375, + "rewards/margins": 0.7398099899291992, + "rewards/rejected": -11.563387870788574, + "step": 3315 + }, + { + "epoch": 2.2884250474383303, + "grad_norm": 0.3178173303604126, + "learning_rate": 2.9660529344073653e-06, + "logits/chosen": 3.500540018081665, + "logits/rejected": 3.574917793273926, + "logps/chosen": -165.08302307128906, + "logps/rejected": -177.23968505859375, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.684224128723145, + "rewards/margins": 1.173521637916565, + "rewards/rejected": -12.857745170593262, + "step": 3316 + }, + { + "epoch": 2.2891150595135414, + "grad_norm": 0.3620028495788574, + "learning_rate": 2.963176064441887e-06, + "logits/chosen": 3.142632007598877, + "logits/rejected": 3.208220958709717, + "logps/chosen": -143.82327270507812, + "logps/rejected": -156.10177612304688, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.607903480529785, + "rewards/margins": 1.2388170957565308, + "rewards/rejected": -10.846720695495605, + "step": 3317 + }, + { + "epoch": 2.289805071588753, + "grad_norm": 0.33932116627693176, + "learning_rate": 2.96029919447641e-06, + "logits/chosen": 3.2764320373535156, + "logits/rejected": 3.4773964881896973, + "logps/chosen": -168.43545532226562, + "logps/rejected": -179.5293426513672, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.058477401733398, + "rewards/margins": 1.1357516050338745, + "rewards/rejected": -13.194229125976562, + "step": 3318 + }, + { + "epoch": 2.290495083663964, + "grad_norm": 0.266769677400589, + "learning_rate": 2.9574223245109323e-06, + "logits/chosen": 3.4122233390808105, + "logits/rejected": 3.3856873512268066, + "logps/chosen": -157.0406036376953, + "logps/rejected": -167.29849243164062, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.246638298034668, + "rewards/margins": 0.8737049698829651, + "rewards/rejected": -12.120343208312988, + "step": 3319 + }, + { + "epoch": 2.2911850957391753, + "grad_norm": 0.3794921636581421, + "learning_rate": 2.954545454545455e-06, + "logits/chosen": 3.021615505218506, + "logits/rejected": 3.021615505218506, + "logps/chosen": -170.57147216796875, + "logps/rejected": -170.57147216796875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.157662391662598, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.157662391662598, + "step": 3320 + }, + { + "epoch": 2.291875107814387, + "grad_norm": 0.2632385790348053, + "learning_rate": 2.951668584579977e-06, + "logits/chosen": 3.318060874938965, + "logits/rejected": 3.385791778564453, + "logps/chosen": -164.37261962890625, + "logps/rejected": -178.54127502441406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.779487609863281, + "rewards/margins": 1.4234981536865234, + "rewards/rejected": -13.202985763549805, + "step": 3321 + }, + { + "epoch": 2.292565119889598, + "grad_norm": 0.2976635694503784, + "learning_rate": 2.9487917146144994e-06, + "logits/chosen": 3.5157856941223145, + "logits/rejected": 3.530973434448242, + "logps/chosen": -158.7930145263672, + "logps/rejected": -178.44515991210938, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.077163696289062, + "rewards/margins": 2.019789934158325, + "rewards/rejected": -13.096952438354492, + "step": 3322 + }, + { + "epoch": 2.2932551319648096, + "grad_norm": 0.39770805835723877, + "learning_rate": 2.945914844649022e-06, + "logits/chosen": 3.405081272125244, + "logits/rejected": 3.45379376411438, + "logps/chosen": -161.5802001953125, + "logps/rejected": -173.3265838623047, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.277620315551758, + "rewards/margins": 1.1245414018630981, + "rewards/rejected": -12.402161598205566, + "step": 3323 + }, + { + "epoch": 2.2939451440400207, + "grad_norm": 0.26144155859947205, + "learning_rate": 2.9430379746835445e-06, + "logits/chosen": 3.4315812587738037, + "logits/rejected": 3.462890386581421, + "logps/chosen": -171.182373046875, + "logps/rejected": -179.19944763183594, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.378898620605469, + "rewards/margins": 0.8110430836677551, + "rewards/rejected": -13.18994140625, + "step": 3324 + }, + { + "epoch": 2.294635156115232, + "grad_norm": 0.28641578555107117, + "learning_rate": 2.9401611047180673e-06, + "logits/chosen": 3.5173802375793457, + "logits/rejected": 3.5532307624816895, + "logps/chosen": -179.78909301757812, + "logps/rejected": -187.46957397460938, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.291141510009766, + "rewards/margins": 0.7650759220123291, + "rewards/rejected": -14.056217193603516, + "step": 3325 + }, + { + "epoch": 2.2953251681904434, + "grad_norm": 0.30144327878952026, + "learning_rate": 2.937284234752589e-06, + "logits/chosen": 3.898519992828369, + "logits/rejected": 4.026907920837402, + "logps/chosen": -172.11294555664062, + "logps/rejected": -185.0157470703125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.526805877685547, + "rewards/margins": 1.2659097909927368, + "rewards/rejected": -13.792716979980469, + "step": 3326 + }, + { + "epoch": 2.2960151802656545, + "grad_norm": 0.30855220556259155, + "learning_rate": 2.934407364787112e-06, + "logits/chosen": 3.2055158615112305, + "logits/rejected": 3.2055158615112305, + "logps/chosen": -168.27996826171875, + "logps/rejected": -168.27996826171875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.166244506835938, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.166244506835938, + "step": 3327 + }, + { + "epoch": 2.296705192340866, + "grad_norm": 0.272057443857193, + "learning_rate": 2.9315304948216343e-06, + "logits/chosen": 3.331599712371826, + "logits/rejected": 3.5005862712860107, + "logps/chosen": -156.11289978027344, + "logps/rejected": -174.19422912597656, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.889039039611816, + "rewards/margins": 1.7087278366088867, + "rewards/rejected": -12.597766876220703, + "step": 3328 + }, + { + "epoch": 2.2973952044160773, + "grad_norm": 0.28324854373931885, + "learning_rate": 2.928653624856157e-06, + "logits/chosen": 3.7245242595672607, + "logits/rejected": 3.7245242595672607, + "logps/chosen": -176.50425720214844, + "logps/rejected": -176.50425720214844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.943582534790039, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.943582534790039, + "step": 3329 + }, + { + "epoch": 2.2980852164912884, + "grad_norm": 0.3832896947860718, + "learning_rate": 2.925776754890679e-06, + "logits/chosen": 3.4457993507385254, + "logits/rejected": 3.460376501083374, + "logps/chosen": -177.5961456298828, + "logps/rejected": -194.8639373779297, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.091381072998047, + "rewards/margins": 1.6703966856002808, + "rewards/rejected": -14.761777877807617, + "step": 3330 + }, + { + "epoch": 2.2987752285665, + "grad_norm": 21.535667419433594, + "learning_rate": 2.922899884925202e-06, + "logits/chosen": 3.3109140396118164, + "logits/rejected": 3.6108827590942383, + "logps/chosen": -126.71314239501953, + "logps/rejected": -143.21963500976562, + "loss": 1.3196, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.156720161437988, + "rewards/margins": 1.4946086406707764, + "rewards/rejected": -9.651329040527344, + "step": 3331 + }, + { + "epoch": 2.299465240641711, + "grad_norm": 0.3951772153377533, + "learning_rate": 2.920023014959724e-06, + "logits/chosen": 3.1568188667297363, + "logits/rejected": 3.1568188667297363, + "logps/chosen": -153.501708984375, + "logps/rejected": -153.501708984375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.809501647949219, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -10.809501647949219, + "step": 3332 + }, + { + "epoch": 2.3001552527169227, + "grad_norm": 0.27657926082611084, + "learning_rate": 2.917146144994247e-06, + "logits/chosen": 3.193385124206543, + "logits/rejected": 3.3231911659240723, + "logps/chosen": -163.51620483398438, + "logps/rejected": -172.71624755859375, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.58383846282959, + "rewards/margins": 0.943488359451294, + "rewards/rejected": -12.527327537536621, + "step": 3333 + }, + { + "epoch": 2.300845264792134, + "grad_norm": 0.2963782846927643, + "learning_rate": 2.914269275028769e-06, + "logits/chosen": 3.113445281982422, + "logits/rejected": 3.2401390075683594, + "logps/chosen": -156.76785278320312, + "logps/rejected": -171.0270233154297, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.958096504211426, + "rewards/margins": 1.4240553379058838, + "rewards/rejected": -12.38215160369873, + "step": 3334 + }, + { + "epoch": 2.3015352768673454, + "grad_norm": 0.27550771832466125, + "learning_rate": 2.9113924050632912e-06, + "logits/chosen": 3.596061944961548, + "logits/rejected": 3.6457457542419434, + "logps/chosen": -167.01039123535156, + "logps/rejected": -174.4849395751953, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.12460708618164, + "rewards/margins": 0.764427661895752, + "rewards/rejected": -12.88903522491455, + "step": 3335 + }, + { + "epoch": 2.3022252889425565, + "grad_norm": 0.2254650741815567, + "learning_rate": 2.908515535097814e-06, + "logits/chosen": 3.0781123638153076, + "logits/rejected": 3.5189146995544434, + "logps/chosen": -151.18182373046875, + "logps/rejected": -186.40432739257812, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.32077693939209, + "rewards/margins": 3.4325978755950928, + "rewards/rejected": -13.753375053405762, + "step": 3336 + }, + { + "epoch": 2.3029153010177676, + "grad_norm": 0.28591933846473694, + "learning_rate": 2.9056386651323363e-06, + "logits/chosen": 3.268014430999756, + "logits/rejected": 3.3218202590942383, + "logps/chosen": -155.49783325195312, + "logps/rejected": -175.55921936035156, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.691685676574707, + "rewards/margins": 2.0745091438293457, + "rewards/rejected": -12.766195297241211, + "step": 3337 + }, + { + "epoch": 2.3036053130929792, + "grad_norm": 0.3001360595226288, + "learning_rate": 2.9027617951668587e-06, + "logits/chosen": 3.0755538940429688, + "logits/rejected": 3.194371461868286, + "logps/chosen": -168.77581787109375, + "logps/rejected": -178.0375213623047, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.184150695800781, + "rewards/margins": 0.9245474338531494, + "rewards/rejected": -13.108697891235352, + "step": 3338 + }, + { + "epoch": 2.3042953251681904, + "grad_norm": 4.826504230499268, + "learning_rate": 2.899884925201381e-06, + "logits/chosen": 3.6366512775421143, + "logits/rejected": 3.615173101425171, + "logps/chosen": -154.07850646972656, + "logps/rejected": -162.04815673828125, + "loss": 0.5854, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.668987274169922, + "rewards/margins": 0.7478399276733398, + "rewards/rejected": -11.416826248168945, + "step": 3339 + }, + { + "epoch": 2.304985337243402, + "grad_norm": 0.304235577583313, + "learning_rate": 2.897008055235904e-06, + "logits/chosen": 3.4014291763305664, + "logits/rejected": 3.4993772506713867, + "logps/chosen": -171.2777862548828, + "logps/rejected": -178.67759704589844, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.2176513671875, + "rewards/margins": 0.7361840009689331, + "rewards/rejected": -12.953835487365723, + "step": 3340 + }, + { + "epoch": 2.305675349318613, + "grad_norm": 4.349198818206787, + "learning_rate": 2.894131185270426e-06, + "logits/chosen": 3.439751148223877, + "logits/rejected": 3.4760236740112305, + "logps/chosen": -140.90911865234375, + "logps/rejected": -169.1799774169922, + "loss": 0.4537, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.116039276123047, + "rewards/margins": 2.8246309757232666, + "rewards/rejected": -11.940670013427734, + "step": 3341 + }, + { + "epoch": 2.306365361393824, + "grad_norm": 0.2690965235233307, + "learning_rate": 2.891254315304948e-06, + "logits/chosen": 3.285005569458008, + "logits/rejected": 3.318005084991455, + "logps/chosen": -150.08099365234375, + "logps/rejected": -170.77777099609375, + "loss": 0.5202, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.181676864624023, + "rewards/margins": 2.067446708679199, + "rewards/rejected": -12.249123573303223, + "step": 3342 + }, + { + "epoch": 2.3070553734690358, + "grad_norm": 0.3536495268344879, + "learning_rate": 2.888377445339471e-06, + "logits/chosen": 3.235891580581665, + "logits/rejected": 3.3775200843811035, + "logps/chosen": -158.86090087890625, + "logps/rejected": -165.6805419921875, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.227592468261719, + "rewards/margins": 0.651387095451355, + "rewards/rejected": -11.878978729248047, + "step": 3343 + }, + { + "epoch": 2.307745385544247, + "grad_norm": 0.4623779058456421, + "learning_rate": 2.8855005753739932e-06, + "logits/chosen": 3.1501283645629883, + "logits/rejected": 3.1501283645629883, + "logps/chosen": -181.62274169921875, + "logps/rejected": -181.6227264404297, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.43796443939209, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.437965393066406, + "step": 3344 + }, + { + "epoch": 2.3084353976194585, + "grad_norm": 4.873171329498291, + "learning_rate": 2.882623705408516e-06, + "logits/chosen": 3.0661728382110596, + "logits/rejected": 3.2134265899658203, + "logps/chosen": -149.59591674804688, + "logps/rejected": -169.25869750976562, + "loss": 0.469, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.23447322845459, + "rewards/margins": 1.977971076965332, + "rewards/rejected": -12.212444305419922, + "step": 3345 + }, + { + "epoch": 2.3091254096946696, + "grad_norm": 0.28200581669807434, + "learning_rate": 2.879746835443038e-06, + "logits/chosen": 3.744770050048828, + "logits/rejected": 3.7866413593292236, + "logps/chosen": -179.36631774902344, + "logps/rejected": -188.4123077392578, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.116311073303223, + "rewards/margins": 0.9187898635864258, + "rewards/rejected": -14.035100936889648, + "step": 3346 + }, + { + "epoch": 2.3098154217698807, + "grad_norm": 0.2946661412715912, + "learning_rate": 2.8768699654775607e-06, + "logits/chosen": 3.0353052616119385, + "logits/rejected": 3.0886192321777344, + "logps/chosen": -167.1790771484375, + "logps/rejected": -180.6190948486328, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.13055419921875, + "rewards/margins": 1.3310269117355347, + "rewards/rejected": -13.461581230163574, + "step": 3347 + }, + { + "epoch": 2.3105054338450923, + "grad_norm": 0.33567485213279724, + "learning_rate": 2.873993095512083e-06, + "logits/chosen": 3.6451048851013184, + "logits/rejected": 3.6425185203552246, + "logps/chosen": -163.08673095703125, + "logps/rejected": -174.18685913085938, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.46828842163086, + "rewards/margins": 1.1212372779846191, + "rewards/rejected": -12.589526176452637, + "step": 3348 + }, + { + "epoch": 2.3111954459203035, + "grad_norm": 0.3331127464771271, + "learning_rate": 2.871116225546606e-06, + "logits/chosen": 3.539276361465454, + "logits/rejected": 3.636323928833008, + "logps/chosen": -169.68533325195312, + "logps/rejected": -179.86245727539062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.202380180358887, + "rewards/margins": 0.9941622018814087, + "rewards/rejected": -13.196542739868164, + "step": 3349 + }, + { + "epoch": 2.311885457995515, + "grad_norm": 0.3938754200935364, + "learning_rate": 2.8682393555811278e-06, + "logits/chosen": 3.6772096157073975, + "logits/rejected": 3.763465404510498, + "logps/chosen": -180.06246948242188, + "logps/rejected": -185.95127868652344, + "loss": 0.6074, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.204687118530273, + "rewards/margins": 0.6230696439743042, + "rewards/rejected": -13.827756881713867, + "step": 3350 + }, + { + "epoch": 2.312575470070726, + "grad_norm": 1.1370769739151, + "learning_rate": 2.8653624856156505e-06, + "logits/chosen": 3.37202787399292, + "logits/rejected": 3.6731185913085938, + "logps/chosen": -148.72642517089844, + "logps/rejected": -183.72537231445312, + "loss": 0.4402, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.154830932617188, + "rewards/margins": 3.5462465286254883, + "rewards/rejected": -13.701078414916992, + "step": 3351 + }, + { + "epoch": 2.3132654821459377, + "grad_norm": 0.3121284544467926, + "learning_rate": 2.862485615650173e-06, + "logits/chosen": 3.2942051887512207, + "logits/rejected": 3.3286681175231934, + "logps/chosen": -159.68795776367188, + "logps/rejected": -167.3748016357422, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.251687049865723, + "rewards/margins": 0.8070815205574036, + "rewards/rejected": -12.058768272399902, + "step": 3352 + }, + { + "epoch": 2.313955494221149, + "grad_norm": 0.2967800796031952, + "learning_rate": 2.8596087456846957e-06, + "logits/chosen": 3.377688407897949, + "logits/rejected": 3.517918825149536, + "logps/chosen": -154.37452697753906, + "logps/rejected": -174.278076171875, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.51668643951416, + "rewards/margins": 2.195572853088379, + "rewards/rejected": -12.712259292602539, + "step": 3353 + }, + { + "epoch": 2.31464550629636, + "grad_norm": 0.2667534351348877, + "learning_rate": 2.8567318757192176e-06, + "logits/chosen": 3.2364718914031982, + "logits/rejected": 3.479806900024414, + "logps/chosen": -164.29110717773438, + "logps/rejected": -182.57925415039062, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.454432487487793, + "rewards/margins": 1.8933199644088745, + "rewards/rejected": -13.347752571105957, + "step": 3354 + }, + { + "epoch": 2.3153355183715716, + "grad_norm": 0.40102365612983704, + "learning_rate": 2.85385500575374e-06, + "logits/chosen": 3.661823034286499, + "logits/rejected": 3.661823034286499, + "logps/chosen": -176.52438354492188, + "logps/rejected": -176.52438354492188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.90787124633789, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.90787124633789, + "step": 3355 + }, + { + "epoch": 2.3160255304467827, + "grad_norm": 0.38859644532203674, + "learning_rate": 2.8509781357882627e-06, + "logits/chosen": 3.718921184539795, + "logits/rejected": 3.718921184539795, + "logps/chosen": -187.36032104492188, + "logps/rejected": -187.36032104492188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.928966522216797, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.928966522216797, + "step": 3356 + }, + { + "epoch": 2.3167155425219943, + "grad_norm": 0.3545669913291931, + "learning_rate": 2.848101265822785e-06, + "logits/chosen": 3.688828945159912, + "logits/rejected": 3.846834421157837, + "logps/chosen": -166.90963745117188, + "logps/rejected": -176.98126220703125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.169021606445312, + "rewards/margins": 0.9900768399238586, + "rewards/rejected": -13.159099578857422, + "step": 3357 + }, + { + "epoch": 2.3174055545972054, + "grad_norm": 0.348657488822937, + "learning_rate": 2.8452243958573074e-06, + "logits/chosen": 3.3581104278564453, + "logits/rejected": 3.750011920928955, + "logps/chosen": -143.16168212890625, + "logps/rejected": -183.896484375, + "loss": 0.348, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.676545143127441, + "rewards/margins": 3.977267026901245, + "rewards/rejected": -13.653812408447266, + "step": 3358 + }, + { + "epoch": 2.318095566672417, + "grad_norm": 0.30843275785446167, + "learning_rate": 2.8423475258918298e-06, + "logits/chosen": 3.6457693576812744, + "logits/rejected": 3.6457693576812744, + "logps/chosen": -170.09393310546875, + "logps/rejected": -170.09393310546875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.264288902282715, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.264288902282715, + "step": 3359 + }, + { + "epoch": 2.318785578747628, + "grad_norm": 7.174393177032471, + "learning_rate": 2.8394706559263525e-06, + "logits/chosen": 3.4326682090759277, + "logits/rejected": 3.45951509475708, + "logps/chosen": -156.24481201171875, + "logps/rejected": -189.1204071044922, + "loss": 0.4115, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.744621276855469, + "rewards/margins": 3.2526891231536865, + "rewards/rejected": -13.997310638427734, + "step": 3360 + }, + { + "epoch": 2.3194755908228393, + "grad_norm": 0.4395774304866791, + "learning_rate": 2.836593785960875e-06, + "logits/chosen": 3.4132487773895264, + "logits/rejected": 3.4426705837249756, + "logps/chosen": -158.86212158203125, + "logps/rejected": -164.6463623046875, + "loss": 0.6082, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.135605812072754, + "rewards/margins": 0.5344297885894775, + "rewards/rejected": -11.670036315917969, + "step": 3361 + }, + { + "epoch": 2.320165602898051, + "grad_norm": 0.4704343378543854, + "learning_rate": 2.833716915995397e-06, + "logits/chosen": 3.4303572177886963, + "logits/rejected": 3.5246148109436035, + "logps/chosen": -157.69943237304688, + "logps/rejected": -163.92332458496094, + "loss": 0.6072, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.131600379943848, + "rewards/margins": 0.6489004492759705, + "rewards/rejected": -11.780500411987305, + "step": 3362 + }, + { + "epoch": 2.320855614973262, + "grad_norm": 0.3981803357601166, + "learning_rate": 2.8308400460299196e-06, + "logits/chosen": 3.6731998920440674, + "logits/rejected": 3.711559772491455, + "logps/chosen": -168.404541015625, + "logps/rejected": -175.77919006347656, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.059782028198242, + "rewards/margins": 0.6866154670715332, + "rewards/rejected": -12.746397972106934, + "step": 3363 + }, + { + "epoch": 2.321545627048473, + "grad_norm": 0.35857146978378296, + "learning_rate": 2.827963176064442e-06, + "logits/chosen": 3.3460822105407715, + "logits/rejected": 3.4117074012756348, + "logps/chosen": -142.9737548828125, + "logps/rejected": -156.2600555419922, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.625361442565918, + "rewards/margins": 1.2496095895767212, + "rewards/rejected": -10.874971389770508, + "step": 3364 + }, + { + "epoch": 2.3222356391236847, + "grad_norm": 0.4329790472984314, + "learning_rate": 2.8250863060989647e-06, + "logits/chosen": 3.811074733734131, + "logits/rejected": 3.811074733734131, + "logps/chosen": -179.34463500976562, + "logps/rejected": -179.34463500976562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.050477981567383, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.050477981567383, + "step": 3365 + }, + { + "epoch": 2.322925651198896, + "grad_norm": 0.3216456472873688, + "learning_rate": 2.8222094361334867e-06, + "logits/chosen": 3.815866708755493, + "logits/rejected": 3.815866708755493, + "logps/chosen": -179.04278564453125, + "logps/rejected": -179.04278564453125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.171258926391602, + "rewards/margins": 0.0, + "rewards/rejected": -13.171258926391602, + "step": 3366 + }, + { + "epoch": 2.3236156632741074, + "grad_norm": 0.3879246413707733, + "learning_rate": 2.8193325661680094e-06, + "logits/chosen": 3.6754353046417236, + "logits/rejected": 3.6754353046417236, + "logps/chosen": -177.82830810546875, + "logps/rejected": -177.82830810546875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.123838424682617, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -13.123838424682617, + "step": 3367 + }, + { + "epoch": 2.3243056753493185, + "grad_norm": 0.39527779817581177, + "learning_rate": 2.8164556962025318e-06, + "logits/chosen": 3.4945127964019775, + "logits/rejected": 3.598767042160034, + "logps/chosen": -157.9376220703125, + "logps/rejected": -177.5278778076172, + "loss": 0.5208, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.91773509979248, + "rewards/margins": 1.8896955251693726, + "rewards/rejected": -12.8074312210083, + "step": 3368 + }, + { + "epoch": 2.32499568742453, + "grad_norm": 0.3187848627567291, + "learning_rate": 2.8135788262370546e-06, + "logits/chosen": 3.3362293243408203, + "logits/rejected": 3.472100257873535, + "logps/chosen": -152.28533935546875, + "logps/rejected": -167.36915588378906, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.463475227355957, + "rewards/margins": 1.436980128288269, + "rewards/rejected": -11.900455474853516, + "step": 3369 + }, + { + "epoch": 2.3256856994997412, + "grad_norm": 0.40978217124938965, + "learning_rate": 2.8107019562715765e-06, + "logits/chosen": 3.2013204097747803, + "logits/rejected": 3.484156370162964, + "logps/chosen": -159.85552978515625, + "logps/rejected": -178.1022491455078, + "loss": 0.5238, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.277592658996582, + "rewards/margins": 1.7507896423339844, + "rewards/rejected": -13.028382301330566, + "step": 3370 + }, + { + "epoch": 2.3263757115749524, + "grad_norm": 0.2653023600578308, + "learning_rate": 2.8078250863060993e-06, + "logits/chosen": 3.2954068183898926, + "logits/rejected": 3.5683865547180176, + "logps/chosen": -162.44204711914062, + "logps/rejected": -181.71151733398438, + "loss": 0.5207, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.550260543823242, + "rewards/margins": 1.9109574556350708, + "rewards/rejected": -13.461217880249023, + "step": 3371 + }, + { + "epoch": 2.327065723650164, + "grad_norm": 0.38028043508529663, + "learning_rate": 2.8049482163406216e-06, + "logits/chosen": 3.8155035972595215, + "logits/rejected": 3.8155035972595215, + "logps/chosen": -184.5508270263672, + "logps/rejected": -184.55084228515625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.65976333618164, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.65976333618164, + "step": 3372 + }, + { + "epoch": 2.327755735725375, + "grad_norm": 29.31817054748535, + "learning_rate": 2.8020713463751444e-06, + "logits/chosen": 2.9297332763671875, + "logits/rejected": 3.317584753036499, + "logps/chosen": -140.73472595214844, + "logps/rejected": -166.42181396484375, + "loss": 1.179, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.246517181396484, + "rewards/margins": 2.447248697280884, + "rewards/rejected": -11.693765640258789, + "step": 3373 + }, + { + "epoch": 2.3284457478005867, + "grad_norm": 0.28546881675720215, + "learning_rate": 2.7991944764096663e-06, + "logits/chosen": 3.5376551151275635, + "logits/rejected": 3.769256830215454, + "logps/chosen": -171.01348876953125, + "logps/rejected": -178.56216430664062, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.421992301940918, + "rewards/margins": 0.7777523398399353, + "rewards/rejected": -13.19974422454834, + "step": 3374 + }, + { + "epoch": 2.329135759875798, + "grad_norm": 3.3375444412231445, + "learning_rate": 2.7963176064441887e-06, + "logits/chosen": 3.305823802947998, + "logits/rejected": 3.4323034286499023, + "logps/chosen": -153.74171447753906, + "logps/rejected": -176.8243865966797, + "loss": 0.4493, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.640666007995605, + "rewards/margins": 2.4200031757354736, + "rewards/rejected": -13.0606689453125, + "step": 3375 + }, + { + "epoch": 2.3298257719510094, + "grad_norm": 0.3652627766132355, + "learning_rate": 2.7934407364787114e-06, + "logits/chosen": 3.4467055797576904, + "logits/rejected": 3.549488067626953, + "logps/chosen": -151.292724609375, + "logps/rejected": -174.0531005859375, + "loss": 0.52, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.496960639953613, + "rewards/margins": 2.3358821868896484, + "rewards/rejected": -12.832843780517578, + "step": 3376 + }, + { + "epoch": 2.3305157840262205, + "grad_norm": 0.5153656005859375, + "learning_rate": 2.790563866513234e-06, + "logits/chosen": 3.503641128540039, + "logits/rejected": 3.6127991676330566, + "logps/chosen": -170.0617218017578, + "logps/rejected": -174.97579956054688, + "loss": 0.6089, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.107748031616211, + "rewards/margins": 0.4935489892959595, + "rewards/rejected": -12.601297378540039, + "step": 3377 + }, + { + "epoch": 2.3312057961014316, + "grad_norm": 0.23960629105567932, + "learning_rate": 2.7876869965477566e-06, + "logits/chosen": 3.5199379920959473, + "logits/rejected": 3.623009204864502, + "logps/chosen": -166.15151977539062, + "logps/rejected": -186.494140625, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.7887601852417, + "rewards/margins": 2.0960869789123535, + "rewards/rejected": -13.884848594665527, + "step": 3378 + }, + { + "epoch": 2.331895808176643, + "grad_norm": 0.37259528040885925, + "learning_rate": 2.7848101265822785e-06, + "logits/chosen": 3.5282909870147705, + "logits/rejected": 3.5282909870147705, + "logps/chosen": -167.3544921875, + "logps/rejected": -167.3544921875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.159934997558594, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.159934997558594, + "step": 3379 + }, + { + "epoch": 2.3325858202518543, + "grad_norm": 0.33178070187568665, + "learning_rate": 2.7819332566168013e-06, + "logits/chosen": 3.5479273796081543, + "logits/rejected": 3.63881254196167, + "logps/chosen": -167.22265625, + "logps/rejected": -178.64602661132812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.869312286376953, + "rewards/margins": 1.1789953708648682, + "rewards/rejected": -13.048307418823242, + "step": 3380 + }, + { + "epoch": 2.3332758323270655, + "grad_norm": 0.31792977452278137, + "learning_rate": 2.7790563866513236e-06, + "logits/chosen": 3.1072869300842285, + "logits/rejected": 3.1072869300842285, + "logps/chosen": -157.6538848876953, + "logps/rejected": -157.6538848876953, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.981563568115234, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -10.981563568115234, + "step": 3381 + }, + { + "epoch": 2.333965844402277, + "grad_norm": 0.3372638523578644, + "learning_rate": 2.7761795166858464e-06, + "logits/chosen": 3.114105701446533, + "logits/rejected": 3.156452178955078, + "logps/chosen": -161.75917053222656, + "logps/rejected": -169.7040252685547, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.410516738891602, + "rewards/margins": 0.7926538586616516, + "rewards/rejected": -12.203170776367188, + "step": 3382 + }, + { + "epoch": 2.334655856477488, + "grad_norm": 0.30080246925354004, + "learning_rate": 2.7733026467203683e-06, + "logits/chosen": 3.45251202583313, + "logits/rejected": 3.45251202583313, + "logps/chosen": -173.82461547851562, + "logps/rejected": -173.82461547851562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.830670356750488, + "rewards/margins": 0.0, + "rewards/rejected": -12.830670356750488, + "step": 3383 + }, + { + "epoch": 2.3353458685526998, + "grad_norm": 0.2681029736995697, + "learning_rate": 2.7704257767548907e-06, + "logits/chosen": 3.3788352012634277, + "logits/rejected": 3.4663608074188232, + "logps/chosen": -188.33740234375, + "logps/rejected": -198.28195190429688, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.991125106811523, + "rewards/margins": 1.0185174942016602, + "rewards/rejected": -15.009641647338867, + "step": 3384 + }, + { + "epoch": 2.336035880627911, + "grad_norm": 0.2527998983860016, + "learning_rate": 2.7675489067894135e-06, + "logits/chosen": 3.5384066104888916, + "logits/rejected": 3.5311272144317627, + "logps/chosen": -175.78138732910156, + "logps/rejected": -183.15377807617188, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.922684669494629, + "rewards/margins": 0.7896512746810913, + "rewards/rejected": -13.712335586547852, + "step": 3385 + }, + { + "epoch": 2.3367258927031225, + "grad_norm": 0.3031516671180725, + "learning_rate": 2.7646720368239362e-06, + "logits/chosen": 3.23149037361145, + "logits/rejected": 3.4655909538269043, + "logps/chosen": -149.68539428710938, + "logps/rejected": -183.3930206298828, + "loss": 0.4334, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.11643123626709, + "rewards/margins": 3.3330109119415283, + "rewards/rejected": -13.449441909790039, + "step": 3386 + }, + { + "epoch": 2.3374159047783336, + "grad_norm": 0.3408765196800232, + "learning_rate": 2.761795166858458e-06, + "logits/chosen": 2.999620199203491, + "logits/rejected": 2.999620199203491, + "logps/chosen": -182.55516052246094, + "logps/rejected": -182.55516052246094, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.526556015014648, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.526556968688965, + "step": 3387 + }, + { + "epoch": 2.3381059168535447, + "grad_norm": 0.3124777674674988, + "learning_rate": 2.7589182968929805e-06, + "logits/chosen": 3.2644853591918945, + "logits/rejected": 3.415894031524658, + "logps/chosen": -172.65267944335938, + "logps/rejected": -182.7381591796875, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.342954635620117, + "rewards/margins": 0.934958815574646, + "rewards/rejected": -13.277914047241211, + "step": 3388 + }, + { + "epoch": 2.3387959289287563, + "grad_norm": 0.4105788469314575, + "learning_rate": 2.7560414269275033e-06, + "logits/chosen": 3.3900599479675293, + "logits/rejected": 3.3900599479675293, + "logps/chosen": -178.1596221923828, + "logps/rejected": -178.1596221923828, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.02652359008789, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.026524543762207, + "step": 3389 + }, + { + "epoch": 2.3394859410039675, + "grad_norm": 0.36047518253326416, + "learning_rate": 2.7531645569620256e-06, + "logits/chosen": 3.4731087684631348, + "logits/rejected": 3.5309011936187744, + "logps/chosen": -154.61419677734375, + "logps/rejected": -161.8091583251953, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.707584381103516, + "rewards/margins": 0.7214944958686829, + "rewards/rejected": -11.429079055786133, + "step": 3390 + }, + { + "epoch": 2.340175953079179, + "grad_norm": 0.48374709486961365, + "learning_rate": 2.750287686996548e-06, + "logits/chosen": 3.4836065769195557, + "logits/rejected": 3.6860344409942627, + "logps/chosen": -166.92303466796875, + "logps/rejected": -172.29803466796875, + "loss": 0.608, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.980405807495117, + "rewards/margins": 0.5545310974121094, + "rewards/rejected": -12.534936904907227, + "step": 3391 + }, + { + "epoch": 2.34086596515439, + "grad_norm": 0.35737326741218567, + "learning_rate": 2.7474108170310703e-06, + "logits/chosen": 3.6214840412139893, + "logits/rejected": 3.6214840412139893, + "logps/chosen": -161.30889892578125, + "logps/rejected": -161.30889892578125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.246918678283691, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.246917724609375, + "step": 3392 + }, + { + "epoch": 2.3415559772296017, + "grad_norm": 0.34841495752334595, + "learning_rate": 2.744533947065593e-06, + "logits/chosen": 3.4820642471313477, + "logits/rejected": 3.6192312240600586, + "logps/chosen": -144.59291076660156, + "logps/rejected": -156.3488311767578, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.817336082458496, + "rewards/margins": 1.185526967048645, + "rewards/rejected": -11.002862930297852, + "step": 3393 + }, + { + "epoch": 2.342245989304813, + "grad_norm": 0.2990693151950836, + "learning_rate": 2.7416570771001155e-06, + "logits/chosen": 3.354419469833374, + "logits/rejected": 3.3735523223876953, + "logps/chosen": -144.70126342773438, + "logps/rejected": -163.88287353515625, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.81195068359375, + "rewards/margins": 1.848012089729309, + "rewards/rejected": -11.659963607788086, + "step": 3394 + }, + { + "epoch": 2.342936001380024, + "grad_norm": 0.28304004669189453, + "learning_rate": 2.7387802071346374e-06, + "logits/chosen": 3.531515121459961, + "logits/rejected": 3.583038330078125, + "logps/chosen": -177.92156982421875, + "logps/rejected": -191.9962158203125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.09188175201416, + "rewards/margins": 1.4430145025253296, + "rewards/rejected": -14.534896850585938, + "step": 3395 + }, + { + "epoch": 2.3436260134552356, + "grad_norm": 0.47035840153694153, + "learning_rate": 2.73590333716916e-06, + "logits/chosen": 3.237694263458252, + "logits/rejected": 3.237694263458252, + "logps/chosen": -169.64242553710938, + "logps/rejected": -169.64242553710938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.247797012329102, + "rewards/margins": 0.0, + "rewards/rejected": -12.247797012329102, + "step": 3396 + }, + { + "epoch": 2.3443160255304467, + "grad_norm": 0.8166826963424683, + "learning_rate": 2.7330264672036825e-06, + "logits/chosen": 3.3973467350006104, + "logits/rejected": 3.53055739402771, + "logps/chosen": -150.05784606933594, + "logps/rejected": -188.55816650390625, + "loss": 0.4359, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.397079467773438, + "rewards/margins": 3.769608974456787, + "rewards/rejected": -14.166688919067383, + "step": 3397 + }, + { + "epoch": 2.3450060376056583, + "grad_norm": 0.35919997096061707, + "learning_rate": 2.7301495972382053e-06, + "logits/chosen": 2.8777852058410645, + "logits/rejected": 2.9410271644592285, + "logps/chosen": -161.91722106933594, + "logps/rejected": -169.15158081054688, + "loss": 0.6068, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.442534446716309, + "rewards/margins": 0.7384503483772278, + "rewards/rejected": -12.180984497070312, + "step": 3398 + }, + { + "epoch": 2.3456960496808694, + "grad_norm": 0.3546614646911621, + "learning_rate": 2.7272727272727272e-06, + "logits/chosen": 3.5761165618896484, + "logits/rejected": 3.6188693046569824, + "logps/chosen": -158.18594360351562, + "logps/rejected": -168.42138671875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.116697311401367, + "rewards/margins": 0.9999579787254333, + "rewards/rejected": -12.116655349731445, + "step": 3399 + }, + { + "epoch": 2.3463860617560806, + "grad_norm": 0.5750224590301514, + "learning_rate": 2.72439585730725e-06, + "logits/chosen": 2.586477756500244, + "logits/rejected": 2.586477756500244, + "logps/chosen": -147.31065368652344, + "logps/rejected": -147.31065368652344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -9.891740798950195, + "rewards/margins": 0.0, + "rewards/rejected": -9.891740798950195, + "step": 3400 + }, + { + "epoch": 2.347076073831292, + "grad_norm": 0.48305776715278625, + "learning_rate": 2.7215189873417724e-06, + "logits/chosen": 3.564913272857666, + "logits/rejected": 3.564913272857666, + "logps/chosen": -154.29444885253906, + "logps/rejected": -154.29444885253906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.837844848632812, + "rewards/margins": 0.0, + "rewards/rejected": -10.837844848632812, + "step": 3401 + }, + { + "epoch": 2.3477660859065033, + "grad_norm": 0.3297654986381531, + "learning_rate": 2.718642117376295e-06, + "logits/chosen": 2.9303793907165527, + "logits/rejected": 2.9303793907165527, + "logps/chosen": -185.39456176757812, + "logps/rejected": -185.39456176757812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.74839973449707, + "rewards/margins": 0.0, + "rewards/rejected": -13.74839973449707, + "step": 3402 + }, + { + "epoch": 2.348456097981715, + "grad_norm": 0.4550008773803711, + "learning_rate": 2.715765247410817e-06, + "logits/chosen": 3.522639274597168, + "logits/rejected": 3.522639274597168, + "logps/chosen": -176.15463256835938, + "logps/rejected": -176.15463256835938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.792574882507324, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.792574882507324, + "step": 3403 + }, + { + "epoch": 2.349146110056926, + "grad_norm": 0.30351391434669495, + "learning_rate": 2.7128883774453394e-06, + "logits/chosen": 3.1236259937286377, + "logits/rejected": 3.2150092124938965, + "logps/chosen": -151.7105712890625, + "logps/rejected": -172.70382690429688, + "loss": 0.5201, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.382275581359863, + "rewards/margins": 2.151517152786255, + "rewards/rejected": -12.533792495727539, + "step": 3404 + }, + { + "epoch": 2.349836122132137, + "grad_norm": 0.4272667467594147, + "learning_rate": 2.710011507479862e-06, + "logits/chosen": 3.245107650756836, + "logits/rejected": 3.4000425338745117, + "logps/chosen": -151.30975341796875, + "logps/rejected": -159.70050048828125, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.258445739746094, + "rewards/margins": 0.8552520275115967, + "rewards/rejected": -11.113697052001953, + "step": 3405 + }, + { + "epoch": 2.3505261342073487, + "grad_norm": 1.452956199645996, + "learning_rate": 2.707134637514385e-06, + "logits/chosen": 3.3955910205841064, + "logits/rejected": 3.418381452560425, + "logps/chosen": -167.29925537109375, + "logps/rejected": -171.26119995117188, + "loss": 0.6122, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.745481491088867, + "rewards/margins": 0.38223642110824585, + "rewards/rejected": -12.127717971801758, + "step": 3406 + }, + { + "epoch": 2.35121614628256, + "grad_norm": 0.25718697905540466, + "learning_rate": 2.704257767548907e-06, + "logits/chosen": 3.625370979309082, + "logits/rejected": 3.587604284286499, + "logps/chosen": -143.72682189941406, + "logps/rejected": -170.98228454589844, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.656839370727539, + "rewards/margins": 2.695749282836914, + "rewards/rejected": -12.352588653564453, + "step": 3407 + }, + { + "epoch": 2.3519061583577714, + "grad_norm": 0.3959476947784424, + "learning_rate": 2.7013808975834292e-06, + "logits/chosen": 3.150953769683838, + "logits/rejected": 3.5024526119232178, + "logps/chosen": -146.41796875, + "logps/rejected": -170.1367645263672, + "loss": 0.4337, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.855786323547363, + "rewards/margins": 2.523498058319092, + "rewards/rejected": -12.37928295135498, + "step": 3408 + }, + { + "epoch": 2.3525961704329825, + "grad_norm": 0.4624360203742981, + "learning_rate": 2.698504027617952e-06, + "logits/chosen": 3.523617744445801, + "logits/rejected": 3.523617744445801, + "logps/chosen": -159.95858764648438, + "logps/rejected": -159.95858764648438, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.451444625854492, + "rewards/margins": 2.980232238769531e-07, + "rewards/rejected": -11.451444625854492, + "step": 3409 + }, + { + "epoch": 2.353286182508194, + "grad_norm": 0.5180379152297974, + "learning_rate": 2.6956271576524744e-06, + "logits/chosen": 3.3510682582855225, + "logits/rejected": 3.2943122386932373, + "logps/chosen": -157.93798828125, + "logps/rejected": -163.3076171875, + "loss": 0.6096, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.97062873840332, + "rewards/margins": 0.4618229269981384, + "rewards/rejected": -11.432451248168945, + "step": 3410 + }, + { + "epoch": 2.3539761945834052, + "grad_norm": 0.29017966985702515, + "learning_rate": 2.6927502876869967e-06, + "logits/chosen": 3.5381460189819336, + "logits/rejected": 3.6585757732391357, + "logps/chosen": -182.57162475585938, + "logps/rejected": -195.04061889648438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.38310718536377, + "rewards/margins": 1.2259844541549683, + "rewards/rejected": -14.609091758728027, + "step": 3411 + }, + { + "epoch": 2.3546662066586164, + "grad_norm": 0.31172889471054077, + "learning_rate": 2.689873417721519e-06, + "logits/chosen": 3.312721014022827, + "logits/rejected": 3.445580244064331, + "logps/chosen": -139.76406860351562, + "logps/rejected": -153.6584930419922, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.209405899047852, + "rewards/margins": 1.3552687168121338, + "rewards/rejected": -10.564674377441406, + "step": 3412 + }, + { + "epoch": 2.355356218733828, + "grad_norm": 26.072250366210938, + "learning_rate": 2.686996547756042e-06, + "logits/chosen": 3.1127920150756836, + "logits/rejected": 3.0563249588012695, + "logps/chosen": -156.93365478515625, + "logps/rejected": -165.69129943847656, + "loss": 0.6535, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.820417404174805, + "rewards/margins": 0.8440979719161987, + "rewards/rejected": -11.664515495300293, + "step": 3413 + }, + { + "epoch": 2.356046230809039, + "grad_norm": 0.35559573769569397, + "learning_rate": 2.684119677790564e-06, + "logits/chosen": 3.2551541328430176, + "logits/rejected": 3.283677101135254, + "logps/chosen": -157.22482299804688, + "logps/rejected": -168.52731323242188, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.900609970092773, + "rewards/margins": 1.1214594841003418, + "rewards/rejected": -12.022069931030273, + "step": 3414 + }, + { + "epoch": 2.3567362428842507, + "grad_norm": 0.3875845968723297, + "learning_rate": 2.681242807825086e-06, + "logits/chosen": 3.4997522830963135, + "logits/rejected": 3.6905517578125, + "logps/chosen": -157.95806884765625, + "logps/rejected": -181.33758544921875, + "loss": 0.52, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.999067306518555, + "rewards/margins": 2.3053698539733887, + "rewards/rejected": -13.304437637329102, + "step": 3415 + }, + { + "epoch": 2.357426254959462, + "grad_norm": 0.37388792634010315, + "learning_rate": 2.678365937859609e-06, + "logits/chosen": 3.17732572555542, + "logits/rejected": 3.17732572555542, + "logps/chosen": -174.00057983398438, + "logps/rejected": -174.00059509277344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.631624221801758, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.631624221801758, + "step": 3416 + }, + { + "epoch": 2.358116267034673, + "grad_norm": 0.3211597502231598, + "learning_rate": 2.6754890678941312e-06, + "logits/chosen": 3.667402982711792, + "logits/rejected": 3.574854850769043, + "logps/chosen": -158.00823974609375, + "logps/rejected": -177.163330078125, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.866458892822266, + "rewards/margins": 2.0484533309936523, + "rewards/rejected": -12.914913177490234, + "step": 3417 + }, + { + "epoch": 2.3588062791098845, + "grad_norm": 0.4484611451625824, + "learning_rate": 2.672612197928654e-06, + "logits/chosen": 3.081895589828491, + "logits/rejected": 3.1344449520111084, + "logps/chosen": -144.53768920898438, + "logps/rejected": -157.89059448242188, + "loss": 0.5212, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.724991798400879, + "rewards/margins": 1.39360773563385, + "rewards/rejected": -11.118599891662598, + "step": 3418 + }, + { + "epoch": 2.3594962911850956, + "grad_norm": 0.35218098759651184, + "learning_rate": 2.669735327963176e-06, + "logits/chosen": 3.2915940284729004, + "logits/rejected": 3.3439345359802246, + "logps/chosen": -178.09149169921875, + "logps/rejected": -184.7269287109375, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.853265762329102, + "rewards/margins": 0.6669544577598572, + "rewards/rejected": -13.520220756530762, + "step": 3419 + }, + { + "epoch": 2.360186303260307, + "grad_norm": 0.387844055891037, + "learning_rate": 2.6668584579976987e-06, + "logits/chosen": 3.281162738800049, + "logits/rejected": 3.559096097946167, + "logps/chosen": -149.34271240234375, + "logps/rejected": -170.75497436523438, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.01778507232666, + "rewards/margins": 2.1077475547790527, + "rewards/rejected": -12.125532150268555, + "step": 3420 + }, + { + "epoch": 2.3608763153355183, + "grad_norm": 0.42413532733917236, + "learning_rate": 2.663981588032221e-06, + "logits/chosen": 3.06510329246521, + "logits/rejected": 3.0878162384033203, + "logps/chosen": -139.3246307373047, + "logps/rejected": -159.7057647705078, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.27216625213623, + "rewards/margins": 2.0122244358062744, + "rewards/rejected": -11.284390449523926, + "step": 3421 + }, + { + "epoch": 2.3615663274107295, + "grad_norm": 0.32951945066452026, + "learning_rate": 2.661104718066744e-06, + "logits/chosen": 3.5163445472717285, + "logits/rejected": 3.5163445472717285, + "logps/chosen": -157.61416625976562, + "logps/rejected": -157.61416625976562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.17206859588623, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.17206859588623, + "step": 3422 + }, + { + "epoch": 2.362256339485941, + "grad_norm": 0.39751750230789185, + "learning_rate": 2.6582278481012658e-06, + "logits/chosen": 3.602297306060791, + "logits/rejected": 3.602297306060791, + "logps/chosen": -170.9964599609375, + "logps/rejected": -170.9964599609375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.38095474243164, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -12.38095474243164, + "step": 3423 + }, + { + "epoch": 2.362946351561152, + "grad_norm": 0.48238763213157654, + "learning_rate": 2.6553509781357886e-06, + "logits/chosen": 3.2914249897003174, + "logits/rejected": 3.2914249897003174, + "logps/chosen": -171.85861206054688, + "logps/rejected": -171.85861206054688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.388021469116211, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.388021469116211, + "step": 3424 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 0.2999078929424286, + "learning_rate": 2.652474108170311e-06, + "logits/chosen": 3.3694567680358887, + "logits/rejected": 3.605449676513672, + "logps/chosen": -150.22842407226562, + "logps/rejected": -174.76858520507812, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.080831527709961, + "rewards/margins": 2.3954062461853027, + "rewards/rejected": -12.476238250732422, + "step": 3425 + }, + { + "epoch": 2.364326375711575, + "grad_norm": 1.0175862312316895, + "learning_rate": 2.6495972382048337e-06, + "logits/chosen": 3.241987466812134, + "logits/rejected": 3.2636146545410156, + "logps/chosen": -162.81106567382812, + "logps/rejected": -166.20394897460938, + "loss": 0.6133, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.698270797729492, + "rewards/margins": 0.36096590757369995, + "rewards/rejected": -12.059236526489258, + "step": 3426 + }, + { + "epoch": 2.3650163877867865, + "grad_norm": 0.31175288558006287, + "learning_rate": 2.646720368239356e-06, + "logits/chosen": 3.505798816680908, + "logits/rejected": 3.6268627643585205, + "logps/chosen": -150.01641845703125, + "logps/rejected": -174.58642578125, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.142814636230469, + "rewards/margins": 2.474623680114746, + "rewards/rejected": -12.617439270019531, + "step": 3427 + }, + { + "epoch": 2.3657063998619976, + "grad_norm": 0.3173881471157074, + "learning_rate": 2.643843498273878e-06, + "logits/chosen": 3.0352933406829834, + "logits/rejected": 3.00610613822937, + "logps/chosen": -157.4697265625, + "logps/rejected": -169.68206787109375, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.978570938110352, + "rewards/margins": 1.1508432626724243, + "rewards/rejected": -12.129413604736328, + "step": 3428 + }, + { + "epoch": 2.3663964119372087, + "grad_norm": 3.812925338745117, + "learning_rate": 2.6409666283084007e-06, + "logits/chosen": 3.206913948059082, + "logits/rejected": 3.281477689743042, + "logps/chosen": -162.1302947998047, + "logps/rejected": -171.133544921875, + "loss": 0.5529, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.402321815490723, + "rewards/margins": 0.9490188956260681, + "rewards/rejected": -12.351339340209961, + "step": 3429 + }, + { + "epoch": 2.3670864240124203, + "grad_norm": 0.3320915997028351, + "learning_rate": 2.638089758342923e-06, + "logits/chosen": 3.5515952110290527, + "logits/rejected": 3.5515952110290527, + "logps/chosen": -169.87330627441406, + "logps/rejected": -169.87330627441406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.162829399108887, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.162829399108887, + "step": 3430 + }, + { + "epoch": 2.3677764360876314, + "grad_norm": 0.4355034828186035, + "learning_rate": 2.635212888377446e-06, + "logits/chosen": 3.291311264038086, + "logits/rejected": 3.291311264038086, + "logps/chosen": -189.97573852539062, + "logps/rejected": -189.97573852539062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.290557861328125, + "rewards/margins": 0.0, + "rewards/rejected": -14.290557861328125, + "step": 3431 + }, + { + "epoch": 2.368466448162843, + "grad_norm": 0.34889793395996094, + "learning_rate": 2.632336018411968e-06, + "logits/chosen": 3.548027992248535, + "logits/rejected": 3.702481508255005, + "logps/chosen": -180.17831420898438, + "logps/rejected": -195.7921600341797, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.219803810119629, + "rewards/margins": 1.5729784965515137, + "rewards/rejected": -14.7927827835083, + "step": 3432 + }, + { + "epoch": 2.369156460238054, + "grad_norm": 0.4716157913208008, + "learning_rate": 2.6294591484464906e-06, + "logits/chosen": 3.499983549118042, + "logits/rejected": 3.8091554641723633, + "logps/chosen": -160.44845581054688, + "logps/rejected": -175.1346435546875, + "loss": 0.5227, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.268671035766602, + "rewards/margins": 1.4407387971878052, + "rewards/rejected": -12.709409713745117, + "step": 3433 + }, + { + "epoch": 2.3698464723132657, + "grad_norm": 0.45979395508766174, + "learning_rate": 2.626582278481013e-06, + "logits/chosen": 3.0309557914733887, + "logits/rejected": 3.097755193710327, + "logps/chosen": -126.91552734375, + "logps/rejected": -160.19467163085938, + "loss": 0.4344, + "rewards/accuracies": 0.375, + "rewards/chosen": -7.813767433166504, + "rewards/margins": 3.3640027046203613, + "rewards/rejected": -11.17776870727539, + "step": 3434 + }, + { + "epoch": 2.370536484388477, + "grad_norm": 0.3554493188858032, + "learning_rate": 2.6237054085155357e-06, + "logits/chosen": 3.7595508098602295, + "logits/rejected": 3.7258265018463135, + "logps/chosen": -176.41778564453125, + "logps/rejected": -190.070556640625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.022729873657227, + "rewards/margins": 1.321503758430481, + "rewards/rejected": -14.344234466552734, + "step": 3435 + }, + { + "epoch": 2.371226496463688, + "grad_norm": 0.8316214084625244, + "learning_rate": 2.6208285385500576e-06, + "logits/chosen": 3.4259543418884277, + "logits/rejected": 3.569932460784912, + "logps/chosen": -145.1545867919922, + "logps/rejected": -162.88766479492188, + "loss": 0.5248, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.792831420898438, + "rewards/margins": 1.7431297302246094, + "rewards/rejected": -11.535961151123047, + "step": 3436 + }, + { + "epoch": 2.3719165085388996, + "grad_norm": 0.3489980697631836, + "learning_rate": 2.61795166858458e-06, + "logits/chosen": 3.8059604167938232, + "logits/rejected": 3.8059604167938232, + "logps/chosen": -179.39186096191406, + "logps/rejected": -179.39186096191406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.992786407470703, + "rewards/margins": 0.0, + "rewards/rejected": -12.992786407470703, + "step": 3437 + }, + { + "epoch": 2.3726065206141107, + "grad_norm": 0.4958920478820801, + "learning_rate": 2.6150747986191028e-06, + "logits/chosen": 3.174060344696045, + "logits/rejected": 3.44014310836792, + "logps/chosen": -150.20167541503906, + "logps/rejected": -161.89630126953125, + "loss": 0.5232, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.250189781188965, + "rewards/margins": 1.1902040243148804, + "rewards/rejected": -11.44039535522461, + "step": 3438 + }, + { + "epoch": 2.373296532689322, + "grad_norm": 6.409010410308838, + "learning_rate": 2.612197928653625e-06, + "logits/chosen": 3.7136383056640625, + "logits/rejected": 3.965928554534912, + "logps/chosen": -161.7841033935547, + "logps/rejected": -183.9445037841797, + "loss": 0.4574, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.436681747436523, + "rewards/margins": 2.2961134910583496, + "rewards/rejected": -13.732794761657715, + "step": 3439 + }, + { + "epoch": 2.3739865447645334, + "grad_norm": 0.2798191010951996, + "learning_rate": 2.6093210586881475e-06, + "logits/chosen": 3.7121081352233887, + "logits/rejected": 3.804361343383789, + "logps/chosen": -155.3040771484375, + "logps/rejected": -187.49044799804688, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.977466583251953, + "rewards/margins": 2.96651554107666, + "rewards/rejected": -13.943982124328613, + "step": 3440 + }, + { + "epoch": 2.3746765568397445, + "grad_norm": 0.29959243535995483, + "learning_rate": 2.60644418872267e-06, + "logits/chosen": 3.7793354988098145, + "logits/rejected": 3.7793354988098145, + "logps/chosen": -196.02511596679688, + "logps/rejected": -196.02511596679688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.72633171081543, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.726332664489746, + "step": 3441 + }, + { + "epoch": 2.375366568914956, + "grad_norm": 0.27638232707977295, + "learning_rate": 2.6035673187571926e-06, + "logits/chosen": 2.9145820140838623, + "logits/rejected": 3.2199461460113525, + "logps/chosen": -133.400634765625, + "logps/rejected": -172.29345703125, + "loss": 0.4332, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.670937538146973, + "rewards/margins": 3.8821403980255127, + "rewards/rejected": -12.553077697753906, + "step": 3442 + }, + { + "epoch": 2.3760565809901673, + "grad_norm": 0.3409247398376465, + "learning_rate": 2.600690448791715e-06, + "logits/chosen": 3.322794198989868, + "logits/rejected": 3.3250765800476074, + "logps/chosen": -168.1739044189453, + "logps/rejected": -176.90029907226562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.845586776733398, + "rewards/margins": 0.9028546214103699, + "rewards/rejected": -12.748441696166992, + "step": 3443 + }, + { + "epoch": 2.376746593065379, + "grad_norm": 0.39555370807647705, + "learning_rate": 2.5978135788262373e-06, + "logits/chosen": 3.2840960025787354, + "logits/rejected": 3.2840960025787354, + "logps/chosen": -161.49514770507812, + "logps/rejected": -161.49514770507812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.234718322753906, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -11.234718322753906, + "step": 3444 + }, + { + "epoch": 2.37743660514059, + "grad_norm": 5.397103786468506, + "learning_rate": 2.5949367088607596e-06, + "logits/chosen": 2.9688637256622314, + "logits/rejected": 2.86403751373291, + "logps/chosen": -137.1479034423828, + "logps/rejected": -142.98231506347656, + "loss": 0.5801, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.043684959411621, + "rewards/margins": 0.5525926947593689, + "rewards/rejected": -9.596278190612793, + "step": 3445 + }, + { + "epoch": 2.378126617215801, + "grad_norm": 0.8451628088951111, + "learning_rate": 2.5920598388952824e-06, + "logits/chosen": 3.1360912322998047, + "logits/rejected": 3.3940443992614746, + "logps/chosen": -144.98414611816406, + "logps/rejected": -168.47116088867188, + "loss": 0.4446, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.748123168945312, + "rewards/margins": 2.274308681488037, + "rewards/rejected": -12.022432327270508, + "step": 3446 + }, + { + "epoch": 2.3788166292910127, + "grad_norm": 1.9467052221298218, + "learning_rate": 2.5891829689298048e-06, + "logits/chosen": 3.240360736846924, + "logits/rejected": 3.4825596809387207, + "logps/chosen": -152.0430145263672, + "logps/rejected": -188.1483154296875, + "loss": 0.3618, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.349569320678711, + "rewards/margins": 3.583534002304077, + "rewards/rejected": -13.93310260772705, + "step": 3447 + }, + { + "epoch": 2.379506641366224, + "grad_norm": 0.980597972869873, + "learning_rate": 2.5863060989643267e-06, + "logits/chosen": 3.398064374923706, + "logits/rejected": 3.7822513580322266, + "logps/chosen": -149.35830688476562, + "logps/rejected": -161.26132202148438, + "loss": 0.453, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.932437896728516, + "rewards/margins": 1.3648759126663208, + "rewards/rejected": -11.29731273651123, + "step": 3448 + }, + { + "epoch": 2.3801966534414354, + "grad_norm": 3.74599552154541, + "learning_rate": 2.5834292289988495e-06, + "logits/chosen": 3.3542778491973877, + "logits/rejected": 3.524473190307617, + "logps/chosen": -140.67303466796875, + "logps/rejected": -168.42025756835938, + "loss": 0.4542, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.45876407623291, + "rewards/margins": 2.558509588241577, + "rewards/rejected": -12.017273902893066, + "step": 3449 + }, + { + "epoch": 2.3808866655166465, + "grad_norm": 0.3735192120075226, + "learning_rate": 2.580552359033372e-06, + "logits/chosen": 3.1117491722106934, + "logits/rejected": 3.1807308197021484, + "logps/chosen": -145.9117889404297, + "logps/rejected": -158.87225341796875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.998717308044434, + "rewards/margins": 1.3040238618850708, + "rewards/rejected": -11.302741050720215, + "step": 3450 + }, + { + "epoch": 2.381576677591858, + "grad_norm": 0.35038402676582336, + "learning_rate": 2.5776754890678946e-06, + "logits/chosen": 3.5959889888763428, + "logits/rejected": 3.5959889888763428, + "logps/chosen": -183.9690399169922, + "logps/rejected": -183.9690399169922, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.614042282104492, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.614044189453125, + "step": 3451 + }, + { + "epoch": 2.3822666896670692, + "grad_norm": 0.3734563887119293, + "learning_rate": 2.5747986191024165e-06, + "logits/chosen": 3.2266039848327637, + "logits/rejected": 3.2266039848327637, + "logps/chosen": -158.1711883544922, + "logps/rejected": -158.17120361328125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.958240509033203, + "rewards/margins": 4.172325134277344e-07, + "rewards/rejected": -10.958240509033203, + "step": 3452 + }, + { + "epoch": 2.3829567017422804, + "grad_norm": 0.34943631291389465, + "learning_rate": 2.5719217491369393e-06, + "logits/chosen": 3.2044601440429688, + "logits/rejected": 3.291300058364868, + "logps/chosen": -164.71099853515625, + "logps/rejected": -178.78897094726562, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.737053871154785, + "rewards/margins": 1.3911131620407104, + "rewards/rejected": -13.128167152404785, + "step": 3453 + }, + { + "epoch": 2.383646713817492, + "grad_norm": 0.337065726518631, + "learning_rate": 2.5690448791714616e-06, + "logits/chosen": 3.612128734588623, + "logits/rejected": 3.7922868728637695, + "logps/chosen": -174.54400634765625, + "logps/rejected": -181.00401306152344, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.703879356384277, + "rewards/margins": 0.6399902105331421, + "rewards/rejected": -13.34386920928955, + "step": 3454 + }, + { + "epoch": 2.384336725892703, + "grad_norm": 0.4075106084346771, + "learning_rate": 2.5661680092059844e-06, + "logits/chosen": 3.262730598449707, + "logits/rejected": 3.6860439777374268, + "logps/chosen": -149.46658325195312, + "logps/rejected": -175.2186279296875, + "loss": 0.4341, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.126760482788086, + "rewards/margins": 2.5358190536499023, + "rewards/rejected": -12.662579536437988, + "step": 3455 + }, + { + "epoch": 2.385026737967914, + "grad_norm": 0.39599505066871643, + "learning_rate": 2.5632911392405064e-06, + "logits/chosen": 3.2840123176574707, + "logits/rejected": 3.2840123176574707, + "logps/chosen": -200.75253295898438, + "logps/rejected": -200.75253295898438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -15.187854766845703, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -15.187853813171387, + "step": 3456 + }, + { + "epoch": 2.3857167500431258, + "grad_norm": 0.3215695917606354, + "learning_rate": 2.5604142692750287e-06, + "logits/chosen": 3.03760027885437, + "logits/rejected": 3.2088232040405273, + "logps/chosen": -144.5392303466797, + "logps/rejected": -180.25640869140625, + "loss": 0.4334, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.697937965393066, + "rewards/margins": 3.50205135345459, + "rewards/rejected": -13.199989318847656, + "step": 3457 + }, + { + "epoch": 2.386406762118337, + "grad_norm": 0.3279735743999481, + "learning_rate": 2.5575373993095515e-06, + "logits/chosen": 3.4887478351593018, + "logits/rejected": 3.492147207260132, + "logps/chosen": -155.08148193359375, + "logps/rejected": -165.43923950195312, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.806031227111816, + "rewards/margins": 0.9753454327583313, + "rewards/rejected": -11.781376838684082, + "step": 3458 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 0.35780444741249084, + "learning_rate": 2.5546605293440743e-06, + "logits/chosen": 4.04493522644043, + "logits/rejected": 4.238604545593262, + "logps/chosen": -179.15948486328125, + "logps/rejected": -185.41189575195312, + "loss": 0.6074, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.271312713623047, + "rewards/margins": 0.6173855066299438, + "rewards/rejected": -13.888697624206543, + "step": 3459 + }, + { + "epoch": 2.3877867862687596, + "grad_norm": 0.37481793761253357, + "learning_rate": 2.551783659378596e-06, + "logits/chosen": 3.3524281978607178, + "logits/rejected": 3.498631477355957, + "logps/chosen": -178.3646240234375, + "logps/rejected": -186.71115112304688, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.821222305297852, + "rewards/margins": 0.8728940486907959, + "rewards/rejected": -13.694116592407227, + "step": 3460 + }, + { + "epoch": 2.388476798343971, + "grad_norm": 0.37786880135536194, + "learning_rate": 2.5489067894131185e-06, + "logits/chosen": 3.525824546813965, + "logits/rejected": 3.6630468368530273, + "logps/chosen": -166.72695922851562, + "logps/rejected": -186.55450439453125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.990758895874023, + "rewards/margins": 1.8020328283309937, + "rewards/rejected": -13.792791366577148, + "step": 3461 + }, + { + "epoch": 2.3891668104191823, + "grad_norm": 0.35164618492126465, + "learning_rate": 2.5460299194476413e-06, + "logits/chosen": 3.7154722213745117, + "logits/rejected": 3.7154722213745117, + "logps/chosen": -181.6257781982422, + "logps/rejected": -181.6257781982422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.376655578613281, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.376655578613281, + "step": 3462 + }, + { + "epoch": 2.3898568224943935, + "grad_norm": 20.618494033813477, + "learning_rate": 2.5431530494821637e-06, + "logits/chosen": 3.186154842376709, + "logits/rejected": 3.215031147003174, + "logps/chosen": -178.70559692382812, + "logps/rejected": -176.92071533203125, + "loss": 0.8447, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.210990905761719, + "rewards/margins": -0.21808111667633057, + "rewards/rejected": -12.992908477783203, + "step": 3463 + }, + { + "epoch": 2.390546834569605, + "grad_norm": 0.30931755900382996, + "learning_rate": 2.540276179516686e-06, + "logits/chosen": 3.699273109436035, + "logits/rejected": 3.699273109436035, + "logps/chosen": -190.89743041992188, + "logps/rejected": -190.8974151611328, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.154329299926758, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -14.154328346252441, + "step": 3464 + }, + { + "epoch": 2.391236846644816, + "grad_norm": 0.31484630703926086, + "learning_rate": 2.5373993095512084e-06, + "logits/chosen": 3.167994976043701, + "logits/rejected": 3.195740222930908, + "logps/chosen": -167.08409118652344, + "logps/rejected": -182.1692352294922, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.849785804748535, + "rewards/margins": 1.5224305391311646, + "rewards/rejected": -13.37221622467041, + "step": 3465 + }, + { + "epoch": 2.3919268587200277, + "grad_norm": 0.3050552308559418, + "learning_rate": 2.534522439585731e-06, + "logits/chosen": 3.6666641235351562, + "logits/rejected": 3.6666641235351562, + "logps/chosen": -183.4903564453125, + "logps/rejected": -183.4903564453125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.552406311035156, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.55240535736084, + "step": 3466 + }, + { + "epoch": 2.392616870795239, + "grad_norm": 0.3128288686275482, + "learning_rate": 2.5316455696202535e-06, + "logits/chosen": 3.472107410430908, + "logits/rejected": 3.585235595703125, + "logps/chosen": -181.0586700439453, + "logps/rejected": -187.117919921875, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.299287796020508, + "rewards/margins": 0.6407164335250854, + "rewards/rejected": -13.940004348754883, + "step": 3467 + }, + { + "epoch": 2.3933068828704505, + "grad_norm": 0.27909719944000244, + "learning_rate": 2.5287686996547754e-06, + "logits/chosen": 3.4737532138824463, + "logits/rejected": 3.513091564178467, + "logps/chosen": -193.02279663085938, + "logps/rejected": -206.0377655029297, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.812997817993164, + "rewards/margins": 1.2588229179382324, + "rewards/rejected": -16.071821212768555, + "step": 3468 + }, + { + "epoch": 2.3939968949456616, + "grad_norm": 0.3139829933643341, + "learning_rate": 2.525891829689298e-06, + "logits/chosen": 3.3992772102355957, + "logits/rejected": 3.5256240367889404, + "logps/chosen": -168.531005859375, + "logps/rejected": -186.56443786621094, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.121946334838867, + "rewards/margins": 1.7593023777008057, + "rewards/rejected": -13.881248474121094, + "step": 3469 + }, + { + "epoch": 2.3946869070208727, + "grad_norm": 0.5565690398216248, + "learning_rate": 2.5230149597238205e-06, + "logits/chosen": 3.0739145278930664, + "logits/rejected": 3.1667041778564453, + "logps/chosen": -169.29071044921875, + "logps/rejected": -176.55238342285156, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.351335525512695, + "rewards/margins": 0.6735967397689819, + "rewards/rejected": -13.024932861328125, + "step": 3470 + }, + { + "epoch": 2.3953769190960843, + "grad_norm": 0.3918910622596741, + "learning_rate": 2.5201380897583433e-06, + "logits/chosen": 3.6682705879211426, + "logits/rejected": 3.8388333320617676, + "logps/chosen": -178.24783325195312, + "logps/rejected": -184.4498291015625, + "loss": 0.6077, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.090703964233398, + "rewards/margins": 0.5857880115509033, + "rewards/rejected": -13.676492691040039, + "step": 3471 + }, + { + "epoch": 2.3960669311712954, + "grad_norm": 0.3425779640674591, + "learning_rate": 2.5172612197928652e-06, + "logits/chosen": 3.476637840270996, + "logits/rejected": 3.476637840270996, + "logps/chosen": -181.66738891601562, + "logps/rejected": -181.66738891601562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.276228904724121, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.276228904724121, + "step": 3472 + }, + { + "epoch": 2.3967569432465066, + "grad_norm": 0.39553266763687134, + "learning_rate": 2.514384349827388e-06, + "logits/chosen": 3.6463632583618164, + "logits/rejected": 3.7623748779296875, + "logps/chosen": -174.83425903320312, + "logps/rejected": -182.2014923095703, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.728033065795898, + "rewards/margins": 0.7312367558479309, + "rewards/rejected": -13.459268569946289, + "step": 3473 + }, + { + "epoch": 2.397446955321718, + "grad_norm": 0.3611784875392914, + "learning_rate": 2.5115074798619104e-06, + "logits/chosen": 3.8318710327148438, + "logits/rejected": 3.871974468231201, + "logps/chosen": -164.26039123535156, + "logps/rejected": -174.20411682128906, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.663049697875977, + "rewards/margins": 0.992278516292572, + "rewards/rejected": -12.655327796936035, + "step": 3474 + }, + { + "epoch": 2.3981369673969293, + "grad_norm": 17.175670623779297, + "learning_rate": 2.508630609896433e-06, + "logits/chosen": 3.425156593322754, + "logits/rejected": 3.3774585723876953, + "logps/chosen": -182.59848022460938, + "logps/rejected": -182.74050903320312, + "loss": 0.6588, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.459196090698242, + "rewards/margins": 0.08191323280334473, + "rewards/rejected": -13.541109085083008, + "step": 3475 + }, + { + "epoch": 2.398826979472141, + "grad_norm": 0.24447160959243774, + "learning_rate": 2.505753739930955e-06, + "logits/chosen": 3.922661781311035, + "logits/rejected": 4.075404644012451, + "logps/chosen": -174.73040771484375, + "logps/rejected": -196.544189453125, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.729909896850586, + "rewards/margins": 2.136172294616699, + "rewards/rejected": -14.866081237792969, + "step": 3476 + }, + { + "epoch": 2.399516991547352, + "grad_norm": 0.35575512051582336, + "learning_rate": 2.5028768699654774e-06, + "logits/chosen": 3.433960437774658, + "logits/rejected": 3.462414503097534, + "logps/chosen": -155.57440185546875, + "logps/rejected": -169.66534423828125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.693438529968262, + "rewards/margins": 1.476976752281189, + "rewards/rejected": -12.170415878295898, + "step": 3477 + }, + { + "epoch": 2.4002070036225636, + "grad_norm": 0.4101574122905731, + "learning_rate": 2.5e-06, + "logits/chosen": 3.9009575843811035, + "logits/rejected": 3.9009575843811035, + "logps/chosen": -191.45501708984375, + "logps/rejected": -191.45501708984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.254209518432617, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.254209518432617, + "step": 3478 + }, + { + "epoch": 2.4008970156977747, + "grad_norm": 0.2676275074481964, + "learning_rate": 2.497123130034523e-06, + "logits/chosen": 3.1938533782958984, + "logits/rejected": 3.362330675125122, + "logps/chosen": -126.89695739746094, + "logps/rejected": -160.53562927246094, + "loss": 0.4337, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.02182674407959, + "rewards/margins": 3.368911027908325, + "rewards/rejected": -11.390737533569336, + "step": 3479 + }, + { + "epoch": 2.401587027772986, + "grad_norm": 0.26817792654037476, + "learning_rate": 2.494246260069045e-06, + "logits/chosen": 3.3206534385681152, + "logits/rejected": 3.361280918121338, + "logps/chosen": -177.9087677001953, + "logps/rejected": -194.9931640625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.952972412109375, + "rewards/margins": 1.6892852783203125, + "rewards/rejected": -14.642257690429688, + "step": 3480 + }, + { + "epoch": 2.4022770398481974, + "grad_norm": 0.32280853390693665, + "learning_rate": 2.4913693901035677e-06, + "logits/chosen": 3.3805935382843018, + "logits/rejected": 3.3805935382843018, + "logps/chosen": -182.8860626220703, + "logps/rejected": -182.8860626220703, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.33789348602295, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -13.33789348602295, + "step": 3481 + }, + { + "epoch": 2.4029670519234085, + "grad_norm": 0.2898353636264801, + "learning_rate": 2.48849252013809e-06, + "logits/chosen": 3.3306491374969482, + "logits/rejected": 3.333491086959839, + "logps/chosen": -172.5558624267578, + "logps/rejected": -185.6447296142578, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.418067932128906, + "rewards/margins": 1.3591856956481934, + "rewards/rejected": -13.777253150939941, + "step": 3482 + }, + { + "epoch": 2.40365706399862, + "grad_norm": 0.3903382420539856, + "learning_rate": 2.4856156501726124e-06, + "logits/chosen": 3.6162445545196533, + "logits/rejected": 3.6162445545196533, + "logps/chosen": -173.9271240234375, + "logps/rejected": -173.9271240234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.539976119995117, + "rewards/margins": 0.0, + "rewards/rejected": -12.539976119995117, + "step": 3483 + }, + { + "epoch": 2.4043470760738312, + "grad_norm": 0.514056921005249, + "learning_rate": 2.4827387802071347e-06, + "logits/chosen": 3.3281078338623047, + "logits/rejected": 3.4110097885131836, + "logps/chosen": -170.34918212890625, + "logps/rejected": -176.15057373046875, + "loss": 0.6078, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.141098976135254, + "rewards/margins": 0.5704792737960815, + "rewards/rejected": -12.711578369140625, + "step": 3484 + }, + { + "epoch": 2.405037088149043, + "grad_norm": 0.29790326952934265, + "learning_rate": 2.4798619102416575e-06, + "logits/chosen": 3.0370030403137207, + "logits/rejected": 3.0629734992980957, + "logps/chosen": -167.7961883544922, + "logps/rejected": -181.49270629882812, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.880827903747559, + "rewards/margins": 1.4077777862548828, + "rewards/rejected": -13.288605690002441, + "step": 3485 + }, + { + "epoch": 2.405727100224254, + "grad_norm": 0.5647591948509216, + "learning_rate": 2.47698504027618e-06, + "logits/chosen": 3.2259573936462402, + "logits/rejected": 3.3908777236938477, + "logps/chosen": -175.69410705566406, + "logps/rejected": -183.12579345703125, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.764289855957031, + "rewards/margins": 0.722567081451416, + "rewards/rejected": -13.486857414245605, + "step": 3486 + }, + { + "epoch": 2.406417112299465, + "grad_norm": 0.510454535484314, + "learning_rate": 2.4741081703107022e-06, + "logits/chosen": 2.955345869064331, + "logits/rejected": 3.337052345275879, + "logps/chosen": -151.28192138671875, + "logps/rejected": -177.43655395507812, + "loss": 0.4356, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.656986236572266, + "rewards/margins": 2.600947380065918, + "rewards/rejected": -13.257933616638184, + "step": 3487 + }, + { + "epoch": 2.4071071243746767, + "grad_norm": 0.2795129120349884, + "learning_rate": 2.4712313003452246e-06, + "logits/chosen": 3.530869245529175, + "logits/rejected": 3.5687685012817383, + "logps/chosen": -182.7520751953125, + "logps/rejected": -193.616943359375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.749268531799316, + "rewards/margins": 1.0201750993728638, + "rewards/rejected": -14.769444465637207, + "step": 3488 + }, + { + "epoch": 2.407797136449888, + "grad_norm": 0.41764023900032043, + "learning_rate": 2.4683544303797473e-06, + "logits/chosen": 3.3546977043151855, + "logits/rejected": 3.3546977043151855, + "logps/chosen": -166.34451293945312, + "logps/rejected": -166.34449768066406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.938810348510742, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -11.93880844116211, + "step": 3489 + }, + { + "epoch": 2.4084871485250994, + "grad_norm": 16.225460052490234, + "learning_rate": 2.4654775604142693e-06, + "logits/chosen": 3.473281145095825, + "logits/rejected": 3.863225221633911, + "logps/chosen": -144.49583435058594, + "logps/rejected": -188.8448486328125, + "loss": 0.3823, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.71959114074707, + "rewards/margins": 4.479593276977539, + "rewards/rejected": -14.19918441772461, + "step": 3490 + }, + { + "epoch": 2.4091771606003105, + "grad_norm": 0.4655269384384155, + "learning_rate": 2.462600690448792e-06, + "logits/chosen": 3.365135669708252, + "logits/rejected": 3.449381113052368, + "logps/chosen": -140.74246215820312, + "logps/rejected": -147.38525390625, + "loss": 0.6073, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.329282760620117, + "rewards/margins": 0.637953519821167, + "rewards/rejected": -9.967236518859863, + "step": 3491 + }, + { + "epoch": 2.4098671726755216, + "grad_norm": 0.27533236145973206, + "learning_rate": 2.4597238204833144e-06, + "logits/chosen": 3.2831242084503174, + "logits/rejected": 3.457145929336548, + "logps/chosen": -177.56533813476562, + "logps/rejected": -192.5234375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.784948348999023, + "rewards/margins": 1.5306055545806885, + "rewards/rejected": -14.315553665161133, + "step": 3492 + }, + { + "epoch": 2.410557184750733, + "grad_norm": 0.23040595650672913, + "learning_rate": 2.4568469505178367e-06, + "logits/chosen": 3.3855016231536865, + "logits/rejected": 3.487234592437744, + "logps/chosen": -160.78408813476562, + "logps/rejected": -191.32217407226562, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.345064163208008, + "rewards/margins": 3.0974860191345215, + "rewards/rejected": -14.442549705505371, + "step": 3493 + }, + { + "epoch": 2.4112471968259443, + "grad_norm": 0.27266427874565125, + "learning_rate": 2.453970080552359e-06, + "logits/chosen": 3.4396467208862305, + "logits/rejected": 3.6239185333251953, + "logps/chosen": -158.49362182617188, + "logps/rejected": -185.56900024414062, + "loss": 0.4338, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.059008598327637, + "rewards/margins": 2.6674728393554688, + "rewards/rejected": -13.726481437683105, + "step": 3494 + }, + { + "epoch": 2.411937208901156, + "grad_norm": 0.33338066935539246, + "learning_rate": 2.451093210586882e-06, + "logits/chosen": 3.3655648231506348, + "logits/rejected": 3.471484422683716, + "logps/chosen": -153.20889282226562, + "logps/rejected": -168.44651794433594, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.668048858642578, + "rewards/margins": 1.5443311929702759, + "rewards/rejected": -12.212381362915039, + "step": 3495 + }, + { + "epoch": 2.412627220976367, + "grad_norm": 0.3171137571334839, + "learning_rate": 2.4482163406214042e-06, + "logits/chosen": 3.3205819129943848, + "logits/rejected": 3.414052724838257, + "logps/chosen": -153.0616455078125, + "logps/rejected": -175.00106811523438, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.489018440246582, + "rewards/margins": 2.2401137351989746, + "rewards/rejected": -12.729131698608398, + "step": 3496 + }, + { + "epoch": 2.413317233051578, + "grad_norm": 0.32325831055641174, + "learning_rate": 2.4453394706559266e-06, + "logits/chosen": 3.6688685417175293, + "logits/rejected": 3.6688685417175293, + "logps/chosen": -184.31076049804688, + "logps/rejected": -184.31076049804688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.733062744140625, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.733062744140625, + "step": 3497 + }, + { + "epoch": 2.4140072451267898, + "grad_norm": 0.641406774520874, + "learning_rate": 2.442462600690449e-06, + "logits/chosen": 3.510511875152588, + "logits/rejected": 3.4269237518310547, + "logps/chosen": -163.31451416015625, + "logps/rejected": -167.68283081054688, + "loss": 0.6098, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.375725746154785, + "rewards/margins": 0.4517906904220581, + "rewards/rejected": -11.827516555786133, + "step": 3498 + }, + { + "epoch": 2.414697257202001, + "grad_norm": 0.37693145871162415, + "learning_rate": 2.4395857307249717e-06, + "logits/chosen": 3.6778855323791504, + "logits/rejected": 3.6778855323791504, + "logps/chosen": -179.17196655273438, + "logps/rejected": -179.17196655273438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.085208892822266, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.085208892822266, + "step": 3499 + }, + { + "epoch": 2.4153872692772125, + "grad_norm": 0.3417743444442749, + "learning_rate": 2.4367088607594936e-06, + "logits/chosen": 3.304642677307129, + "logits/rejected": 3.304642677307129, + "logps/chosen": -168.3299102783203, + "logps/rejected": -168.3299102783203, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.136505126953125, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -12.136505126953125, + "step": 3500 + }, + { + "epoch": 2.4160772813524236, + "grad_norm": 0.3488665223121643, + "learning_rate": 2.4338319907940164e-06, + "logits/chosen": 3.7655892372131348, + "logits/rejected": 3.7335517406463623, + "logps/chosen": -178.10513305664062, + "logps/rejected": -190.32176208496094, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.005655288696289, + "rewards/margins": 1.243390679359436, + "rewards/rejected": -14.249046325683594, + "step": 3501 + }, + { + "epoch": 2.416767293427635, + "grad_norm": 0.35524579882621765, + "learning_rate": 2.4309551208285388e-06, + "logits/chosen": 3.6924564838409424, + "logits/rejected": 3.6924564838409424, + "logps/chosen": -173.16563415527344, + "logps/rejected": -173.16563415527344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.585868835449219, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.585868835449219, + "step": 3502 + }, + { + "epoch": 2.4174573055028463, + "grad_norm": 0.3832785487174988, + "learning_rate": 2.428078250863061e-06, + "logits/chosen": 3.6550397872924805, + "logits/rejected": 3.6550397872924805, + "logps/chosen": -175.03707885742188, + "logps/rejected": -175.03707885742188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.68199348449707, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.68199348449707, + "step": 3503 + }, + { + "epoch": 2.4181473175780575, + "grad_norm": 0.5554924011230469, + "learning_rate": 2.4252013808975835e-06, + "logits/chosen": 3.5930938720703125, + "logits/rejected": 3.6207144260406494, + "logps/chosen": -171.04583740234375, + "logps/rejected": -181.95266723632812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.423545837402344, + "rewards/margins": 1.0608476400375366, + "rewards/rejected": -13.484393119812012, + "step": 3504 + }, + { + "epoch": 2.418837329653269, + "grad_norm": 0.4118953347206116, + "learning_rate": 2.4223245109321062e-06, + "logits/chosen": 3.588824987411499, + "logits/rejected": 3.588824987411499, + "logps/chosen": -168.99085998535156, + "logps/rejected": -168.99085998535156, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.048133850097656, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.048133850097656, + "step": 3505 + }, + { + "epoch": 2.41952734172848, + "grad_norm": 9.626919746398926, + "learning_rate": 2.4194476409666286e-06, + "logits/chosen": 3.549480438232422, + "logits/rejected": 3.542682647705078, + "logps/chosen": -175.1883087158203, + "logps/rejected": -175.3769073486328, + "loss": 0.6991, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.840864181518555, + "rewards/margins": -0.011566221714019775, + "rewards/rejected": -12.82929801940918, + "step": 3506 + }, + { + "epoch": 2.4202173538036917, + "grad_norm": 0.33976975083351135, + "learning_rate": 2.416570771001151e-06, + "logits/chosen": 2.909276008605957, + "logits/rejected": 3.335979461669922, + "logps/chosen": -147.19439697265625, + "logps/rejected": -177.38961791992188, + "loss": 0.4336, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.944311141967773, + "rewards/margins": 3.0811052322387695, + "rewards/rejected": -13.02541732788086, + "step": 3507 + }, + { + "epoch": 2.420907365878903, + "grad_norm": 0.32661667466163635, + "learning_rate": 2.4136939010356733e-06, + "logits/chosen": 3.530677080154419, + "logits/rejected": 3.530677080154419, + "logps/chosen": -183.33401489257812, + "logps/rejected": -183.33399963378906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.529559135437012, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -13.529559135437012, + "step": 3508 + }, + { + "epoch": 2.421597377954114, + "grad_norm": 0.33520954847335815, + "learning_rate": 2.410817031070196e-06, + "logits/chosen": 3.3767590522766113, + "logits/rejected": 3.5772652626037598, + "logps/chosen": -164.59078979492188, + "logps/rejected": -193.07472229003906, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.456474304199219, + "rewards/margins": 2.8579466342926025, + "rewards/rejected": -14.314420700073242, + "step": 3509 + }, + { + "epoch": 2.4222873900293256, + "grad_norm": 0.37409600615501404, + "learning_rate": 2.407940161104718e-06, + "logits/chosen": 3.2165002822875977, + "logits/rejected": 3.2385330200195312, + "logps/chosen": -168.7074737548828, + "logps/rejected": -178.00143432617188, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.008060455322266, + "rewards/margins": 1.006026029586792, + "rewards/rejected": -13.014086723327637, + "step": 3510 + }, + { + "epoch": 2.4229774021045367, + "grad_norm": 0.4946405291557312, + "learning_rate": 2.4050632911392408e-06, + "logits/chosen": 3.2669548988342285, + "logits/rejected": 3.2669548988342285, + "logps/chosen": -179.01564025878906, + "logps/rejected": -179.01564025878906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.137868881225586, + "rewards/margins": 0.0, + "rewards/rejected": -13.137868881225586, + "step": 3511 + }, + { + "epoch": 2.4236674141797483, + "grad_norm": 0.3893367648124695, + "learning_rate": 2.402186421173763e-06, + "logits/chosen": 3.2187604904174805, + "logits/rejected": 3.47878360748291, + "logps/chosen": -148.57815551757812, + "logps/rejected": -176.40414428710938, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.993535995483398, + "rewards/margins": 2.7851600646972656, + "rewards/rejected": -12.778696060180664, + "step": 3512 + }, + { + "epoch": 2.4243574262549594, + "grad_norm": 10.701079368591309, + "learning_rate": 2.3993095512082855e-06, + "logits/chosen": 3.552969217300415, + "logits/rejected": 3.7792959213256836, + "logps/chosen": -170.4774169921875, + "logps/rejected": -176.13958740234375, + "loss": 0.7427, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.185171127319336, + "rewards/margins": 0.4854241609573364, + "rewards/rejected": -12.6705961227417, + "step": 3513 + }, + { + "epoch": 2.4250474383301706, + "grad_norm": 0.33926016092300415, + "learning_rate": 2.396432681242808e-06, + "logits/chosen": 3.536677837371826, + "logits/rejected": 3.536677837371826, + "logps/chosen": -171.47958374023438, + "logps/rejected": -171.47958374023438, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.435541152954102, + "rewards/margins": 9.5367431640625e-07, + "rewards/rejected": -12.435542106628418, + "step": 3514 + }, + { + "epoch": 2.425737450405382, + "grad_norm": 0.34992480278015137, + "learning_rate": 2.3935558112773306e-06, + "logits/chosen": 3.249828577041626, + "logits/rejected": 3.333460807800293, + "logps/chosen": -181.04196166992188, + "logps/rejected": -192.29974365234375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.179361343383789, + "rewards/margins": 1.1204230785369873, + "rewards/rejected": -14.299784660339355, + "step": 3515 + }, + { + "epoch": 2.4264274624805933, + "grad_norm": 0.3526644706726074, + "learning_rate": 2.390678941311853e-06, + "logits/chosen": 3.9572012424468994, + "logits/rejected": 3.9572012424468994, + "logps/chosen": -179.19134521484375, + "logps/rejected": -179.19134521484375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.101280212402344, + "rewards/margins": -5.960464477539062e-07, + "rewards/rejected": -13.101279258728027, + "step": 3516 + }, + { + "epoch": 2.427117474555805, + "grad_norm": 5.784792900085449, + "learning_rate": 2.3878020713463753e-06, + "logits/chosen": 3.475339651107788, + "logits/rejected": 3.7831289768218994, + "logps/chosen": -153.70053100585938, + "logps/rejected": -177.33828735351562, + "loss": 0.4547, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.567564964294434, + "rewards/margins": 2.2853212356567383, + "rewards/rejected": -12.852886199951172, + "step": 3517 + }, + { + "epoch": 2.427807486631016, + "grad_norm": 28.95530128479004, + "learning_rate": 2.3849252013808977e-06, + "logits/chosen": 3.377248764038086, + "logits/rejected": 3.400023937225342, + "logps/chosen": -173.771728515625, + "logps/rejected": -181.1243896484375, + "loss": 0.7827, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.43274974822998, + "rewards/margins": 0.7591186165809631, + "rewards/rejected": -13.191868782043457, + "step": 3518 + }, + { + "epoch": 2.4284974987062276, + "grad_norm": 0.33671805262565613, + "learning_rate": 2.3820483314154204e-06, + "logits/chosen": 3.6005897521972656, + "logits/rejected": 3.6005897521972656, + "logps/chosen": -182.28897094726562, + "logps/rejected": -182.28897094726562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.539701461791992, + "rewards/margins": 0.0, + "rewards/rejected": -13.539701461791992, + "step": 3519 + }, + { + "epoch": 2.4291875107814387, + "grad_norm": 0.3278137147426605, + "learning_rate": 2.3791714614499424e-06, + "logits/chosen": 3.7218518257141113, + "logits/rejected": 3.7218518257141113, + "logps/chosen": -175.5172882080078, + "logps/rejected": -175.5172882080078, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.708963394165039, + "rewards/margins": 4.172325134277344e-07, + "rewards/rejected": -12.708963394165039, + "step": 3520 + }, + { + "epoch": 2.42987752285665, + "grad_norm": 0.371146023273468, + "learning_rate": 2.376294591484465e-06, + "logits/chosen": 3.5640268325805664, + "logits/rejected": 3.6927871704101562, + "logps/chosen": -164.60256958007812, + "logps/rejected": -192.11764526367188, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.6785888671875, + "rewards/margins": 2.7632813453674316, + "rewards/rejected": -14.441869735717773, + "step": 3521 + }, + { + "epoch": 2.4305675349318614, + "grad_norm": 0.37224504351615906, + "learning_rate": 2.3734177215189875e-06, + "logits/chosen": 3.742619037628174, + "logits/rejected": 3.7327561378479004, + "logps/chosen": -164.10311889648438, + "logps/rejected": -182.25685119628906, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.65224838256836, + "rewards/margins": 1.7985440492630005, + "rewards/rejected": -13.450794219970703, + "step": 3522 + }, + { + "epoch": 2.4312575470070725, + "grad_norm": 0.31800374388694763, + "learning_rate": 2.37054085155351e-06, + "logits/chosen": 3.26540470123291, + "logits/rejected": 3.2870144844055176, + "logps/chosen": -169.31858825683594, + "logps/rejected": -194.7410125732422, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.045138359069824, + "rewards/margins": 2.6165289878845215, + "rewards/rejected": -14.66166877746582, + "step": 3523 + }, + { + "epoch": 2.431947559082284, + "grad_norm": 0.3498968482017517, + "learning_rate": 2.367663981588032e-06, + "logits/chosen": 3.505798816680908, + "logits/rejected": 3.505798816680908, + "logps/chosen": -170.49453735351562, + "logps/rejected": -170.49453735351562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.371819496154785, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.371819496154785, + "step": 3524 + }, + { + "epoch": 2.4326375711574952, + "grad_norm": 0.24706235527992249, + "learning_rate": 2.364787111622555e-06, + "logits/chosen": 3.1692886352539062, + "logits/rejected": 3.51422119140625, + "logps/chosen": -149.67262268066406, + "logps/rejected": -184.70941162109375, + "loss": 0.4333, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.207757949829102, + "rewards/margins": 3.539834976196289, + "rewards/rejected": -13.74759292602539, + "step": 3525 + }, + { + "epoch": 2.433327583232707, + "grad_norm": 0.6183508634567261, + "learning_rate": 2.3619102416570773e-06, + "logits/chosen": 3.413604974746704, + "logits/rejected": 3.413604974746704, + "logps/chosen": -159.2686004638672, + "logps/rejected": -159.2686004638672, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.264398574829102, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.264398574829102, + "step": 3526 + }, + { + "epoch": 2.434017595307918, + "grad_norm": 0.3615618348121643, + "learning_rate": 2.3590333716915997e-06, + "logits/chosen": 2.9665441513061523, + "logits/rejected": 3.060778856277466, + "logps/chosen": -159.63351440429688, + "logps/rejected": -171.83229064941406, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.199399948120117, + "rewards/margins": 1.2418601512908936, + "rewards/rejected": -12.441261291503906, + "step": 3527 + }, + { + "epoch": 2.434707607383129, + "grad_norm": 0.3137776553630829, + "learning_rate": 2.356156501726122e-06, + "logits/chosen": 3.4454548358917236, + "logits/rejected": 4.03273868560791, + "logps/chosen": -145.14111328125, + "logps/rejected": -186.00465393066406, + "loss": 0.348, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.66723918914795, + "rewards/margins": 4.172010898590088, + "rewards/rejected": -13.839250564575195, + "step": 3528 + }, + { + "epoch": 2.4353976194583407, + "grad_norm": 0.6856857538223267, + "learning_rate": 2.353279631760645e-06, + "logits/chosen": 3.462475061416626, + "logits/rejected": 3.756453037261963, + "logps/chosen": -152.00399780273438, + "logps/rejected": -178.62550354003906, + "loss": 0.4361, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.384220123291016, + "rewards/margins": 2.5967159271240234, + "rewards/rejected": -12.980936050415039, + "step": 3529 + }, + { + "epoch": 2.436087631533552, + "grad_norm": 0.30330708622932434, + "learning_rate": 2.350402761795167e-06, + "logits/chosen": 3.6011815071105957, + "logits/rejected": 3.7665069103240967, + "logps/chosen": -167.15452575683594, + "logps/rejected": -176.76870727539062, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.858019828796387, + "rewards/margins": 0.9447774887084961, + "rewards/rejected": -12.802797317504883, + "step": 3530 + }, + { + "epoch": 2.436777643608763, + "grad_norm": 0.32768791913986206, + "learning_rate": 2.3475258918296895e-06, + "logits/chosen": 3.4604296684265137, + "logits/rejected": 3.6037042140960693, + "logps/chosen": -154.48214721679688, + "logps/rejected": -174.65997314453125, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.015567779541016, + "rewards/margins": 2.0243401527404785, + "rewards/rejected": -13.039907455444336, + "step": 3531 + }, + { + "epoch": 2.4374676556839745, + "grad_norm": 0.4374789297580719, + "learning_rate": 2.344649021864212e-06, + "logits/chosen": 3.3954858779907227, + "logits/rejected": 3.5955562591552734, + "logps/chosen": -163.5312957763672, + "logps/rejected": -170.64236450195312, + "loss": 0.6069, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.716280937194824, + "rewards/margins": 0.7180145382881165, + "rewards/rejected": -12.434295654296875, + "step": 3532 + }, + { + "epoch": 2.4381576677591856, + "grad_norm": 0.31389716267585754, + "learning_rate": 2.341772151898734e-06, + "logits/chosen": 3.5492749214172363, + "logits/rejected": 3.5614380836486816, + "logps/chosen": -167.1748046875, + "logps/rejected": -178.13853454589844, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.787914276123047, + "rewards/margins": 1.1099685430526733, + "rewards/rejected": -12.897882461547852, + "step": 3533 + }, + { + "epoch": 2.438847679834397, + "grad_norm": 0.5054188370704651, + "learning_rate": 2.338895281933257e-06, + "logits/chosen": 3.1902856826782227, + "logits/rejected": 3.2785558700561523, + "logps/chosen": -146.45962524414062, + "logps/rejected": -151.8078155517578, + "loss": 0.6079, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.691343307495117, + "rewards/margins": 0.5627039670944214, + "rewards/rejected": -10.254047393798828, + "step": 3534 + }, + { + "epoch": 2.4395376919096083, + "grad_norm": 0.3309966027736664, + "learning_rate": 2.3360184119677793e-06, + "logits/chosen": 3.4754507541656494, + "logits/rejected": 3.550590753555298, + "logps/chosen": -157.94876098632812, + "logps/rejected": -168.718017578125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.920042991638184, + "rewards/margins": 1.1047786474227905, + "rewards/rejected": -12.024821281433105, + "step": 3535 + }, + { + "epoch": 2.44022770398482, + "grad_norm": 0.5073686242103577, + "learning_rate": 2.3331415420023017e-06, + "logits/chosen": 3.348566770553589, + "logits/rejected": 3.364494800567627, + "logps/chosen": -162.80088806152344, + "logps/rejected": -167.53768920898438, + "loss": 0.6088, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.368942260742188, + "rewards/margins": 0.4965614080429077, + "rewards/rejected": -11.865503311157227, + "step": 3536 + }, + { + "epoch": 2.440917716060031, + "grad_norm": 0.3026762008666992, + "learning_rate": 2.330264672036824e-06, + "logits/chosen": 3.564148426055908, + "logits/rejected": 3.628664493560791, + "logps/chosen": -178.11279296875, + "logps/rejected": -187.32122802734375, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.963645935058594, + "rewards/margins": 0.9731693863868713, + "rewards/rejected": -13.93681526184082, + "step": 3537 + }, + { + "epoch": 2.441607728135242, + "grad_norm": 0.3955360949039459, + "learning_rate": 2.327387802071347e-06, + "logits/chosen": 3.4554367065429688, + "logits/rejected": 3.4554367065429688, + "logps/chosen": -170.610595703125, + "logps/rejected": -170.610595703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.237958908081055, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.237958908081055, + "step": 3538 + }, + { + "epoch": 2.4422977402104538, + "grad_norm": 0.3725787401199341, + "learning_rate": 2.324510932105869e-06, + "logits/chosen": 3.143251657485962, + "logits/rejected": 3.1757986545562744, + "logps/chosen": -165.79934692382812, + "logps/rejected": -180.2095184326172, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.539576530456543, + "rewards/margins": 1.45975923538208, + "rewards/rejected": -12.999336242675781, + "step": 3539 + }, + { + "epoch": 2.442987752285665, + "grad_norm": 0.3340211808681488, + "learning_rate": 2.3216340621403915e-06, + "logits/chosen": 3.4052376747131348, + "logits/rejected": 3.7727417945861816, + "logps/chosen": -160.4341278076172, + "logps/rejected": -178.5369415283203, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.279052734375, + "rewards/margins": 1.8126287460327148, + "rewards/rejected": -13.091682434082031, + "step": 3540 + }, + { + "epoch": 2.4436777643608765, + "grad_norm": 0.29385730624198914, + "learning_rate": 2.318757192174914e-06, + "logits/chosen": 3.5240373611450195, + "logits/rejected": 3.5240373611450195, + "logps/chosen": -165.04324340820312, + "logps/rejected": -165.04324340820312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.587637901306152, + "rewards/margins": 0.0, + "rewards/rejected": -11.587637901306152, + "step": 3541 + }, + { + "epoch": 2.4443677764360876, + "grad_norm": 0.2995578646659851, + "learning_rate": 2.3158803222094366e-06, + "logits/chosen": 3.3110766410827637, + "logits/rejected": 3.3110766410827637, + "logps/chosen": -178.0948028564453, + "logps/rejected": -178.0948028564453, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.910079956054688, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -12.910079956054688, + "step": 3542 + }, + { + "epoch": 2.445057788511299, + "grad_norm": 0.30285537242889404, + "learning_rate": 2.3130034522439586e-06, + "logits/chosen": 3.3679747581481934, + "logits/rejected": 3.3999531269073486, + "logps/chosen": -153.71688842773438, + "logps/rejected": -167.69886779785156, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.633188247680664, + "rewards/margins": 1.4195724725723267, + "rewards/rejected": -12.05276107788086, + "step": 3543 + }, + { + "epoch": 2.4457478005865103, + "grad_norm": 0.27843281626701355, + "learning_rate": 2.3101265822784813e-06, + "logits/chosen": 3.276488780975342, + "logits/rejected": 3.478940010070801, + "logps/chosen": -139.2984161376953, + "logps/rejected": -175.33006286621094, + "loss": 0.4333, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.222330093383789, + "rewards/margins": 3.6628618240356445, + "rewards/rejected": -12.885191917419434, + "step": 3544 + }, + { + "epoch": 2.4464378126617214, + "grad_norm": 0.3585169017314911, + "learning_rate": 2.3072497123130037e-06, + "logits/chosen": 3.477123260498047, + "logits/rejected": 3.6030077934265137, + "logps/chosen": -161.01727294921875, + "logps/rejected": -182.42572021484375, + "loss": 0.5201, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.309087753295898, + "rewards/margins": 2.208115816116333, + "rewards/rejected": -13.517204284667969, + "step": 3545 + }, + { + "epoch": 2.447127824736933, + "grad_norm": 0.5690658092498779, + "learning_rate": 2.304372842347526e-06, + "logits/chosen": 3.3608365058898926, + "logits/rejected": 3.8291783332824707, + "logps/chosen": -143.10159301757812, + "logps/rejected": -166.09854125976562, + "loss": 0.4386, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.503942489624023, + "rewards/margins": 2.330831289291382, + "rewards/rejected": -11.834773063659668, + "step": 3546 + }, + { + "epoch": 2.447817836812144, + "grad_norm": 0.3628024458885193, + "learning_rate": 2.3014959723820484e-06, + "logits/chosen": 3.5567798614501953, + "logits/rejected": 3.7204995155334473, + "logps/chosen": -172.8255157470703, + "logps/rejected": -181.6531219482422, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.520397186279297, + "rewards/margins": 0.849707305431366, + "rewards/rejected": -13.370104789733887, + "step": 3547 + }, + { + "epoch": 2.4485078488873553, + "grad_norm": 0.4331745207309723, + "learning_rate": 2.298619102416571e-06, + "logits/chosen": 3.5890650749206543, + "logits/rejected": 3.5890650749206543, + "logps/chosen": -183.8997344970703, + "logps/rejected": -183.89971923828125, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.601658821105957, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.601658821105957, + "step": 3548 + }, + { + "epoch": 2.449197860962567, + "grad_norm": 17.20377540588379, + "learning_rate": 2.2957422324510935e-06, + "logits/chosen": 3.1416375637054443, + "logits/rejected": 3.172480344772339, + "logps/chosen": -153.00892639160156, + "logps/rejected": -151.55792236328125, + "loss": 0.7702, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.522823333740234, + "rewards/margins": -0.12435722351074219, + "rewards/rejected": -10.398466110229492, + "step": 3549 + }, + { + "epoch": 2.449887873037778, + "grad_norm": 0.4115380048751831, + "learning_rate": 2.292865362485616e-06, + "logits/chosen": 3.215540885925293, + "logits/rejected": 3.25128173828125, + "logps/chosen": -163.92752075195312, + "logps/rejected": -181.8780059814453, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.64394760131836, + "rewards/margins": 1.7834157943725586, + "rewards/rejected": -13.427363395690918, + "step": 3550 + }, + { + "epoch": 2.4505778851129896, + "grad_norm": 0.32770684361457825, + "learning_rate": 2.2899884925201382e-06, + "logits/chosen": 3.2891507148742676, + "logits/rejected": 3.446019172668457, + "logps/chosen": -151.05526733398438, + "logps/rejected": -161.8558349609375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.427725791931152, + "rewards/margins": 1.1149041652679443, + "rewards/rejected": -11.54262924194336, + "step": 3551 + }, + { + "epoch": 2.4512678971882007, + "grad_norm": 0.33185410499572754, + "learning_rate": 2.287111622554661e-06, + "logits/chosen": 3.555793046951294, + "logits/rejected": 3.656067132949829, + "logps/chosen": -166.0790252685547, + "logps/rejected": -179.01690673828125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.930601119995117, + "rewards/margins": 1.3241722583770752, + "rewards/rejected": -13.254773139953613, + "step": 3552 + }, + { + "epoch": 2.4519579092634123, + "grad_norm": 0.4209582507610321, + "learning_rate": 2.284234752589183e-06, + "logits/chosen": 3.0680079460144043, + "logits/rejected": 3.4368362426757812, + "logps/chosen": -141.95144653320312, + "logps/rejected": -162.01576232910156, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.41500473022461, + "rewards/margins": 1.9950848817825317, + "rewards/rejected": -11.410089492797852, + "step": 3553 + }, + { + "epoch": 2.4526479213386234, + "grad_norm": 0.5055276155471802, + "learning_rate": 2.2813578826237057e-06, + "logits/chosen": 3.6499552726745605, + "logits/rejected": 3.630443572998047, + "logps/chosen": -182.59451293945312, + "logps/rejected": -187.76495361328125, + "loss": 0.6077, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.523490905761719, + "rewards/margins": 0.5785776376724243, + "rewards/rejected": -14.102068901062012, + "step": 3554 + }, + { + "epoch": 2.4533379334138345, + "grad_norm": 0.39261719584465027, + "learning_rate": 2.278481012658228e-06, + "logits/chosen": 3.677879810333252, + "logits/rejected": 3.677879810333252, + "logps/chosen": -183.91238403320312, + "logps/rejected": -183.91238403320312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.551517486572266, + "rewards/margins": 0.0, + "rewards/rejected": -13.551517486572266, + "step": 3555 + }, + { + "epoch": 2.454027945489046, + "grad_norm": 0.3636389672756195, + "learning_rate": 2.2756041426927504e-06, + "logits/chosen": 3.350581645965576, + "logits/rejected": 3.4338037967681885, + "logps/chosen": -155.34815979003906, + "logps/rejected": -168.64280700683594, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.915955543518066, + "rewards/margins": 1.3693363666534424, + "rewards/rejected": -12.28529167175293, + "step": 3556 + }, + { + "epoch": 2.4547179575642573, + "grad_norm": 0.36224380135536194, + "learning_rate": 2.2727272727272728e-06, + "logits/chosen": 3.4385921955108643, + "logits/rejected": 3.4984400272369385, + "logps/chosen": -167.7331085205078, + "logps/rejected": -177.7878875732422, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.927114486694336, + "rewards/margins": 1.056217074394226, + "rewards/rejected": -12.983331680297852, + "step": 3557 + }, + { + "epoch": 2.455407969639469, + "grad_norm": 0.24980923533439636, + "learning_rate": 2.2698504027617955e-06, + "logits/chosen": 3.365983009338379, + "logits/rejected": 3.885561227798462, + "logps/chosen": -151.18463134765625, + "logps/rejected": -189.99754333496094, + "loss": 0.3474, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.222358703613281, + "rewards/margins": 3.9336938858032227, + "rewards/rejected": -14.156052589416504, + "step": 3558 + }, + { + "epoch": 2.45609798171468, + "grad_norm": 0.509764552116394, + "learning_rate": 2.266973532796318e-06, + "logits/chosen": 3.33650279045105, + "logits/rejected": 3.33650279045105, + "logps/chosen": -180.20091247558594, + "logps/rejected": -180.200927734375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.282706260681152, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.282707214355469, + "step": 3559 + }, + { + "epoch": 2.4567879937898915, + "grad_norm": 0.4645133316516876, + "learning_rate": 2.2640966628308402e-06, + "logits/chosen": 3.5572500228881836, + "logits/rejected": 3.5572500228881836, + "logps/chosen": -169.78057861328125, + "logps/rejected": -169.78057861328125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.18817138671875, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -12.18817138671875, + "step": 3560 + }, + { + "epoch": 2.4574780058651027, + "grad_norm": 0.4267825484275818, + "learning_rate": 2.2612197928653626e-06, + "logits/chosen": 3.5252747535705566, + "logits/rejected": 3.5992188453674316, + "logps/chosen": -169.02017211914062, + "logps/rejected": -181.68577575683594, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.201244354248047, + "rewards/margins": 1.2744396924972534, + "rewards/rejected": -13.475683212280273, + "step": 3561 + }, + { + "epoch": 2.458168017940314, + "grad_norm": 0.36969655752182007, + "learning_rate": 2.2583429228998854e-06, + "logits/chosen": 3.399911880493164, + "logits/rejected": 3.515395164489746, + "logps/chosen": -158.21180725097656, + "logps/rejected": -179.76516723632812, + "loss": 0.5204, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.049956321716309, + "rewards/margins": 2.180988073348999, + "rewards/rejected": -13.23094367980957, + "step": 3562 + }, + { + "epoch": 2.4588580300155254, + "grad_norm": 4.055732250213623, + "learning_rate": 2.2554660529344073e-06, + "logits/chosen": 3.5931782722473145, + "logits/rejected": 3.6167445182800293, + "logps/chosen": -163.73199462890625, + "logps/rejected": -165.73818969726562, + "loss": 0.6293, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.453653335571289, + "rewards/margins": 0.20107340812683105, + "rewards/rejected": -11.6547269821167, + "step": 3563 + }, + { + "epoch": 2.4595480420907365, + "grad_norm": 0.7538365721702576, + "learning_rate": 2.25258918296893e-06, + "logits/chosen": 3.3183937072753906, + "logits/rejected": 3.5763094425201416, + "logps/chosen": -173.1263885498047, + "logps/rejected": -189.488037109375, + "loss": 0.5223, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.68757438659668, + "rewards/margins": 1.6854043006896973, + "rewards/rejected": -14.372980117797852, + "step": 3564 + }, + { + "epoch": 2.460238054165948, + "grad_norm": 0.4664669930934906, + "learning_rate": 2.2497123130034524e-06, + "logits/chosen": 3.819528579711914, + "logits/rejected": 3.819528579711914, + "logps/chosen": -184.01797485351562, + "logps/rejected": -184.01797485351562, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.84993839263916, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.84993839263916, + "step": 3565 + }, + { + "epoch": 2.4609280662411592, + "grad_norm": 0.45242252945899963, + "learning_rate": 2.2468354430379748e-06, + "logits/chosen": 3.5558314323425293, + "logits/rejected": 3.6808347702026367, + "logps/chosen": -162.17283630371094, + "logps/rejected": -176.65599060058594, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.519639015197754, + "rewards/margins": 1.3788071870803833, + "rewards/rejected": -12.898447036743164, + "step": 3566 + }, + { + "epoch": 2.4616180783163704, + "grad_norm": 0.5002351999282837, + "learning_rate": 2.243958573072497e-06, + "logits/chosen": 3.4017953872680664, + "logits/rejected": 3.646855354309082, + "logps/chosen": -173.49984741210938, + "logps/rejected": -191.89205932617188, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.494182586669922, + "rewards/margins": 1.8908449411392212, + "rewards/rejected": -14.385027885437012, + "step": 3567 + }, + { + "epoch": 2.462308090391582, + "grad_norm": 0.32872474193573, + "learning_rate": 2.24108170310702e-06, + "logits/chosen": 3.4276256561279297, + "logits/rejected": 3.627078056335449, + "logps/chosen": -176.00035095214844, + "logps/rejected": -183.65261840820312, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.741682052612305, + "rewards/margins": 0.7957370281219482, + "rewards/rejected": -13.537419319152832, + "step": 3568 + }, + { + "epoch": 2.462998102466793, + "grad_norm": 0.30957189202308655, + "learning_rate": 2.2382048331415422e-06, + "logits/chosen": 3.4516170024871826, + "logits/rejected": 3.4936647415161133, + "logps/chosen": -163.80198669433594, + "logps/rejected": -172.18824768066406, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.690145492553711, + "rewards/margins": 0.8565642833709717, + "rewards/rejected": -12.546710968017578, + "step": 3569 + }, + { + "epoch": 2.4636881145420046, + "grad_norm": 0.6440287828445435, + "learning_rate": 2.2353279631760646e-06, + "logits/chosen": 3.2397098541259766, + "logits/rejected": 3.451470136642456, + "logps/chosen": -137.79595947265625, + "logps/rejected": -169.5130615234375, + "loss": 0.437, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.985352516174316, + "rewards/margins": 3.1171875, + "rewards/rejected": -12.102540969848633, + "step": 3570 + }, + { + "epoch": 2.464378126617216, + "grad_norm": 0.3527195155620575, + "learning_rate": 2.232451093210587e-06, + "logits/chosen": 3.367551565170288, + "logits/rejected": 3.367551565170288, + "logps/chosen": -156.54185485839844, + "logps/rejected": -156.54185485839844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.722017288208008, + "rewards/margins": 0.0, + "rewards/rejected": -10.722017288208008, + "step": 3571 + }, + { + "epoch": 2.465068138692427, + "grad_norm": 0.42990341782569885, + "learning_rate": 2.2295742232451097e-06, + "logits/chosen": 3.5609679222106934, + "logits/rejected": 3.5609679222106934, + "logps/chosen": -171.40670776367188, + "logps/rejected": -171.40670776367188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.373916625976562, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.373916625976562, + "step": 3572 + }, + { + "epoch": 2.4657581507676385, + "grad_norm": 0.4862270951271057, + "learning_rate": 2.2266973532796317e-06, + "logits/chosen": 3.7312159538269043, + "logits/rejected": 3.7312159538269043, + "logps/chosen": -182.42300415039062, + "logps/rejected": -182.42300415039062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.389508247375488, + "rewards/margins": 0.0, + "rewards/rejected": -13.389508247375488, + "step": 3573 + }, + { + "epoch": 2.4664481628428496, + "grad_norm": 0.2795262634754181, + "learning_rate": 2.2238204833141544e-06, + "logits/chosen": 3.715580463409424, + "logits/rejected": 4.073470592498779, + "logps/chosen": -144.9008331298828, + "logps/rejected": -173.6875, + "loss": 0.4341, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.820207595825195, + "rewards/margins": 2.8904030323028564, + "rewards/rejected": -12.710611343383789, + "step": 3574 + }, + { + "epoch": 2.467138174918061, + "grad_norm": 0.4159247875213623, + "learning_rate": 2.2209436133486768e-06, + "logits/chosen": 3.343095302581787, + "logits/rejected": 3.650811195373535, + "logps/chosen": -147.43829345703125, + "logps/rejected": -167.8448486328125, + "loss": 0.5207, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.920402526855469, + "rewards/margins": 1.8989689350128174, + "rewards/rejected": -11.81937026977539, + "step": 3575 + }, + { + "epoch": 2.4678281869932723, + "grad_norm": 0.5128793120384216, + "learning_rate": 2.218066743383199e-06, + "logits/chosen": 3.215388298034668, + "logits/rejected": 3.3341596126556396, + "logps/chosen": -157.784912109375, + "logps/rejected": -163.90174865722656, + "loss": 0.6072, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.015090942382812, + "rewards/margins": 0.6515817046165466, + "rewards/rejected": -11.66667366027832, + "step": 3576 + }, + { + "epoch": 2.468518199068484, + "grad_norm": 0.44664838910102844, + "learning_rate": 2.2151898734177215e-06, + "logits/chosen": 2.9413154125213623, + "logits/rejected": 2.973482131958008, + "logps/chosen": -146.0372314453125, + "logps/rejected": -162.01675415039062, + "loss": 0.5223, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.69765853881836, + "rewards/margins": 1.6669609546661377, + "rewards/rejected": -11.364620208740234, + "step": 3577 + }, + { + "epoch": 2.469208211143695, + "grad_norm": 0.3816380798816681, + "learning_rate": 2.2123130034522443e-06, + "logits/chosen": 3.4112672805786133, + "logits/rejected": 3.5472216606140137, + "logps/chosen": -157.90953063964844, + "logps/rejected": -164.69139099121094, + "loss": 0.607, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.932984352111816, + "rewards/margins": 0.6795828342437744, + "rewards/rejected": -11.612566947937012, + "step": 3578 + }, + { + "epoch": 2.469898223218906, + "grad_norm": 3.883777141571045, + "learning_rate": 2.2094361334867666e-06, + "logits/chosen": 3.4488110542297363, + "logits/rejected": 3.5046210289001465, + "logps/chosen": -129.29847717285156, + "logps/rejected": -157.8507537841797, + "loss": 0.368, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.276504516601562, + "rewards/margins": 2.963911294937134, + "rewards/rejected": -11.240416526794434, + "step": 3579 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.4306887090206146, + "learning_rate": 2.206559263521289e-06, + "logits/chosen": 3.101444721221924, + "logits/rejected": 3.262256622314453, + "logps/chosen": -160.15719604492188, + "logps/rejected": -177.59747314453125, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.364215850830078, + "rewards/margins": 1.7882295846939087, + "rewards/rejected": -13.152444839477539, + "step": 3580 + }, + { + "epoch": 2.471278247369329, + "grad_norm": 23.054668426513672, + "learning_rate": 2.2036823935558117e-06, + "logits/chosen": 3.5840072631835938, + "logits/rejected": 3.4960951805114746, + "logps/chosen": -159.61587524414062, + "logps/rejected": -157.60491943359375, + "loss": 0.8126, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.551636695861816, + "rewards/margins": -0.17935872077941895, + "rewards/rejected": -11.372278213500977, + "step": 3581 + }, + { + "epoch": 2.4719682594445405, + "grad_norm": 0.3424520492553711, + "learning_rate": 2.200805523590334e-06, + "logits/chosen": 2.8262386322021484, + "logits/rejected": 3.120976686477661, + "logps/chosen": -142.01995849609375, + "logps/rejected": -164.05935668945312, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.639148712158203, + "rewards/margins": 2.172003984451294, + "rewards/rejected": -11.811153411865234, + "step": 3582 + }, + { + "epoch": 2.4726582715197516, + "grad_norm": 0.2622275650501251, + "learning_rate": 2.1979286536248564e-06, + "logits/chosen": 3.353546619415283, + "logits/rejected": 3.436093807220459, + "logps/chosen": -151.99624633789062, + "logps/rejected": -184.88772583007812, + "loss": 0.4338, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.308712005615234, + "rewards/margins": 3.3824410438537598, + "rewards/rejected": -13.691152572631836, + "step": 3583 + }, + { + "epoch": 2.4733482835949627, + "grad_norm": 0.40626928210258484, + "learning_rate": 2.195051783659379e-06, + "logits/chosen": 3.2612574100494385, + "logits/rejected": 3.2612574100494385, + "logps/chosen": -165.73435974121094, + "logps/rejected": -165.73435974121094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.633983612060547, + "rewards/margins": 0.0, + "rewards/rejected": -11.633983612060547, + "step": 3584 + }, + { + "epoch": 2.4740382956701743, + "grad_norm": 0.48120686411857605, + "learning_rate": 2.192174913693901e-06, + "logits/chosen": 3.485961675643921, + "logits/rejected": 3.485961675643921, + "logps/chosen": -183.09564208984375, + "logps/rejected": -183.09564208984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.527776718139648, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.527776718139648, + "step": 3585 + }, + { + "epoch": 2.4747283077453854, + "grad_norm": 0.3601981997489929, + "learning_rate": 2.1892980437284235e-06, + "logits/chosen": 3.3374457359313965, + "logits/rejected": 3.2992966175079346, + "logps/chosen": -144.93405151367188, + "logps/rejected": -157.11328125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.535178184509277, + "rewards/margins": 1.2617753744125366, + "rewards/rejected": -10.796954154968262, + "step": 3586 + }, + { + "epoch": 2.475418319820597, + "grad_norm": 0.4950183928012848, + "learning_rate": 2.1864211737629463e-06, + "logits/chosen": 3.4726572036743164, + "logits/rejected": 3.4726572036743164, + "logps/chosen": -174.72976684570312, + "logps/rejected": -174.72976684570312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.53934097290039, + "rewards/margins": 0.0, + "rewards/rejected": -12.53934097290039, + "step": 3587 + }, + { + "epoch": 2.476108331895808, + "grad_norm": 0.3665831685066223, + "learning_rate": 2.1835443037974686e-06, + "logits/chosen": 3.107651710510254, + "logits/rejected": 3.21820330619812, + "logps/chosen": -150.77536010742188, + "logps/rejected": -165.6932373046875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.222064018249512, + "rewards/margins": 1.522344946861267, + "rewards/rejected": -11.74440860748291, + "step": 3588 + }, + { + "epoch": 2.4767983439710193, + "grad_norm": 0.3558652698993683, + "learning_rate": 2.180667433831991e-06, + "logits/chosen": 3.0445361137390137, + "logits/rejected": 3.055629253387451, + "logps/chosen": -156.83230590820312, + "logps/rejected": -170.8582000732422, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.874528884887695, + "rewards/margins": 1.4197577238082886, + "rewards/rejected": -12.294286727905273, + "step": 3589 + }, + { + "epoch": 2.477488356046231, + "grad_norm": 0.32703790068626404, + "learning_rate": 2.1777905638665133e-06, + "logits/chosen": 3.533297538757324, + "logits/rejected": 3.533297538757324, + "logps/chosen": -190.60081481933594, + "logps/rejected": -190.60081481933594, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.148597717285156, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.148597717285156, + "step": 3590 + }, + { + "epoch": 2.478178368121442, + "grad_norm": 0.5610954761505127, + "learning_rate": 2.174913693901036e-06, + "logits/chosen": 3.3296518325805664, + "logits/rejected": 3.6509008407592773, + "logps/chosen": -154.50296020507812, + "logps/rejected": -173.33775329589844, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.516802787780762, + "rewards/margins": 1.8743195533752441, + "rewards/rejected": -12.391121864318848, + "step": 3591 + }, + { + "epoch": 2.4788683801966536, + "grad_norm": 0.3294491469860077, + "learning_rate": 2.1720368239355585e-06, + "logits/chosen": 2.5499696731567383, + "logits/rejected": 2.944093704223633, + "logps/chosen": -133.30435180664062, + "logps/rejected": -159.29965209960938, + "loss": 0.434, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.669807434082031, + "rewards/margins": 2.6134324073791504, + "rewards/rejected": -11.283239364624023, + "step": 3592 + }, + { + "epoch": 2.4795583922718647, + "grad_norm": 0.3260849118232727, + "learning_rate": 2.169159953970081e-06, + "logits/chosen": 3.1428985595703125, + "logits/rejected": 3.111219882965088, + "logps/chosen": -151.73641967773438, + "logps/rejected": -168.155029296875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.480867385864258, + "rewards/margins": 1.6242597103118896, + "rewards/rejected": -12.105127334594727, + "step": 3593 + }, + { + "epoch": 2.4802484043470763, + "grad_norm": 0.5514612197875977, + "learning_rate": 2.166283084004603e-06, + "logits/chosen": 3.4075207710266113, + "logits/rejected": 3.417051076889038, + "logps/chosen": -160.84121704101562, + "logps/rejected": -166.18833923339844, + "loss": 0.6079, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.271769523620605, + "rewards/margins": 0.5624467134475708, + "rewards/rejected": -11.834217071533203, + "step": 3594 + }, + { + "epoch": 2.4809384164222874, + "grad_norm": 0.3617801070213318, + "learning_rate": 2.1634062140391255e-06, + "logits/chosen": 3.115834951400757, + "logits/rejected": 3.115834951400757, + "logps/chosen": -182.859130859375, + "logps/rejected": -182.859130859375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.512743949890137, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -13.51274299621582, + "step": 3595 + }, + { + "epoch": 2.4816284284974985, + "grad_norm": 28.2322998046875, + "learning_rate": 2.160529344073648e-06, + "logits/chosen": 3.295403003692627, + "logits/rejected": 3.724946975708008, + "logps/chosen": -155.6126251220703, + "logps/rejected": -173.6148681640625, + "loss": 0.6529, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.819443702697754, + "rewards/margins": 1.8254450559616089, + "rewards/rejected": -12.644887924194336, + "step": 3596 + }, + { + "epoch": 2.48231844057271, + "grad_norm": 0.6616138219833374, + "learning_rate": 2.1576524741081706e-06, + "logits/chosen": 3.5770981311798096, + "logits/rejected": 3.6763193607330322, + "logps/chosen": -149.49588012695312, + "logps/rejected": -154.32615661621094, + "loss": 0.6094, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.245238304138184, + "rewards/margins": 0.46866270899772644, + "rewards/rejected": -10.71390151977539, + "step": 3597 + }, + { + "epoch": 2.4830084526479212, + "grad_norm": 2.509484052658081, + "learning_rate": 2.154775604142693e-06, + "logits/chosen": 3.074782371520996, + "logits/rejected": 3.1358540058135986, + "logps/chosen": -162.6744384765625, + "logps/rejected": -164.69729614257812, + "loss": 0.6285, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.603944778442383, + "rewards/margins": 0.20603251457214355, + "rewards/rejected": -11.809976577758789, + "step": 3598 + }, + { + "epoch": 2.483698464723133, + "grad_norm": 0.7251695990562439, + "learning_rate": 2.1518987341772153e-06, + "logits/chosen": 3.3413963317871094, + "logits/rejected": 3.2932581901550293, + "logps/chosen": -144.66476440429688, + "logps/rejected": -147.63832092285156, + "loss": 0.6282, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.972476959228516, + "rewards/margins": 0.20797166228294373, + "rewards/rejected": -10.180448532104492, + "step": 3599 + }, + { + "epoch": 2.484388476798344, + "grad_norm": 0.3349617123603821, + "learning_rate": 2.1490218642117377e-06, + "logits/chosen": 3.414083957672119, + "logits/rejected": 3.414083957672119, + "logps/chosen": -171.51837158203125, + "logps/rejected": -171.51837158203125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.417759895324707, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.417760848999023, + "step": 3600 + }, + { + "epoch": 2.4850784888735555, + "grad_norm": 30.605257034301758, + "learning_rate": 2.1461449942462605e-06, + "logits/chosen": 3.778897762298584, + "logits/rejected": 3.8431196212768555, + "logps/chosen": -158.02308654785156, + "logps/rejected": -160.7710418701172, + "loss": 0.6664, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.145395278930664, + "rewards/margins": 0.2880839705467224, + "rewards/rejected": -11.433480262756348, + "step": 3601 + }, + { + "epoch": 2.4857685009487667, + "grad_norm": 0.38275691866874695, + "learning_rate": 2.143268124280783e-06, + "logits/chosen": 3.4087975025177, + "logits/rejected": 3.4087975025177, + "logps/chosen": -175.2381591796875, + "logps/rejected": -175.2381591796875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.046445846557617, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.04644775390625, + "step": 3602 + }, + { + "epoch": 2.486458513023978, + "grad_norm": 0.35328033566474915, + "learning_rate": 2.140391254315305e-06, + "logits/chosen": 3.7231738567352295, + "logits/rejected": 3.7231738567352295, + "logps/chosen": -171.2628173828125, + "logps/rejected": -171.2628173828125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.292433738708496, + "rewards/margins": 0.0, + "rewards/rejected": -12.292433738708496, + "step": 3603 + }, + { + "epoch": 2.4871485250991894, + "grad_norm": 0.31253933906555176, + "learning_rate": 2.1375143843498275e-06, + "logits/chosen": 3.310706853866577, + "logits/rejected": 3.5732884407043457, + "logps/chosen": -160.54702758789062, + "logps/rejected": -189.25289916992188, + "loss": 0.4335, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.16824722290039, + "rewards/margins": 2.92374324798584, + "rewards/rejected": -14.091991424560547, + "step": 3604 + }, + { + "epoch": 2.4878385371744005, + "grad_norm": 0.2777039706707001, + "learning_rate": 2.13463751438435e-06, + "logits/chosen": 3.392125129699707, + "logits/rejected": 3.4216084480285645, + "logps/chosen": -167.9466552734375, + "logps/rejected": -198.12252807617188, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.085968017578125, + "rewards/margins": 3.0041658878326416, + "rewards/rejected": -15.090133666992188, + "step": 3605 + }, + { + "epoch": 2.4885285492496116, + "grad_norm": 0.31817787885665894, + "learning_rate": 2.1317606444188722e-06, + "logits/chosen": 2.9921669960021973, + "logits/rejected": 3.2322535514831543, + "logps/chosen": -145.98744201660156, + "logps/rejected": -170.24497985839844, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.862898826599121, + "rewards/margins": 2.44462251663208, + "rewards/rejected": -12.307520866394043, + "step": 3606 + }, + { + "epoch": 2.489218561324823, + "grad_norm": 0.5380116105079651, + "learning_rate": 2.128883774453395e-06, + "logits/chosen": 3.432143449783325, + "logits/rejected": 3.613459825515747, + "logps/chosen": -153.11236572265625, + "logps/rejected": -162.7093963623047, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.598638534545898, + "rewards/margins": 0.970777690410614, + "rewards/rejected": -11.569416046142578, + "step": 3607 + }, + { + "epoch": 2.4899085734000344, + "grad_norm": 0.4137287139892578, + "learning_rate": 2.1260069044879174e-06, + "logits/chosen": 3.1868491172790527, + "logits/rejected": 3.187893867492676, + "logps/chosen": -156.896484375, + "logps/rejected": -168.5932159423828, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.973941802978516, + "rewards/margins": 1.2321864366531372, + "rewards/rejected": -12.20612907409668, + "step": 3608 + }, + { + "epoch": 2.490598585475246, + "grad_norm": 0.43434497714042664, + "learning_rate": 2.1231300345224397e-06, + "logits/chosen": 3.3459692001342773, + "logits/rejected": 3.5391082763671875, + "logps/chosen": -159.6934814453125, + "logps/rejected": -169.16268920898438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.19901180267334, + "rewards/margins": 1.0057369470596313, + "rewards/rejected": -12.20474910736084, + "step": 3609 + }, + { + "epoch": 2.491288597550457, + "grad_norm": 0.30375242233276367, + "learning_rate": 2.120253164556962e-06, + "logits/chosen": 3.508523941040039, + "logits/rejected": 3.7337589263916016, + "logps/chosen": -164.61257934570312, + "logps/rejected": -183.9811553955078, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.741512298583984, + "rewards/margins": 1.8730225563049316, + "rewards/rejected": -13.614534378051758, + "step": 3610 + }, + { + "epoch": 2.4919786096256686, + "grad_norm": 0.3643184006214142, + "learning_rate": 2.117376294591485e-06, + "logits/chosen": 3.6077427864074707, + "logits/rejected": 3.707045555114746, + "logps/chosen": -167.47579956054688, + "logps/rejected": -178.42840576171875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.03067398071289, + "rewards/margins": 1.1034376621246338, + "rewards/rejected": -13.134112358093262, + "step": 3611 + }, + { + "epoch": 2.4926686217008798, + "grad_norm": 0.5189604759216309, + "learning_rate": 2.114499424626007e-06, + "logits/chosen": 3.2461864948272705, + "logits/rejected": 3.2821261882781982, + "logps/chosen": -164.23361206054688, + "logps/rejected": -170.9977264404297, + "loss": 0.607, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.712854385375977, + "rewards/margins": 0.6842361688613892, + "rewards/rejected": -12.397089958190918, + "step": 3612 + }, + { + "epoch": 2.493358633776091, + "grad_norm": 0.38174039125442505, + "learning_rate": 2.1116225546605295e-06, + "logits/chosen": 3.6171579360961914, + "logits/rejected": 3.6171579360961914, + "logps/chosen": -164.43502807617188, + "logps/rejected": -164.43502807617188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.675904273986816, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.675904273986816, + "step": 3613 + }, + { + "epoch": 2.4940486458513025, + "grad_norm": 0.31936946511268616, + "learning_rate": 2.108745684695052e-06, + "logits/chosen": 3.564373016357422, + "logits/rejected": 3.5683789253234863, + "logps/chosen": -167.89462280273438, + "logps/rejected": -180.064453125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.128227233886719, + "rewards/margins": 1.1862800121307373, + "rewards/rejected": -13.314506530761719, + "step": 3614 + }, + { + "epoch": 2.4947386579265136, + "grad_norm": 0.4044038653373718, + "learning_rate": 2.1058688147295742e-06, + "logits/chosen": 3.8897719383239746, + "logits/rejected": 3.8897719383239746, + "logps/chosen": -182.04574584960938, + "logps/rejected": -182.04574584960938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.553943634033203, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.553943634033203, + "step": 3615 + }, + { + "epoch": 2.495428670001725, + "grad_norm": 2.066263198852539, + "learning_rate": 2.1029919447640966e-06, + "logits/chosen": 3.3906333446502686, + "logits/rejected": 3.5779526233673096, + "logps/chosen": -147.3997802734375, + "logps/rejected": -158.21710205078125, + "loss": 0.5356, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.094253540039062, + "rewards/margins": 1.0907708406448364, + "rewards/rejected": -11.18502426147461, + "step": 3616 + }, + { + "epoch": 2.4961186820769363, + "grad_norm": 0.44540226459503174, + "learning_rate": 2.1001150747986194e-06, + "logits/chosen": 3.5021259784698486, + "logits/rejected": 3.5021259784698486, + "logps/chosen": -162.10484313964844, + "logps/rejected": -162.10484313964844, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.383247375488281, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.383247375488281, + "step": 3617 + }, + { + "epoch": 2.496808694152148, + "grad_norm": 0.33454829454421997, + "learning_rate": 2.0972382048331417e-06, + "logits/chosen": 3.2704737186431885, + "logits/rejected": 3.397038698196411, + "logps/chosen": -162.98416137695312, + "logps/rejected": -176.44166564941406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.593938827514648, + "rewards/margins": 1.2615684270858765, + "rewards/rejected": -12.855507850646973, + "step": 3618 + }, + { + "epoch": 2.497498706227359, + "grad_norm": 0.36470553278923035, + "learning_rate": 2.094361334867664e-06, + "logits/chosen": 3.2434325218200684, + "logits/rejected": 3.605613946914673, + "logps/chosen": -133.3546142578125, + "logps/rejected": -152.96923828125, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.575349807739258, + "rewards/margins": 1.921709418296814, + "rewards/rejected": -10.49705982208252, + "step": 3619 + }, + { + "epoch": 2.49818871830257, + "grad_norm": 0.4310034215450287, + "learning_rate": 2.0914844649021864e-06, + "logits/chosen": 3.7790939807891846, + "logits/rejected": 3.7790939807891846, + "logps/chosen": -176.96121215820312, + "logps/rejected": -176.96121215820312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.671347618103027, + "rewards/margins": 0.0, + "rewards/rejected": -12.671347618103027, + "step": 3620 + }, + { + "epoch": 2.4988787303777817, + "grad_norm": 0.34509265422821045, + "learning_rate": 2.088607594936709e-06, + "logits/chosen": 3.624030113220215, + "logits/rejected": 3.7119081020355225, + "logps/chosen": -164.04449462890625, + "logps/rejected": -170.934814453125, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.78228759765625, + "rewards/margins": 0.7608677744865417, + "rewards/rejected": -12.543155670166016, + "step": 3621 + }, + { + "epoch": 2.499568742452993, + "grad_norm": 0.39087721705436707, + "learning_rate": 2.0857307249712315e-06, + "logits/chosen": 3.5153634548187256, + "logits/rejected": 3.5153634548187256, + "logps/chosen": -168.81198120117188, + "logps/rejected": -168.81198120117188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.083660125732422, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.083660125732422, + "step": 3622 + }, + { + "epoch": 2.500258754528204, + "grad_norm": 0.4542538821697235, + "learning_rate": 2.082853855005754e-06, + "logits/chosen": 3.653440237045288, + "logits/rejected": 3.653440237045288, + "logps/chosen": -166.19064331054688, + "logps/rejected": -166.19064331054688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.801198959350586, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.801199913024902, + "step": 3623 + }, + { + "epoch": 2.5009487666034156, + "grad_norm": 0.33798426389694214, + "learning_rate": 2.0799769850402762e-06, + "logits/chosen": 3.634037494659424, + "logits/rejected": 3.634037494659424, + "logps/chosen": -187.55804443359375, + "logps/rejected": -187.55804443359375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.960094451904297, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.960094451904297, + "step": 3624 + }, + { + "epoch": 2.5016387786786267, + "grad_norm": 0.44054409861564636, + "learning_rate": 2.0771001150747986e-06, + "logits/chosen": 3.4321155548095703, + "logits/rejected": 3.5417370796203613, + "logps/chosen": -155.4955291748047, + "logps/rejected": -171.1389923095703, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.789998054504395, + "rewards/margins": 1.5012696981430054, + "rewards/rejected": -12.291267395019531, + "step": 3625 + }, + { + "epoch": 2.5023287907538383, + "grad_norm": 0.4637002646923065, + "learning_rate": 2.074223245109321e-06, + "logits/chosen": 4.0674591064453125, + "logits/rejected": 4.0674591064453125, + "logps/chosen": -178.0364532470703, + "logps/rejected": -178.03643798828125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.962940216064453, + "rewards/margins": 0.0, + "rewards/rejected": -12.962940216064453, + "step": 3626 + }, + { + "epoch": 2.5030188028290494, + "grad_norm": 0.34847205877304077, + "learning_rate": 2.0713463751438437e-06, + "logits/chosen": 3.0758919715881348, + "logits/rejected": 3.219324827194214, + "logps/chosen": -144.92019653320312, + "logps/rejected": -164.42642211914062, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.624324798583984, + "rewards/margins": 1.9830641746520996, + "rewards/rejected": -11.60738754272461, + "step": 3627 + }, + { + "epoch": 2.503708814904261, + "grad_norm": 0.3102167546749115, + "learning_rate": 2.068469505178366e-06, + "logits/chosen": 3.3297643661499023, + "logits/rejected": 3.4174957275390625, + "logps/chosen": -142.2976837158203, + "logps/rejected": -180.3601531982422, + "loss": 0.4332, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.52307415008545, + "rewards/margins": 3.7889244556427, + "rewards/rejected": -13.31199836730957, + "step": 3628 + }, + { + "epoch": 2.504398826979472, + "grad_norm": 0.45154091715812683, + "learning_rate": 2.0655926352128884e-06, + "logits/chosen": 3.164971113204956, + "logits/rejected": 3.164971113204956, + "logps/chosen": -174.15379333496094, + "logps/rejected": -174.15379333496094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.682418823242188, + "rewards/margins": 0.0, + "rewards/rejected": -12.682418823242188, + "step": 3629 + }, + { + "epoch": 2.5050888390546833, + "grad_norm": 0.3895204961299896, + "learning_rate": 2.0627157652474108e-06, + "logits/chosen": 3.1365067958831787, + "logits/rejected": 3.1365067958831787, + "logps/chosen": -162.00469970703125, + "logps/rejected": -162.0046844482422, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.316902160644531, + "rewards/margins": 0.0, + "rewards/rejected": -11.316902160644531, + "step": 3630 + }, + { + "epoch": 2.505778851129895, + "grad_norm": 0.49068814516067505, + "learning_rate": 2.0598388952819336e-06, + "logits/chosen": 3.508274555206299, + "logits/rejected": 3.508274555206299, + "logps/chosen": -164.92398071289062, + "logps/rejected": -164.9239959716797, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.697547912597656, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.697547912597656, + "step": 3631 + }, + { + "epoch": 2.506468863205106, + "grad_norm": 19.901025772094727, + "learning_rate": 2.056962025316456e-06, + "logits/chosen": 3.625310182571411, + "logits/rejected": 3.5274882316589355, + "logps/chosen": -183.46250915527344, + "logps/rejected": -186.24185180664062, + "loss": 0.7587, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.574320793151855, + "rewards/margins": 0.26099908351898193, + "rewards/rejected": -13.835320472717285, + "step": 3632 + }, + { + "epoch": 2.5071588752803176, + "grad_norm": 0.3862287998199463, + "learning_rate": 2.0540851553509783e-06, + "logits/chosen": 3.2854506969451904, + "logits/rejected": 3.4278719425201416, + "logps/chosen": -166.55531311035156, + "logps/rejected": -186.42031860351562, + "loss": 0.5209, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.898956298828125, + "rewards/margins": 1.9921698570251465, + "rewards/rejected": -13.89112663269043, + "step": 3633 + }, + { + "epoch": 2.5078488873555287, + "grad_norm": 0.3303433656692505, + "learning_rate": 2.051208285385501e-06, + "logits/chosen": 3.201120615005493, + "logits/rejected": 3.4106431007385254, + "logps/chosen": -155.77078247070312, + "logps/rejected": -184.3690185546875, + "loss": 0.4335, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.77635383605957, + "rewards/margins": 2.879443645477295, + "rewards/rejected": -13.655797004699707, + "step": 3634 + }, + { + "epoch": 2.5085388994307403, + "grad_norm": 0.41330698132514954, + "learning_rate": 2.048331415420023e-06, + "logits/chosen": 3.3168485164642334, + "logits/rejected": 3.406538486480713, + "logps/chosen": -166.2147979736328, + "logps/rejected": -177.7770233154297, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.033040046691895, + "rewards/margins": 1.0896402597427368, + "rewards/rejected": -13.122679710388184, + "step": 3635 + }, + { + "epoch": 2.5092289115059514, + "grad_norm": 0.36509618163108826, + "learning_rate": 2.0454545454545457e-06, + "logits/chosen": 3.1291465759277344, + "logits/rejected": 3.1967966556549072, + "logps/chosen": -138.72845458984375, + "logps/rejected": -150.40121459960938, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.116416931152344, + "rewards/margins": 1.253483533859253, + "rewards/rejected": -10.369900703430176, + "step": 3636 + }, + { + "epoch": 2.5099189235811625, + "grad_norm": 0.7858584523200989, + "learning_rate": 2.042577675489068e-06, + "logits/chosen": 3.2654240131378174, + "logits/rejected": 3.3605127334594727, + "logps/chosen": -152.78359985351562, + "logps/rejected": -181.70449829101562, + "loss": 0.4375, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.589773178100586, + "rewards/margins": 2.9695146083831787, + "rewards/rejected": -13.559288024902344, + "step": 3637 + }, + { + "epoch": 2.510608935656374, + "grad_norm": 0.43213731050491333, + "learning_rate": 2.0397008055235904e-06, + "logits/chosen": 3.2203078269958496, + "logits/rejected": 3.2203078269958496, + "logps/chosen": -155.6029052734375, + "logps/rejected": -155.6029052734375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.763007164001465, + "rewards/margins": 0.0, + "rewards/rejected": -10.763007164001465, + "step": 3638 + }, + { + "epoch": 2.5112989477315852, + "grad_norm": 0.4328063726425171, + "learning_rate": 2.036823935558113e-06, + "logits/chosen": 3.249634265899658, + "logits/rejected": 3.249634265899658, + "logps/chosen": -174.68136596679688, + "logps/rejected": -174.68136596679688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.419393539428711, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -12.419393539428711, + "step": 3639 + }, + { + "epoch": 2.5119889598067964, + "grad_norm": 2.552417516708374, + "learning_rate": 2.0339470655926356e-06, + "logits/chosen": 3.418966293334961, + "logits/rejected": 3.6904048919677734, + "logps/chosen": -176.6422119140625, + "logps/rejected": -184.12948608398438, + "loss": 0.5381, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.780853271484375, + "rewards/margins": 0.6763275861740112, + "rewards/rejected": -13.45718002319336, + "step": 3640 + }, + { + "epoch": 2.512678971882008, + "grad_norm": 30.562908172607422, + "learning_rate": 2.031070195627158e-06, + "logits/chosen": 3.3078978061676025, + "logits/rejected": 3.4213926792144775, + "logps/chosen": -164.3388214111328, + "logps/rejected": -165.32289123535156, + "loss": 1.0632, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.724864959716797, + "rewards/margins": 0.06808274984359741, + "rewards/rejected": -11.792947769165039, + "step": 3641 + }, + { + "epoch": 2.5133689839572195, + "grad_norm": 1.4721417427062988, + "learning_rate": 2.0281933256616803e-06, + "logits/chosen": 3.35760235786438, + "logits/rejected": 3.398792266845703, + "logps/chosen": -158.1513214111328, + "logps/rejected": -161.5640869140625, + "loss": 0.6157, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.106245040893555, + "rewards/margins": 0.3210322856903076, + "rewards/rejected": -11.427278518676758, + "step": 3642 + }, + { + "epoch": 2.5140589960324307, + "grad_norm": 0.28683042526245117, + "learning_rate": 2.0253164556962026e-06, + "logits/chosen": 3.203944683074951, + "logits/rejected": 3.5412657260894775, + "logps/chosen": -146.66567993164062, + "logps/rejected": -172.40475463867188, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.854888916015625, + "rewards/margins": 2.611524820327759, + "rewards/rejected": -12.466413497924805, + "step": 3643 + }, + { + "epoch": 2.514749008107642, + "grad_norm": 0.34103062748908997, + "learning_rate": 2.0224395857307254e-06, + "logits/chosen": 3.610342025756836, + "logits/rejected": 3.610342025756836, + "logps/chosen": -181.45419311523438, + "logps/rejected": -181.45419311523438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.187322616577148, + "rewards/margins": 0.0, + "rewards/rejected": -13.187322616577148, + "step": 3644 + }, + { + "epoch": 2.5154390201828534, + "grad_norm": 16.23097801208496, + "learning_rate": 2.0195627157652477e-06, + "logits/chosen": 3.5167157649993896, + "logits/rejected": 3.530162811279297, + "logps/chosen": -181.94480895996094, + "logps/rejected": -173.51446533203125, + "loss": 1.4844, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.266733169555664, + "rewards/margins": -0.8777356743812561, + "rewards/rejected": -12.388998031616211, + "step": 3645 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 0.3030661940574646, + "learning_rate": 2.01668584579977e-06, + "logits/chosen": 3.113320827484131, + "logits/rejected": 3.113320827484131, + "logps/chosen": -167.91769409179688, + "logps/rejected": -167.91769409179688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.992875099182129, + "rewards/margins": 0.0, + "rewards/rejected": -11.992875099182129, + "step": 3646 + }, + { + "epoch": 2.5168190443332756, + "grad_norm": 1.9713084697723389, + "learning_rate": 2.0138089758342925e-06, + "logits/chosen": 3.477478265762329, + "logits/rejected": 3.4296226501464844, + "logps/chosen": -178.228515625, + "logps/rejected": -181.79391479492188, + "loss": 0.613, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.973400115966797, + "rewards/margins": 0.36574500799179077, + "rewards/rejected": -13.339143753051758, + "step": 3647 + }, + { + "epoch": 2.517509056408487, + "grad_norm": 0.41294652223587036, + "learning_rate": 2.010932105868815e-06, + "logits/chosen": 3.1132540702819824, + "logits/rejected": 3.1132540702819824, + "logps/chosen": -156.6136474609375, + "logps/rejected": -156.6136474609375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.953075408935547, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -10.95307445526123, + "step": 3648 + }, + { + "epoch": 2.5181990684836983, + "grad_norm": 0.4214085042476654, + "learning_rate": 2.008055235903337e-06, + "logits/chosen": 3.5023441314697266, + "logits/rejected": 3.5107221603393555, + "logps/chosen": -163.4697265625, + "logps/rejected": -174.50619506835938, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.46441650390625, + "rewards/margins": 1.1201246976852417, + "rewards/rejected": -12.584540367126465, + "step": 3649 + }, + { + "epoch": 2.51888908055891, + "grad_norm": 0.3182547390460968, + "learning_rate": 2.00517836593786e-06, + "logits/chosen": 3.221949338912964, + "logits/rejected": 3.236757278442383, + "logps/chosen": -165.57431030273438, + "logps/rejected": -175.9942626953125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.821921348571777, + "rewards/margins": 1.053119421005249, + "rewards/rejected": -12.875041007995605, + "step": 3650 + }, + { + "epoch": 2.519579092634121, + "grad_norm": 0.39645877480506897, + "learning_rate": 2.0023014959723823e-06, + "logits/chosen": 3.1635477542877197, + "logits/rejected": 3.1635477542877197, + "logps/chosen": -138.0321044921875, + "logps/rejected": -138.0321044921875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.045587539672852, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -9.045587539672852, + "step": 3651 + }, + { + "epoch": 2.5202691047093326, + "grad_norm": 0.3789900541305542, + "learning_rate": 1.9994246260069046e-06, + "logits/chosen": 3.4128050804138184, + "logits/rejected": 3.4128050804138184, + "logps/chosen": -159.36160278320312, + "logps/rejected": -159.36160278320312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.142641067504883, + "rewards/margins": -4.172325134277344e-07, + "rewards/rejected": -11.142640113830566, + "step": 3652 + }, + { + "epoch": 2.5209591167845438, + "grad_norm": 0.3436361849308014, + "learning_rate": 1.996547756041427e-06, + "logits/chosen": 3.2891077995300293, + "logits/rejected": 3.361271381378174, + "logps/chosen": -143.70823669433594, + "logps/rejected": -162.6341094970703, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.49114990234375, + "rewards/margins": 2.0234484672546387, + "rewards/rejected": -11.514598846435547, + "step": 3653 + }, + { + "epoch": 2.521649128859755, + "grad_norm": 1.951573133468628, + "learning_rate": 1.9936708860759498e-06, + "logits/chosen": 3.5004382133483887, + "logits/rejected": 3.4355556964874268, + "logps/chosen": -151.0673370361328, + "logps/rejected": -160.99822998046875, + "loss": 0.5303, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.297091484069824, + "rewards/margins": 0.9310055375099182, + "rewards/rejected": -11.228096961975098, + "step": 3654 + }, + { + "epoch": 2.5223391409349665, + "grad_norm": 2.2534990310668945, + "learning_rate": 1.990794016110472e-06, + "logits/chosen": 3.4669270515441895, + "logits/rejected": 3.4845480918884277, + "logps/chosen": -174.8544921875, + "logps/rejected": -176.44949340820312, + "loss": 0.6294, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.670920372009277, + "rewards/margins": 0.20028209686279297, + "rewards/rejected": -12.871201515197754, + "step": 3655 + }, + { + "epoch": 2.5230291530101776, + "grad_norm": 20.627017974853516, + "learning_rate": 1.9879171461449945e-06, + "logits/chosen": 3.435704469680786, + "logits/rejected": 3.3912346363067627, + "logps/chosen": -178.82257080078125, + "logps/rejected": -184.6630859375, + "loss": 1.0167, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.994071006774902, + "rewards/margins": 0.5857242345809937, + "rewards/rejected": -13.579795837402344, + "step": 3656 + }, + { + "epoch": 2.5237191650853887, + "grad_norm": 0.33047133684158325, + "learning_rate": 1.985040276179517e-06, + "logits/chosen": 3.0280041694641113, + "logits/rejected": 3.258709669113159, + "logps/chosen": -158.7242889404297, + "logps/rejected": -190.57696533203125, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.963798522949219, + "rewards/margins": 3.254000425338745, + "rewards/rejected": -14.217798233032227, + "step": 3657 + }, + { + "epoch": 2.5244091771606003, + "grad_norm": 0.37253817915916443, + "learning_rate": 1.982163406214039e-06, + "logits/chosen": 3.589102029800415, + "logits/rejected": 3.589102029800415, + "logps/chosen": -163.12754821777344, + "logps/rejected": -163.12754821777344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.427131652832031, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -11.427131652832031, + "step": 3658 + }, + { + "epoch": 2.525099189235812, + "grad_norm": 1.6487313508987427, + "learning_rate": 1.9792865362485615e-06, + "logits/chosen": 3.4482126235961914, + "logits/rejected": 3.480437755584717, + "logps/chosen": -164.15167236328125, + "logps/rejected": -178.8935546875, + "loss": 0.529, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.591561317443848, + "rewards/margins": 1.5907585620880127, + "rewards/rejected": -13.182319641113281, + "step": 3659 + }, + { + "epoch": 2.525789201311023, + "grad_norm": 0.3551265299320221, + "learning_rate": 1.9764096662830843e-06, + "logits/chosen": 3.0674057006835938, + "logits/rejected": 3.272716522216797, + "logps/chosen": -163.19332885742188, + "logps/rejected": -172.0429229736328, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.788593292236328, + "rewards/margins": 0.8417320251464844, + "rewards/rejected": -12.630325317382812, + "step": 3660 + }, + { + "epoch": 2.526479213386234, + "grad_norm": 0.4203450381755829, + "learning_rate": 1.9735327963176066e-06, + "logits/chosen": 3.1273467540740967, + "logits/rejected": 3.1273467540740967, + "logps/chosen": -176.1241455078125, + "logps/rejected": -176.1241455078125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.836645126342773, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.836645126342773, + "step": 3661 + }, + { + "epoch": 2.5271692254614457, + "grad_norm": 0.3208460509777069, + "learning_rate": 1.970655926352129e-06, + "logits/chosen": 3.444988250732422, + "logits/rejected": 3.6209630966186523, + "logps/chosen": -150.33560180664062, + "logps/rejected": -167.81146240234375, + "loss": 0.5205, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.133872985839844, + "rewards/margins": 1.7119377851486206, + "rewards/rejected": -11.845809936523438, + "step": 3662 + }, + { + "epoch": 2.527859237536657, + "grad_norm": 0.2838817536830902, + "learning_rate": 1.9677790563866513e-06, + "logits/chosen": 3.4732892513275146, + "logits/rejected": 3.7841413021087646, + "logps/chosen": -157.94729614257812, + "logps/rejected": -187.05307006835938, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.919760704040527, + "rewards/margins": 2.8115530014038086, + "rewards/rejected": -13.731313705444336, + "step": 3663 + }, + { + "epoch": 2.528549249611868, + "grad_norm": 0.3462006747722626, + "learning_rate": 1.964902186421174e-06, + "logits/chosen": 3.4437875747680664, + "logits/rejected": 3.644454002380371, + "logps/chosen": -176.6556396484375, + "logps/rejected": -192.375, + "loss": 0.5206, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.663000106811523, + "rewards/margins": 1.6628098487854004, + "rewards/rejected": -14.325811386108398, + "step": 3664 + }, + { + "epoch": 2.5292392616870796, + "grad_norm": 0.3281342685222626, + "learning_rate": 1.9620253164556965e-06, + "logits/chosen": 3.2648026943206787, + "logits/rejected": 3.2648026943206787, + "logps/chosen": -172.0288543701172, + "logps/rejected": -172.0288543701172, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.503034591674805, + "rewards/margins": 0.0, + "rewards/rejected": -12.503034591674805, + "step": 3665 + }, + { + "epoch": 2.5299292737622907, + "grad_norm": 1.0909175872802734, + "learning_rate": 1.959148446490219e-06, + "logits/chosen": 2.8845582008361816, + "logits/rejected": 2.8721742630004883, + "logps/chosen": -158.2958984375, + "logps/rejected": -161.3181610107422, + "loss": 0.6166, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.150720596313477, + "rewards/margins": 0.3091139793395996, + "rewards/rejected": -11.459834098815918, + "step": 3666 + }, + { + "epoch": 2.5306192858375023, + "grad_norm": 8.782440185546875, + "learning_rate": 1.956271576524741e-06, + "logits/chosen": 3.211768627166748, + "logits/rejected": 3.168363571166992, + "logps/chosen": -199.24903869628906, + "logps/rejected": -196.85305786132812, + "loss": 0.8778, + "rewards/accuracies": 0.0, + "rewards/chosen": -15.100025177001953, + "rewards/margins": -0.25615251064300537, + "rewards/rejected": -14.843873023986816, + "step": 3667 + }, + { + "epoch": 2.5313092979127134, + "grad_norm": 0.3492814600467682, + "learning_rate": 1.9533947065592635e-06, + "logits/chosen": 3.3119852542877197, + "logits/rejected": 3.3962411880493164, + "logps/chosen": -152.74415588378906, + "logps/rejected": -163.55935668945312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.638468742370605, + "rewards/margins": 1.0647857189178467, + "rewards/rejected": -11.703253746032715, + "step": 3668 + }, + { + "epoch": 2.531999309987925, + "grad_norm": 0.42657920718193054, + "learning_rate": 1.950517836593786e-06, + "logits/chosen": 3.212862730026245, + "logits/rejected": 3.212862730026245, + "logps/chosen": -191.24392700195312, + "logps/rejected": -191.24392700195312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.32409381866455, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.324092864990234, + "step": 3669 + }, + { + "epoch": 2.532689322063136, + "grad_norm": 0.3741990923881531, + "learning_rate": 1.9476409666283087e-06, + "logits/chosen": 3.635312080383301, + "logits/rejected": 3.555975914001465, + "logps/chosen": -170.02969360351562, + "logps/rejected": -186.72019958496094, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.245063781738281, + "rewards/margins": 1.613576889038086, + "rewards/rejected": -13.858641624450684, + "step": 3670 + }, + { + "epoch": 2.5333793341383473, + "grad_norm": 0.35060998797416687, + "learning_rate": 1.944764096662831e-06, + "logits/chosen": 3.256434202194214, + "logits/rejected": 3.256434202194214, + "logps/chosen": -178.56710815429688, + "logps/rejected": -178.56710815429688, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.173200607299805, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -13.173200607299805, + "step": 3671 + }, + { + "epoch": 2.534069346213559, + "grad_norm": 1.289057970046997, + "learning_rate": 1.9418872266973534e-06, + "logits/chosen": 3.3372750282287598, + "logits/rejected": 3.3076624870300293, + "logps/chosen": -140.25244140625, + "logps/rejected": -171.13592529296875, + "loss": 0.4415, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.064979553222656, + "rewards/margins": 3.1348211765289307, + "rewards/rejected": -12.199801445007324, + "step": 3672 + }, + { + "epoch": 2.53475935828877, + "grad_norm": 0.4979020655155182, + "learning_rate": 1.9390103567318757e-06, + "logits/chosen": 3.401480197906494, + "logits/rejected": 3.401480197906494, + "logps/chosen": -165.32684326171875, + "logps/rejected": -165.32684326171875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.811605453491211, + "rewards/margins": -8.940696716308594e-08, + "rewards/rejected": -11.811605453491211, + "step": 3673 + }, + { + "epoch": 2.535449370363981, + "grad_norm": 0.3862718939781189, + "learning_rate": 1.9361334867663985e-06, + "logits/chosen": 3.641927480697632, + "logits/rejected": 3.641927480697632, + "logps/chosen": -171.4876708984375, + "logps/rejected": -171.4876708984375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.375197410583496, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.375198364257812, + "step": 3674 + }, + { + "epoch": 2.5361393824391927, + "grad_norm": 0.3833481967449188, + "learning_rate": 1.933256616800921e-06, + "logits/chosen": 3.6810271739959717, + "logits/rejected": 3.6810271739959717, + "logps/chosen": -164.31982421875, + "logps/rejected": -164.31982421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.819307327270508, + "rewards/margins": 0.0, + "rewards/rejected": -11.819307327270508, + "step": 3675 + }, + { + "epoch": 2.5368293945144043, + "grad_norm": 0.7220263481140137, + "learning_rate": 1.930379746835443e-06, + "logits/chosen": 3.365841865539551, + "logits/rejected": 3.3203721046447754, + "logps/chosen": -160.1427764892578, + "logps/rejected": -170.51341247558594, + "loss": 0.5244, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.371350288391113, + "rewards/margins": 1.001351237297058, + "rewards/rejected": -12.372702598571777, + "step": 3676 + }, + { + "epoch": 2.5375194065896154, + "grad_norm": 0.39641109108924866, + "learning_rate": 1.9275028768699655e-06, + "logits/chosen": 3.181772232055664, + "logits/rejected": 3.181772232055664, + "logps/chosen": -185.20257568359375, + "logps/rejected": -185.20257568359375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.837738037109375, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.837738037109375, + "step": 3677 + }, + { + "epoch": 2.5382094186648265, + "grad_norm": 0.35387319326400757, + "learning_rate": 1.924626006904488e-06, + "logits/chosen": 3.2011899948120117, + "logits/rejected": 3.4215149879455566, + "logps/chosen": -146.49729919433594, + "logps/rejected": -156.0562744140625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.863428115844727, + "rewards/margins": 0.9947404861450195, + "rewards/rejected": -10.858168601989746, + "step": 3678 + }, + { + "epoch": 2.538899430740038, + "grad_norm": 0.28213390707969666, + "learning_rate": 1.9217491369390102e-06, + "logits/chosen": 3.5917694568634033, + "logits/rejected": 3.6065361499786377, + "logps/chosen": -173.49639892578125, + "logps/rejected": -182.49264526367188, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.534334182739258, + "rewards/margins": 0.9395091533660889, + "rewards/rejected": -13.47384262084961, + "step": 3679 + }, + { + "epoch": 2.5395894428152492, + "grad_norm": 0.39312222599983215, + "learning_rate": 1.918872266973533e-06, + "logits/chosen": 3.7465572357177734, + "logits/rejected": 3.7051806449890137, + "logps/chosen": -171.40357971191406, + "logps/rejected": -181.12335205078125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.244013786315918, + "rewards/margins": 1.0189847946166992, + "rewards/rejected": -13.262998580932617, + "step": 3680 + }, + { + "epoch": 2.5402794548904604, + "grad_norm": 5.6913604736328125, + "learning_rate": 1.9159953970080554e-06, + "logits/chosen": 3.2764806747436523, + "logits/rejected": 3.425586223602295, + "logps/chosen": -177.57174682617188, + "logps/rejected": -179.09942626953125, + "loss": 0.649, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.137947082519531, + "rewards/margins": 0.11289322376251221, + "rewards/rejected": -13.25084114074707, + "step": 3681 + }, + { + "epoch": 2.540969466965672, + "grad_norm": 4.108907222747803, + "learning_rate": 1.9131185270425777e-06, + "logits/chosen": 3.7638862133026123, + "logits/rejected": 3.532501220703125, + "logps/chosen": -175.48297119140625, + "logps/rejected": -176.14248657226562, + "loss": 0.6568, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.62084674835205, + "rewards/margins": 0.08789360523223877, + "rewards/rejected": -12.708741188049316, + "step": 3682 + }, + { + "epoch": 2.541659479040883, + "grad_norm": 0.2860874831676483, + "learning_rate": 1.9102416570771005e-06, + "logits/chosen": 3.704467535018921, + "logits/rejected": 3.6962432861328125, + "logps/chosen": -152.4315948486328, + "logps/rejected": -166.1490478515625, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.549806594848633, + "rewards/margins": 1.3163937330245972, + "rewards/rejected": -11.866199493408203, + "step": 3683 + }, + { + "epoch": 2.5423494911160947, + "grad_norm": 0.4297160506248474, + "learning_rate": 1.907364787111623e-06, + "logits/chosen": 3.577690362930298, + "logits/rejected": 3.7537624835968018, + "logps/chosen": -173.27786254882812, + "logps/rejected": -181.79837036132812, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.535198211669922, + "rewards/margins": 0.8288196325302124, + "rewards/rejected": -13.364017486572266, + "step": 3684 + }, + { + "epoch": 2.543039503191306, + "grad_norm": 0.35767942667007446, + "learning_rate": 1.9044879171461452e-06, + "logits/chosen": 3.752113103866577, + "logits/rejected": 3.7993922233581543, + "logps/chosen": -163.5525665283203, + "logps/rejected": -173.98228454589844, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.732433319091797, + "rewards/margins": 1.051090121269226, + "rewards/rejected": -12.783523559570312, + "step": 3685 + }, + { + "epoch": 2.5437295152665174, + "grad_norm": 0.5743569731712341, + "learning_rate": 1.9016110471806676e-06, + "logits/chosen": 3.111290693283081, + "logits/rejected": 3.205623149871826, + "logps/chosen": -144.3866424560547, + "logps/rejected": -156.67523193359375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.686251640319824, + "rewards/margins": 1.188633918762207, + "rewards/rejected": -10.874885559082031, + "step": 3686 + }, + { + "epoch": 2.5444195273417285, + "grad_norm": 0.3501124978065491, + "learning_rate": 1.8987341772151901e-06, + "logits/chosen": 3.5162479877471924, + "logits/rejected": 3.767782688140869, + "logps/chosen": -180.59750366210938, + "logps/rejected": -186.57855224609375, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.350643157958984, + "rewards/margins": 0.608244776725769, + "rewards/rejected": -13.958888053894043, + "step": 3687 + }, + { + "epoch": 2.5451095394169396, + "grad_norm": 0.42926403880119324, + "learning_rate": 1.8958573072497125e-06, + "logits/chosen": 3.419139862060547, + "logits/rejected": 3.613771915435791, + "logps/chosen": -162.8734130859375, + "logps/rejected": -179.93896484375, + "loss": 0.4368, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.439083099365234, + "rewards/margins": 1.7625977993011475, + "rewards/rejected": -13.201682090759277, + "step": 3688 + }, + { + "epoch": 2.545799551492151, + "grad_norm": 0.334235280752182, + "learning_rate": 1.892980437284235e-06, + "logits/chosen": 3.426772117614746, + "logits/rejected": 3.478837251663208, + "logps/chosen": -171.84304809570312, + "logps/rejected": -186.83514404296875, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.41278076171875, + "rewards/margins": 1.473866581916809, + "rewards/rejected": -13.886646270751953, + "step": 3689 + }, + { + "epoch": 2.5464895635673623, + "grad_norm": 0.370835542678833, + "learning_rate": 1.8901035673187574e-06, + "logits/chosen": 3.394568920135498, + "logits/rejected": 3.4010229110717773, + "logps/chosen": -155.09719848632812, + "logps/rejected": -162.97491455078125, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.930891036987305, + "rewards/margins": 0.7686830163002014, + "rewards/rejected": -11.69957447052002, + "step": 3690 + }, + { + "epoch": 2.5471795756425735, + "grad_norm": 0.47515353560447693, + "learning_rate": 1.88722669735328e-06, + "logits/chosen": 3.388273239135742, + "logits/rejected": 3.388273239135742, + "logps/chosen": -169.1940460205078, + "logps/rejected": -169.1940460205078, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.220874786376953, + "rewards/margins": 0.0, + "rewards/rejected": -12.220874786376953, + "step": 3691 + }, + { + "epoch": 2.547869587717785, + "grad_norm": 0.32346341013908386, + "learning_rate": 1.8843498273878023e-06, + "logits/chosen": 3.492431163787842, + "logits/rejected": 3.611694812774658, + "logps/chosen": -176.58377075195312, + "logps/rejected": -184.70132446289062, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.853775978088379, + "rewards/margins": 0.7974064350128174, + "rewards/rejected": -13.6511812210083, + "step": 3692 + }, + { + "epoch": 2.5485595997929966, + "grad_norm": 0.3526836633682251, + "learning_rate": 1.8814729574223249e-06, + "logits/chosen": 3.355520248413086, + "logits/rejected": 3.541440725326538, + "logps/chosen": -161.87628173828125, + "logps/rejected": -170.05691528320312, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.45116138458252, + "rewards/margins": 0.8163110017776489, + "rewards/rejected": -12.267473220825195, + "step": 3693 + }, + { + "epoch": 2.5492496118682078, + "grad_norm": 0.34337061643600464, + "learning_rate": 1.878596087456847e-06, + "logits/chosen": 3.468125343322754, + "logits/rejected": 3.6815438270568848, + "logps/chosen": -164.83782958984375, + "logps/rejected": -175.49974060058594, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.744091033935547, + "rewards/margins": 1.0553853511810303, + "rewards/rejected": -12.799476623535156, + "step": 3694 + }, + { + "epoch": 2.549939623943419, + "grad_norm": 0.44756659865379333, + "learning_rate": 1.8757192174913696e-06, + "logits/chosen": 3.4862422943115234, + "logits/rejected": 3.4862422943115234, + "logps/chosen": -173.00836181640625, + "logps/rejected": -173.00836181640625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.507259368896484, + "rewards/margins": 0.0, + "rewards/rejected": -12.507259368896484, + "step": 3695 + }, + { + "epoch": 2.5506296360186305, + "grad_norm": 70.04923248291016, + "learning_rate": 1.872842347525892e-06, + "logits/chosen": 3.4547502994537354, + "logits/rejected": 3.3853020668029785, + "logps/chosen": -151.68893432617188, + "logps/rejected": -145.77084350585938, + "loss": 1.199, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.306384086608887, + "rewards/margins": -0.5914140343666077, + "rewards/rejected": -9.714969635009766, + "step": 3696 + }, + { + "epoch": 2.5513196480938416, + "grad_norm": 0.339942067861557, + "learning_rate": 1.8699654775604145e-06, + "logits/chosen": 3.6303279399871826, + "logits/rejected": 3.7662055492401123, + "logps/chosen": -170.13784790039062, + "logps/rejected": -186.3814697265625, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.271727561950684, + "rewards/margins": 1.66645085811615, + "rewards/rejected": -13.938178062438965, + "step": 3697 + }, + { + "epoch": 2.5520096601690527, + "grad_norm": 0.43189722299575806, + "learning_rate": 1.8670886075949368e-06, + "logits/chosen": 3.5638270378112793, + "logits/rejected": 3.5476298332214355, + "logps/chosen": -171.8782958984375, + "logps/rejected": -184.94985961914062, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.3988618850708, + "rewards/margins": 1.321097731590271, + "rewards/rejected": -13.719959259033203, + "step": 3698 + }, + { + "epoch": 2.5526996722442643, + "grad_norm": 0.3778727948665619, + "learning_rate": 1.8642117376294594e-06, + "logits/chosen": 3.2329206466674805, + "logits/rejected": 3.5500998497009277, + "logps/chosen": -164.13394165039062, + "logps/rejected": -186.3525390625, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.488882064819336, + "rewards/margins": 2.22156023979187, + "rewards/rejected": -13.710441589355469, + "step": 3699 + }, + { + "epoch": 2.5533896843194754, + "grad_norm": 0.38875317573547363, + "learning_rate": 1.8613348676639817e-06, + "logits/chosen": 3.2211380004882812, + "logits/rejected": 3.251394748687744, + "logps/chosen": -162.16651916503906, + "logps/rejected": -175.37594604492188, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.605772018432617, + "rewards/margins": 1.2814453840255737, + "rewards/rejected": -12.887216567993164, + "step": 3700 + }, + { + "epoch": 2.554079696394687, + "grad_norm": 0.43335863947868347, + "learning_rate": 1.8584579976985043e-06, + "logits/chosen": 3.4063735008239746, + "logits/rejected": 3.4063735008239746, + "logps/chosen": -171.91455078125, + "logps/rejected": -171.91455078125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.468596458435059, + "rewards/margins": 0.0, + "rewards/rejected": -12.468596458435059, + "step": 3701 + }, + { + "epoch": 2.554769708469898, + "grad_norm": 0.29123759269714355, + "learning_rate": 1.8555811277330267e-06, + "logits/chosen": 3.318110466003418, + "logits/rejected": 3.5384511947631836, + "logps/chosen": -150.4760284423828, + "logps/rejected": -178.9071807861328, + "loss": 0.4339, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.226410865783691, + "rewards/margins": 2.8513734340667725, + "rewards/rejected": -13.077784538269043, + "step": 3702 + }, + { + "epoch": 2.5554597205451097, + "grad_norm": 0.3458091616630554, + "learning_rate": 1.8527042577675492e-06, + "logits/chosen": 3.486940860748291, + "logits/rejected": 3.6210548877716064, + "logps/chosen": -169.16159057617188, + "logps/rejected": -178.84890747070312, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.218537330627441, + "rewards/margins": 1.051831603050232, + "rewards/rejected": -13.270368576049805, + "step": 3703 + }, + { + "epoch": 2.556149732620321, + "grad_norm": 0.29905998706817627, + "learning_rate": 1.8498273878020714e-06, + "logits/chosen": 3.780590534210205, + "logits/rejected": 4.025979042053223, + "logps/chosen": -178.77052307128906, + "logps/rejected": -187.95237731933594, + "loss": 0.6066, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.023719787597656, + "rewards/margins": 0.8838592171669006, + "rewards/rejected": -13.907577514648438, + "step": 3704 + }, + { + "epoch": 2.556839744695532, + "grad_norm": 0.34481281042099, + "learning_rate": 1.846950517836594e-06, + "logits/chosen": 3.376272439956665, + "logits/rejected": 3.5707767009735107, + "logps/chosen": -148.171142578125, + "logps/rejected": -174.33718872070312, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.016210556030273, + "rewards/margins": 2.511754274368286, + "rewards/rejected": -12.52796459197998, + "step": 3705 + }, + { + "epoch": 2.5575297567707436, + "grad_norm": 0.3381449580192566, + "learning_rate": 1.8440736478711163e-06, + "logits/chosen": 4.007003307342529, + "logits/rejected": 3.989525318145752, + "logps/chosen": -176.57386779785156, + "logps/rejected": -186.1647491455078, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.758920669555664, + "rewards/margins": 0.9773062467575073, + "rewards/rejected": -13.736227035522461, + "step": 3706 + }, + { + "epoch": 2.5582197688459547, + "grad_norm": 0.6321740746498108, + "learning_rate": 1.8411967779056388e-06, + "logits/chosen": 3.603797674179077, + "logits/rejected": 3.603797674179077, + "logps/chosen": -175.9532928466797, + "logps/rejected": -175.95327758789062, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.744190216064453, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.744190216064453, + "step": 3707 + }, + { + "epoch": 2.5589097809211663, + "grad_norm": 1.4382692575454712, + "learning_rate": 1.8383199079401612e-06, + "logits/chosen": 3.2006235122680664, + "logits/rejected": 3.2544007301330566, + "logps/chosen": -158.7970428466797, + "logps/rejected": -162.00514221191406, + "loss": 0.6149, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.172407150268555, + "rewards/margins": 0.33360886573791504, + "rewards/rejected": -11.506014823913574, + "step": 3708 + }, + { + "epoch": 2.5595997929963774, + "grad_norm": 0.40571799874305725, + "learning_rate": 1.8354430379746838e-06, + "logits/chosen": 3.602510929107666, + "logits/rejected": 3.602510929107666, + "logps/chosen": -156.52621459960938, + "logps/rejected": -156.52621459960938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.992450714111328, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -10.992450714111328, + "step": 3709 + }, + { + "epoch": 2.560289805071589, + "grad_norm": 0.24338014423847198, + "learning_rate": 1.8325661680092061e-06, + "logits/chosen": 3.7437682151794434, + "logits/rejected": 3.9090499877929688, + "logps/chosen": -142.88388061523438, + "logps/rejected": -183.3235626220703, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.70015811920166, + "rewards/margins": 3.8977091312408447, + "rewards/rejected": -13.59786605834961, + "step": 3710 + }, + { + "epoch": 2.5609798171468, + "grad_norm": 0.31184718012809753, + "learning_rate": 1.8296892980437287e-06, + "logits/chosen": 3.717630386352539, + "logits/rejected": 3.717630386352539, + "logps/chosen": -184.96221923828125, + "logps/rejected": -184.96221923828125, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.655957221984863, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.65595817565918, + "step": 3711 + }, + { + "epoch": 2.5616698292220113, + "grad_norm": 0.33598870038986206, + "learning_rate": 1.826812428078251e-06, + "logits/chosen": 3.4611291885375977, + "logits/rejected": 3.4611291885375977, + "logps/chosen": -174.51519775390625, + "logps/rejected": -174.5151824951172, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.588663101196289, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.588663101196289, + "step": 3712 + }, + { + "epoch": 2.562359841297223, + "grad_norm": 0.41209113597869873, + "learning_rate": 1.8239355581127736e-06, + "logits/chosen": 3.344834089279175, + "logits/rejected": 3.3888368606567383, + "logps/chosen": -165.76611328125, + "logps/rejected": -175.735595703125, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.704559326171875, + "rewards/margins": 1.0266296863555908, + "rewards/rejected": -12.73118782043457, + "step": 3713 + }, + { + "epoch": 2.563049853372434, + "grad_norm": 0.3713444769382477, + "learning_rate": 1.8210586881472957e-06, + "logits/chosen": 4.105559825897217, + "logits/rejected": 4.105559825897217, + "logps/chosen": -183.53831481933594, + "logps/rejected": -183.53831481933594, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.444708824157715, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.444708824157715, + "step": 3714 + }, + { + "epoch": 2.563739865447645, + "grad_norm": 0.4890138506889343, + "learning_rate": 1.8181818181818183e-06, + "logits/chosen": 3.2901110649108887, + "logits/rejected": 3.4889323711395264, + "logps/chosen": -145.01193237304688, + "logps/rejected": -185.01931762695312, + "loss": 0.3505, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.089057922363281, + "rewards/margins": 3.759723424911499, + "rewards/rejected": -13.848779678344727, + "step": 3715 + }, + { + "epoch": 2.5644298775228567, + "grad_norm": 0.3167511522769928, + "learning_rate": 1.8153049482163406e-06, + "logits/chosen": 3.50213885307312, + "logits/rejected": 3.5815441608428955, + "logps/chosen": -188.51828002929688, + "logps/rejected": -194.7897491455078, + "loss": 0.6073, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.778242111206055, + "rewards/margins": 0.6285711526870728, + "rewards/rejected": -14.406813621520996, + "step": 3716 + }, + { + "epoch": 2.565119889598068, + "grad_norm": 0.3135661780834198, + "learning_rate": 1.8124280782508632e-06, + "logits/chosen": 3.6661953926086426, + "logits/rejected": 3.9039173126220703, + "logps/chosen": -167.12319946289062, + "logps/rejected": -174.9788055419922, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.993086814880371, + "rewards/margins": 0.805582582950592, + "rewards/rejected": -12.79866886138916, + "step": 3717 + }, + { + "epoch": 2.5658099016732794, + "grad_norm": 0.4513089954853058, + "learning_rate": 1.8095512082853856e-06, + "logits/chosen": 2.9465079307556152, + "logits/rejected": 3.0652103424072266, + "logps/chosen": -136.61758422851562, + "logps/rejected": -158.8054962158203, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.898979187011719, + "rewards/margins": 2.2043042182922363, + "rewards/rejected": -11.103282928466797, + "step": 3718 + }, + { + "epoch": 2.5664999137484905, + "grad_norm": 0.40078437328338623, + "learning_rate": 1.8066743383199081e-06, + "logits/chosen": 3.7443180084228516, + "logits/rejected": 3.878518581390381, + "logps/chosen": -157.69747924804688, + "logps/rejected": -185.2332305908203, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.986385345458984, + "rewards/margins": 2.7088046073913574, + "rewards/rejected": -13.6951904296875, + "step": 3719 + }, + { + "epoch": 2.567189925823702, + "grad_norm": 0.3302474617958069, + "learning_rate": 1.8037974683544305e-06, + "logits/chosen": 3.918571949005127, + "logits/rejected": 3.918571949005127, + "logps/chosen": -189.15411376953125, + "logps/rejected": -189.15411376953125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.999482154846191, + "rewards/margins": 0.0, + "rewards/rejected": -13.999482154846191, + "step": 3720 + }, + { + "epoch": 2.5678799378989132, + "grad_norm": 0.41239047050476074, + "learning_rate": 1.800920598388953e-06, + "logits/chosen": 3.880333662033081, + "logits/rejected": 3.880333662033081, + "logps/chosen": -170.00682067871094, + "logps/rejected": -170.00682067871094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.27199935913086, + "rewards/margins": 0.0, + "rewards/rejected": -12.27199935913086, + "step": 3721 + }, + { + "epoch": 2.5685699499741244, + "grad_norm": 0.3413873612880707, + "learning_rate": 1.7980437284234754e-06, + "logits/chosen": 3.5826480388641357, + "logits/rejected": 3.5826480388641357, + "logps/chosen": -181.1867218017578, + "logps/rejected": -181.1867218017578, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.499923706054688, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.499923706054688, + "step": 3722 + }, + { + "epoch": 2.569259962049336, + "grad_norm": 0.45764338970184326, + "learning_rate": 1.795166858457998e-06, + "logits/chosen": 3.1420257091522217, + "logits/rejected": 3.3214268684387207, + "logps/chosen": -144.9394989013672, + "logps/rejected": -163.98143005371094, + "loss": 0.5219, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.669297218322754, + "rewards/margins": 1.8265125751495361, + "rewards/rejected": -11.495810508728027, + "step": 3723 + }, + { + "epoch": 2.569949974124547, + "grad_norm": 0.4241916537284851, + "learning_rate": 1.79228998849252e-06, + "logits/chosen": 3.526183605194092, + "logits/rejected": 3.595874309539795, + "logps/chosen": -174.06805419921875, + "logps/rejected": -180.8015594482422, + "loss": 0.607, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.480670928955078, + "rewards/margins": 0.6898589730262756, + "rewards/rejected": -13.170530319213867, + "step": 3724 + }, + { + "epoch": 2.5706399861997586, + "grad_norm": 0.44209200143814087, + "learning_rate": 1.7894131185270427e-06, + "logits/chosen": 3.625218391418457, + "logits/rejected": 3.5760397911071777, + "logps/chosen": -172.76170349121094, + "logps/rejected": -185.7747344970703, + "loss": 0.5222, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.542366027832031, + "rewards/margins": 1.2667022943496704, + "rewards/rejected": -13.809067726135254, + "step": 3725 + }, + { + "epoch": 2.5713299982749698, + "grad_norm": 0.42358189821243286, + "learning_rate": 1.786536248561565e-06, + "logits/chosen": 3.1075565814971924, + "logits/rejected": 3.178925037384033, + "logps/chosen": -175.0888671875, + "logps/rejected": -184.62167358398438, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.840641021728516, + "rewards/margins": 0.9428789019584656, + "rewards/rejected": -13.783519744873047, + "step": 3726 + }, + { + "epoch": 2.5720200103501814, + "grad_norm": 0.3202028274536133, + "learning_rate": 1.7836593785960876e-06, + "logits/chosen": 3.1870598793029785, + "logits/rejected": 3.4390416145324707, + "logps/chosen": -146.6125030517578, + "logps/rejected": -169.1889190673828, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.832575798034668, + "rewards/margins": 2.1897706985473633, + "rewards/rejected": -12.022346496582031, + "step": 3727 + }, + { + "epoch": 2.5727100224253925, + "grad_norm": 0.2924342751502991, + "learning_rate": 1.78078250863061e-06, + "logits/chosen": 4.1423869132995605, + "logits/rejected": 4.13308048248291, + "logps/chosen": -168.62229919433594, + "logps/rejected": -178.4266357421875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.966148376464844, + "rewards/margins": 1.004528284072876, + "rewards/rejected": -12.970677375793457, + "step": 3728 + }, + { + "epoch": 2.5734000345006036, + "grad_norm": 0.3703171908855438, + "learning_rate": 1.7779056386651325e-06, + "logits/chosen": 3.598376750946045, + "logits/rejected": 3.6609549522399902, + "logps/chosen": -147.83547973632812, + "logps/rejected": -174.40850830078125, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.052834510803223, + "rewards/margins": 2.6282691955566406, + "rewards/rejected": -12.681103706359863, + "step": 3729 + }, + { + "epoch": 2.574090046575815, + "grad_norm": 0.30577531456947327, + "learning_rate": 1.7750287686996548e-06, + "logits/chosen": 3.3184897899627686, + "logits/rejected": 3.3923068046569824, + "logps/chosen": -161.0838623046875, + "logps/rejected": -170.47535705566406, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.425036430358887, + "rewards/margins": 0.9447731971740723, + "rewards/rejected": -12.3698091506958, + "step": 3730 + }, + { + "epoch": 2.5747800586510263, + "grad_norm": 0.333660364151001, + "learning_rate": 1.7721518987341774e-06, + "logits/chosen": 3.6234536170959473, + "logits/rejected": 3.655186653137207, + "logps/chosen": -152.50926208496094, + "logps/rejected": -160.7034149169922, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.599580764770508, + "rewards/margins": 0.870187520980835, + "rewards/rejected": -11.469768524169922, + "step": 3731 + }, + { + "epoch": 2.5754700707262375, + "grad_norm": 0.3711493909358978, + "learning_rate": 1.7692750287686998e-06, + "logits/chosen": 3.1653571128845215, + "logits/rejected": 3.508633613586426, + "logps/chosen": -139.16925048828125, + "logps/rejected": -175.6619415283203, + "loss": 0.4333, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.133346557617188, + "rewards/margins": 3.6359801292419434, + "rewards/rejected": -12.769325256347656, + "step": 3732 + }, + { + "epoch": 2.576160082801449, + "grad_norm": 0.33009305596351624, + "learning_rate": 1.7663981588032223e-06, + "logits/chosen": 3.2612152099609375, + "logits/rejected": 3.5204381942749023, + "logps/chosen": -154.88246154785156, + "logps/rejected": -171.1324920654297, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.730535507202148, + "rewards/margins": 1.6673364639282227, + "rewards/rejected": -12.397872924804688, + "step": 3733 + }, + { + "epoch": 2.5768500948766606, + "grad_norm": 0.4302142858505249, + "learning_rate": 1.7635212888377449e-06, + "logits/chosen": 4.006315231323242, + "logits/rejected": 4.006315231323242, + "logps/chosen": -183.74392700195312, + "logps/rejected": -183.74392700195312, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.632315635681152, + "rewards/margins": 0.0, + "rewards/rejected": -13.632315635681152, + "step": 3734 + }, + { + "epoch": 2.5775401069518717, + "grad_norm": 0.4128783047199249, + "learning_rate": 1.760644418872267e-06, + "logits/chosen": 3.500319480895996, + "logits/rejected": 3.500319480895996, + "logps/chosen": -165.36605834960938, + "logps/rejected": -165.36605834960938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.840165138244629, + "rewards/margins": 0.0, + "rewards/rejected": -11.840165138244629, + "step": 3735 + }, + { + "epoch": 2.578230119027083, + "grad_norm": 0.3958815634250641, + "learning_rate": 1.7577675489067898e-06, + "logits/chosen": 3.693814277648926, + "logits/rejected": 3.693814277648926, + "logps/chosen": -165.81179809570312, + "logps/rejected": -165.81179809570312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.73077392578125, + "rewards/margins": 0.0, + "rewards/rejected": -11.73077392578125, + "step": 3736 + }, + { + "epoch": 2.5789201311022945, + "grad_norm": 0.2919794023036957, + "learning_rate": 1.754890678941312e-06, + "logits/chosen": 2.8650262355804443, + "logits/rejected": 3.015664577484131, + "logps/chosen": -142.46131896972656, + "logps/rejected": -151.56423950195312, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.368396759033203, + "rewards/margins": 0.8858280777931213, + "rewards/rejected": -10.25422477722168, + "step": 3737 + }, + { + "epoch": 2.5796101431775056, + "grad_norm": 0.3889884948730469, + "learning_rate": 1.7520138089758345e-06, + "logits/chosen": 3.357213258743286, + "logits/rejected": 3.4347984790802, + "logps/chosen": -173.33349609375, + "logps/rejected": -185.70315551757812, + "loss": 0.5216, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.682039260864258, + "rewards/margins": 1.2637211084365845, + "rewards/rejected": -13.945759773254395, + "step": 3738 + }, + { + "epoch": 2.5803001552527167, + "grad_norm": 0.3103839159011841, + "learning_rate": 1.7491369390103568e-06, + "logits/chosen": 3.5395348072052, + "logits/rejected": 3.660851001739502, + "logps/chosen": -179.30636596679688, + "logps/rejected": -192.63418579101562, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.17819881439209, + "rewards/margins": 1.342063546180725, + "rewards/rejected": -14.520261764526367, + "step": 3739 + }, + { + "epoch": 2.5809901673279283, + "grad_norm": 0.43035972118377686, + "learning_rate": 1.7462600690448794e-06, + "logits/chosen": 3.684183359146118, + "logits/rejected": 3.6918108463287354, + "logps/chosen": -164.42630004882812, + "logps/rejected": -175.73391723632812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.598551750183105, + "rewards/margins": 1.139047622680664, + "rewards/rejected": -12.73759937286377, + "step": 3740 + }, + { + "epoch": 2.5816801794031394, + "grad_norm": 0.3771646320819855, + "learning_rate": 1.7433831990794018e-06, + "logits/chosen": 3.3788833618164062, + "logits/rejected": 3.3412160873413086, + "logps/chosen": -181.78704833984375, + "logps/rejected": -189.03204345703125, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.384787559509277, + "rewards/margins": 0.8290205001831055, + "rewards/rejected": -14.213808059692383, + "step": 3741 + }, + { + "epoch": 2.582370191478351, + "grad_norm": 0.4558018445968628, + "learning_rate": 1.7405063291139243e-06, + "logits/chosen": 3.6259098052978516, + "logits/rejected": 3.6259098052978516, + "logps/chosen": -167.83758544921875, + "logps/rejected": -167.83758544921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.123384475708008, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -12.123384475708008, + "step": 3742 + }, + { + "epoch": 2.583060203553562, + "grad_norm": 0.4288727641105652, + "learning_rate": 1.7376294591484467e-06, + "logits/chosen": 4.261684417724609, + "logits/rejected": 4.261684417724609, + "logps/chosen": -170.53419494628906, + "logps/rejected": -170.53419494628906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.460809707641602, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.460809707641602, + "step": 3743 + }, + { + "epoch": 2.5837502156287737, + "grad_norm": 0.43110421299934387, + "learning_rate": 1.7347525891829692e-06, + "logits/chosen": 3.337287187576294, + "logits/rejected": 3.475262403488159, + "logps/chosen": -155.342529296875, + "logps/rejected": -169.79537963867188, + "loss": 0.5217, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.644424438476562, + "rewards/margins": 1.4930849075317383, + "rewards/rejected": -12.137508392333984, + "step": 3744 + }, + { + "epoch": 2.584440227703985, + "grad_norm": 0.3458033800125122, + "learning_rate": 1.7318757192174914e-06, + "logits/chosen": 3.738816261291504, + "logits/rejected": 3.738816261291504, + "logps/chosen": -187.67498779296875, + "logps/rejected": -187.67498779296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.959114074707031, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.959112167358398, + "step": 3745 + }, + { + "epoch": 2.585130239779196, + "grad_norm": 0.39342454075813293, + "learning_rate": 1.7289988492520142e-06, + "logits/chosen": 3.408215045928955, + "logits/rejected": 3.6957740783691406, + "logps/chosen": -152.32583618164062, + "logps/rejected": -177.33302307128906, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.473377227783203, + "rewards/margins": 2.4719982147216797, + "rewards/rejected": -12.945374488830566, + "step": 3746 + }, + { + "epoch": 2.5858202518544076, + "grad_norm": 0.32892197370529175, + "learning_rate": 1.7261219792865363e-06, + "logits/chosen": 3.4843459129333496, + "logits/rejected": 3.5515761375427246, + "logps/chosen": -162.72702026367188, + "logps/rejected": -174.21224975585938, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.62634563446045, + "rewards/margins": 1.0930677652359009, + "rewards/rejected": -12.719413757324219, + "step": 3747 + }, + { + "epoch": 2.5865102639296187, + "grad_norm": 0.3933846354484558, + "learning_rate": 1.7232451093210589e-06, + "logits/chosen": 3.4467759132385254, + "logits/rejected": 3.4467759132385254, + "logps/chosen": -163.46673583984375, + "logps/rejected": -163.46673583984375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.472238540649414, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -11.472237586975098, + "step": 3748 + }, + { + "epoch": 2.58720027600483, + "grad_norm": 0.41590261459350586, + "learning_rate": 1.7203682393555812e-06, + "logits/chosen": 3.400387763977051, + "logits/rejected": 3.403076171875, + "logps/chosen": -152.22470092773438, + "logps/rejected": -157.7667236328125, + "loss": 0.6075, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.56583023071289, + "rewards/margins": 0.5990906953811646, + "rewards/rejected": -11.164920806884766, + "step": 3749 + }, + { + "epoch": 2.5878902880800414, + "grad_norm": 0.34012889862060547, + "learning_rate": 1.7174913693901038e-06, + "logits/chosen": 3.0152647495269775, + "logits/rejected": 3.2103309631347656, + "logps/chosen": -153.70327758789062, + "logps/rejected": -173.80918884277344, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.66629409790039, + "rewards/margins": 2.080162286758423, + "rewards/rejected": -12.746455192565918, + "step": 3750 + }, + { + "epoch": 2.588580300155253, + "grad_norm": 0.37787145376205444, + "learning_rate": 1.7146144994246261e-06, + "logits/chosen": 3.3936216831207275, + "logits/rejected": 3.5681161880493164, + "logps/chosen": -149.8321990966797, + "logps/rejected": -163.34193420410156, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.350876808166504, + "rewards/margins": 1.2617017030715942, + "rewards/rejected": -11.612577438354492, + "step": 3751 + }, + { + "epoch": 2.589270312230464, + "grad_norm": 13.264731407165527, + "learning_rate": 1.7117376294591487e-06, + "logits/chosen": 3.477699041366577, + "logits/rejected": 3.512408494949341, + "logps/chosen": -161.53659057617188, + "logps/rejected": -155.85330200195312, + "loss": 1.0849, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.429874420166016, + "rewards/margins": -0.47566625475883484, + "rewards/rejected": -10.954208374023438, + "step": 3752 + }, + { + "epoch": 2.5899603243056752, + "grad_norm": 0.6861639618873596, + "learning_rate": 1.708860759493671e-06, + "logits/chosen": 3.3748698234558105, + "logits/rejected": 3.5658106803894043, + "logps/chosen": -159.26339721679688, + "logps/rejected": -186.068603515625, + "loss": 0.4381, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.042330741882324, + "rewards/margins": 2.71244478225708, + "rewards/rejected": -13.754776000976562, + "step": 3753 + }, + { + "epoch": 2.590650336380887, + "grad_norm": 0.3147505819797516, + "learning_rate": 1.7059838895281936e-06, + "logits/chosen": 3.261349678039551, + "logits/rejected": 3.2638134956359863, + "logps/chosen": -155.05699157714844, + "logps/rejected": -172.2489013671875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.713257789611816, + "rewards/margins": 1.6624870300292969, + "rewards/rejected": -12.37574577331543, + "step": 3754 + }, + { + "epoch": 2.591340348456098, + "grad_norm": 0.37827447056770325, + "learning_rate": 1.7031070195627157e-06, + "logits/chosen": 3.5173726081848145, + "logits/rejected": 3.5173726081848145, + "logps/chosen": -170.0132598876953, + "logps/rejected": -170.01327514648438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.217123985290527, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.217124938964844, + "step": 3755 + }, + { + "epoch": 2.592030360531309, + "grad_norm": 0.4487982988357544, + "learning_rate": 1.7002301495972385e-06, + "logits/chosen": 3.501847505569458, + "logits/rejected": 3.501847505569458, + "logps/chosen": -174.90989685058594, + "logps/rejected": -174.90989685058594, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.864959716796875, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.864959716796875, + "step": 3756 + }, + { + "epoch": 2.5927203726065207, + "grad_norm": 0.3145388066768646, + "learning_rate": 1.6973532796317607e-06, + "logits/chosen": 3.7004263401031494, + "logits/rejected": 3.7004263401031494, + "logps/chosen": -192.2542724609375, + "logps/rejected": -192.2542724609375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.493307113647461, + "rewards/margins": 0.0, + "rewards/rejected": -14.493307113647461, + "step": 3757 + }, + { + "epoch": 2.593410384681732, + "grad_norm": 0.37784963846206665, + "learning_rate": 1.6944764096662832e-06, + "logits/chosen": 3.6161422729492188, + "logits/rejected": 3.6161422729492188, + "logps/chosen": -168.46444702148438, + "logps/rejected": -168.46444702148438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.953695297241211, + "rewards/margins": 0.0, + "rewards/rejected": -11.953695297241211, + "step": 3758 + }, + { + "epoch": 2.5941003967569434, + "grad_norm": 3.7860050201416016, + "learning_rate": 1.6915995397008056e-06, + "logits/chosen": 3.5451951026916504, + "logits/rejected": 3.5680370330810547, + "logps/chosen": -162.83145141601562, + "logps/rejected": -174.02444458007812, + "loss": 0.5526, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.668038368225098, + "rewards/margins": 1.0980496406555176, + "rewards/rejected": -12.766088485717773, + "step": 3759 + }, + { + "epoch": 2.5947904088321545, + "grad_norm": 0.3800159990787506, + "learning_rate": 1.6887226697353281e-06, + "logits/chosen": 3.154304265975952, + "logits/rejected": 3.154304265975952, + "logps/chosen": -161.68711853027344, + "logps/rejected": -161.68711853027344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.334264755249023, + "rewards/margins": 0.0, + "rewards/rejected": -11.334264755249023, + "step": 3760 + }, + { + "epoch": 2.595480420907366, + "grad_norm": 0.6607725620269775, + "learning_rate": 1.6858457997698505e-06, + "logits/chosen": 3.7282447814941406, + "logits/rejected": 3.6943161487579346, + "logps/chosen": -173.79425048828125, + "logps/rejected": -178.3437042236328, + "loss": 0.6091, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.558637619018555, + "rewards/margins": 0.4841940999031067, + "rewards/rejected": -13.042831420898438, + "step": 3761 + }, + { + "epoch": 2.596170432982577, + "grad_norm": 0.39641591906547546, + "learning_rate": 1.682968929804373e-06, + "logits/chosen": 3.3767802715301514, + "logits/rejected": 3.3767802715301514, + "logps/chosen": -164.86251831054688, + "logps/rejected": -164.86251831054688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.682878494262695, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -11.682878494262695, + "step": 3762 + }, + { + "epoch": 2.5968604450577883, + "grad_norm": 0.35207000374794006, + "learning_rate": 1.6800920598388954e-06, + "logits/chosen": 3.193988800048828, + "logits/rejected": 3.193988800048828, + "logps/chosen": -152.39407348632812, + "logps/rejected": -152.39407348632812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.514201164245605, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -10.514202117919922, + "step": 3763 + }, + { + "epoch": 2.597550457133, + "grad_norm": 0.3801426291465759, + "learning_rate": 1.677215189873418e-06, + "logits/chosen": 3.333324909210205, + "logits/rejected": 3.4711415767669678, + "logps/chosen": -166.1175537109375, + "logps/rejected": -177.5266876220703, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.852804183959961, + "rewards/margins": 1.1289738416671753, + "rewards/rejected": -12.981779098510742, + "step": 3764 + }, + { + "epoch": 2.598240469208211, + "grad_norm": 0.2922021448612213, + "learning_rate": 1.6743383199079401e-06, + "logits/chosen": 3.020190954208374, + "logits/rejected": 3.2973146438598633, + "logps/chosen": -162.58212280273438, + "logps/rejected": -193.7962188720703, + "loss": 0.4335, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.609891891479492, + "rewards/margins": 3.1138288974761963, + "rewards/rejected": -14.723722457885742, + "step": 3765 + }, + { + "epoch": 2.598930481283422, + "grad_norm": 0.4021306037902832, + "learning_rate": 1.6714614499424629e-06, + "logits/chosen": 3.5642244815826416, + "logits/rejected": 3.5642244815826416, + "logps/chosen": -176.91287231445312, + "logps/rejected": -176.91287231445312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.90713882446289, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.90713882446289, + "step": 3766 + }, + { + "epoch": 2.5996204933586338, + "grad_norm": 0.38710564374923706, + "learning_rate": 1.668584579976985e-06, + "logits/chosen": 3.6295478343963623, + "logits/rejected": 3.6295478343963623, + "logps/chosen": -175.11973571777344, + "logps/rejected": -175.11973571777344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.732887268066406, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.732887268066406, + "step": 3767 + }, + { + "epoch": 2.6003105054338453, + "grad_norm": 0.33239293098449707, + "learning_rate": 1.6657077100115076e-06, + "logits/chosen": 3.5842833518981934, + "logits/rejected": 3.5842833518981934, + "logps/chosen": -180.7200164794922, + "logps/rejected": -180.7200164794922, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.377094268798828, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.377094268798828, + "step": 3768 + }, + { + "epoch": 2.6010005175090565, + "grad_norm": 0.36947575211524963, + "learning_rate": 1.66283084004603e-06, + "logits/chosen": 2.8990650177001953, + "logits/rejected": 3.120173454284668, + "logps/chosen": -149.3485107421875, + "logps/rejected": -167.5885772705078, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.045023918151855, + "rewards/margins": 1.937254548072815, + "rewards/rejected": -11.982278823852539, + "step": 3769 + }, + { + "epoch": 2.6016905295842676, + "grad_norm": 0.278579980134964, + "learning_rate": 1.6599539700805525e-06, + "logits/chosen": 3.773338794708252, + "logits/rejected": 3.8744778633117676, + "logps/chosen": -159.69554138183594, + "logps/rejected": -172.51397705078125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.081273078918457, + "rewards/margins": 1.2672271728515625, + "rewards/rejected": -12.34850025177002, + "step": 3770 + }, + { + "epoch": 2.602380541659479, + "grad_norm": 0.2792533338069916, + "learning_rate": 1.6570771001150749e-06, + "logits/chosen": 3.4778430461883545, + "logits/rejected": 3.5720884799957275, + "logps/chosen": -171.61087036132812, + "logps/rejected": -195.13552856445312, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.310433387756348, + "rewards/margins": 2.34195613861084, + "rewards/rejected": -14.652388572692871, + "step": 3771 + }, + { + "epoch": 2.6030705537346903, + "grad_norm": 0.7490223050117493, + "learning_rate": 1.6542002301495974e-06, + "logits/chosen": 3.4433398246765137, + "logits/rejected": 3.4433398246765137, + "logps/chosen": -165.63890075683594, + "logps/rejected": -165.63890075683594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.845269203186035, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.845268249511719, + "step": 3772 + }, + { + "epoch": 2.6037605658099015, + "grad_norm": 0.4256206452846527, + "learning_rate": 1.6513233601841198e-06, + "logits/chosen": 3.7077791690826416, + "logits/rejected": 3.7077791690826416, + "logps/chosen": -193.00491333007812, + "logps/rejected": -193.00491333007812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.736417770385742, + "rewards/margins": 0.0, + "rewards/rejected": -14.736417770385742, + "step": 3773 + }, + { + "epoch": 2.604450577885113, + "grad_norm": 0.36814233660697937, + "learning_rate": 1.6484464902186423e-06, + "logits/chosen": 3.6302032470703125, + "logits/rejected": 3.6302032470703125, + "logps/chosen": -184.71878051757812, + "logps/rejected": -184.71878051757812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.828023910522461, + "rewards/margins": 0.0, + "rewards/rejected": -13.828023910522461, + "step": 3774 + }, + { + "epoch": 2.605140589960324, + "grad_norm": 0.37264329195022583, + "learning_rate": 1.6455696202531647e-06, + "logits/chosen": 3.154263734817505, + "logits/rejected": 3.2451229095458984, + "logps/chosen": -174.99993896484375, + "logps/rejected": -189.26284790039062, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.759397506713867, + "rewards/margins": 1.4519646167755127, + "rewards/rejected": -14.211360931396484, + "step": 3775 + }, + { + "epoch": 2.6058306020355357, + "grad_norm": 10.535778045654297, + "learning_rate": 1.6426927502876872e-06, + "logits/chosen": 3.2408688068389893, + "logits/rejected": 3.187288999557495, + "logps/chosen": -142.9632110595703, + "logps/rejected": -157.73602294921875, + "loss": 0.5914, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.605844497680664, + "rewards/margins": 1.4627723693847656, + "rewards/rejected": -11.06861686706543, + "step": 3776 + }, + { + "epoch": 2.606520614110747, + "grad_norm": 0.3504539132118225, + "learning_rate": 1.6398158803222094e-06, + "logits/chosen": 3.4339027404785156, + "logits/rejected": 3.473160743713379, + "logps/chosen": -180.90451049804688, + "logps/rejected": -189.59564208984375, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.237482070922852, + "rewards/margins": 0.8690577149391174, + "rewards/rejected": -14.106539726257324, + "step": 3777 + }, + { + "epoch": 2.6072106261859584, + "grad_norm": 0.391558974981308, + "learning_rate": 1.636939010356732e-06, + "logits/chosen": 3.716718912124634, + "logits/rejected": 3.716718912124634, + "logps/chosen": -186.7213134765625, + "logps/rejected": -186.72129821777344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.906469345092773, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.906469345092773, + "step": 3778 + }, + { + "epoch": 2.6079006382611696, + "grad_norm": 0.4361339509487152, + "learning_rate": 1.6340621403912543e-06, + "logits/chosen": 3.328221559524536, + "logits/rejected": 3.4016897678375244, + "logps/chosen": -169.30819702148438, + "logps/rejected": -176.09640502929688, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.014628410339355, + "rewards/margins": 0.7558932900428772, + "rewards/rejected": -12.770522117614746, + "step": 3779 + }, + { + "epoch": 2.6085906503363807, + "grad_norm": 2.465116262435913, + "learning_rate": 1.6311852704257769e-06, + "logits/chosen": 3.4138970375061035, + "logits/rejected": 3.4780211448669434, + "logps/chosen": -166.19921875, + "logps/rejected": -181.65414428710938, + "loss": 0.5283, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.957071304321289, + "rewards/margins": 1.5264774560928345, + "rewards/rejected": -13.483548164367676, + "step": 3780 + }, + { + "epoch": 2.6092806624115923, + "grad_norm": 0.3632013499736786, + "learning_rate": 1.6283084004602992e-06, + "logits/chosen": 3.148669958114624, + "logits/rejected": 3.148669958114624, + "logps/chosen": -170.31228637695312, + "logps/rejected": -170.31227111816406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.273167610168457, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -12.273167610168457, + "step": 3781 + }, + { + "epoch": 2.6099706744868034, + "grad_norm": 0.43475112318992615, + "learning_rate": 1.6254315304948218e-06, + "logits/chosen": 3.098048210144043, + "logits/rejected": 2.9920547008514404, + "logps/chosen": -159.524658203125, + "logps/rejected": -167.51507568359375, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.166245460510254, + "rewards/margins": 0.7744254469871521, + "rewards/rejected": -11.940671920776367, + "step": 3782 + }, + { + "epoch": 2.610660686562015, + "grad_norm": 0.3517889380455017, + "learning_rate": 1.6225546605293441e-06, + "logits/chosen": 3.8142030239105225, + "logits/rejected": 3.8142030239105225, + "logps/chosen": -179.76962280273438, + "logps/rejected": -179.76962280273438, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.150575637817383, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.150575637817383, + "step": 3783 + }, + { + "epoch": 2.611350698637226, + "grad_norm": 0.2682652771472931, + "learning_rate": 1.6196777905638667e-06, + "logits/chosen": 3.010171413421631, + "logits/rejected": 2.9742002487182617, + "logps/chosen": -156.2632598876953, + "logps/rejected": -169.7837677001953, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.845820426940918, + "rewards/margins": 1.3645803928375244, + "rewards/rejected": -12.21040153503418, + "step": 3784 + }, + { + "epoch": 2.6120407107124377, + "grad_norm": 0.35293862223625183, + "learning_rate": 1.6168009205983893e-06, + "logits/chosen": 3.1036882400512695, + "logits/rejected": 3.1721315383911133, + "logps/chosen": -156.63470458984375, + "logps/rejected": -183.70875549316406, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.92636775970459, + "rewards/margins": 2.6459059715270996, + "rewards/rejected": -13.572274208068848, + "step": 3785 + }, + { + "epoch": 2.612730722787649, + "grad_norm": 0.371233195066452, + "learning_rate": 1.6139240506329116e-06, + "logits/chosen": 3.0268898010253906, + "logits/rejected": 3.3196821212768555, + "logps/chosen": -151.52938842773438, + "logps/rejected": -175.61471557617188, + "loss": 0.4338, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.418298721313477, + "rewards/margins": 2.4983787536621094, + "rewards/rejected": -12.916678428649902, + "step": 3786 + }, + { + "epoch": 2.61342073486286, + "grad_norm": 1.5206886529922485, + "learning_rate": 1.6110471806674342e-06, + "logits/chosen": 3.3704614639282227, + "logits/rejected": 3.4823498725891113, + "logps/chosen": -166.46087646484375, + "logps/rejected": -178.02444458007812, + "loss": 0.5281, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.748025894165039, + "rewards/margins": 1.1591376066207886, + "rewards/rejected": -12.907163619995117, + "step": 3787 + }, + { + "epoch": 2.6141107469380716, + "grad_norm": 0.33492511510849, + "learning_rate": 1.6081703107019563e-06, + "logits/chosen": 3.5400776863098145, + "logits/rejected": 3.5400776863098145, + "logps/chosen": -163.3721923828125, + "logps/rejected": -163.3721923828125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.598752975463867, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.598752975463867, + "step": 3788 + }, + { + "epoch": 2.6148007590132827, + "grad_norm": 0.6112927198410034, + "learning_rate": 1.6052934407364789e-06, + "logits/chosen": 3.290543556213379, + "logits/rejected": 3.4356815814971924, + "logps/chosen": -145.155029296875, + "logps/rejected": -154.9691162109375, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.624368667602539, + "rewards/margins": 1.0065274238586426, + "rewards/rejected": -10.630895614624023, + "step": 3789 + }, + { + "epoch": 2.615490771088494, + "grad_norm": 0.37072432041168213, + "learning_rate": 1.6024165707710012e-06, + "logits/chosen": 3.149981737136841, + "logits/rejected": 3.1561648845672607, + "logps/chosen": -178.56854248046875, + "logps/rejected": -187.089111328125, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.126938819885254, + "rewards/margins": 0.9180395603179932, + "rewards/rejected": -14.044978141784668, + "step": 3790 + }, + { + "epoch": 2.6161807831637054, + "grad_norm": 0.2628788948059082, + "learning_rate": 1.5995397008055238e-06, + "logits/chosen": 2.743602752685547, + "logits/rejected": 2.9515366554260254, + "logps/chosen": -173.59579467773438, + "logps/rejected": -195.48623657226562, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.69950008392334, + "rewards/margins": 2.2143161296844482, + "rewards/rejected": -14.91381549835205, + "step": 3791 + }, + { + "epoch": 2.6168707952389165, + "grad_norm": 24.521642684936523, + "learning_rate": 1.5966628308400461e-06, + "logits/chosen": 3.3870677947998047, + "logits/rejected": 3.355790615081787, + "logps/chosen": -178.4652557373047, + "logps/rejected": -174.30845642089844, + "loss": 1.0507, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.081535339355469, + "rewards/margins": -0.4405708909034729, + "rewards/rejected": -12.640965461730957, + "step": 3792 + }, + { + "epoch": 2.617560807314128, + "grad_norm": 0.3877668082714081, + "learning_rate": 1.5937859608745687e-06, + "logits/chosen": 3.476236343383789, + "logits/rejected": 3.476236343383789, + "logps/chosen": -184.8900604248047, + "logps/rejected": -184.8900604248047, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.49238395690918, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -13.492384910583496, + "step": 3793 + }, + { + "epoch": 2.6182508193893392, + "grad_norm": 21.86929702758789, + "learning_rate": 1.590909090909091e-06, + "logits/chosen": 3.197626829147339, + "logits/rejected": 3.218024730682373, + "logps/chosen": -158.98419189453125, + "logps/rejected": -158.0897216796875, + "loss": 0.7556, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.095057487487793, + "rewards/margins": -0.10397160053253174, + "rewards/rejected": -10.99108600616455, + "step": 3794 + }, + { + "epoch": 2.618940831464551, + "grad_norm": 0.4368889629840851, + "learning_rate": 1.5880322209436136e-06, + "logits/chosen": 3.3699581623077393, + "logits/rejected": 3.5170278549194336, + "logps/chosen": -172.09059143066406, + "logps/rejected": -180.54957580566406, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.366620063781738, + "rewards/margins": 0.8146978616714478, + "rewards/rejected": -13.181318283081055, + "step": 3795 + }, + { + "epoch": 2.619630843539762, + "grad_norm": 15.28219985961914, + "learning_rate": 1.585155350978136e-06, + "logits/chosen": 2.895395278930664, + "logits/rejected": 2.8443963527679443, + "logps/chosen": -162.938232421875, + "logps/rejected": -162.3812713623047, + "loss": 0.7434, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.596623420715332, + "rewards/margins": -0.0859639048576355, + "rewards/rejected": -11.510659217834473, + "step": 3796 + }, + { + "epoch": 2.620320855614973, + "grad_norm": 0.4450856149196625, + "learning_rate": 1.5822784810126585e-06, + "logits/chosen": 3.2772610187530518, + "logits/rejected": 3.4036433696746826, + "logps/chosen": -152.63021850585938, + "logps/rejected": -167.1178741455078, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.40872859954834, + "rewards/margins": 1.357344150543213, + "rewards/rejected": -11.766071319580078, + "step": 3797 + }, + { + "epoch": 2.6210108676901847, + "grad_norm": 0.38828691840171814, + "learning_rate": 1.5794016110471807e-06, + "logits/chosen": 3.3310117721557617, + "logits/rejected": 3.430561065673828, + "logps/chosen": -157.24075317382812, + "logps/rejected": -167.99539184570312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.844620704650879, + "rewards/margins": 1.105225920677185, + "rewards/rejected": -11.949846267700195, + "step": 3798 + }, + { + "epoch": 2.621700879765396, + "grad_norm": 0.31730562448501587, + "learning_rate": 1.5765247410817032e-06, + "logits/chosen": 3.3375747203826904, + "logits/rejected": 3.491323471069336, + "logps/chosen": -160.20777893066406, + "logps/rejected": -183.74380493164062, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.222188949584961, + "rewards/margins": 2.3453028202056885, + "rewards/rejected": -13.56749153137207, + "step": 3799 + }, + { + "epoch": 2.6223908918406074, + "grad_norm": 0.4041832387447357, + "learning_rate": 1.5736478711162256e-06, + "logits/chosen": 3.1618900299072266, + "logits/rejected": 3.3330376148223877, + "logps/chosen": -140.55442810058594, + "logps/rejected": -154.87501525878906, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.354228973388672, + "rewards/margins": 1.4349050521850586, + "rewards/rejected": -10.78913402557373, + "step": 3800 + }, + { + "epoch": 2.6230809039158185, + "grad_norm": 0.6896607875823975, + "learning_rate": 1.5707710011507482e-06, + "logits/chosen": 2.910890817642212, + "logits/rejected": 3.1485822200775146, + "logps/chosen": -166.6427459716797, + "logps/rejected": -179.86265563964844, + "loss": 0.5219, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.797595977783203, + "rewards/margins": 1.3291579484939575, + "rewards/rejected": -13.126754760742188, + "step": 3801 + }, + { + "epoch": 2.62377091599103, + "grad_norm": 1.055799961090088, + "learning_rate": 1.5678941311852705e-06, + "logits/chosen": 3.3168022632598877, + "logits/rejected": 3.3692173957824707, + "logps/chosen": -155.77786254882812, + "logps/rejected": -170.5924530029297, + "loss": 0.5231, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.722232818603516, + "rewards/margins": 1.4996082782745361, + "rewards/rejected": -12.221841812133789, + "step": 3802 + }, + { + "epoch": 2.624460928066241, + "grad_norm": 0.36995038390159607, + "learning_rate": 1.565017261219793e-06, + "logits/chosen": 3.587383508682251, + "logits/rejected": 3.587383508682251, + "logps/chosen": -176.0194854736328, + "logps/rejected": -176.01950073242188, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.677834510803223, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.677835464477539, + "step": 3803 + }, + { + "epoch": 2.6251509401414523, + "grad_norm": 0.5266299843788147, + "learning_rate": 1.5621403912543154e-06, + "logits/chosen": 3.3847434520721436, + "logits/rejected": 3.3847434520721436, + "logps/chosen": -177.35110473632812, + "logps/rejected": -177.35110473632812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.858424186706543, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.858424186706543, + "step": 3804 + }, + { + "epoch": 2.625840952216664, + "grad_norm": 0.3256146013736725, + "learning_rate": 1.559263521288838e-06, + "logits/chosen": 3.509406328201294, + "logits/rejected": 3.648050546646118, + "logps/chosen": -169.6291961669922, + "logps/rejected": -178.17242431640625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.130020141601562, + "rewards/margins": 0.8897292017936707, + "rewards/rejected": -13.019749641418457, + "step": 3805 + }, + { + "epoch": 2.626530964291875, + "grad_norm": 0.4967210292816162, + "learning_rate": 1.5563866513233603e-06, + "logits/chosen": 3.158409357070923, + "logits/rejected": 3.2282609939575195, + "logps/chosen": -153.60353088378906, + "logps/rejected": -159.36598205566406, + "loss": 0.608, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.71722412109375, + "rewards/margins": 0.5552454590797424, + "rewards/rejected": -11.272470474243164, + "step": 3806 + }, + { + "epoch": 2.627220976367086, + "grad_norm": 0.3755486309528351, + "learning_rate": 1.553509781357883e-06, + "logits/chosen": 3.1516268253326416, + "logits/rejected": 3.2995808124542236, + "logps/chosen": -155.74154663085938, + "logps/rejected": -165.55136108398438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.592687606811523, + "rewards/margins": 1.0245667695999146, + "rewards/rejected": -11.617254257202148, + "step": 3807 + }, + { + "epoch": 2.6279109884422978, + "grad_norm": 0.3548702895641327, + "learning_rate": 1.550632911392405e-06, + "logits/chosen": 3.3339970111846924, + "logits/rejected": 3.344949960708618, + "logps/chosen": -159.03005981445312, + "logps/rejected": -170.94068908691406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.187690734863281, + "rewards/margins": 1.1382076740264893, + "rewards/rejected": -12.325899124145508, + "step": 3808 + }, + { + "epoch": 2.628601000517509, + "grad_norm": 0.8286681771278381, + "learning_rate": 1.5477560414269276e-06, + "logits/chosen": 3.185547351837158, + "logits/rejected": 3.2336127758026123, + "logps/chosen": -157.54052734375, + "logps/rejected": -161.029296875, + "loss": 0.611, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.870944023132324, + "rewards/margins": 0.41207289695739746, + "rewards/rejected": -11.283016204833984, + "step": 3809 + }, + { + "epoch": 2.6292910125927205, + "grad_norm": 0.3242150545120239, + "learning_rate": 1.54487917146145e-06, + "logits/chosen": 3.1544439792633057, + "logits/rejected": 3.1544439792633057, + "logps/chosen": -164.5758819580078, + "logps/rejected": -164.57589721679688, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.647470474243164, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -11.647470474243164, + "step": 3810 + }, + { + "epoch": 2.6299810246679316, + "grad_norm": 0.34858518838882446, + "learning_rate": 1.5420023014959725e-06, + "logits/chosen": 3.341113805770874, + "logits/rejected": 3.412921905517578, + "logps/chosen": -174.48712158203125, + "logps/rejected": -184.3818817138672, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.577942848205566, + "rewards/margins": 1.0200467109680176, + "rewards/rejected": -13.597990036010742, + "step": 3811 + }, + { + "epoch": 2.630671036743143, + "grad_norm": 0.42254704236984253, + "learning_rate": 1.5391254315304949e-06, + "logits/chosen": 3.452493190765381, + "logits/rejected": 3.452493190765381, + "logps/chosen": -161.20745849609375, + "logps/rejected": -161.20745849609375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.302070617675781, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -11.302070617675781, + "step": 3812 + }, + { + "epoch": 2.6313610488183543, + "grad_norm": 0.33922284841537476, + "learning_rate": 1.5362485615650174e-06, + "logits/chosen": 3.1197421550750732, + "logits/rejected": 3.17655348777771, + "logps/chosen": -143.79054260253906, + "logps/rejected": -155.3454132080078, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.578730583190918, + "rewards/margins": 1.198601245880127, + "rewards/rejected": -10.777332305908203, + "step": 3813 + }, + { + "epoch": 2.6320510608935654, + "grad_norm": 0.31207695603370667, + "learning_rate": 1.5333716915995398e-06, + "logits/chosen": 3.6453514099121094, + "logits/rejected": 3.6453514099121094, + "logps/chosen": -183.6467742919922, + "logps/rejected": -183.6467742919922, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.519464492797852, + "rewards/margins": 0.0, + "rewards/rejected": -13.519464492797852, + "step": 3814 + }, + { + "epoch": 2.632741072968777, + "grad_norm": 0.5456629395484924, + "learning_rate": 1.5304948216340623e-06, + "logits/chosen": 3.165621280670166, + "logits/rejected": 3.212700843811035, + "logps/chosen": -138.1647491455078, + "logps/rejected": -145.6026611328125, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.177160263061523, + "rewards/margins": 0.7076303958892822, + "rewards/rejected": -9.884790420532227, + "step": 3815 + }, + { + "epoch": 2.633431085043988, + "grad_norm": 0.4253145754337311, + "learning_rate": 1.5276179516685847e-06, + "logits/chosen": 3.0387027263641357, + "logits/rejected": 3.0420000553131104, + "logps/chosen": -170.33920288085938, + "logps/rejected": -178.6376495361328, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.23218822479248, + "rewards/margins": 0.7835451364517212, + "rewards/rejected": -13.01573371887207, + "step": 3816 + }, + { + "epoch": 2.6341210971191997, + "grad_norm": 0.29075291752815247, + "learning_rate": 1.5247410817031073e-06, + "logits/chosen": 3.2783610820770264, + "logits/rejected": 3.411459445953369, + "logps/chosen": -161.68145751953125, + "logps/rejected": -173.2527618408203, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.264095306396484, + "rewards/margins": 1.1533373594284058, + "rewards/rejected": -12.417430877685547, + "step": 3817 + }, + { + "epoch": 2.634811109194411, + "grad_norm": 0.35292044281959534, + "learning_rate": 1.5218642117376294e-06, + "logits/chosen": 3.632455825805664, + "logits/rejected": 3.632455825805664, + "logps/chosen": -171.93124389648438, + "logps/rejected": -171.93124389648438, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.410881042480469, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.410881042480469, + "step": 3818 + }, + { + "epoch": 2.6355011212696224, + "grad_norm": 0.38155925273895264, + "learning_rate": 1.518987341772152e-06, + "logits/chosen": 3.317382335662842, + "logits/rejected": 3.4824604988098145, + "logps/chosen": -183.47988891601562, + "logps/rejected": -193.41317749023438, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.559761047363281, + "rewards/margins": 0.965859591960907, + "rewards/rejected": -14.52562141418457, + "step": 3819 + }, + { + "epoch": 2.6361911333448336, + "grad_norm": 0.42066818475723267, + "learning_rate": 1.5161104718066743e-06, + "logits/chosen": 3.4022350311279297, + "logits/rejected": 3.4022350311279297, + "logps/chosen": -158.63162231445312, + "logps/rejected": -158.63162231445312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.928916931152344, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -10.928916931152344, + "step": 3820 + }, + { + "epoch": 2.6368811454200447, + "grad_norm": 0.2926231622695923, + "learning_rate": 1.5132336018411969e-06, + "logits/chosen": 3.3756489753723145, + "logits/rejected": 3.3998475074768066, + "logps/chosen": -163.96519470214844, + "logps/rejected": -178.34152221679688, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.47723388671875, + "rewards/margins": 1.4626803398132324, + "rewards/rejected": -12.93991470336914, + "step": 3821 + }, + { + "epoch": 2.6375711574952563, + "grad_norm": 1.6324595212936401, + "learning_rate": 1.5103567318757192e-06, + "logits/chosen": 3.154188871383667, + "logits/rejected": 3.3356354236602783, + "logps/chosen": -143.04469299316406, + "logps/rejected": -154.13365173339844, + "loss": 0.5278, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.592329025268555, + "rewards/margins": 1.1419681310653687, + "rewards/rejected": -10.734297752380371, + "step": 3822 + }, + { + "epoch": 2.6382611695704674, + "grad_norm": 0.44418880343437195, + "learning_rate": 1.5074798619102418e-06, + "logits/chosen": 3.239469051361084, + "logits/rejected": 3.3154611587524414, + "logps/chosen": -160.36329650878906, + "logps/rejected": -168.53692626953125, + "loss": 0.6067, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.263398170471191, + "rewards/margins": 0.8012238144874573, + "rewards/rejected": -12.064621925354004, + "step": 3823 + }, + { + "epoch": 2.6389511816456785, + "grad_norm": 0.35244449973106384, + "learning_rate": 1.5046029919447641e-06, + "logits/chosen": 2.6974148750305176, + "logits/rejected": 2.6974148750305176, + "logps/chosen": -157.55389404296875, + "logps/rejected": -157.55389404296875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.081247329711914, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -11.081247329711914, + "step": 3824 + }, + { + "epoch": 2.63964119372089, + "grad_norm": 0.46517643332481384, + "learning_rate": 1.5017261219792867e-06, + "logits/chosen": 3.3759472370147705, + "logits/rejected": 3.3759472370147705, + "logps/chosen": -161.79534912109375, + "logps/rejected": -161.79534912109375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.368518829345703, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -11.368518829345703, + "step": 3825 + }, + { + "epoch": 2.6403312057961017, + "grad_norm": 0.42281144857406616, + "learning_rate": 1.498849252013809e-06, + "logits/chosen": 3.458205223083496, + "logits/rejected": 3.614476203918457, + "logps/chosen": -151.98385620117188, + "logps/rejected": -169.96417236328125, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.486281394958496, + "rewards/margins": 1.7796739339828491, + "rewards/rejected": -12.265955924987793, + "step": 3826 + }, + { + "epoch": 2.641021217871313, + "grad_norm": 0.4256799519062042, + "learning_rate": 1.4959723820483316e-06, + "logits/chosen": 3.221186399459839, + "logits/rejected": 3.375265121459961, + "logps/chosen": -135.9557342529297, + "logps/rejected": -166.76470947265625, + "loss": 0.4346, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.909980773925781, + "rewards/margins": 2.982271909713745, + "rewards/rejected": -11.892252922058105, + "step": 3827 + }, + { + "epoch": 2.641711229946524, + "grad_norm": 0.3874044716358185, + "learning_rate": 1.4930955120828538e-06, + "logits/chosen": 3.361558675765991, + "logits/rejected": 3.5477166175842285, + "logps/chosen": -147.35772705078125, + "logps/rejected": -173.94456481933594, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.172063827514648, + "rewards/margins": 2.6337499618530273, + "rewards/rejected": -12.805813789367676, + "step": 3828 + }, + { + "epoch": 2.6424012420217355, + "grad_norm": 24.290950775146484, + "learning_rate": 1.4902186421173763e-06, + "logits/chosen": 3.1884469985961914, + "logits/rejected": 3.0599405765533447, + "logps/chosen": -160.00857543945312, + "logps/rejected": -169.37010192871094, + "loss": 1.1848, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.134763717651367, + "rewards/margins": 0.9200911521911621, + "rewards/rejected": -12.054855346679688, + "step": 3829 + }, + { + "epoch": 2.6430912540969467, + "grad_norm": 0.45206859707832336, + "learning_rate": 1.4873417721518987e-06, + "logits/chosen": 3.191375732421875, + "logits/rejected": 3.200197219848633, + "logps/chosen": -159.4685821533203, + "logps/rejected": -173.41224670410156, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.253690719604492, + "rewards/margins": 1.4072481393814087, + "rewards/rejected": -12.66093921661377, + "step": 3830 + }, + { + "epoch": 2.643781266172158, + "grad_norm": 0.357023686170578, + "learning_rate": 1.4844649021864212e-06, + "logits/chosen": 3.182978868484497, + "logits/rejected": 3.644514560699463, + "logps/chosen": -150.27334594726562, + "logps/rejected": -185.06240844726562, + "loss": 0.4333, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.33137321472168, + "rewards/margins": 3.517373561859131, + "rewards/rejected": -13.848746299743652, + "step": 3831 + }, + { + "epoch": 2.6444712782473694, + "grad_norm": 0.6371244788169861, + "learning_rate": 1.4815880322209436e-06, + "logits/chosen": 3.1294195652008057, + "logits/rejected": 3.1294195652008057, + "logps/chosen": -168.15469360351562, + "logps/rejected": -168.15469360351562, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.000505447387695, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.000505447387695, + "step": 3832 + }, + { + "epoch": 2.6451612903225805, + "grad_norm": 0.4783386290073395, + "learning_rate": 1.4787111622554662e-06, + "logits/chosen": 3.35475754737854, + "logits/rejected": 3.38571834564209, + "logps/chosen": -166.30276489257812, + "logps/rejected": -177.90240478515625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.934526443481445, + "rewards/margins": 1.1768062114715576, + "rewards/rejected": -13.111333847045898, + "step": 3833 + }, + { + "epoch": 2.645851302397792, + "grad_norm": 2.78977370262146, + "learning_rate": 1.4758342922899885e-06, + "logits/chosen": 3.1999053955078125, + "logits/rejected": 3.2774181365966797, + "logps/chosen": -142.80206298828125, + "logps/rejected": -153.16978454589844, + "loss": 0.5364, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.596508026123047, + "rewards/margins": 1.0728172063827515, + "rewards/rejected": -10.669326782226562, + "step": 3834 + }, + { + "epoch": 2.6465413144730032, + "grad_norm": 0.5313112139701843, + "learning_rate": 1.472957422324511e-06, + "logits/chosen": 3.172048568725586, + "logits/rejected": 3.172048568725586, + "logps/chosen": -173.3069610595703, + "logps/rejected": -173.3069610595703, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.528550148010254, + "rewards/margins": 0.0, + "rewards/rejected": -12.528550148010254, + "step": 3835 + }, + { + "epoch": 2.647231326548215, + "grad_norm": 0.4186846613883972, + "learning_rate": 1.4700805523590336e-06, + "logits/chosen": 3.0598809719085693, + "logits/rejected": 3.2567787170410156, + "logps/chosen": -150.4049072265625, + "logps/rejected": -159.687744140625, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.233839988708496, + "rewards/margins": 0.8972445726394653, + "rewards/rejected": -11.131084442138672, + "step": 3836 + }, + { + "epoch": 2.647921338623426, + "grad_norm": 0.3359779119491577, + "learning_rate": 1.467203682393556e-06, + "logits/chosen": 3.1530966758728027, + "logits/rejected": 3.1530966758728027, + "logps/chosen": -180.8491973876953, + "logps/rejected": -180.8491973876953, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.281349182128906, + "rewards/margins": 0.0, + "rewards/rejected": -13.281349182128906, + "step": 3837 + }, + { + "epoch": 2.648611350698637, + "grad_norm": 0.34725579619407654, + "learning_rate": 1.4643268124280786e-06, + "logits/chosen": 3.4644081592559814, + "logits/rejected": 3.5072224140167236, + "logps/chosen": -165.2187042236328, + "logps/rejected": -178.64849853515625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.882283210754395, + "rewards/margins": 1.3490350246429443, + "rewards/rejected": -13.231317520141602, + "step": 3838 + }, + { + "epoch": 2.6493013627738486, + "grad_norm": 0.40996789932250977, + "learning_rate": 1.461449942462601e-06, + "logits/chosen": 3.026620388031006, + "logits/rejected": 3.1944756507873535, + "logps/chosen": -137.34719848632812, + "logps/rejected": -159.1427764892578, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.139942169189453, + "rewards/margins": 2.1585209369659424, + "rewards/rejected": -11.2984619140625, + "step": 3839 + }, + { + "epoch": 2.6499913748490598, + "grad_norm": 0.74229896068573, + "learning_rate": 1.4585730724971235e-06, + "logits/chosen": 3.354193925857544, + "logits/rejected": 3.3215348720550537, + "logps/chosen": -159.22427368164062, + "logps/rejected": -181.96734619140625, + "loss": 0.4383, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.926860809326172, + "rewards/margins": 2.388059377670288, + "rewards/rejected": -13.314920425415039, + "step": 3840 + }, + { + "epoch": 2.650681386924271, + "grad_norm": 0.33764517307281494, + "learning_rate": 1.4556962025316456e-06, + "logits/chosen": 3.0946364402770996, + "logits/rejected": 3.284183979034424, + "logps/chosen": -158.5812225341797, + "logps/rejected": -168.51681518554688, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.111530303955078, + "rewards/margins": 0.9694182276725769, + "rewards/rejected": -12.080947875976562, + "step": 3841 + }, + { + "epoch": 2.6513713989994825, + "grad_norm": 0.40507447719573975, + "learning_rate": 1.4528193325661682e-06, + "logits/chosen": 2.8638157844543457, + "logits/rejected": 3.1531596183776855, + "logps/chosen": -140.9998321533203, + "logps/rejected": -162.16383361816406, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.299571990966797, + "rewards/margins": 2.1275808811187744, + "rewards/rejected": -11.427152633666992, + "step": 3842 + }, + { + "epoch": 2.652061411074694, + "grad_norm": 0.3467468321323395, + "learning_rate": 1.4499424626006905e-06, + "logits/chosen": 3.2455861568450928, + "logits/rejected": 3.2896931171417236, + "logps/chosen": -166.75462341308594, + "logps/rejected": -174.3582763671875, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.8794527053833, + "rewards/margins": 0.7927567958831787, + "rewards/rejected": -12.672209739685059, + "step": 3843 + }, + { + "epoch": 2.652751423149905, + "grad_norm": 0.5602220892906189, + "learning_rate": 1.447065592635213e-06, + "logits/chosen": 2.8502111434936523, + "logits/rejected": 2.8502111434936523, + "logps/chosen": -168.47987365722656, + "logps/rejected": -168.47987365722656, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.097461700439453, + "rewards/margins": 0.0, + "rewards/rejected": -12.097461700439453, + "step": 3844 + }, + { + "epoch": 2.6534414352251163, + "grad_norm": 1.2875672578811646, + "learning_rate": 1.4441887226697354e-06, + "logits/chosen": 2.9633047580718994, + "logits/rejected": 3.020402193069458, + "logps/chosen": -157.77500915527344, + "logps/rejected": -173.72500610351562, + "loss": 0.5299, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.101237297058105, + "rewards/margins": 1.6235312223434448, + "rewards/rejected": -12.724767684936523, + "step": 3845 + }, + { + "epoch": 2.654131447300328, + "grad_norm": 5.002281188964844, + "learning_rate": 1.441311852704258e-06, + "logits/chosen": 3.0069217681884766, + "logits/rejected": 3.1513383388519287, + "logps/chosen": -145.66183471679688, + "logps/rejected": -156.26121520996094, + "loss": 0.5459, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.735572814941406, + "rewards/margins": 1.0714689493179321, + "rewards/rejected": -10.80704116821289, + "step": 3846 + }, + { + "epoch": 2.654821459375539, + "grad_norm": 0.40724870562553406, + "learning_rate": 1.4384349827387804e-06, + "logits/chosen": 3.261204719543457, + "logits/rejected": 3.261204719543457, + "logps/chosen": -153.50514221191406, + "logps/rejected": -153.50514221191406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -10.468238830566406, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -10.468238830566406, + "step": 3847 + }, + { + "epoch": 2.65551147145075, + "grad_norm": 0.31206104159355164, + "learning_rate": 1.435558112773303e-06, + "logits/chosen": 3.4121286869049072, + "logits/rejected": 3.6309866905212402, + "logps/chosen": -154.1595458984375, + "logps/rejected": -179.11883544921875, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.559732437133789, + "rewards/margins": 2.4582009315490723, + "rewards/rejected": -13.017932891845703, + "step": 3848 + }, + { + "epoch": 2.6562014835259617, + "grad_norm": 6.221340656280518, + "learning_rate": 1.4326812428078253e-06, + "logits/chosen": 3.2586159706115723, + "logits/rejected": 3.384141445159912, + "logps/chosen": -150.56088256835938, + "logps/rejected": -160.44287109375, + "loss": 0.5431, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.36015510559082, + "rewards/margins": 1.0699741840362549, + "rewards/rejected": -11.430130004882812, + "step": 3849 + }, + { + "epoch": 2.656891495601173, + "grad_norm": 0.5330758094787598, + "learning_rate": 1.4298043728423478e-06, + "logits/chosen": 3.32057785987854, + "logits/rejected": 3.399592399597168, + "logps/chosen": -160.74298095703125, + "logps/rejected": -171.34365844726562, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.311487197875977, + "rewards/margins": 1.0659382343292236, + "rewards/rejected": -12.377426147460938, + "step": 3850 + }, + { + "epoch": 2.6575815076763845, + "grad_norm": 0.42121532559394836, + "learning_rate": 1.42692750287687e-06, + "logits/chosen": 3.081875801086426, + "logits/rejected": 3.1180202960968018, + "logps/chosen": -147.13845825195312, + "logps/rejected": -160.66880798339844, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.944931030273438, + "rewards/margins": 1.3682385683059692, + "rewards/rejected": -11.313169479370117, + "step": 3851 + }, + { + "epoch": 2.6582715197515956, + "grad_norm": 0.42805659770965576, + "learning_rate": 1.4240506329113925e-06, + "logits/chosen": 3.2838802337646484, + "logits/rejected": 3.2838802337646484, + "logps/chosen": -168.9343719482422, + "logps/rejected": -168.9343719482422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.262182235717773, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.262182235717773, + "step": 3852 + }, + { + "epoch": 2.658961531826807, + "grad_norm": 0.3924486041069031, + "learning_rate": 1.4211737629459149e-06, + "logits/chosen": 3.274256706237793, + "logits/rejected": 3.4503417015075684, + "logps/chosen": -161.84205627441406, + "logps/rejected": -172.3287811279297, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.473997116088867, + "rewards/margins": 1.0382964611053467, + "rewards/rejected": -12.512293815612793, + "step": 3853 + }, + { + "epoch": 2.6596515439020183, + "grad_norm": 0.4191887676715851, + "learning_rate": 1.4182968929804374e-06, + "logits/chosen": 3.3755831718444824, + "logits/rejected": 3.6228675842285156, + "logps/chosen": -163.17066955566406, + "logps/rejected": -173.4225311279297, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.447220802307129, + "rewards/margins": 1.0607585906982422, + "rewards/rejected": -12.507979393005371, + "step": 3854 + }, + { + "epoch": 2.6603415559772294, + "grad_norm": 0.306684285402298, + "learning_rate": 1.4154200230149598e-06, + "logits/chosen": 3.70927095413208, + "logits/rejected": 3.732017993927002, + "logps/chosen": -149.64138793945312, + "logps/rejected": -160.66888427734375, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.370990753173828, + "rewards/margins": 1.1244391202926636, + "rewards/rejected": -11.495429992675781, + "step": 3855 + }, + { + "epoch": 2.661031568052441, + "grad_norm": 0.37185996770858765, + "learning_rate": 1.4125431530494824e-06, + "logits/chosen": 3.120255947113037, + "logits/rejected": 3.3161938190460205, + "logps/chosen": -167.61886596679688, + "logps/rejected": -186.60208129882812, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.834602355957031, + "rewards/margins": 1.9942436218261719, + "rewards/rejected": -13.828845977783203, + "step": 3856 + }, + { + "epoch": 2.661721580127652, + "grad_norm": 0.39954236149787903, + "learning_rate": 1.4096662830840047e-06, + "logits/chosen": 3.064695358276367, + "logits/rejected": 3.147533893585205, + "logps/chosen": -134.88632202148438, + "logps/rejected": -144.34213256835938, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -8.626537322998047, + "rewards/margins": 0.9104446768760681, + "rewards/rejected": -9.536981582641602, + "step": 3857 + }, + { + "epoch": 2.6624115922028633, + "grad_norm": 0.4158859848976135, + "learning_rate": 1.4067894131185273e-06, + "logits/chosen": 3.0501933097839355, + "logits/rejected": 3.0812442302703857, + "logps/chosen": -136.6190948486328, + "logps/rejected": -158.33636474609375, + "loss": 0.4344, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.873344421386719, + "rewards/margins": 2.2355334758758545, + "rewards/rejected": -11.108879089355469, + "step": 3858 + }, + { + "epoch": 2.663101604278075, + "grad_norm": 0.3614151179790497, + "learning_rate": 1.4039125431530496e-06, + "logits/chosen": 2.9062485694885254, + "logits/rejected": 3.1355838775634766, + "logps/chosen": -155.45736694335938, + "logps/rejected": -167.19688415527344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.836945533752441, + "rewards/margins": 1.1620886325836182, + "rewards/rejected": -11.999034881591797, + "step": 3859 + }, + { + "epoch": 2.6637916163532864, + "grad_norm": 0.36780956387519836, + "learning_rate": 1.4010356731875722e-06, + "logits/chosen": 3.4380836486816406, + "logits/rejected": 3.371356725692749, + "logps/chosen": -170.0495147705078, + "logps/rejected": -178.19432067871094, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.097208023071289, + "rewards/margins": 0.7856813073158264, + "rewards/rejected": -12.882888793945312, + "step": 3860 + }, + { + "epoch": 2.6644816284284976, + "grad_norm": 0.2710409462451935, + "learning_rate": 1.3981588032220943e-06, + "logits/chosen": 3.2812819480895996, + "logits/rejected": 3.5272560119628906, + "logps/chosen": -146.1025390625, + "logps/rejected": -172.77049255371094, + "loss": 0.4338, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.978846549987793, + "rewards/margins": 2.643674612045288, + "rewards/rejected": -12.622520446777344, + "step": 3861 + }, + { + "epoch": 2.6651716405037087, + "grad_norm": 0.3287646770477295, + "learning_rate": 1.395281933256617e-06, + "logits/chosen": 3.142240285873413, + "logits/rejected": 3.2898707389831543, + "logps/chosen": -150.966796875, + "logps/rejected": -158.80545043945312, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.40111255645752, + "rewards/margins": 0.7395926117897034, + "rewards/rejected": -11.140705108642578, + "step": 3862 + }, + { + "epoch": 2.6658616525789203, + "grad_norm": 0.24585527181625366, + "learning_rate": 1.3924050632911392e-06, + "logits/chosen": 3.3358161449432373, + "logits/rejected": 3.475977897644043, + "logps/chosen": -136.53872680664062, + "logps/rejected": -182.26853942871094, + "loss": 0.4332, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.825017929077148, + "rewards/margins": 4.511743545532227, + "rewards/rejected": -13.336761474609375, + "step": 3863 + }, + { + "epoch": 2.6665516646541314, + "grad_norm": 0.30146411061286926, + "learning_rate": 1.3895281933256618e-06, + "logits/chosen": 3.4008426666259766, + "logits/rejected": 3.4557948112487793, + "logps/chosen": -160.8258056640625, + "logps/rejected": -169.9530029296875, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.259422302246094, + "rewards/margins": 0.9704421758651733, + "rewards/rejected": -12.229865074157715, + "step": 3864 + }, + { + "epoch": 2.6672416767293425, + "grad_norm": 0.343412309885025, + "learning_rate": 1.3866513233601842e-06, + "logits/chosen": 3.1893749237060547, + "logits/rejected": 3.147218704223633, + "logps/chosen": -172.33839416503906, + "logps/rejected": -184.4264373779297, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.48601245880127, + "rewards/margins": 1.2145966291427612, + "rewards/rejected": -13.700610160827637, + "step": 3865 + }, + { + "epoch": 2.667931688804554, + "grad_norm": 0.4797021448612213, + "learning_rate": 1.3837744533947067e-06, + "logits/chosen": 3.3801381587982178, + "logits/rejected": 3.4239208698272705, + "logps/chosen": -149.19317626953125, + "logps/rejected": -154.32386779785156, + "loss": 0.6087, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.169474601745605, + "rewards/margins": 0.5057472586631775, + "rewards/rejected": -10.675222396850586, + "step": 3866 + }, + { + "epoch": 2.6686217008797652, + "grad_norm": 0.9253336787223816, + "learning_rate": 1.380897583429229e-06, + "logits/chosen": 3.0303053855895996, + "logits/rejected": 3.034571647644043, + "logps/chosen": -128.42074584960938, + "logps/rejected": -155.10328674316406, + "loss": 0.438, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.938729286193848, + "rewards/margins": 2.580277442932129, + "rewards/rejected": -10.519006729125977, + "step": 3867 + }, + { + "epoch": 2.669311712954977, + "grad_norm": 0.36083948612213135, + "learning_rate": 1.3780207134637516e-06, + "logits/chosen": 3.123544454574585, + "logits/rejected": 3.4208216667175293, + "logps/chosen": -132.3797149658203, + "logps/rejected": -158.16845703125, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -8.624665260314941, + "rewards/margins": 2.6207313537597656, + "rewards/rejected": -11.245396614074707, + "step": 3868 + }, + { + "epoch": 2.670001725030188, + "grad_norm": 0.3104192316532135, + "learning_rate": 1.375143843498274e-06, + "logits/chosen": 3.4599769115448, + "logits/rejected": 3.4599769115448, + "logps/chosen": -162.2971649169922, + "logps/rejected": -162.2971649169922, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.717489242553711, + "rewards/margins": 0.0, + "rewards/rejected": -11.717489242553711, + "step": 3869 + }, + { + "epoch": 2.6706917371053995, + "grad_norm": 0.32804399728775024, + "learning_rate": 1.3722669735327966e-06, + "logits/chosen": 3.213921308517456, + "logits/rejected": 3.390021324157715, + "logps/chosen": -174.49343872070312, + "logps/rejected": -183.39981079101562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.632222175598145, + "rewards/margins": 0.9120927453041077, + "rewards/rejected": -13.54431438446045, + "step": 3870 + }, + { + "epoch": 2.6713817491806107, + "grad_norm": 1.4754774570465088, + "learning_rate": 1.3693901035673187e-06, + "logits/chosen": 3.7050557136535645, + "logits/rejected": 3.744950532913208, + "logps/chosen": -173.6848907470703, + "logps/rejected": -177.4563446044922, + "loss": 0.6122, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.598441123962402, + "rewards/margins": 0.38350051641464233, + "rewards/rejected": -12.981942176818848, + "step": 3871 + }, + { + "epoch": 2.672071761255822, + "grad_norm": 0.3645482659339905, + "learning_rate": 1.3665132336018413e-06, + "logits/chosen": 3.4766058921813965, + "logits/rejected": 3.660452365875244, + "logps/chosen": -167.466552734375, + "logps/rejected": -188.1095428466797, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.880255699157715, + "rewards/margins": 1.9744467735290527, + "rewards/rejected": -13.854702949523926, + "step": 3872 + }, + { + "epoch": 2.6727617733310334, + "grad_norm": 0.6630595922470093, + "learning_rate": 1.3636363636363636e-06, + "logits/chosen": 3.155773162841797, + "logits/rejected": 3.1421241760253906, + "logps/chosen": -124.25462341308594, + "logps/rejected": -152.28807067871094, + "loss": 0.4358, + "rewards/accuracies": 0.375, + "rewards/chosen": -7.793721675872803, + "rewards/margins": 2.6989150047302246, + "rewards/rejected": -10.492636680603027, + "step": 3873 + }, + { + "epoch": 2.6734517854062445, + "grad_norm": 0.38057464361190796, + "learning_rate": 1.3607594936708862e-06, + "logits/chosen": 3.216862201690674, + "logits/rejected": 3.4432358741760254, + "logps/chosen": -160.12515258789062, + "logps/rejected": -173.2017059326172, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.293018341064453, + "rewards/margins": 1.3255248069763184, + "rewards/rejected": -12.618542671203613, + "step": 3874 + }, + { + "epoch": 2.674141797481456, + "grad_norm": 0.4234451353549957, + "learning_rate": 1.3578826237054085e-06, + "logits/chosen": 2.929595470428467, + "logits/rejected": 3.041830539703369, + "logps/chosen": -155.97569274902344, + "logps/rejected": -177.0124053955078, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.865121841430664, + "rewards/margins": 2.123924970626831, + "rewards/rejected": -12.989046096801758, + "step": 3875 + }, + { + "epoch": 2.674831809556667, + "grad_norm": 18.35798454284668, + "learning_rate": 1.355005753739931e-06, + "logits/chosen": 3.175778388977051, + "logits/rejected": 3.1769587993621826, + "logps/chosen": -161.73455810546875, + "logps/rejected": -168.69097900390625, + "loss": 0.7354, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.325641632080078, + "rewards/margins": 0.6901328563690186, + "rewards/rejected": -12.015775680541992, + "step": 3876 + }, + { + "epoch": 2.675521821631879, + "grad_norm": 0.42028290033340454, + "learning_rate": 1.3521288837744534e-06, + "logits/chosen": 3.447157144546509, + "logits/rejected": 3.447157144546509, + "logps/chosen": -162.22659301757812, + "logps/rejected": -162.22659301757812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.63546085357666, + "rewards/margins": -2.980232238769531e-07, + "rewards/rejected": -11.63546085357666, + "step": 3877 + }, + { + "epoch": 2.67621183370709, + "grad_norm": 0.4328041076660156, + "learning_rate": 1.349252013808976e-06, + "logits/chosen": 3.515744686126709, + "logits/rejected": 3.665771484375, + "logps/chosen": -159.9287109375, + "logps/rejected": -171.19281005859375, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.093334197998047, + "rewards/margins": 1.1556358337402344, + "rewards/rejected": -12.248969078063965, + "step": 3878 + }, + { + "epoch": 2.676901845782301, + "grad_norm": 2.173647880554199, + "learning_rate": 1.3463751438434984e-06, + "logits/chosen": 3.4530434608459473, + "logits/rejected": 3.565427780151367, + "logps/chosen": -161.761962890625, + "logps/rejected": -163.71237182617188, + "loss": 0.6346, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.37896728515625, + "rewards/margins": 0.17244935035705566, + "rewards/rejected": -11.551416397094727, + "step": 3879 + }, + { + "epoch": 2.6775918578575126, + "grad_norm": 0.28992512822151184, + "learning_rate": 1.343498273878021e-06, + "logits/chosen": 3.3177597522735596, + "logits/rejected": 3.4736361503601074, + "logps/chosen": -156.0288848876953, + "logps/rejected": -174.222412109375, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.92526912689209, + "rewards/margins": 1.859618902206421, + "rewards/rejected": -12.784887313842773, + "step": 3880 + }, + { + "epoch": 2.6782818699327238, + "grad_norm": 0.46437376737594604, + "learning_rate": 1.340621403912543e-06, + "logits/chosen": 3.671164035797119, + "logits/rejected": 3.671164035797119, + "logps/chosen": -171.9635772705078, + "logps/rejected": -171.9635772705078, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.299545288085938, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.299545288085938, + "step": 3881 + }, + { + "epoch": 2.678971882007935, + "grad_norm": 0.3453662097454071, + "learning_rate": 1.3377445339470656e-06, + "logits/chosen": 3.4515252113342285, + "logits/rejected": 3.3979477882385254, + "logps/chosen": -155.32374572753906, + "logps/rejected": -183.15765380859375, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.712271690368652, + "rewards/margins": 2.738494873046875, + "rewards/rejected": -13.450766563415527, + "step": 3882 + }, + { + "epoch": 2.6796618940831465, + "grad_norm": 0.30818971991539, + "learning_rate": 1.334867663981588e-06, + "logits/chosen": 3.4883053302764893, + "logits/rejected": 3.469778060913086, + "logps/chosen": -157.83497619628906, + "logps/rejected": -168.1353302001953, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.921297073364258, + "rewards/margins": 1.0625678300857544, + "rewards/rejected": -11.983864784240723, + "step": 3883 + }, + { + "epoch": 2.6803519061583576, + "grad_norm": 0.3229856491088867, + "learning_rate": 1.3319907940161105e-06, + "logits/chosen": 3.482438564300537, + "logits/rejected": 3.4879567623138428, + "logps/chosen": -157.5822296142578, + "logps/rejected": -174.90579223632812, + "loss": 0.5201, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.157425880432129, + "rewards/margins": 1.809499979019165, + "rewards/rejected": -12.966925621032715, + "step": 3884 + }, + { + "epoch": 2.681041918233569, + "grad_norm": 22.288156509399414, + "learning_rate": 1.3291139240506329e-06, + "logits/chosen": 3.4633572101593018, + "logits/rejected": 3.643583059310913, + "logps/chosen": -171.6236572265625, + "logps/rejected": -182.06614685058594, + "loss": 0.7608, + "rewards/accuracies": 0.625, + "rewards/chosen": -12.405763626098633, + "rewards/margins": 1.0906093120574951, + "rewards/rejected": -13.49637222290039, + "step": 3885 + }, + { + "epoch": 2.6817319303087803, + "grad_norm": 0.25552934408187866, + "learning_rate": 1.3262370540851555e-06, + "logits/chosen": 3.389253854751587, + "logits/rejected": 3.781672477722168, + "logps/chosen": -145.14227294921875, + "logps/rejected": -182.19314575195312, + "loss": 0.347, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.805511474609375, + "rewards/margins": 3.741485595703125, + "rewards/rejected": -13.5469970703125, + "step": 3886 + }, + { + "epoch": 2.682421942383992, + "grad_norm": 0.33583420515060425, + "learning_rate": 1.323360184119678e-06, + "logits/chosen": 3.6993296146392822, + "logits/rejected": 3.6993296146392822, + "logps/chosen": -176.75381469726562, + "logps/rejected": -176.75381469726562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.85563850402832, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.85563850402832, + "step": 3887 + }, + { + "epoch": 2.683111954459203, + "grad_norm": 0.3902665674686432, + "learning_rate": 1.3204833141542004e-06, + "logits/chosen": 3.0221328735351562, + "logits/rejected": 3.296208381652832, + "logps/chosen": -140.98898315429688, + "logps/rejected": -176.7961883544922, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.476728439331055, + "rewards/margins": 3.4470527172088623, + "rewards/rejected": -12.923782348632812, + "step": 3888 + }, + { + "epoch": 2.683801966534414, + "grad_norm": 0.4101158082485199, + "learning_rate": 1.317606444188723e-06, + "logits/chosen": 3.5845632553100586, + "logits/rejected": 3.6352577209472656, + "logps/chosen": -159.62803649902344, + "logps/rejected": -168.69471740722656, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.254576683044434, + "rewards/margins": 0.9321840405464172, + "rewards/rejected": -12.186760902404785, + "step": 3889 + }, + { + "epoch": 2.6844919786096257, + "grad_norm": 0.5755507349967957, + "learning_rate": 1.3147295742232453e-06, + "logits/chosen": 3.58721923828125, + "logits/rejected": 3.7375025749206543, + "logps/chosen": -151.69871520996094, + "logps/rejected": -168.2216339111328, + "loss": 0.5246, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.477636337280273, + "rewards/margins": 1.5330824851989746, + "rewards/rejected": -12.010719299316406, + "step": 3890 + }, + { + "epoch": 2.685181990684837, + "grad_norm": 0.3939227759838104, + "learning_rate": 1.3118527042577678e-06, + "logits/chosen": 3.687119722366333, + "logits/rejected": 3.687119722366333, + "logps/chosen": -187.1453094482422, + "logps/rejected": -187.1453094482422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.004623413085938, + "rewards/margins": 0.0, + "rewards/rejected": -14.004623413085938, + "step": 3891 + }, + { + "epoch": 2.6858720027600484, + "grad_norm": 0.3382118046283722, + "learning_rate": 1.30897583429229e-06, + "logits/chosen": 3.5104434490203857, + "logits/rejected": 3.587958812713623, + "logps/chosen": -182.96279907226562, + "logps/rejected": -191.6241912841797, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.5095853805542, + "rewards/margins": 0.8679572939872742, + "rewards/rejected": -14.377543449401855, + "step": 3892 + }, + { + "epoch": 2.6865620148352596, + "grad_norm": 0.31691449880599976, + "learning_rate": 1.3060989643268126e-06, + "logits/chosen": 3.657041311264038, + "logits/rejected": 3.818990707397461, + "logps/chosen": -170.73834228515625, + "logps/rejected": -185.55657958984375, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.438730239868164, + "rewards/margins": 1.5149049758911133, + "rewards/rejected": -13.953636169433594, + "step": 3893 + }, + { + "epoch": 2.687252026910471, + "grad_norm": 0.4394901394844055, + "learning_rate": 1.303222094361335e-06, + "logits/chosen": 3.8093037605285645, + "logits/rejected": 3.8093037605285645, + "logps/chosen": -180.50051879882812, + "logps/rejected": -180.50051879882812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.226325988769531, + "rewards/margins": 0.0, + "rewards/rejected": -13.226325988769531, + "step": 3894 + }, + { + "epoch": 2.6879420389856823, + "grad_norm": 0.3357067406177521, + "learning_rate": 1.3003452243958575e-06, + "logits/chosen": 3.901564836502075, + "logits/rejected": 3.9849750995635986, + "logps/chosen": -165.54368591308594, + "logps/rejected": -179.66891479492188, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.790397644042969, + "rewards/margins": 1.362139344215393, + "rewards/rejected": -13.152538299560547, + "step": 3895 + }, + { + "epoch": 2.6886320510608934, + "grad_norm": 0.4050954282283783, + "learning_rate": 1.2974683544303798e-06, + "logits/chosen": 3.670241594314575, + "logits/rejected": 3.677727699279785, + "logps/chosen": -168.58535766601562, + "logps/rejected": -180.663818359375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.09678840637207, + "rewards/margins": 1.246419906616211, + "rewards/rejected": -13.343208312988281, + "step": 3896 + }, + { + "epoch": 2.689322063136105, + "grad_norm": 0.35557669401168823, + "learning_rate": 1.2945914844649024e-06, + "logits/chosen": 4.048340320587158, + "logits/rejected": 4.162870407104492, + "logps/chosen": -165.80642700195312, + "logps/rejected": -173.69534301757812, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.76426887512207, + "rewards/margins": 0.8539937734603882, + "rewards/rejected": -12.618263244628906, + "step": 3897 + }, + { + "epoch": 2.690012075211316, + "grad_norm": 0.35124555230140686, + "learning_rate": 1.2917146144994247e-06, + "logits/chosen": 3.413278102874756, + "logits/rejected": 3.446471691131592, + "logps/chosen": -174.72088623046875, + "logps/rejected": -187.07919311523438, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.649038314819336, + "rewards/margins": 1.270481824874878, + "rewards/rejected": -13.919520378112793, + "step": 3898 + }, + { + "epoch": 2.6907020872865273, + "grad_norm": 0.468019038438797, + "learning_rate": 1.2888377445339473e-06, + "logits/chosen": 3.3807034492492676, + "logits/rejected": 3.6170220375061035, + "logps/chosen": -155.22154235839844, + "logps/rejected": -176.4200439453125, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.667482376098633, + "rewards/margins": 2.125619411468506, + "rewards/rejected": -12.793102264404297, + "step": 3899 + }, + { + "epoch": 2.691392099361739, + "grad_norm": 0.3398546576499939, + "learning_rate": 1.2859608745684696e-06, + "logits/chosen": 3.5586540699005127, + "logits/rejected": 3.873940944671631, + "logps/chosen": -151.3817138671875, + "logps/rejected": -179.60540771484375, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.356110572814941, + "rewards/margins": 2.8708078861236572, + "rewards/rejected": -13.226919174194336, + "step": 3900 + }, + { + "epoch": 2.6920821114369504, + "grad_norm": 0.3418499529361725, + "learning_rate": 1.2830840046029922e-06, + "logits/chosen": 3.529691219329834, + "logits/rejected": 3.702552318572998, + "logps/chosen": -167.59396362304688, + "logps/rejected": -175.98587036132812, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.945938110351562, + "rewards/margins": 0.7744461297988892, + "rewards/rejected": -12.72038459777832, + "step": 3901 + }, + { + "epoch": 2.6927721235121616, + "grad_norm": 0.36849385499954224, + "learning_rate": 1.2802071346375144e-06, + "logits/chosen": 3.638582706451416, + "logits/rejected": 3.603527545928955, + "logps/chosen": -163.16868591308594, + "logps/rejected": -185.753662109375, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.60726547241211, + "rewards/margins": 2.300598382949829, + "rewards/rejected": -13.90786361694336, + "step": 3902 + }, + { + "epoch": 2.6934621355873727, + "grad_norm": 2.3924708366394043, + "learning_rate": 1.2773302646720371e-06, + "logits/chosen": 3.892427921295166, + "logits/rejected": 3.9260497093200684, + "logps/chosen": -178.4915008544922, + "logps/rejected": -181.44888305664062, + "loss": 0.6202, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.086933135986328, + "rewards/margins": 0.26910853385925293, + "rewards/rejected": -13.356040954589844, + "step": 3903 + }, + { + "epoch": 2.6941521476625843, + "grad_norm": 0.43148335814476013, + "learning_rate": 1.2744533947065593e-06, + "logits/chosen": 3.704049587249756, + "logits/rejected": 3.7358639240264893, + "logps/chosen": -159.86878967285156, + "logps/rejected": -169.69686889648438, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.36878776550293, + "rewards/margins": 0.9627803564071655, + "rewards/rejected": -12.331568717956543, + "step": 3904 + }, + { + "epoch": 2.6948421597377954, + "grad_norm": 0.409181147813797, + "learning_rate": 1.2715765247410818e-06, + "logits/chosen": 3.9048821926116943, + "logits/rejected": 3.9048821926116943, + "logps/chosen": -187.60826110839844, + "logps/rejected": -187.60826110839844, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.945621490478516, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.945621490478516, + "step": 3905 + }, + { + "epoch": 2.6955321718130065, + "grad_norm": 0.41431909799575806, + "learning_rate": 1.2686996547756042e-06, + "logits/chosen": 3.3035483360290527, + "logits/rejected": 3.4578537940979004, + "logps/chosen": -160.25119018554688, + "logps/rejected": -174.72518920898438, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.270698547363281, + "rewards/margins": 1.4637330770492554, + "rewards/rejected": -12.734432220458984, + "step": 3906 + }, + { + "epoch": 2.696222183888218, + "grad_norm": 0.3611016571521759, + "learning_rate": 1.2658227848101267e-06, + "logits/chosen": 3.429443597793579, + "logits/rejected": 3.638340473175049, + "logps/chosen": -171.91375732421875, + "logps/rejected": -186.11520385742188, + "loss": 0.6065, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.514991760253906, + "rewards/margins": 1.4325439929962158, + "rewards/rejected": -13.947535514831543, + "step": 3907 + }, + { + "epoch": 2.6969121959634292, + "grad_norm": 19.34990882873535, + "learning_rate": 1.262945914844649e-06, + "logits/chosen": 3.414033889770508, + "logits/rejected": 3.311851978302002, + "logps/chosen": -173.21612548828125, + "logps/rejected": -180.10394287109375, + "loss": 0.8732, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.260295867919922, + "rewards/margins": 0.737106204032898, + "rewards/rejected": -12.99740219116211, + "step": 3908 + }, + { + "epoch": 2.697602208038641, + "grad_norm": 0.3640545606613159, + "learning_rate": 1.2600690448791717e-06, + "logits/chosen": 3.7483315467834473, + "logits/rejected": 3.7483315467834473, + "logps/chosen": -186.1839599609375, + "logps/rejected": -186.18392944335938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.951251983642578, + "rewards/margins": 0.0, + "rewards/rejected": -13.951251983642578, + "step": 3909 + }, + { + "epoch": 2.698292220113852, + "grad_norm": 0.31217125058174133, + "learning_rate": 1.257192174913694e-06, + "logits/chosen": 3.648545980453491, + "logits/rejected": 3.648545980453491, + "logps/chosen": -195.7801513671875, + "logps/rejected": -195.7801513671875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.765392303466797, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.76539134979248, + "step": 3910 + }, + { + "epoch": 2.6989822321890635, + "grad_norm": 0.3288620412349701, + "learning_rate": 1.2543153049482166e-06, + "logits/chosen": 3.487947702407837, + "logits/rejected": 3.6036758422851562, + "logps/chosen": -170.65859985351562, + "logps/rejected": -177.7511444091797, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.265522003173828, + "rewards/margins": 0.7910221219062805, + "rewards/rejected": -13.05654525756836, + "step": 3911 + }, + { + "epoch": 2.6996722442642747, + "grad_norm": 0.42173463106155396, + "learning_rate": 1.2514384349827387e-06, + "logits/chosen": 3.5287113189697266, + "logits/rejected": 3.6959822177886963, + "logps/chosen": -159.75601196289062, + "logps/rejected": -187.21188354492188, + "loss": 0.4363, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.315673828125, + "rewards/margins": 2.695611000061035, + "rewards/rejected": -14.011284828186035, + "step": 3912 + }, + { + "epoch": 2.700362256339486, + "grad_norm": 0.44918093085289, + "learning_rate": 1.2485615650172615e-06, + "logits/chosen": 3.4133899211883545, + "logits/rejected": 3.4458560943603516, + "logps/chosen": -171.82688903808594, + "logps/rejected": -180.15419006347656, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.395162582397461, + "rewards/margins": 0.8522851467132568, + "rewards/rejected": -13.247447967529297, + "step": 3913 + }, + { + "epoch": 2.7010522684146974, + "grad_norm": 0.346167653799057, + "learning_rate": 1.2456846950517838e-06, + "logits/chosen": 3.448903799057007, + "logits/rejected": 3.456477403640747, + "logps/chosen": -166.00880432128906, + "logps/rejected": -174.17123413085938, + "loss": 0.6068, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.0039644241333, + "rewards/margins": 0.7695825695991516, + "rewards/rejected": -12.773547172546387, + "step": 3914 + }, + { + "epoch": 2.7017422804899085, + "grad_norm": 0.33304980397224426, + "learning_rate": 1.2428078250863062e-06, + "logits/chosen": 3.7068099975585938, + "logits/rejected": 3.7068099975585938, + "logps/chosen": -175.96633911132812, + "logps/rejected": -175.96633911132812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.734272003173828, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.734272003173828, + "step": 3915 + }, + { + "epoch": 2.7024322925651196, + "grad_norm": 0.3499557673931122, + "learning_rate": 1.2399309551208288e-06, + "logits/chosen": 3.880096435546875, + "logits/rejected": 3.880096435546875, + "logps/chosen": -173.79205322265625, + "logps/rejected": -173.79205322265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.56362247467041, + "rewards/margins": 0.0, + "rewards/rejected": -12.56362247467041, + "step": 3916 + }, + { + "epoch": 2.703122304640331, + "grad_norm": 0.355336457490921, + "learning_rate": 1.2370540851553511e-06, + "logits/chosen": 3.700998544692993, + "logits/rejected": 3.6204872131347656, + "logps/chosen": -175.00035095214844, + "logps/rejected": -182.15542602539062, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.606534957885742, + "rewards/margins": 0.7525852918624878, + "rewards/rejected": -13.35912036895752, + "step": 3917 + }, + { + "epoch": 2.703812316715543, + "grad_norm": 0.37311112880706787, + "learning_rate": 1.2341772151898737e-06, + "logits/chosen": 3.893342971801758, + "logits/rejected": 3.893342971801758, + "logps/chosen": -190.89712524414062, + "logps/rejected": -190.89712524414062, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.316338539123535, + "rewards/margins": 0.0, + "rewards/rejected": -14.316337585449219, + "step": 3918 + }, + { + "epoch": 2.704502328790754, + "grad_norm": 0.4234566390514374, + "learning_rate": 1.231300345224396e-06, + "logits/chosen": 3.1932997703552246, + "logits/rejected": 3.2149555683135986, + "logps/chosen": -154.2861328125, + "logps/rejected": -174.09786987304688, + "loss": 0.5205, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.715364456176758, + "rewards/margins": 1.767686128616333, + "rewards/rejected": -12.483050346374512, + "step": 3919 + }, + { + "epoch": 2.705192340865965, + "grad_norm": 0.38674819469451904, + "learning_rate": 1.2284234752589184e-06, + "logits/chosen": 3.637700080871582, + "logits/rejected": 3.659247398376465, + "logps/chosen": -155.28831481933594, + "logps/rejected": -167.4893798828125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.80742359161377, + "rewards/margins": 1.2634172439575195, + "rewards/rejected": -12.070839881896973, + "step": 3920 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.37292343378067017, + "learning_rate": 1.225546605293441e-06, + "logits/chosen": 3.4756689071655273, + "logits/rejected": 3.502655506134033, + "logps/chosen": -153.24667358398438, + "logps/rejected": -164.35861206054688, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.52738094329834, + "rewards/margins": 1.1407625675201416, + "rewards/rejected": -11.668143272399902, + "step": 3921 + }, + { + "epoch": 2.7065723650163878, + "grad_norm": 0.38708430528640747, + "learning_rate": 1.2226697353279633e-06, + "logits/chosen": 4.100330352783203, + "logits/rejected": 4.100330352783203, + "logps/chosen": -172.07017517089844, + "logps/rejected": -172.07015991210938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.368675231933594, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.368675231933594, + "step": 3922 + }, + { + "epoch": 2.707262377091599, + "grad_norm": 0.3257746696472168, + "learning_rate": 1.2197928653624859e-06, + "logits/chosen": 3.2139639854431152, + "logits/rejected": 3.2823405265808105, + "logps/chosen": -178.17388916015625, + "logps/rejected": -191.8662109375, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.0164213180542, + "rewards/margins": 1.479559302330017, + "rewards/rejected": -14.495980262756348, + "step": 3923 + }, + { + "epoch": 2.7079523891668105, + "grad_norm": 0.38621214032173157, + "learning_rate": 1.2169159953970082e-06, + "logits/chosen": 3.797856330871582, + "logits/rejected": 3.8520851135253906, + "logps/chosen": -175.27403259277344, + "logps/rejected": -188.2625274658203, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.797435760498047, + "rewards/margins": 1.3254568576812744, + "rewards/rejected": -14.122892379760742, + "step": 3924 + }, + { + "epoch": 2.7086424012420216, + "grad_norm": 0.3756365478038788, + "learning_rate": 1.2140391254315306e-06, + "logits/chosen": 3.6814968585968018, + "logits/rejected": 3.6808478832244873, + "logps/chosen": -174.6993865966797, + "logps/rejected": -183.45938110351562, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.7354154586792, + "rewards/margins": 0.846078634262085, + "rewards/rejected": -13.581494331359863, + "step": 3925 + }, + { + "epoch": 2.709332413317233, + "grad_norm": 0.38301870226860046, + "learning_rate": 1.2111622554660531e-06, + "logits/chosen": 3.5243396759033203, + "logits/rejected": 3.7887802124023438, + "logps/chosen": -164.1024627685547, + "logps/rejected": -173.4598388671875, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.675603866577148, + "rewards/margins": 0.8695911765098572, + "rewards/rejected": -12.545194625854492, + "step": 3926 + }, + { + "epoch": 2.7100224253924443, + "grad_norm": 0.31561997532844543, + "learning_rate": 1.2082853855005755e-06, + "logits/chosen": 3.391772747039795, + "logits/rejected": 3.4524073600769043, + "logps/chosen": -162.30361938476562, + "logps/rejected": -171.4203338623047, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.664241790771484, + "rewards/margins": 0.9024491310119629, + "rewards/rejected": -12.566690444946289, + "step": 3927 + }, + { + "epoch": 2.710712437467656, + "grad_norm": 0.39249059557914734, + "learning_rate": 1.205408515535098e-06, + "logits/chosen": 3.585097312927246, + "logits/rejected": 3.585097312927246, + "logps/chosen": -181.35934448242188, + "logps/rejected": -181.35934448242188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.468645095825195, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.468645095825195, + "step": 3928 + }, + { + "epoch": 2.711402449542867, + "grad_norm": 0.3101739287376404, + "learning_rate": 1.2025316455696204e-06, + "logits/chosen": 3.480426788330078, + "logits/rejected": 3.8030924797058105, + "logps/chosen": -155.47299194335938, + "logps/rejected": -179.16555786132812, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.672243118286133, + "rewards/margins": 2.4002654552459717, + "rewards/rejected": -13.072507858276367, + "step": 3929 + }, + { + "epoch": 2.712092461618078, + "grad_norm": 0.3359091281890869, + "learning_rate": 1.1996547756041427e-06, + "logits/chosen": 3.157808780670166, + "logits/rejected": 3.1914172172546387, + "logps/chosen": -161.79931640625, + "logps/rejected": -195.06068420410156, + "loss": 0.4333, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.561975479125977, + "rewards/margins": 3.362443208694458, + "rewards/rejected": -14.924418449401855, + "step": 3930 + }, + { + "epoch": 2.7127824736932897, + "grad_norm": 0.578292727470398, + "learning_rate": 1.1967779056386653e-06, + "logits/chosen": 3.261279821395874, + "logits/rejected": 3.505807876586914, + "logps/chosen": -167.5668182373047, + "logps/rejected": -180.37893676757812, + "loss": 0.5231, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.925939559936523, + "rewards/margins": 1.3243029117584229, + "rewards/rejected": -13.250243186950684, + "step": 3931 + }, + { + "epoch": 2.713472485768501, + "grad_norm": 0.3558076024055481, + "learning_rate": 1.1939010356731877e-06, + "logits/chosen": 3.659363269805908, + "logits/rejected": 3.8453783988952637, + "logps/chosen": -169.37429809570312, + "logps/rejected": -179.9912872314453, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.987592697143555, + "rewards/margins": 1.0600513219833374, + "rewards/rejected": -13.04764461517334, + "step": 3932 + }, + { + "epoch": 2.714162497843712, + "grad_norm": 0.6026251912117004, + "learning_rate": 1.1910241657077102e-06, + "logits/chosen": 3.663910388946533, + "logits/rejected": 3.704176664352417, + "logps/chosen": -178.73898315429688, + "logps/rejected": -183.26409912109375, + "loss": 0.6099, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.986597061157227, + "rewards/margins": 0.44764673709869385, + "rewards/rejected": -13.434243202209473, + "step": 3933 + }, + { + "epoch": 2.7148525099189236, + "grad_norm": 0.3264009952545166, + "learning_rate": 1.1881472957422326e-06, + "logits/chosen": 3.8041491508483887, + "logits/rejected": 4.117443084716797, + "logps/chosen": -140.89035034179688, + "logps/rejected": -175.51364135742188, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.321493148803711, + "rewards/margins": 3.476416826248169, + "rewards/rejected": -12.797908782958984, + "step": 3934 + }, + { + "epoch": 2.715542521994135, + "grad_norm": 0.47800326347351074, + "learning_rate": 1.185270425776755e-06, + "logits/chosen": 3.4395787715911865, + "logits/rejected": 3.4395787715911865, + "logps/chosen": -167.98452758789062, + "logps/rejected": -167.98452758789062, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.936600685119629, + "rewards/margins": 1.7881393432617188e-07, + "rewards/rejected": -11.936600685119629, + "step": 3935 + }, + { + "epoch": 2.7162325340693463, + "grad_norm": 3.5955867767333984, + "learning_rate": 1.1823935558112775e-06, + "logits/chosen": 3.5802292823791504, + "logits/rejected": 3.6718108654022217, + "logps/chosen": -147.57281494140625, + "logps/rejected": -150.1752166748047, + "loss": 0.6234, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.176803588867188, + "rewards/margins": 0.24189698696136475, + "rewards/rejected": -10.418700218200684, + "step": 3936 + }, + { + "epoch": 2.7169225461445574, + "grad_norm": 0.39503365755081177, + "learning_rate": 1.1795166858457998e-06, + "logits/chosen": 3.6400599479675293, + "logits/rejected": 3.7956180572509766, + "logps/chosen": -172.21054077148438, + "logps/rejected": -187.64572143554688, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.475772857666016, + "rewards/margins": 1.474120020866394, + "rewards/rejected": -13.9498929977417, + "step": 3937 + }, + { + "epoch": 2.717612558219769, + "grad_norm": 0.3207153379917145, + "learning_rate": 1.1766398158803224e-06, + "logits/chosen": 3.5625672340393066, + "logits/rejected": 3.540395736694336, + "logps/chosen": -182.7466583251953, + "logps/rejected": -194.21243286132812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.39511489868164, + "rewards/margins": 1.1977297067642212, + "rewards/rejected": -14.59284496307373, + "step": 3938 + }, + { + "epoch": 2.71830257029498, + "grad_norm": 0.3545495867729187, + "learning_rate": 1.1737629459148447e-06, + "logits/chosen": 3.19808030128479, + "logits/rejected": 3.19808030128479, + "logps/chosen": -166.280517578125, + "logps/rejected": -166.280517578125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.977442741394043, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -11.977441787719727, + "step": 3939 + }, + { + "epoch": 2.7189925823701913, + "grad_norm": 0.5262154936790466, + "learning_rate": 1.170886075949367e-06, + "logits/chosen": 3.702723979949951, + "logits/rejected": 3.705933094024658, + "logps/chosen": -192.53843688964844, + "logps/rejected": -197.88070678710938, + "loss": 0.6086, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.50389289855957, + "rewards/margins": 0.50916588306427, + "rewards/rejected": -15.013059616088867, + "step": 3940 + }, + { + "epoch": 2.719682594445403, + "grad_norm": 0.4603697955608368, + "learning_rate": 1.1680092059838897e-06, + "logits/chosen": 3.2841756343841553, + "logits/rejected": 3.329465866088867, + "logps/chosen": -144.20294189453125, + "logps/rejected": -150.20449829101562, + "loss": 0.6077, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.072962760925293, + "rewards/margins": 0.5855611562728882, + "rewards/rejected": -10.658523559570312, + "step": 3941 + }, + { + "epoch": 2.720372606520614, + "grad_norm": 0.3398266136646271, + "learning_rate": 1.165132336018412e-06, + "logits/chosen": 3.9022467136383057, + "logits/rejected": 3.9022467136383057, + "logps/chosen": -180.32432556152344, + "logps/rejected": -180.32432556152344, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.262679100036621, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.262679100036621, + "step": 3942 + }, + { + "epoch": 2.7210626185958255, + "grad_norm": 0.34524229168891907, + "learning_rate": 1.1622554660529346e-06, + "logits/chosen": 3.878741979598999, + "logits/rejected": 3.9871878623962402, + "logps/chosen": -177.6089324951172, + "logps/rejected": -189.52645874023438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.83133602142334, + "rewards/margins": 1.2248198986053467, + "rewards/rejected": -14.056156158447266, + "step": 3943 + }, + { + "epoch": 2.7217526306710367, + "grad_norm": 0.3545430898666382, + "learning_rate": 1.159378596087457e-06, + "logits/chosen": 3.8092141151428223, + "logits/rejected": 3.8092141151428223, + "logps/chosen": -190.60467529296875, + "logps/rejected": -190.6046600341797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.233001708984375, + "rewards/margins": -1.1920928955078125e-06, + "rewards/rejected": -14.233000755310059, + "step": 3944 + }, + { + "epoch": 2.7224426427462483, + "grad_norm": 0.2639114558696747, + "learning_rate": 1.1565017261219793e-06, + "logits/chosen": 3.25076961517334, + "logits/rejected": 3.78226375579834, + "logps/chosen": -164.87498474121094, + "logps/rejected": -196.30091857910156, + "loss": 0.4334, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.722061157226562, + "rewards/margins": 3.268134117126465, + "rewards/rejected": -14.990194320678711, + "step": 3945 + }, + { + "epoch": 2.7231326548214594, + "grad_norm": 0.3549414873123169, + "learning_rate": 1.1536248561565018e-06, + "logits/chosen": 3.5060739517211914, + "logits/rejected": 3.706149101257324, + "logps/chosen": -166.2123565673828, + "logps/rejected": -185.5546875, + "loss": 0.521, + "rewards/accuracies": 0.625, + "rewards/chosen": -11.900638580322266, + "rewards/margins": 1.919084906578064, + "rewards/rejected": -13.819723129272461, + "step": 3946 + }, + { + "epoch": 2.7238226668966705, + "grad_norm": 0.8880373239517212, + "learning_rate": 1.1507479861910242e-06, + "logits/chosen": 3.4556121826171875, + "logits/rejected": 3.6413211822509766, + "logps/chosen": -156.89942932128906, + "logps/rejected": -172.233642578125, + "loss": 0.5235, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.023009300231934, + "rewards/margins": 1.5183144807815552, + "rewards/rejected": -12.541324615478516, + "step": 3947 + }, + { + "epoch": 2.724512678971882, + "grad_norm": 0.45417875051498413, + "learning_rate": 1.1478711162255468e-06, + "logits/chosen": 3.6334633827209473, + "logits/rejected": 3.6334633827209473, + "logps/chosen": -173.42872619628906, + "logps/rejected": -173.42872619628906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.640628814697266, + "rewards/margins": -2.980232238769531e-07, + "rewards/rejected": -12.640628814697266, + "step": 3948 + }, + { + "epoch": 2.7252026910470932, + "grad_norm": 0.3587210476398468, + "learning_rate": 1.1449942462600691e-06, + "logits/chosen": 3.687471389770508, + "logits/rejected": 3.687471389770508, + "logps/chosen": -186.1378631591797, + "logps/rejected": -186.1378631591797, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.787040710449219, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.787042617797852, + "step": 3949 + }, + { + "epoch": 2.725892703122305, + "grad_norm": 0.29994213581085205, + "learning_rate": 1.1421173762945915e-06, + "logits/chosen": 4.059582233428955, + "logits/rejected": 4.059582233428955, + "logps/chosen": -186.36036682128906, + "logps/rejected": -186.36036682128906, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.81741714477539, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.81741714477539, + "step": 3950 + }, + { + "epoch": 2.726582715197516, + "grad_norm": 0.3929007351398468, + "learning_rate": 1.139240506329114e-06, + "logits/chosen": 3.2833070755004883, + "logits/rejected": 3.5835494995117188, + "logps/chosen": -147.73297119140625, + "logps/rejected": -177.47116088867188, + "loss": 0.4336, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.140829086303711, + "rewards/margins": 3.028273105621338, + "rewards/rejected": -13.169102668762207, + "step": 3951 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.4571923315525055, + "learning_rate": 1.1363636363636364e-06, + "logits/chosen": 3.246969699859619, + "logits/rejected": 3.3214290142059326, + "logps/chosen": -141.4445037841797, + "logps/rejected": -146.42626953125, + "loss": 0.6083, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.359128952026367, + "rewards/margins": 0.5318575501441956, + "rewards/rejected": -9.890987396240234, + "step": 3952 + }, + { + "epoch": 2.7279627393479386, + "grad_norm": 0.5128844976425171, + "learning_rate": 1.133486766398159e-06, + "logits/chosen": 3.290776252746582, + "logits/rejected": 3.4170217514038086, + "logps/chosen": -153.92213439941406, + "logps/rejected": -161.05853271484375, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.482213973999023, + "rewards/margins": 0.7218979001045227, + "rewards/rejected": -11.204111099243164, + "step": 3953 + }, + { + "epoch": 2.72865275142315, + "grad_norm": 0.7029299139976501, + "learning_rate": 1.1306098964326813e-06, + "logits/chosen": 3.771122694015503, + "logits/rejected": 4.017608642578125, + "logps/chosen": -156.8789825439453, + "logps/rejected": -177.87216186523438, + "loss": 0.4386, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.766680717468262, + "rewards/margins": 2.14711856842041, + "rewards/rejected": -12.913799285888672, + "step": 3954 + }, + { + "epoch": 2.7293427634983614, + "grad_norm": 0.616105854511261, + "learning_rate": 1.1277330264672036e-06, + "logits/chosen": 3.628756523132324, + "logits/rejected": 3.8420205116271973, + "logps/chosen": -170.8701934814453, + "logps/rejected": -195.16912841796875, + "loss": 0.4365, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.354876518249512, + "rewards/margins": 2.4075241088867188, + "rewards/rejected": -14.76240062713623, + "step": 3955 + }, + { + "epoch": 2.7300327755735725, + "grad_norm": 11.411883354187012, + "learning_rate": 1.1248561565017262e-06, + "logits/chosen": 3.899664878845215, + "logits/rejected": 3.899445056915283, + "logps/chosen": -165.51678466796875, + "logps/rejected": -172.82656860351562, + "loss": 0.6386, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.891849517822266, + "rewards/margins": 0.7247461080551147, + "rewards/rejected": -12.616596221923828, + "step": 3956 + }, + { + "epoch": 2.7307227876487836, + "grad_norm": 0.3769899904727936, + "learning_rate": 1.1219792865362486e-06, + "logits/chosen": 3.679988145828247, + "logits/rejected": 3.8485724925994873, + "logps/chosen": -179.819580078125, + "logps/rejected": -186.75521850585938, + "loss": 0.6071, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.151690483093262, + "rewards/margins": 0.6585737466812134, + "rewards/rejected": -13.810264587402344, + "step": 3957 + }, + { + "epoch": 2.731412799723995, + "grad_norm": 14.498950004577637, + "learning_rate": 1.1191024165707711e-06, + "logits/chosen": 3.7117085456848145, + "logits/rejected": 3.793158531188965, + "logps/chosen": -166.3447265625, + "logps/rejected": -172.4354705810547, + "loss": 1.164, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.817317008972168, + "rewards/margins": 0.6471840143203735, + "rewards/rejected": -12.464500427246094, + "step": 3958 + }, + { + "epoch": 2.7321028117992063, + "grad_norm": 0.36930951476097107, + "learning_rate": 1.1162255466052935e-06, + "logits/chosen": 2.9945101737976074, + "logits/rejected": 3.234423875808716, + "logps/chosen": -165.14541625976562, + "logps/rejected": -186.68795776367188, + "loss": 0.5202, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.943059921264648, + "rewards/margins": 2.101820707321167, + "rewards/rejected": -14.044881820678711, + "step": 3959 + }, + { + "epoch": 2.732792823874418, + "grad_norm": 0.4357692301273346, + "learning_rate": 1.1133486766398158e-06, + "logits/chosen": 3.4391727447509766, + "logits/rejected": 3.553401470184326, + "logps/chosen": -155.99627685546875, + "logps/rejected": -167.8359375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.84250259399414, + "rewards/margins": 1.1526210308074951, + "rewards/rejected": -11.995123863220215, + "step": 3960 + }, + { + "epoch": 2.733482835949629, + "grad_norm": 0.6053608059883118, + "learning_rate": 1.1104718066743384e-06, + "logits/chosen": 3.567354202270508, + "logits/rejected": 3.948206901550293, + "logps/chosen": -160.78369140625, + "logps/rejected": -166.42037963867188, + "loss": 0.6084, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.334870338439941, + "rewards/margins": 0.521598219871521, + "rewards/rejected": -11.85646915435791, + "step": 3961 + }, + { + "epoch": 2.7341728480248406, + "grad_norm": 0.34014713764190674, + "learning_rate": 1.1075949367088607e-06, + "logits/chosen": 3.828533172607422, + "logits/rejected": 4.1038498878479, + "logps/chosen": -150.9237823486328, + "logps/rejected": -176.81163024902344, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.264153480529785, + "rewards/margins": 2.6303815841674805, + "rewards/rejected": -12.894535064697266, + "step": 3962 + }, + { + "epoch": 2.7348628601000518, + "grad_norm": 0.2913403809070587, + "learning_rate": 1.1047180667433833e-06, + "logits/chosen": 3.671250343322754, + "logits/rejected": 3.8355960845947266, + "logps/chosen": -160.29800415039062, + "logps/rejected": -187.95281982421875, + "loss": 0.4335, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.253774642944336, + "rewards/margins": 2.75754451751709, + "rewards/rejected": -14.011320114135742, + "step": 3963 + }, + { + "epoch": 2.735552872175263, + "grad_norm": 0.36708498001098633, + "learning_rate": 1.1018411967779059e-06, + "logits/chosen": 3.792191982269287, + "logits/rejected": 3.792191982269287, + "logps/chosen": -176.12295532226562, + "logps/rejected": -176.12295532226562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.689678192138672, + "rewards/margins": 0.0, + "rewards/rejected": -12.689678192138672, + "step": 3964 + }, + { + "epoch": 2.7362428842504745, + "grad_norm": 0.4455726146697998, + "learning_rate": 1.0989643268124282e-06, + "logits/chosen": 3.438347816467285, + "logits/rejected": 3.438347816467285, + "logps/chosen": -155.10679626464844, + "logps/rejected": -155.1068115234375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.807812690734863, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -10.80781364440918, + "step": 3965 + }, + { + "epoch": 2.7369328963256856, + "grad_norm": 0.3220333158969879, + "learning_rate": 1.0960874568469506e-06, + "logits/chosen": 3.6074483394622803, + "logits/rejected": 3.776035785675049, + "logps/chosen": -189.13389587402344, + "logps/rejected": -198.08074951171875, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -14.131379127502441, + "rewards/margins": 0.8544492721557617, + "rewards/rejected": -14.985828399658203, + "step": 3966 + }, + { + "epoch": 2.737622908400897, + "grad_norm": 0.32399383187294006, + "learning_rate": 1.0932105868814731e-06, + "logits/chosen": 3.684798240661621, + "logits/rejected": 3.7596635818481445, + "logps/chosen": -178.12591552734375, + "logps/rejected": -188.45826721191406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.00036907196045, + "rewards/margins": 1.068791151046753, + "rewards/rejected": -14.069160461425781, + "step": 3967 + }, + { + "epoch": 2.7383129204761083, + "grad_norm": 36.55746841430664, + "learning_rate": 1.0903337169159955e-06, + "logits/chosen": 3.385601282119751, + "logits/rejected": 3.2336153984069824, + "logps/chosen": -159.80764770507812, + "logps/rejected": -167.6143035888672, + "loss": 0.852, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.141727447509766, + "rewards/margins": 0.6735078692436218, + "rewards/rejected": -11.815235137939453, + "step": 3968 + }, + { + "epoch": 2.73900293255132, + "grad_norm": 0.2622986137866974, + "learning_rate": 1.087456846950518e-06, + "logits/chosen": 3.3826904296875, + "logits/rejected": 3.6079750061035156, + "logps/chosen": -152.5569610595703, + "logps/rejected": -183.95814514160156, + "loss": 0.4335, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.370384216308594, + "rewards/margins": 3.149444580078125, + "rewards/rejected": -13.519828796386719, + "step": 3969 + }, + { + "epoch": 2.739692944626531, + "grad_norm": 0.3105000853538513, + "learning_rate": 1.0845799769850404e-06, + "logits/chosen": 3.219315528869629, + "logits/rejected": 3.481515645980835, + "logps/chosen": -154.37496948242188, + "logps/rejected": -178.09201049804688, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.508321762084961, + "rewards/margins": 2.395437002182007, + "rewards/rejected": -12.903759002685547, + "step": 3970 + }, + { + "epoch": 2.740382956701742, + "grad_norm": 1.1186538934707642, + "learning_rate": 1.0817031070195628e-06, + "logits/chosen": 3.9654574394226074, + "logits/rejected": 3.9728779792785645, + "logps/chosen": -185.67298889160156, + "logps/rejected": -189.83041381835938, + "loss": 0.6174, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.753486633300781, + "rewards/margins": 0.2999333143234253, + "rewards/rejected": -14.05341911315918, + "step": 3971 + }, + { + "epoch": 2.7410729687769537, + "grad_norm": 0.42827823758125305, + "learning_rate": 1.0788262370540853e-06, + "logits/chosen": 3.8644356727600098, + "logits/rejected": 3.8644356727600098, + "logps/chosen": -168.64541625976562, + "logps/rejected": -168.64541625976562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.05798053741455, + "rewards/margins": 0.0, + "rewards/rejected": -12.057981491088867, + "step": 3972 + }, + { + "epoch": 2.741762980852165, + "grad_norm": 0.3172937333583832, + "learning_rate": 1.0759493670886077e-06, + "logits/chosen": 3.531782388687134, + "logits/rejected": 3.635613441467285, + "logps/chosen": -177.98416137695312, + "logps/rejected": -193.18804931640625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.200395584106445, + "rewards/margins": 1.4909138679504395, + "rewards/rejected": -14.691309928894043, + "step": 3973 + }, + { + "epoch": 2.742452992927376, + "grad_norm": 1.1393572092056274, + "learning_rate": 1.0730724971231302e-06, + "logits/chosen": 3.695063591003418, + "logits/rejected": 3.6569366455078125, + "logps/chosen": -171.13938903808594, + "logps/rejected": -175.36923217773438, + "loss": 0.6104, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.410456657409668, + "rewards/margins": 0.430678129196167, + "rewards/rejected": -12.841135025024414, + "step": 3974 + }, + { + "epoch": 2.7431430050025876, + "grad_norm": 0.45641085505485535, + "learning_rate": 1.0701956271576526e-06, + "logits/chosen": 3.481353282928467, + "logits/rejected": 3.5509462356567383, + "logps/chosen": -169.06983947753906, + "logps/rejected": -180.56961059570312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.129547119140625, + "rewards/margins": 1.1632694005966187, + "rewards/rejected": -13.292816162109375, + "step": 3975 + }, + { + "epoch": 2.7438330170777987, + "grad_norm": 0.3482382297515869, + "learning_rate": 1.067318757192175e-06, + "logits/chosen": 3.4904446601867676, + "logits/rejected": 3.748093843460083, + "logps/chosen": -153.4759979248047, + "logps/rejected": -179.06573486328125, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.540377616882324, + "rewards/margins": 2.565687656402588, + "rewards/rejected": -13.106064796447754, + "step": 3976 + }, + { + "epoch": 2.7445230291530103, + "grad_norm": 0.36415645480155945, + "learning_rate": 1.0644418872266975e-06, + "logits/chosen": 3.4190523624420166, + "logits/rejected": 3.6062941551208496, + "logps/chosen": -152.95352172851562, + "logps/rejected": -175.4853973388672, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.283529281616211, + "rewards/margins": 2.2930727005004883, + "rewards/rejected": -12.576601028442383, + "step": 3977 + }, + { + "epoch": 2.7452130412282214, + "grad_norm": 0.3475237488746643, + "learning_rate": 1.0615650172612199e-06, + "logits/chosen": 3.9547369480133057, + "logits/rejected": 4.041049003601074, + "logps/chosen": -178.09938049316406, + "logps/rejected": -190.59640502929688, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.853914260864258, + "rewards/margins": 1.2491000890731812, + "rewards/rejected": -14.103014945983887, + "step": 3978 + }, + { + "epoch": 2.745903053303433, + "grad_norm": 0.38845476508140564, + "learning_rate": 1.0586881472957424e-06, + "logits/chosen": 3.1934127807617188, + "logits/rejected": 3.1934127807617188, + "logps/chosen": -163.8715057373047, + "logps/rejected": -163.8715057373047, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.761116027832031, + "rewards/margins": 0.0, + "rewards/rejected": -11.761116027832031, + "step": 3979 + }, + { + "epoch": 2.746593065378644, + "grad_norm": 0.42861679196357727, + "learning_rate": 1.0558112773302648e-06, + "logits/chosen": 3.3049464225769043, + "logits/rejected": 3.5470123291015625, + "logps/chosen": -149.38453674316406, + "logps/rejected": -169.8514404296875, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.477998733520508, + "rewards/margins": 1.9948880672454834, + "rewards/rejected": -12.47288703918457, + "step": 3980 + }, + { + "epoch": 2.7472830774538552, + "grad_norm": 0.3206661641597748, + "learning_rate": 1.0529344073647871e-06, + "logits/chosen": 3.4329843521118164, + "logits/rejected": 3.4638657569885254, + "logps/chosen": -166.97596740722656, + "logps/rejected": -189.5663299560547, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.934670448303223, + "rewards/margins": 2.2653744220733643, + "rewards/rejected": -14.200044631958008, + "step": 3981 + }, + { + "epoch": 2.747973089529067, + "grad_norm": 0.5041112899780273, + "learning_rate": 1.0500575373993097e-06, + "logits/chosen": 3.266526222229004, + "logits/rejected": 3.330465078353882, + "logps/chosen": -163.23123168945312, + "logps/rejected": -172.3702850341797, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.632493019104004, + "rewards/margins": 0.9503393769264221, + "rewards/rejected": -12.582832336425781, + "step": 3982 + }, + { + "epoch": 2.748663101604278, + "grad_norm": 0.33489155769348145, + "learning_rate": 1.047180667433832e-06, + "logits/chosen": 3.3087873458862305, + "logits/rejected": 3.43983793258667, + "logps/chosen": -158.59556579589844, + "logps/rejected": -173.6845703125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.995771408081055, + "rewards/margins": 1.437026023864746, + "rewards/rejected": -12.432798385620117, + "step": 3983 + }, + { + "epoch": 2.7493531136794895, + "grad_norm": 0.34840285778045654, + "learning_rate": 1.0443037974683546e-06, + "logits/chosen": 3.9585061073303223, + "logits/rejected": 4.026430130004883, + "logps/chosen": -160.59664916992188, + "logps/rejected": -174.01980590820312, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.213233947753906, + "rewards/margins": 1.2312631607055664, + "rewards/rejected": -12.444496154785156, + "step": 3984 + }, + { + "epoch": 2.7500431257547007, + "grad_norm": 0.6316930055618286, + "learning_rate": 1.041426927502877e-06, + "logits/chosen": 3.6570839881896973, + "logits/rejected": 3.697383403778076, + "logps/chosen": -160.2235565185547, + "logps/rejected": -164.46522521972656, + "loss": 0.6105, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.365118980407715, + "rewards/margins": 0.42831897735595703, + "rewards/rejected": -11.793437004089355, + "step": 3985 + }, + { + "epoch": 2.7507331378299122, + "grad_norm": 0.38591131567955017, + "learning_rate": 1.0385500575373993e-06, + "logits/chosen": 3.7189571857452393, + "logits/rejected": 3.844329595565796, + "logps/chosen": -171.20980834960938, + "logps/rejected": -183.6669158935547, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.294480323791504, + "rewards/margins": 1.2195351123809814, + "rewards/rejected": -13.514015197753906, + "step": 3986 + }, + { + "epoch": 2.7514231499051234, + "grad_norm": 0.4235614836215973, + "learning_rate": 1.0356731875719219e-06, + "logits/chosen": 3.4287471771240234, + "logits/rejected": 3.5678510665893555, + "logps/chosen": -160.90444946289062, + "logps/rejected": -171.54910278320312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.331578254699707, + "rewards/margins": 1.0729682445526123, + "rewards/rejected": -12.404545783996582, + "step": 3987 + }, + { + "epoch": 2.7521131619803345, + "grad_norm": 0.34275388717651367, + "learning_rate": 1.0327963176064442e-06, + "logits/chosen": 3.9221062660217285, + "logits/rejected": 3.9221062660217285, + "logps/chosen": -187.51400756835938, + "logps/rejected": -187.51400756835938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.739715576171875, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.739715576171875, + "step": 3988 + }, + { + "epoch": 2.752803174055546, + "grad_norm": 0.4362207055091858, + "learning_rate": 1.0299194476409668e-06, + "logits/chosen": 2.9426281452178955, + "logits/rejected": 3.0265955924987793, + "logps/chosen": -170.63037109375, + "logps/rejected": -178.97677612304688, + "loss": 0.6066, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.199726104736328, + "rewards/margins": 0.9324530363082886, + "rewards/rejected": -13.132179260253906, + "step": 3989 + }, + { + "epoch": 2.753493186130757, + "grad_norm": 0.3524904251098633, + "learning_rate": 1.0270425776754891e-06, + "logits/chosen": 3.5414657592773438, + "logits/rejected": 3.6847078800201416, + "logps/chosen": -159.5350799560547, + "logps/rejected": -174.9646759033203, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.056021690368652, + "rewards/margins": 1.500135898590088, + "rewards/rejected": -12.556158065795898, + "step": 3990 + }, + { + "epoch": 2.7541831982059684, + "grad_norm": 0.28752660751342773, + "learning_rate": 1.0241657077100115e-06, + "logits/chosen": 3.5494673252105713, + "logits/rejected": 3.689351797103882, + "logps/chosen": -156.05271911621094, + "logps/rejected": -182.93853759765625, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.7775297164917, + "rewards/margins": 2.661597728729248, + "rewards/rejected": -13.439126968383789, + "step": 3991 + }, + { + "epoch": 2.75487321028118, + "grad_norm": 0.28266441822052, + "learning_rate": 1.021288837744534e-06, + "logits/chosen": 3.2477312088012695, + "logits/rejected": 3.2477312088012695, + "logps/chosen": -174.5040283203125, + "logps/rejected": -174.5040283203125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.602266311645508, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.602265357971191, + "step": 3992 + }, + { + "epoch": 2.7555632223563915, + "grad_norm": 0.34743446111679077, + "learning_rate": 1.0184119677790564e-06, + "logits/chosen": 3.529569625854492, + "logits/rejected": 3.529569625854492, + "logps/chosen": -165.6231689453125, + "logps/rejected": -165.6231689453125, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.571746826171875, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -11.571746826171875, + "step": 3993 + }, + { + "epoch": 2.7562532344316026, + "grad_norm": 0.37830644845962524, + "learning_rate": 1.015535097813579e-06, + "logits/chosen": 3.103496551513672, + "logits/rejected": 3.201785087585449, + "logps/chosen": -155.3573455810547, + "logps/rejected": -163.9837646484375, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.779329299926758, + "rewards/margins": 0.8502559065818787, + "rewards/rejected": -11.629585266113281, + "step": 3994 + }, + { + "epoch": 2.7569432465068138, + "grad_norm": 0.3801576793193817, + "learning_rate": 1.0126582278481013e-06, + "logits/chosen": 3.621117115020752, + "logits/rejected": 3.694256544113159, + "logps/chosen": -175.51097106933594, + "logps/rejected": -194.75161743164062, + "loss": 0.5213, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.821959495544434, + "rewards/margins": 1.8372223377227783, + "rewards/rejected": -14.659181594848633, + "step": 3995 + }, + { + "epoch": 2.7576332585820253, + "grad_norm": 0.35583436489105225, + "learning_rate": 1.0097813578826239e-06, + "logits/chosen": 3.6328344345092773, + "logits/rejected": 3.6328344345092773, + "logps/chosen": -161.5211639404297, + "logps/rejected": -161.5211639404297, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.36758041381836, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -11.36758041381836, + "step": 3996 + }, + { + "epoch": 2.7583232706572365, + "grad_norm": 0.3261374533176422, + "learning_rate": 1.0069044879171462e-06, + "logits/chosen": 3.4510254859924316, + "logits/rejected": 3.626605987548828, + "logps/chosen": -180.90185546875, + "logps/rejected": -190.54397583007812, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.127715110778809, + "rewards/margins": 0.9613510370254517, + "rewards/rejected": -14.089066505432129, + "step": 3997 + }, + { + "epoch": 2.7590132827324476, + "grad_norm": 0.5219392776489258, + "learning_rate": 1.0040276179516686e-06, + "logits/chosen": 3.4149928092956543, + "logits/rejected": 3.33428955078125, + "logps/chosen": -173.44654846191406, + "logps/rejected": -179.19210815429688, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.581764221191406, + "rewards/margins": 0.5978074073791504, + "rewards/rejected": -13.179571151733398, + "step": 3998 + }, + { + "epoch": 2.759703294807659, + "grad_norm": 0.41924527287483215, + "learning_rate": 1.0011507479861911e-06, + "logits/chosen": 3.519906997680664, + "logits/rejected": 3.519906997680664, + "logps/chosen": -163.03268432617188, + "logps/rejected": -163.03268432617188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.556838989257812, + "rewards/margins": 0.0, + "rewards/rejected": -11.556838989257812, + "step": 3999 + }, + { + "epoch": 2.7603933068828703, + "grad_norm": 0.42984211444854736, + "learning_rate": 9.982738780207135e-07, + "logits/chosen": 3.2882370948791504, + "logits/rejected": 3.3002281188964844, + "logps/chosen": -165.37918090820312, + "logps/rejected": -183.07000732421875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.83033275604248, + "rewards/margins": 1.7121305465698242, + "rewards/rejected": -13.542463302612305, + "step": 4000 + }, + { + "epoch": 2.761083318958082, + "grad_norm": 0.3415074348449707, + "learning_rate": 9.95397008055236e-07, + "logits/chosen": 3.5869314670562744, + "logits/rejected": 3.5869314670562744, + "logps/chosen": -179.5561065673828, + "logps/rejected": -179.5561065673828, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.161588668823242, + "rewards/margins": -5.960464477539063e-08, + "rewards/rejected": -13.161588668823242, + "step": 4001 + }, + { + "epoch": 2.761773331033293, + "grad_norm": 0.36706265807151794, + "learning_rate": 9.925201380897584e-07, + "logits/chosen": 3.5857295989990234, + "logits/rejected": 3.5857295989990234, + "logps/chosen": -190.11679077148438, + "logps/rejected": -190.11679077148438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.148456573486328, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.148456573486328, + "step": 4002 + }, + { + "epoch": 2.7624633431085046, + "grad_norm": 0.8040185570716858, + "learning_rate": 9.896432681242808e-07, + "logits/chosen": 3.26823091506958, + "logits/rejected": 3.3605268001556396, + "logps/chosen": -172.31326293945312, + "logps/rejected": -187.96029663085938, + "loss": 0.5227, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.629608154296875, + "rewards/margins": 1.5868468284606934, + "rewards/rejected": -14.216455459594727, + "step": 4003 + }, + { + "epoch": 2.7631533551837157, + "grad_norm": 0.3005835711956024, + "learning_rate": 9.867663981588033e-07, + "logits/chosen": 2.9528090953826904, + "logits/rejected": 3.4453318119049072, + "logps/chosen": -139.54554748535156, + "logps/rejected": -177.0312957763672, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.225227355957031, + "rewards/margins": 3.7619271278381348, + "rewards/rejected": -12.987154006958008, + "step": 4004 + }, + { + "epoch": 2.763843367258927, + "grad_norm": 0.37652111053466797, + "learning_rate": 9.838895281933257e-07, + "logits/chosen": 3.3434643745422363, + "logits/rejected": 3.485252857208252, + "logps/chosen": -145.60206604003906, + "logps/rejected": -167.93722534179688, + "loss": 0.5204, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.733168601989746, + "rewards/margins": 2.2366538047790527, + "rewards/rejected": -11.96982192993164, + "step": 4005 + }, + { + "epoch": 2.7645333793341385, + "grad_norm": 0.39530709385871887, + "learning_rate": 9.810126582278482e-07, + "logits/chosen": 3.7076282501220703, + "logits/rejected": 3.7076282501220703, + "logps/chosen": -163.61892700195312, + "logps/rejected": -163.61892700195312, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.586143493652344, + "rewards/margins": 2.9802322387695312e-08, + "rewards/rejected": -11.586143493652344, + "step": 4006 + }, + { + "epoch": 2.7652233914093496, + "grad_norm": 0.4714692533016205, + "learning_rate": 9.781357882623706e-07, + "logits/chosen": 3.385955333709717, + "logits/rejected": 3.75061297416687, + "logps/chosen": -162.61033630371094, + "logps/rejected": -183.88003540039062, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.675371170043945, + "rewards/margins": 2.110521078109741, + "rewards/rejected": -13.785892486572266, + "step": 4007 + }, + { + "epoch": 2.7659134034845607, + "grad_norm": 1.3128732442855835, + "learning_rate": 9.75258918296893e-07, + "logits/chosen": 3.426690101623535, + "logits/rejected": 3.5598926544189453, + "logps/chosen": -166.74679565429688, + "logps/rejected": -179.49673461914062, + "loss": 0.5444, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.796799659729004, + "rewards/margins": 1.2544194459915161, + "rewards/rejected": -13.05121898651123, + "step": 4008 + }, + { + "epoch": 2.7666034155597723, + "grad_norm": 0.4288850426673889, + "learning_rate": 9.723820483314155e-07, + "logits/chosen": 3.5619425773620605, + "logits/rejected": 3.6878716945648193, + "logps/chosen": -172.85951232910156, + "logps/rejected": -189.18875122070312, + "loss": 0.5204, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.471338272094727, + "rewards/margins": 1.6758227348327637, + "rewards/rejected": -14.147161483764648, + "step": 4009 + }, + { + "epoch": 2.767293427634984, + "grad_norm": 0.3195129632949829, + "learning_rate": 9.695051783659379e-07, + "logits/chosen": 2.9375271797180176, + "logits/rejected": 3.037763833999634, + "logps/chosen": -141.40347290039062, + "logps/rejected": -160.2593231201172, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.402840614318848, + "rewards/margins": 1.8989129066467285, + "rewards/rejected": -11.301753997802734, + "step": 4010 + }, + { + "epoch": 2.767983439710195, + "grad_norm": 0.6503312587738037, + "learning_rate": 9.666283084004604e-07, + "logits/chosen": 3.006513833999634, + "logits/rejected": 3.189926862716675, + "logps/chosen": -158.0213623046875, + "logps/rejected": -179.41183471679688, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.128725051879883, + "rewards/margins": 2.1227846145629883, + "rewards/rejected": -13.251510620117188, + "step": 4011 + }, + { + "epoch": 2.768673451785406, + "grad_norm": 6.899331092834473, + "learning_rate": 9.637514384349828e-07, + "logits/chosen": 3.993837594985962, + "logits/rejected": 3.985016345977783, + "logps/chosen": -175.40826416015625, + "logps/rejected": -176.72006225585938, + "loss": 0.6399, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.727751731872559, + "rewards/margins": 0.1480352282524109, + "rewards/rejected": -12.875786781311035, + "step": 4012 + }, + { + "epoch": 2.7693634638606177, + "grad_norm": 0.34903398156166077, + "learning_rate": 9.608745684695051e-07, + "logits/chosen": 3.356862783432007, + "logits/rejected": 3.5633084774017334, + "logps/chosen": -148.56039428710938, + "logps/rejected": -165.36965942382812, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.253340721130371, + "rewards/margins": 1.6895713806152344, + "rewards/rejected": -11.942912101745605, + "step": 4013 + }, + { + "epoch": 2.770053475935829, + "grad_norm": 0.3889206051826477, + "learning_rate": 9.579976985040277e-07, + "logits/chosen": 3.2448079586029053, + "logits/rejected": 3.2448079586029053, + "logps/chosen": -150.7832794189453, + "logps/rejected": -150.78326416015625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.339653015136719, + "rewards/margins": -4.470348358154297e-07, + "rewards/rejected": -10.339653015136719, + "step": 4014 + }, + { + "epoch": 2.77074348801104, + "grad_norm": 0.3235945701599121, + "learning_rate": 9.551208285385502e-07, + "logits/chosen": 3.196138381958008, + "logits/rejected": 3.467325210571289, + "logps/chosen": -119.51868438720703, + "logps/rejected": -141.5361328125, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -7.23665714263916, + "rewards/margins": 2.191965103149414, + "rewards/rejected": -9.428622245788574, + "step": 4015 + }, + { + "epoch": 2.7714335000862516, + "grad_norm": 0.42028021812438965, + "learning_rate": 9.522439585730726e-07, + "logits/chosen": 3.3243250846862793, + "logits/rejected": 3.5725626945495605, + "logps/chosen": -152.21176147460938, + "logps/rejected": -173.6865997314453, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.258796691894531, + "rewards/margins": 2.1659445762634277, + "rewards/rejected": -12.4247407913208, + "step": 4016 + }, + { + "epoch": 2.7721235121614627, + "grad_norm": 0.31647929549217224, + "learning_rate": 9.493670886075951e-07, + "logits/chosen": 3.190779685974121, + "logits/rejected": 3.4684572219848633, + "logps/chosen": -140.7572784423828, + "logps/rejected": -168.84083557128906, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.20537281036377, + "rewards/margins": 2.818566083908081, + "rewards/rejected": -12.023938179016113, + "step": 4017 + }, + { + "epoch": 2.7728135242366743, + "grad_norm": 0.4208238124847412, + "learning_rate": 9.464902186421175e-07, + "logits/chosen": 3.679366111755371, + "logits/rejected": 3.900179862976074, + "logps/chosen": -162.8445587158203, + "logps/rejected": -168.57733154296875, + "loss": 0.6076, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.388724327087402, + "rewards/margins": 0.5958457589149475, + "rewards/rejected": -11.984569549560547, + "step": 4018 + }, + { + "epoch": 2.7735035363118854, + "grad_norm": 0.33347398042678833, + "learning_rate": 9.4361334867664e-07, + "logits/chosen": 3.411975860595703, + "logits/rejected": 3.686764717102051, + "logps/chosen": -166.1841583251953, + "logps/rejected": -186.4752197265625, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.852054595947266, + "rewards/margins": 1.9459024667739868, + "rewards/rejected": -13.797957420349121, + "step": 4019 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 0.33262357115745544, + "learning_rate": 9.407364787111624e-07, + "logits/chosen": 4.000857353210449, + "logits/rejected": 4.1559247970581055, + "logps/chosen": -180.34396362304688, + "logps/rejected": -186.65298461914062, + "loss": 0.6075, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.259613990783691, + "rewards/margins": 0.6064839363098145, + "rewards/rejected": -13.866097450256348, + "step": 4020 + }, + { + "epoch": 2.774883560462308, + "grad_norm": 0.3607003092765808, + "learning_rate": 9.378596087456848e-07, + "logits/chosen": 3.4130492210388184, + "logits/rejected": 3.447575569152832, + "logps/chosen": -163.48294067382812, + "logps/rejected": -180.60638427734375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.697225570678711, + "rewards/margins": 1.6372493505477905, + "rewards/rejected": -13.334476470947266, + "step": 4021 + }, + { + "epoch": 2.7755735725375192, + "grad_norm": 5.458478927612305, + "learning_rate": 9.349827387802072e-07, + "logits/chosen": 3.4287428855895996, + "logits/rejected": 3.4431097507476807, + "logps/chosen": -160.1990966796875, + "logps/rejected": -162.18963623046875, + "loss": 0.6245, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.491785049438477, + "rewards/margins": 0.23307573795318604, + "rewards/rejected": -11.724861145019531, + "step": 4022 + }, + { + "epoch": 2.776263584612731, + "grad_norm": 0.3362561762332916, + "learning_rate": 9.321058688147297e-07, + "logits/chosen": 3.388026475906372, + "logits/rejected": 3.388026475906372, + "logps/chosen": -170.8595428466797, + "logps/rejected": -170.8595428466797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.32206916809082, + "rewards/margins": 0.0, + "rewards/rejected": -12.32206916809082, + "step": 4023 + }, + { + "epoch": 2.776953596687942, + "grad_norm": 2.626312494277954, + "learning_rate": 9.292289988492522e-07, + "logits/chosen": 3.2926318645477295, + "logits/rejected": 3.5085620880126953, + "logps/chosen": -152.8571319580078, + "logps/rejected": -170.30030822753906, + "loss": 0.5054, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.613685607910156, + "rewards/margins": 1.794797420501709, + "rewards/rejected": -12.40848159790039, + "step": 4024 + }, + { + "epoch": 2.777643608763153, + "grad_norm": 0.351632297039032, + "learning_rate": 9.263521288837746e-07, + "logits/chosen": 3.521602153778076, + "logits/rejected": 3.521602153778076, + "logps/chosen": -193.8405303955078, + "logps/rejected": -193.8405303955078, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.675817489624023, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -14.67581844329834, + "step": 4025 + }, + { + "epoch": 2.7783336208383647, + "grad_norm": 0.41003862023353577, + "learning_rate": 9.23475258918297e-07, + "logits/chosen": 3.4424057006835938, + "logits/rejected": 3.4424057006835938, + "logps/chosen": -181.94973754882812, + "logps/rejected": -181.94973754882812, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.619826316833496, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -13.619827270507812, + "step": 4026 + }, + { + "epoch": 2.7790236329135762, + "grad_norm": 0.36621665954589844, + "learning_rate": 9.205983889528194e-07, + "logits/chosen": 3.778444528579712, + "logits/rejected": 3.778444528579712, + "logps/chosen": -189.43185424804688, + "logps/rejected": -189.43186950683594, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.186004638671875, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -14.186004638671875, + "step": 4027 + }, + { + "epoch": 2.7797136449887874, + "grad_norm": 4.9982476234436035, + "learning_rate": 9.177215189873419e-07, + "logits/chosen": 3.394087314605713, + "logits/rejected": 3.516921043395996, + "logps/chosen": -156.43844604492188, + "logps/rejected": -176.25765991210938, + "loss": 0.4832, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.926522254943848, + "rewards/margins": 2.0881991386413574, + "rewards/rejected": -13.014721870422363, + "step": 4028 + }, + { + "epoch": 2.7804036570639985, + "grad_norm": 0.29607805609703064, + "learning_rate": 9.148446490218643e-07, + "logits/chosen": 3.7954938411712646, + "logits/rejected": 3.7352800369262695, + "logps/chosen": -172.2083740234375, + "logps/rejected": -182.06353759765625, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.471033096313477, + "rewards/margins": 0.9661422371864319, + "rewards/rejected": -13.437175750732422, + "step": 4029 + }, + { + "epoch": 2.78109366913921, + "grad_norm": 0.3710397481918335, + "learning_rate": 9.119677790563868e-07, + "logits/chosen": 3.0928750038146973, + "logits/rejected": 3.2067527770996094, + "logps/chosen": -135.54649353027344, + "logps/rejected": -151.63755798339844, + "loss": 0.5205, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.83195972442627, + "rewards/margins": 1.611018419265747, + "rewards/rejected": -10.442977905273438, + "step": 4030 + }, + { + "epoch": 2.781783681214421, + "grad_norm": 0.40094268321990967, + "learning_rate": 9.090909090909091e-07, + "logits/chosen": 3.6261072158813477, + "logits/rejected": 3.5994009971618652, + "logps/chosen": -156.55258178710938, + "logps/rejected": -165.52267456054688, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.943087577819824, + "rewards/margins": 0.9281584620475769, + "rewards/rejected": -11.871246337890625, + "step": 4031 + }, + { + "epoch": 2.7824736932896323, + "grad_norm": 0.3710143268108368, + "learning_rate": 9.062140391254316e-07, + "logits/chosen": 3.2343478202819824, + "logits/rejected": 3.3612678050994873, + "logps/chosen": -181.39971923828125, + "logps/rejected": -191.83926391601562, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -13.562066078186035, + "rewards/margins": 1.0771007537841797, + "rewards/rejected": -14.639166831970215, + "step": 4032 + }, + { + "epoch": 2.783163705364844, + "grad_norm": 0.4108707308769226, + "learning_rate": 9.033371691599541e-07, + "logits/chosen": 3.463991641998291, + "logits/rejected": 3.5828776359558105, + "logps/chosen": -173.0588836669922, + "logps/rejected": -182.03794860839844, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.535758018493652, + "rewards/margins": 0.8810112476348877, + "rewards/rejected": -13.416769027709961, + "step": 4033 + }, + { + "epoch": 2.783853717440055, + "grad_norm": 0.38439005613327026, + "learning_rate": 9.004602991944765e-07, + "logits/chosen": 3.717442035675049, + "logits/rejected": 3.816039800643921, + "logps/chosen": -182.71890258789062, + "logps/rejected": -191.42649841308594, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.566570281982422, + "rewards/margins": 0.8491653203964233, + "rewards/rejected": -14.415735244750977, + "step": 4034 + }, + { + "epoch": 2.7845437295152666, + "grad_norm": 0.3945044279098511, + "learning_rate": 8.97583429228999e-07, + "logits/chosen": 3.246687173843384, + "logits/rejected": 3.4664621353149414, + "logps/chosen": -157.60552978515625, + "logps/rejected": -170.7342529296875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.999903678894043, + "rewards/margins": 1.329201579093933, + "rewards/rejected": -12.329105377197266, + "step": 4035 + }, + { + "epoch": 2.7852337415904778, + "grad_norm": 0.3222460150718689, + "learning_rate": 8.947065592635213e-07, + "logits/chosen": 3.473680257797241, + "logits/rejected": 3.7751879692077637, + "logps/chosen": -144.08541870117188, + "logps/rejected": -165.24256896972656, + "loss": 0.5209, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.73414134979248, + "rewards/margins": 2.164045810699463, + "rewards/rejected": -11.898186683654785, + "step": 4036 + }, + { + "epoch": 2.7859237536656893, + "grad_norm": 0.9380764365196228, + "learning_rate": 8.918296892980438e-07, + "logits/chosen": 3.5386340618133545, + "logits/rejected": 3.4741766452789307, + "logps/chosen": -159.0740966796875, + "logps/rejected": -163.67552185058594, + "loss": 0.6093, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.289737701416016, + "rewards/margins": 0.47347038984298706, + "rewards/rejected": -11.763208389282227, + "step": 4037 + }, + { + "epoch": 2.7866137657409005, + "grad_norm": 0.36824658513069153, + "learning_rate": 8.889528193325662e-07, + "logits/chosen": 3.277982473373413, + "logits/rejected": 3.326388359069824, + "logps/chosen": -161.8535919189453, + "logps/rejected": -171.78733825683594, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.281591415405273, + "rewards/margins": 0.9560654759407043, + "rewards/rejected": -12.23765754699707, + "step": 4038 + }, + { + "epoch": 2.7873037778161116, + "grad_norm": 0.3356563150882721, + "learning_rate": 8.860759493670887e-07, + "logits/chosen": 3.1840507984161377, + "logits/rejected": 3.391986608505249, + "logps/chosen": -153.56727600097656, + "logps/rejected": -175.76773071289062, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.695465087890625, + "rewards/margins": 2.206601858139038, + "rewards/rejected": -12.902067184448242, + "step": 4039 + }, + { + "epoch": 2.787993789891323, + "grad_norm": 7.156869888305664, + "learning_rate": 8.831990794016112e-07, + "logits/chosen": 3.3631885051727295, + "logits/rejected": 3.331326961517334, + "logps/chosen": -168.94546508789062, + "logps/rejected": -170.6446990966797, + "loss": 0.6477, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.103864669799805, + "rewards/margins": 0.11748439073562622, + "rewards/rejected": -12.221348762512207, + "step": 4040 + }, + { + "epoch": 2.7886838019665343, + "grad_norm": 0.44312742352485657, + "learning_rate": 8.803222094361335e-07, + "logits/chosen": 3.4562158584594727, + "logits/rejected": 3.4562158584594727, + "logps/chosen": -180.58792114257812, + "logps/rejected": -180.58792114257812, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.292266845703125, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.292266845703125, + "step": 4041 + }, + { + "epoch": 2.789373814041746, + "grad_norm": 4.474632740020752, + "learning_rate": 8.77445339470656e-07, + "logits/chosen": 3.272526741027832, + "logits/rejected": 3.800355911254883, + "logps/chosen": -157.76239013671875, + "logps/rejected": -176.51690673828125, + "loss": 0.4549, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.855499267578125, + "rewards/margins": 1.9569711685180664, + "rewards/rejected": -12.812469482421875, + "step": 4042 + }, + { + "epoch": 2.790063826116957, + "grad_norm": 0.41578277945518494, + "learning_rate": 8.745684695051784e-07, + "logits/chosen": 3.2840960025787354, + "logits/rejected": 3.395838737487793, + "logps/chosen": -149.11083984375, + "logps/rejected": -169.7556915283203, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.148557662963867, + "rewards/margins": 2.0062363147735596, + "rewards/rejected": -12.154794692993164, + "step": 4043 + }, + { + "epoch": 2.7907538381921686, + "grad_norm": 0.3669945299625397, + "learning_rate": 8.716915995397009e-07, + "logits/chosen": 3.1807148456573486, + "logits/rejected": 3.376084804534912, + "logps/chosen": -165.01158142089844, + "logps/rejected": -180.999755859375, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.635029792785645, + "rewards/margins": 1.648407220840454, + "rewards/rejected": -13.283435821533203, + "step": 4044 + }, + { + "epoch": 2.7914438502673797, + "grad_norm": 1.0767815113067627, + "learning_rate": 8.688147295742233e-07, + "logits/chosen": 3.663454055786133, + "logits/rejected": 3.5870518684387207, + "logps/chosen": -166.91326904296875, + "logps/rejected": -171.09693908691406, + "loss": 0.6102, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.790942192077637, + "rewards/margins": 0.4399777054786682, + "rewards/rejected": -12.230918884277344, + "step": 4045 + }, + { + "epoch": 2.792133862342591, + "grad_norm": 0.4238970875740051, + "learning_rate": 8.659378596087457e-07, + "logits/chosen": 3.7020668983459473, + "logits/rejected": 3.7020668983459473, + "logps/chosen": -179.7720489501953, + "logps/rejected": -179.7720489501953, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.042218208312988, + "rewards/margins": 0.0, + "rewards/rejected": -13.042218208312988, + "step": 4046 + }, + { + "epoch": 2.7928238744178024, + "grad_norm": 0.3752707242965698, + "learning_rate": 8.630609896432681e-07, + "logits/chosen": 3.130268096923828, + "logits/rejected": 3.2473483085632324, + "logps/chosen": -157.4873504638672, + "logps/rejected": -181.73077392578125, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.643899917602539, + "rewards/margins": 2.516273021697998, + "rewards/rejected": -13.160172462463379, + "step": 4047 + }, + { + "epoch": 2.7935138864930136, + "grad_norm": 0.3108433783054352, + "learning_rate": 8.601841196777906e-07, + "logits/chosen": 3.337712049484253, + "logits/rejected": 3.32383131980896, + "logps/chosen": -171.35797119140625, + "logps/rejected": -182.49667358398438, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.396150588989258, + "rewards/margins": 1.143756628036499, + "rewards/rejected": -13.539907455444336, + "step": 4048 + }, + { + "epoch": 2.7942038985682247, + "grad_norm": 0.47095248103141785, + "learning_rate": 8.573072497123131e-07, + "logits/chosen": 3.1352410316467285, + "logits/rejected": 3.2559092044830322, + "logps/chosen": -149.84136962890625, + "logps/rejected": -155.86160278320312, + "loss": 0.6077, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.384076118469238, + "rewards/margins": 0.5857457518577576, + "rewards/rejected": -10.96982192993164, + "step": 4049 + }, + { + "epoch": 2.7948939106434363, + "grad_norm": 0.4810543954372406, + "learning_rate": 8.544303797468355e-07, + "logits/chosen": 2.727027654647827, + "logits/rejected": 3.0999674797058105, + "logps/chosen": -146.3103485107422, + "logps/rejected": -179.92059326171875, + "loss": 0.4337, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.787924766540527, + "rewards/margins": 3.3839831352233887, + "rewards/rejected": -13.171907424926758, + "step": 4050 + }, + { + "epoch": 2.7955839227186474, + "grad_norm": 0.46676746010780334, + "learning_rate": 8.515535097813579e-07, + "logits/chosen": 3.484262704849243, + "logits/rejected": 3.484262704849243, + "logps/chosen": -188.31903076171875, + "logps/rejected": -188.31903076171875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.119224548339844, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -14.11922550201416, + "step": 4051 + }, + { + "epoch": 2.796273934793859, + "grad_norm": 2.070005178451538, + "learning_rate": 8.486766398158803e-07, + "logits/chosen": 3.63425350189209, + "logits/rejected": 3.6481809616088867, + "logps/chosen": -158.705078125, + "logps/rejected": -161.38088989257812, + "loss": 0.6191, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.056413650512695, + "rewards/margins": 0.2809741497039795, + "rewards/rejected": -11.337388038635254, + "step": 4052 + }, + { + "epoch": 2.79696394686907, + "grad_norm": 0.463051438331604, + "learning_rate": 8.457997698504028e-07, + "logits/chosen": 3.128669261932373, + "logits/rejected": 3.128669261932373, + "logps/chosen": -185.3380126953125, + "logps/rejected": -185.3380126953125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.961241722106934, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.961241722106934, + "step": 4053 + }, + { + "epoch": 2.7976539589442817, + "grad_norm": 0.49963438510894775, + "learning_rate": 8.429228998849252e-07, + "logits/chosen": 3.2238144874572754, + "logits/rejected": 3.2706801891326904, + "logps/chosen": -151.7700653076172, + "logps/rejected": -163.01058959960938, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.377994537353516, + "rewards/margins": 1.1645636558532715, + "rewards/rejected": -11.542558670043945, + "step": 4054 + }, + { + "epoch": 2.798343971019493, + "grad_norm": 0.3170202076435089, + "learning_rate": 8.400460299194477e-07, + "logits/chosen": 3.2392935752868652, + "logits/rejected": 3.530503511428833, + "logps/chosen": -153.66299438476562, + "logps/rejected": -185.90341186523438, + "loss": 0.4333, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.404390335083008, + "rewards/margins": 3.3334474563598633, + "rewards/rejected": -13.737836837768555, + "step": 4055 + }, + { + "epoch": 2.799033983094704, + "grad_norm": 15.991719245910645, + "learning_rate": 8.371691599539701e-07, + "logits/chosen": 3.359683036804199, + "logits/rejected": 3.285337448120117, + "logps/chosen": -172.1590118408203, + "logps/rejected": -186.4127655029297, + "loss": 0.5861, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.47923469543457, + "rewards/margins": 1.4526596069335938, + "rewards/rejected": -13.931894302368164, + "step": 4056 + }, + { + "epoch": 2.7997239951699155, + "grad_norm": 2.7282421588897705, + "learning_rate": 8.342922899884925e-07, + "logits/chosen": 3.115292549133301, + "logits/rejected": 3.071542263031006, + "logps/chosen": -147.36648559570312, + "logps/rejected": -163.76197814941406, + "loss": 0.5347, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.162687301635742, + "rewards/margins": 1.551102876663208, + "rewards/rejected": -11.713790893554688, + "step": 4057 + }, + { + "epoch": 2.8004140072451267, + "grad_norm": 0.33720025420188904, + "learning_rate": 8.31415420023015e-07, + "logits/chosen": 3.5839385986328125, + "logits/rejected": 3.5839385986328125, + "logps/chosen": -175.4354705810547, + "logps/rejected": -175.43545532226562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.833502769470215, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -12.833501815795898, + "step": 4058 + }, + { + "epoch": 2.8011040193203383, + "grad_norm": 0.37290218472480774, + "learning_rate": 8.285385500575374e-07, + "logits/chosen": 3.8429312705993652, + "logits/rejected": 3.8429312705993652, + "logps/chosen": -181.2440185546875, + "logps/rejected": -181.2440185546875, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.39286994934082, + "rewards/margins": 4.172325134277344e-07, + "rewards/rejected": -13.392870903015137, + "step": 4059 + }, + { + "epoch": 2.8017940313955494, + "grad_norm": 0.42542946338653564, + "learning_rate": 8.256616800920599e-07, + "logits/chosen": 3.409054756164551, + "logits/rejected": 3.409054756164551, + "logps/chosen": -175.277099609375, + "logps/rejected": -175.277099609375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.596233367919922, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.596233367919922, + "step": 4060 + }, + { + "epoch": 2.802484043470761, + "grad_norm": 0.32297420501708984, + "learning_rate": 8.227848101265823e-07, + "logits/chosen": 3.4141783714294434, + "logits/rejected": 3.5715317726135254, + "logps/chosen": -168.5171661376953, + "logps/rejected": -189.30996704101562, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.050220489501953, + "rewards/margins": 2.1426210403442383, + "rewards/rejected": -14.192842483520508, + "step": 4061 + }, + { + "epoch": 2.803174055545972, + "grad_norm": 0.3846355080604553, + "learning_rate": 8.199079401611047e-07, + "logits/chosen": 3.1156816482543945, + "logits/rejected": 3.1542344093322754, + "logps/chosen": -159.25320434570312, + "logps/rejected": -169.86692810058594, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.16553020477295, + "rewards/margins": 1.0639959573745728, + "rewards/rejected": -12.22952651977539, + "step": 4062 + }, + { + "epoch": 2.8038640676211832, + "grad_norm": 0.36812612414360046, + "learning_rate": 8.170310701956272e-07, + "logits/chosen": 3.7241077423095703, + "logits/rejected": 3.7241077423095703, + "logps/chosen": -184.28016662597656, + "logps/rejected": -184.28016662597656, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.658685684204102, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.658685684204102, + "step": 4063 + }, + { + "epoch": 2.804554079696395, + "grad_norm": 0.30229005217552185, + "learning_rate": 8.141542002301496e-07, + "logits/chosen": 3.0545177459716797, + "logits/rejected": 3.333336353302002, + "logps/chosen": -165.76486206054688, + "logps/rejected": -188.00027465820312, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.002459526062012, + "rewards/margins": 2.1580281257629395, + "rewards/rejected": -14.16048812866211, + "step": 4064 + }, + { + "epoch": 2.805244091771606, + "grad_norm": 0.6218369007110596, + "learning_rate": 8.112773302646721e-07, + "logits/chosen": 3.4419102668762207, + "logits/rejected": 3.5763800144195557, + "logps/chosen": -148.04763793945312, + "logps/rejected": -176.65911865234375, + "loss": 0.4358, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.268342971801758, + "rewards/margins": 2.883679151535034, + "rewards/rejected": -13.152022361755371, + "step": 4065 + }, + { + "epoch": 2.805934103846817, + "grad_norm": 0.3568793833255768, + "learning_rate": 8.084004602991946e-07, + "logits/chosen": 3.454162836074829, + "logits/rejected": 3.454162836074829, + "logps/chosen": -163.1058807373047, + "logps/rejected": -163.1058807373047, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.483163833618164, + "rewards/margins": 0.0, + "rewards/rejected": -11.483163833618164, + "step": 4066 + }, + { + "epoch": 2.8066241159220287, + "grad_norm": 0.6366183161735535, + "learning_rate": 8.055235903337171e-07, + "logits/chosen": 3.5995802879333496, + "logits/rejected": 3.6199002265930176, + "logps/chosen": -169.37432861328125, + "logps/rejected": -174.30426025390625, + "loss": 0.6083, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.07487678527832, + "rewards/margins": 0.531926155090332, + "rewards/rejected": -12.606802940368652, + "step": 4067 + }, + { + "epoch": 2.8073141279972402, + "grad_norm": 0.33004888892173767, + "learning_rate": 8.026467203682394e-07, + "logits/chosen": 3.2116622924804688, + "logits/rejected": 3.4468934535980225, + "logps/chosen": -148.7843475341797, + "logps/rejected": -171.9381561279297, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.153871536254883, + "rewards/margins": 2.243056535720825, + "rewards/rejected": -12.396926879882812, + "step": 4068 + }, + { + "epoch": 2.8080041400724514, + "grad_norm": 0.3829663395881653, + "learning_rate": 7.997698504027619e-07, + "logits/chosen": 3.724884033203125, + "logits/rejected": 3.724884033203125, + "logps/chosen": -183.85182189941406, + "logps/rejected": -183.85182189941406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.793341636657715, + "rewards/margins": 0.0, + "rewards/rejected": -13.793341636657715, + "step": 4069 + }, + { + "epoch": 2.8086941521476625, + "grad_norm": 13.284163475036621, + "learning_rate": 7.968929804372844e-07, + "logits/chosen": 3.494143009185791, + "logits/rejected": 3.405353546142578, + "logps/chosen": -189.78955078125, + "logps/rejected": -185.98995971679688, + "loss": 0.9891, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.208555221557617, + "rewards/margins": -0.3766212463378906, + "rewards/rejected": -13.831933975219727, + "step": 4070 + }, + { + "epoch": 2.809384164222874, + "grad_norm": 0.2806526720523834, + "learning_rate": 7.940161104718068e-07, + "logits/chosen": 3.0886197090148926, + "logits/rejected": 3.3507204055786133, + "logps/chosen": -166.82778930664062, + "logps/rejected": -188.06642150878906, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.753171920776367, + "rewards/margins": 2.095059394836426, + "rewards/rejected": -13.848230361938477, + "step": 4071 + }, + { + "epoch": 2.810074176298085, + "grad_norm": 0.46388307213783264, + "learning_rate": 7.911392405063293e-07, + "logits/chosen": 3.6348485946655273, + "logits/rejected": 3.6348485946655273, + "logps/chosen": -179.76885986328125, + "logps/rejected": -179.76885986328125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.030123710632324, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.03012466430664, + "step": 4072 + }, + { + "epoch": 2.8107641883732963, + "grad_norm": 0.381840318441391, + "learning_rate": 7.882623705408516e-07, + "logits/chosen": 3.315730094909668, + "logits/rejected": 3.315730094909668, + "logps/chosen": -193.38467407226562, + "logps/rejected": -193.38467407226562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.544906616210938, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.544906616210938, + "step": 4073 + }, + { + "epoch": 2.811454200448508, + "grad_norm": 2.7103195190429688, + "learning_rate": 7.853855005753741e-07, + "logits/chosen": 3.4763035774230957, + "logits/rejected": 3.517000198364258, + "logps/chosen": -175.0425262451172, + "logps/rejected": -177.63221740722656, + "loss": 0.6233, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.684133529663086, + "rewards/margins": 0.24229973554611206, + "rewards/rejected": -12.926433563232422, + "step": 4074 + }, + { + "epoch": 2.812144212523719, + "grad_norm": 0.3756246268749237, + "learning_rate": 7.825086306098965e-07, + "logits/chosen": 3.3150079250335693, + "logits/rejected": 3.348013401031494, + "logps/chosen": -163.07852172851562, + "logps/rejected": -173.85174560546875, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.758609771728516, + "rewards/margins": 1.078693151473999, + "rewards/rejected": -12.837303161621094, + "step": 4075 + }, + { + "epoch": 2.8128342245989306, + "grad_norm": 0.3323361873626709, + "learning_rate": 7.79631760644419e-07, + "logits/chosen": 3.2407095432281494, + "logits/rejected": 3.4040184020996094, + "logps/chosen": -148.29098510742188, + "logps/rejected": -168.4970703125, + "loss": 0.5202, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.958749771118164, + "rewards/margins": 2.1206846237182617, + "rewards/rejected": -12.079434394836426, + "step": 4076 + }, + { + "epoch": 2.8135242366741418, + "grad_norm": 0.35044488310813904, + "learning_rate": 7.767548906789415e-07, + "logits/chosen": 3.6448397636413574, + "logits/rejected": 3.666673183441162, + "logps/chosen": -178.16004943847656, + "logps/rejected": -191.24415588378906, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.965097427368164, + "rewards/margins": 1.3288204669952393, + "rewards/rejected": -14.293916702270508, + "step": 4077 + }, + { + "epoch": 2.8142142487493533, + "grad_norm": 0.3779222071170807, + "learning_rate": 7.738780207134638e-07, + "logits/chosen": 3.6304476261138916, + "logits/rejected": 3.6745026111602783, + "logps/chosen": -162.6986083984375, + "logps/rejected": -175.4581298828125, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.564178466796875, + "rewards/margins": 1.317819595336914, + "rewards/rejected": -12.881999015808105, + "step": 4078 + }, + { + "epoch": 2.8149042608245645, + "grad_norm": 0.46041733026504517, + "learning_rate": 7.710011507479863e-07, + "logits/chosen": 3.2951855659484863, + "logits/rejected": 3.2951855659484863, + "logps/chosen": -173.78468322753906, + "logps/rejected": -173.78468322753906, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.713844299316406, + "rewards/margins": 0.0, + "rewards/rejected": -12.713844299316406, + "step": 4079 + }, + { + "epoch": 2.8155942728997756, + "grad_norm": 0.3045227825641632, + "learning_rate": 7.681242807825087e-07, + "logits/chosen": 3.337728500366211, + "logits/rejected": 3.525177001953125, + "logps/chosen": -182.50448608398438, + "logps/rejected": -190.90953063964844, + "loss": 0.6067, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.414852142333984, + "rewards/margins": 0.8169548511505127, + "rewards/rejected": -14.231805801391602, + "step": 4080 + }, + { + "epoch": 2.816284284974987, + "grad_norm": 0.4155093729496002, + "learning_rate": 7.652474108170312e-07, + "logits/chosen": 3.530910015106201, + "logits/rejected": 3.641695976257324, + "logps/chosen": -166.81736755371094, + "logps/rejected": -175.67298889160156, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.881897926330566, + "rewards/margins": 0.9353828430175781, + "rewards/rejected": -12.817280769348145, + "step": 4081 + }, + { + "epoch": 2.8169742970501983, + "grad_norm": 0.3611961603164673, + "learning_rate": 7.623705408515536e-07, + "logits/chosen": 3.73048734664917, + "logits/rejected": 3.73048734664917, + "logps/chosen": -187.91433715820312, + "logps/rejected": -187.91433715820312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.967266082763672, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.967266082763672, + "step": 4082 + }, + { + "epoch": 2.8176643091254094, + "grad_norm": 0.5249338150024414, + "learning_rate": 7.59493670886076e-07, + "logits/chosen": 3.720353603363037, + "logits/rejected": 3.720353603363037, + "logps/chosen": -184.39132690429688, + "logps/rejected": -184.39132690429688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.742023468017578, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -13.742023468017578, + "step": 4083 + }, + { + "epoch": 2.818354321200621, + "grad_norm": 0.3363034427165985, + "learning_rate": 7.566168009205984e-07, + "logits/chosen": 3.6660282611846924, + "logits/rejected": 3.700942277908325, + "logps/chosen": -148.32894897460938, + "logps/rejected": -157.82382202148438, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.216617584228516, + "rewards/margins": 0.9587824940681458, + "rewards/rejected": -11.175400733947754, + "step": 4084 + }, + { + "epoch": 2.8190443332758326, + "grad_norm": 0.32967090606689453, + "learning_rate": 7.537399309551209e-07, + "logits/chosen": 4.028450965881348, + "logits/rejected": 4.24321174621582, + "logps/chosen": -177.4451446533203, + "logps/rejected": -187.80181884765625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.979862213134766, + "rewards/margins": 1.0358692407608032, + "rewards/rejected": -14.015731811523438, + "step": 4085 + }, + { + "epoch": 2.8197343453510437, + "grad_norm": 0.4177880585193634, + "learning_rate": 7.508630609896434e-07, + "logits/chosen": 3.5738584995269775, + "logits/rejected": 3.6139492988586426, + "logps/chosen": -170.3194580078125, + "logps/rejected": -183.32838439941406, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.071782112121582, + "rewards/margins": 1.3774502277374268, + "rewards/rejected": -13.44923210144043, + "step": 4086 + }, + { + "epoch": 2.820424357426255, + "grad_norm": 0.36631613969802856, + "learning_rate": 7.479861910241658e-07, + "logits/chosen": 3.4444544315338135, + "logits/rejected": 3.4728150367736816, + "logps/chosen": -175.46078491210938, + "logps/rejected": -186.7831573486328, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.882993698120117, + "rewards/margins": 1.1660782098770142, + "rewards/rejected": -14.049072265625, + "step": 4087 + }, + { + "epoch": 2.8211143695014664, + "grad_norm": 0.37741774320602417, + "learning_rate": 7.451093210586882e-07, + "logits/chosen": 3.550550937652588, + "logits/rejected": 3.550550937652588, + "logps/chosen": -191.5061798095703, + "logps/rejected": -191.50619506835938, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.216008186340332, + "rewards/margins": 8.344650268554688e-07, + "rewards/rejected": -14.216009140014648, + "step": 4088 + }, + { + "epoch": 2.8218043815766776, + "grad_norm": 0.3758692145347595, + "learning_rate": 7.422324510932106e-07, + "logits/chosen": 3.44221568107605, + "logits/rejected": 3.53462553024292, + "logps/chosen": -161.8714141845703, + "logps/rejected": -179.0675048828125, + "loss": 0.5208, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.536115646362305, + "rewards/margins": 1.727311611175537, + "rewards/rejected": -13.263426780700684, + "step": 4089 + }, + { + "epoch": 2.8224943936518887, + "grad_norm": 0.2966727614402771, + "learning_rate": 7.393555811277331e-07, + "logits/chosen": 3.5023248195648193, + "logits/rejected": 3.747011184692383, + "logps/chosen": -159.30343627929688, + "logps/rejected": -185.29306030273438, + "loss": 0.4338, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.989520072937012, + "rewards/margins": 2.586639642715454, + "rewards/rejected": -13.57615852355957, + "step": 4090 + }, + { + "epoch": 2.8231844057271003, + "grad_norm": 0.42508062720298767, + "learning_rate": 7.364787111622555e-07, + "logits/chosen": 3.5822055339813232, + "logits/rejected": 3.606876850128174, + "logps/chosen": -159.9797821044922, + "logps/rejected": -170.2224884033203, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.288286209106445, + "rewards/margins": 1.0514655113220215, + "rewards/rejected": -12.339752197265625, + "step": 4091 + }, + { + "epoch": 2.8238744178023114, + "grad_norm": 0.4995023310184479, + "learning_rate": 7.33601841196778e-07, + "logits/chosen": 3.3460745811462402, + "logits/rejected": 3.462500810623169, + "logps/chosen": -171.85153198242188, + "logps/rejected": -177.46868896484375, + "loss": 0.6077, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.32872486114502, + "rewards/margins": 0.5775813460350037, + "rewards/rejected": -12.906305313110352, + "step": 4092 + }, + { + "epoch": 2.824564429877523, + "grad_norm": 0.34199830889701843, + "learning_rate": 7.307249712313005e-07, + "logits/chosen": 3.264374256134033, + "logits/rejected": 3.316540241241455, + "logps/chosen": -179.2590789794922, + "logps/rejected": -187.12640380859375, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.997591018676758, + "rewards/margins": 0.8009669184684753, + "rewards/rejected": -13.798559188842773, + "step": 4093 + }, + { + "epoch": 2.825254441952734, + "grad_norm": 0.3152529299259186, + "learning_rate": 7.278481012658228e-07, + "logits/chosen": 3.4066176414489746, + "logits/rejected": 3.6991472244262695, + "logps/chosen": -163.12191772460938, + "logps/rejected": -182.47360229492188, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.545165061950684, + "rewards/margins": 1.9248161315917969, + "rewards/rejected": -13.46998119354248, + "step": 4094 + }, + { + "epoch": 2.8259444540279457, + "grad_norm": 0.5292600393295288, + "learning_rate": 7.249712313003453e-07, + "logits/chosen": 3.681786060333252, + "logits/rejected": 3.681786060333252, + "logps/chosen": -178.5773162841797, + "logps/rejected": -178.5773162841797, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.103630065917969, + "rewards/margins": 0.0, + "rewards/rejected": -13.103630065917969, + "step": 4095 + }, + { + "epoch": 2.826634466103157, + "grad_norm": 0.4105485677719116, + "learning_rate": 7.220943613348677e-07, + "logits/chosen": 3.3957834243774414, + "logits/rejected": 3.3832638263702393, + "logps/chosen": -162.90045166015625, + "logps/rejected": -171.80023193359375, + "loss": 0.607, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.66740608215332, + "rewards/margins": 0.6990697979927063, + "rewards/rejected": -12.366475105285645, + "step": 4096 + }, + { + "epoch": 2.827324478178368, + "grad_norm": 0.48213064670562744, + "learning_rate": 7.192174913693902e-07, + "logits/chosen": 3.3044052124023438, + "logits/rejected": 3.3725011348724365, + "logps/chosen": -167.9824676513672, + "logps/rejected": -174.1017608642578, + "loss": 0.6077, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.104616165161133, + "rewards/margins": 0.584279477596283, + "rewards/rejected": -12.688896179199219, + "step": 4097 + }, + { + "epoch": 2.8280144902535795, + "grad_norm": 0.38196587562561035, + "learning_rate": 7.163406214039126e-07, + "logits/chosen": 3.3879897594451904, + "logits/rejected": 3.3879897594451904, + "logps/chosen": -158.79881286621094, + "logps/rejected": -158.79881286621094, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.041125297546387, + "rewards/margins": 0.0, + "rewards/rejected": -11.041125297546387, + "step": 4098 + }, + { + "epoch": 2.8287045023287907, + "grad_norm": 0.3462895452976227, + "learning_rate": 7.13463751438435e-07, + "logits/chosen": 3.4881432056427, + "logits/rejected": 3.4881432056427, + "logps/chosen": -169.56687927246094, + "logps/rejected": -169.56687927246094, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.92999267578125, + "rewards/margins": 0.0, + "rewards/rejected": -11.92999267578125, + "step": 4099 + }, + { + "epoch": 2.829394514404002, + "grad_norm": 0.5410663485527039, + "learning_rate": 7.105868814729574e-07, + "logits/chosen": 3.698787212371826, + "logits/rejected": 3.698787212371826, + "logps/chosen": -180.66632080078125, + "logps/rejected": -180.66632080078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.21768856048584, + "rewards/margins": 0.0, + "rewards/rejected": -13.21768856048584, + "step": 4100 + }, + { + "epoch": 2.8300845264792134, + "grad_norm": 0.3835890591144562, + "learning_rate": 7.077100115074799e-07, + "logits/chosen": 3.597792625427246, + "logits/rejected": 3.597792625427246, + "logps/chosen": -191.36788940429688, + "logps/rejected": -191.36788940429688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.362796783447266, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.362794876098633, + "step": 4101 + }, + { + "epoch": 2.830774538554425, + "grad_norm": 0.517208993434906, + "learning_rate": 7.048331415420024e-07, + "logits/chosen": 3.9696898460388184, + "logits/rejected": 3.9691243171691895, + "logps/chosen": -173.77798461914062, + "logps/rejected": -180.02203369140625, + "loss": 0.6078, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.593038558959961, + "rewards/margins": 0.5749009847640991, + "rewards/rejected": -13.167938232421875, + "step": 4102 + }, + { + "epoch": 2.831464550629636, + "grad_norm": 30.322284698486328, + "learning_rate": 7.019562715765248e-07, + "logits/chosen": 3.320726156234741, + "logits/rejected": 3.369248151779175, + "logps/chosen": -140.3172149658203, + "logps/rejected": -158.79830932617188, + "loss": 0.5809, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.05676555633545, + "rewards/margins": 1.926423192024231, + "rewards/rejected": -10.983189582824707, + "step": 4103 + }, + { + "epoch": 2.8321545627048472, + "grad_norm": 0.43192288279533386, + "learning_rate": 6.990794016110472e-07, + "logits/chosen": 3.5882129669189453, + "logits/rejected": 3.5825157165527344, + "logps/chosen": -165.53746032714844, + "logps/rejected": -178.9387664794922, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.693232536315918, + "rewards/margins": 1.3457132577896118, + "rewards/rejected": -13.038946151733398, + "step": 4104 + }, + { + "epoch": 2.832844574780059, + "grad_norm": 0.8613746166229248, + "learning_rate": 6.962025316455696e-07, + "logits/chosen": 3.639357805252075, + "logits/rejected": 3.613961696624756, + "logps/chosen": -198.16522216796875, + "logps/rejected": -201.7259521484375, + "loss": 0.6133, + "rewards/accuracies": 0.125, + "rewards/chosen": -15.196174621582031, + "rewards/margins": 0.3612405061721802, + "rewards/rejected": -15.557415008544922, + "step": 4105 + }, + { + "epoch": 2.83353458685527, + "grad_norm": 0.3595247268676758, + "learning_rate": 6.933256616800921e-07, + "logits/chosen": 3.3405532836914062, + "logits/rejected": 3.3405532836914062, + "logps/chosen": -191.7630615234375, + "logps/rejected": -191.7630615234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.451223373413086, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.451221466064453, + "step": 4106 + }, + { + "epoch": 2.834224598930481, + "grad_norm": 0.4270484447479248, + "learning_rate": 6.904487917146145e-07, + "logits/chosen": 3.742587089538574, + "logits/rejected": 3.742587089538574, + "logps/chosen": -173.85678100585938, + "logps/rejected": -173.85679626464844, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.41700267791748, + "rewards/margins": 4.76837158203125e-07, + "rewards/rejected": -12.417003631591797, + "step": 4107 + }, + { + "epoch": 2.8349146110056926, + "grad_norm": 0.46339139342308044, + "learning_rate": 6.87571921749137e-07, + "logits/chosen": 3.7642292976379395, + "logits/rejected": 3.7642292976379395, + "logps/chosen": -185.15115356445312, + "logps/rejected": -185.15115356445312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.675012588500977, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.675012588500977, + "step": 4108 + }, + { + "epoch": 2.8356046230809038, + "grad_norm": 0.3792385458946228, + "learning_rate": 6.846950517836593e-07, + "logits/chosen": 3.298760414123535, + "logits/rejected": 3.36042857170105, + "logps/chosen": -163.50624084472656, + "logps/rejected": -188.0623321533203, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.66690444946289, + "rewards/margins": 2.485903263092041, + "rewards/rejected": -14.152807235717773, + "step": 4109 + }, + { + "epoch": 2.8362946351561154, + "grad_norm": 0.5159512162208557, + "learning_rate": 6.818181818181818e-07, + "logits/chosen": 3.673879623413086, + "logits/rejected": 3.7242627143859863, + "logps/chosen": -180.406005859375, + "logps/rejected": -186.93502807617188, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.172344207763672, + "rewards/margins": 0.7435277700424194, + "rewards/rejected": -13.915872573852539, + "step": 4110 + }, + { + "epoch": 2.8369846472313265, + "grad_norm": 0.34883344173431396, + "learning_rate": 6.789413118527043e-07, + "logits/chosen": 3.468393564224243, + "logits/rejected": 3.3692452907562256, + "logps/chosen": -169.19134521484375, + "logps/rejected": -176.11383056640625, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.981403350830078, + "rewards/margins": 0.715889036655426, + "rewards/rejected": -12.69729232788086, + "step": 4111 + }, + { + "epoch": 2.837674659306538, + "grad_norm": 30.08440589904785, + "learning_rate": 6.760644418872267e-07, + "logits/chosen": 3.7389111518859863, + "logits/rejected": 3.8348512649536133, + "logps/chosen": -160.36041259765625, + "logps/rejected": -157.1778564453125, + "loss": 0.9585, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.208200454711914, + "rewards/margins": -0.34429633617401123, + "rewards/rejected": -10.86390495300293, + "step": 4112 + }, + { + "epoch": 2.838364671381749, + "grad_norm": 0.39032381772994995, + "learning_rate": 6.731875719217492e-07, + "logits/chosen": 3.988128185272217, + "logits/rejected": 3.988128185272217, + "logps/chosen": -181.90586853027344, + "logps/rejected": -181.90586853027344, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.567830085754395, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.567829132080078, + "step": 4113 + }, + { + "epoch": 2.8390546834569603, + "grad_norm": 0.40562987327575684, + "learning_rate": 6.703107019562715e-07, + "logits/chosen": 3.86474347114563, + "logits/rejected": 3.86474347114563, + "logps/chosen": -188.5671844482422, + "logps/rejected": -188.5671844482422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.005054473876953, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -14.005054473876953, + "step": 4114 + }, + { + "epoch": 2.839744695532172, + "grad_norm": 0.3482613265514374, + "learning_rate": 6.67433831990794e-07, + "logits/chosen": 3.642888307571411, + "logits/rejected": 3.6293277740478516, + "logps/chosen": -163.65919494628906, + "logps/rejected": -175.97862243652344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.789649963378906, + "rewards/margins": 1.2368906736373901, + "rewards/rejected": -13.026540756225586, + "step": 4115 + }, + { + "epoch": 2.840434707607383, + "grad_norm": 0.44906893372535706, + "learning_rate": 6.645569620253164e-07, + "logits/chosen": 3.179677963256836, + "logits/rejected": 3.3944339752197266, + "logps/chosen": -160.48109436035156, + "logps/rejected": -174.04071044921875, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.308615684509277, + "rewards/margins": 1.3684896230697632, + "rewards/rejected": -12.677104949951172, + "step": 4116 + }, + { + "epoch": 2.841124719682594, + "grad_norm": 0.32860323786735535, + "learning_rate": 6.61680092059839e-07, + "logits/chosen": 3.5682260990142822, + "logits/rejected": 3.5682260990142822, + "logps/chosen": -159.70018005371094, + "logps/rejected": -159.70016479492188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.252323150634766, + "rewards/margins": -1.4901161193847656e-07, + "rewards/rejected": -11.252323150634766, + "step": 4117 + }, + { + "epoch": 2.8418147317578057, + "grad_norm": 0.8057641386985779, + "learning_rate": 6.588032220943615e-07, + "logits/chosen": 3.3785786628723145, + "logits/rejected": 3.5079963207244873, + "logps/chosen": -159.80657958984375, + "logps/rejected": -172.489501953125, + "loss": 0.5239, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.991597175598145, + "rewards/margins": 1.3329365253448486, + "rewards/rejected": -12.324533462524414, + "step": 4118 + }, + { + "epoch": 2.8425047438330173, + "grad_norm": 8.894006729125977, + "learning_rate": 6.559263521288839e-07, + "logits/chosen": 3.603850841522217, + "logits/rejected": 3.734381675720215, + "logps/chosen": -158.927490234375, + "logps/rejected": -173.4649200439453, + "loss": 0.6604, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.34010124206543, + "rewards/margins": 1.370417833328247, + "rewards/rejected": -12.710519790649414, + "step": 4119 + }, + { + "epoch": 2.8431947559082285, + "grad_norm": 0.4230699837207794, + "learning_rate": 6.530494821634063e-07, + "logits/chosen": 3.308803081512451, + "logits/rejected": 3.38101863861084, + "logps/chosen": -172.9119873046875, + "logps/rejected": -185.3238983154297, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.702589988708496, + "rewards/margins": 1.2019035816192627, + "rewards/rejected": -13.904494285583496, + "step": 4120 + }, + { + "epoch": 2.8438847679834396, + "grad_norm": 0.4372813105583191, + "learning_rate": 6.501726121979287e-07, + "logits/chosen": 3.321793556213379, + "logits/rejected": 3.321793556213379, + "logps/chosen": -166.31292724609375, + "logps/rejected": -166.3129425048828, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.823773384094238, + "rewards/margins": 1.0728836059570312e-06, + "rewards/rejected": -11.823774337768555, + "step": 4121 + }, + { + "epoch": 2.844574780058651, + "grad_norm": 1.1642507314682007, + "learning_rate": 6.472957422324512e-07, + "logits/chosen": 3.3344216346740723, + "logits/rejected": 3.513963222503662, + "logps/chosen": -162.1271514892578, + "logps/rejected": -174.26434326171875, + "loss": 0.524, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.55626392364502, + "rewards/margins": 1.232029676437378, + "rewards/rejected": -12.788293838500977, + "step": 4122 + }, + { + "epoch": 2.8452647921338623, + "grad_norm": 0.373729407787323, + "learning_rate": 6.444188722669736e-07, + "logits/chosen": 3.85211181640625, + "logits/rejected": 3.85211181640625, + "logps/chosen": -190.54403686523438, + "logps/rejected": -190.54403686523438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.50624942779541, + "rewards/margins": 0.0, + "rewards/rejected": -14.50624942779541, + "step": 4123 + }, + { + "epoch": 2.8459548042090734, + "grad_norm": 0.3104390501976013, + "learning_rate": 6.415420023014961e-07, + "logits/chosen": 3.3663501739501953, + "logits/rejected": 3.4006094932556152, + "logps/chosen": -171.69923400878906, + "logps/rejected": -186.181884765625, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.346384048461914, + "rewards/margins": 1.4713189601898193, + "rewards/rejected": -13.81770133972168, + "step": 4124 + }, + { + "epoch": 2.846644816284285, + "grad_norm": 0.43851879239082336, + "learning_rate": 6.386651323360186e-07, + "logits/chosen": 3.4622654914855957, + "logits/rejected": 3.6176910400390625, + "logps/chosen": -147.79776000976562, + "logps/rejected": -164.32080078125, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -9.957416534423828, + "rewards/margins": 1.5890464782714844, + "rewards/rejected": -11.546463012695312, + "step": 4125 + }, + { + "epoch": 2.847334828359496, + "grad_norm": 0.4636618494987488, + "learning_rate": 6.357882623705409e-07, + "logits/chosen": 3.6212263107299805, + "logits/rejected": 3.5150578022003174, + "logps/chosen": -167.80752563476562, + "logps/rejected": -176.89630126953125, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.191125869750977, + "rewards/margins": 0.6807895302772522, + "rewards/rejected": -12.871915817260742, + "step": 4126 + }, + { + "epoch": 2.8480248404347077, + "grad_norm": 0.41701215505599976, + "learning_rate": 6.329113924050634e-07, + "logits/chosen": 3.625291347503662, + "logits/rejected": 3.57366943359375, + "logps/chosen": -164.610595703125, + "logps/rejected": -175.14353942871094, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.545821189880371, + "rewards/margins": 1.0863326787948608, + "rewards/rejected": -12.632153511047363, + "step": 4127 + }, + { + "epoch": 2.848714852509919, + "grad_norm": 0.3879746198654175, + "learning_rate": 6.300345224395858e-07, + "logits/chosen": 3.4985311031341553, + "logits/rejected": 3.4985311031341553, + "logps/chosen": -188.03347778320312, + "logps/rejected": -188.03346252441406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.036497116088867, + "rewards/margins": -7.152557373046875e-07, + "rewards/rejected": -14.036495208740234, + "step": 4128 + }, + { + "epoch": 2.8494048645851304, + "grad_norm": 0.3100801706314087, + "learning_rate": 6.271576524741083e-07, + "logits/chosen": 3.4371843338012695, + "logits/rejected": 3.5140323638916016, + "logps/chosen": -164.29415893554688, + "logps/rejected": -175.6047821044922, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.581182479858398, + "rewards/margins": 1.1633331775665283, + "rewards/rejected": -12.744515419006348, + "step": 4129 + }, + { + "epoch": 2.8500948766603416, + "grad_norm": 0.9341532588005066, + "learning_rate": 6.242807825086307e-07, + "logits/chosen": 3.39357328414917, + "logits/rejected": 3.5024900436401367, + "logps/chosen": -151.6978759765625, + "logps/rejected": -192.79132080078125, + "loss": 0.3512, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.428716659545898, + "rewards/margins": 4.109824180603027, + "rewards/rejected": -14.538540840148926, + "step": 4130 + }, + { + "epoch": 2.8507848887355527, + "grad_norm": 0.33149269223213196, + "learning_rate": 6.214039125431531e-07, + "logits/chosen": 3.7808761596679688, + "logits/rejected": 3.817850112915039, + "logps/chosen": -180.844482421875, + "logps/rejected": -187.9510498046875, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.32390308380127, + "rewards/margins": 0.7361660003662109, + "rewards/rejected": -14.06006908416748, + "step": 4131 + }, + { + "epoch": 2.8514749008107643, + "grad_norm": 0.3088870048522949, + "learning_rate": 6.185270425776756e-07, + "logits/chosen": 3.3882317543029785, + "logits/rejected": 3.596094846725464, + "logps/chosen": -168.7634735107422, + "logps/rejected": -188.86825561523438, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.103218078613281, + "rewards/margins": 2.0397567749023438, + "rewards/rejected": -14.142974853515625, + "step": 4132 + }, + { + "epoch": 2.8521649128859754, + "grad_norm": 0.3913312554359436, + "learning_rate": 6.15650172612198e-07, + "logits/chosen": 3.583184003829956, + "logits/rejected": 3.743133544921875, + "logps/chosen": -172.2255859375, + "logps/rejected": -180.14474487304688, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.446712493896484, + "rewards/margins": 0.7533106803894043, + "rewards/rejected": -13.200023651123047, + "step": 4133 + }, + { + "epoch": 2.852854924961187, + "grad_norm": 0.4394676983356476, + "learning_rate": 6.127733026467205e-07, + "logits/chosen": 3.8870859146118164, + "logits/rejected": 3.8870859146118164, + "logps/chosen": -183.47479248046875, + "logps/rejected": -183.47479248046875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.332914352416992, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.332914352416992, + "step": 4134 + }, + { + "epoch": 2.853544937036398, + "grad_norm": 0.3867483139038086, + "learning_rate": 6.098964326812429e-07, + "logits/chosen": 3.504930019378662, + "logits/rejected": 3.7259159088134766, + "logps/chosen": -175.80084228515625, + "logps/rejected": -190.96669006347656, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.983506202697754, + "rewards/margins": 1.4354177713394165, + "rewards/rejected": -14.418923377990723, + "step": 4135 + }, + { + "epoch": 2.8542349491116097, + "grad_norm": 0.3839922249317169, + "learning_rate": 6.070195627157653e-07, + "logits/chosen": 3.4903693199157715, + "logits/rejected": 3.6993062496185303, + "logps/chosen": -152.0557403564453, + "logps/rejected": -169.56875610351562, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.502036094665527, + "rewards/margins": 1.8271799087524414, + "rewards/rejected": -12.329216003417969, + "step": 4136 + }, + { + "epoch": 2.854924961186821, + "grad_norm": 0.35602304339408875, + "learning_rate": 6.041426927502877e-07, + "logits/chosen": 3.603303909301758, + "logits/rejected": 3.6028642654418945, + "logps/chosen": -172.93101501464844, + "logps/rejected": -181.33255004882812, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.652986526489258, + "rewards/margins": 0.8230453729629517, + "rewards/rejected": -13.476032257080078, + "step": 4137 + }, + { + "epoch": 2.855614973262032, + "grad_norm": 0.3064110279083252, + "learning_rate": 6.012658227848102e-07, + "logits/chosen": 3.7285315990448, + "logits/rejected": 3.753344774246216, + "logps/chosen": -181.24334716796875, + "logps/rejected": -187.98434448242188, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.355525016784668, + "rewards/margins": 0.6927905678749084, + "rewards/rejected": -14.048315048217773, + "step": 4138 + }, + { + "epoch": 2.8563049853372435, + "grad_norm": 0.312886506319046, + "learning_rate": 5.983889528193327e-07, + "logits/chosen": 3.719229221343994, + "logits/rejected": 3.7444262504577637, + "logps/chosen": -187.44931030273438, + "logps/rejected": -194.79367065429688, + "loss": 0.6068, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.881345748901367, + "rewards/margins": 0.7419100999832153, + "rewards/rejected": -14.62325668334961, + "step": 4139 + }, + { + "epoch": 2.8569949974124547, + "grad_norm": 0.42453140020370483, + "learning_rate": 5.955120828538551e-07, + "logits/chosen": 3.816990375518799, + "logits/rejected": 3.816990375518799, + "logps/chosen": -181.23019409179688, + "logps/rejected": -181.23019409179688, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.375077247619629, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.375076293945312, + "step": 4140 + }, + { + "epoch": 2.857685009487666, + "grad_norm": 0.328934907913208, + "learning_rate": 5.926352128883775e-07, + "logits/chosen": 3.6687536239624023, + "logits/rejected": 3.6687536239624023, + "logps/chosen": -184.86309814453125, + "logps/rejected": -184.86309814453125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.515981674194336, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.515981674194336, + "step": 4141 + }, + { + "epoch": 2.8583750215628774, + "grad_norm": 0.43352845311164856, + "learning_rate": 5.897583429228999e-07, + "logits/chosen": 3.149362564086914, + "logits/rejected": 3.2167246341705322, + "logps/chosen": -155.06961059570312, + "logps/rejected": -173.80392456054688, + "loss": 0.5223, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.546731948852539, + "rewards/margins": 1.9440371990203857, + "rewards/rejected": -12.490768432617188, + "step": 4142 + }, + { + "epoch": 2.8590650336380885, + "grad_norm": 0.367141991853714, + "learning_rate": 5.868814729574224e-07, + "logits/chosen": 3.845545768737793, + "logits/rejected": 3.8780734539031982, + "logps/chosen": -174.39402770996094, + "logps/rejected": -188.0455322265625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.835281372070312, + "rewards/margins": 1.3320608139038086, + "rewards/rejected": -14.167343139648438, + "step": 4143 + }, + { + "epoch": 2.8597550457133, + "grad_norm": 0.3651314377784729, + "learning_rate": 5.840046029919448e-07, + "logits/chosen": 3.3307242393493652, + "logits/rejected": 3.7008137702941895, + "logps/chosen": -169.9036407470703, + "logps/rejected": -198.4808349609375, + "loss": 0.4338, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.098522186279297, + "rewards/margins": 2.890852928161621, + "rewards/rejected": -14.989376068115234, + "step": 4144 + }, + { + "epoch": 2.860445057788511, + "grad_norm": 0.41234928369522095, + "learning_rate": 5.811277330264673e-07, + "logits/chosen": 3.4081809520721436, + "logits/rejected": 3.478276014328003, + "logps/chosen": -180.43539428710938, + "logps/rejected": -187.52191162109375, + "loss": 0.607, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.227554321289062, + "rewards/margins": 0.6889240145683289, + "rewards/rejected": -13.916478157043457, + "step": 4145 + }, + { + "epoch": 2.861135069863723, + "grad_norm": 0.42111557722091675, + "learning_rate": 5.782508630609896e-07, + "logits/chosen": 3.3651516437530518, + "logits/rejected": 3.3651516437530518, + "logps/chosen": -191.84078979492188, + "logps/rejected": -191.84078979492188, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.427584648132324, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.427584648132324, + "step": 4146 + }, + { + "epoch": 2.861825081938934, + "grad_norm": 0.49981486797332764, + "learning_rate": 5.753739930955121e-07, + "logits/chosen": 3.5380537509918213, + "logits/rejected": 3.5380537509918213, + "logps/chosen": -175.51821899414062, + "logps/rejected": -175.51821899414062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.720534324645996, + "rewards/margins": 0.0, + "rewards/rejected": -12.720534324645996, + "step": 4147 + }, + { + "epoch": 2.862515094014145, + "grad_norm": 0.4597293436527252, + "learning_rate": 5.724971231300346e-07, + "logits/chosen": 3.592329978942871, + "logits/rejected": 3.592329978942871, + "logps/chosen": -184.88258361816406, + "logps/rejected": -184.88258361816406, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.664291381835938, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.664291381835938, + "step": 4148 + }, + { + "epoch": 2.8632051060893566, + "grad_norm": 0.4910745620727539, + "learning_rate": 5.69620253164557e-07, + "logits/chosen": 3.608081102371216, + "logits/rejected": 3.608081102371216, + "logps/chosen": -175.565185546875, + "logps/rejected": -175.565185546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.835203170776367, + "rewards/margins": 0.0, + "rewards/rejected": -12.835203170776367, + "step": 4149 + }, + { + "epoch": 2.8638951181645678, + "grad_norm": 0.4233107566833496, + "learning_rate": 5.667433831990795e-07, + "logits/chosen": 3.6974804401397705, + "logits/rejected": 3.6974804401397705, + "logps/chosen": -179.69766235351562, + "logps/rejected": -179.69766235351562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.302963256835938, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.302961349487305, + "step": 4150 + }, + { + "epoch": 2.8645851302397793, + "grad_norm": 0.521625280380249, + "learning_rate": 5.638665132336018e-07, + "logits/chosen": 2.998831272125244, + "logits/rejected": 2.998831272125244, + "logps/chosen": -182.42584228515625, + "logps/rejected": -182.42584228515625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.636876106262207, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.636876106262207, + "step": 4151 + }, + { + "epoch": 2.8652751423149905, + "grad_norm": 2.747161865234375, + "learning_rate": 5.609896432681243e-07, + "logits/chosen": 3.4059200286865234, + "logits/rejected": 3.577944040298462, + "logps/chosen": -147.17970275878906, + "logps/rejected": -174.68280029296875, + "loss": 0.4423, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.897915840148926, + "rewards/margins": 2.855865001678467, + "rewards/rejected": -12.753780364990234, + "step": 4152 + }, + { + "epoch": 2.865965154390202, + "grad_norm": 0.3134511709213257, + "learning_rate": 5.581127733026467e-07, + "logits/chosen": 3.4033377170562744, + "logits/rejected": 3.517324209213257, + "logps/chosen": -173.8056182861328, + "logps/rejected": -194.50880432128906, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.518089294433594, + "rewards/margins": 2.101283073425293, + "rewards/rejected": -14.619373321533203, + "step": 4153 + }, + { + "epoch": 2.866655166465413, + "grad_norm": 1.198642611503601, + "learning_rate": 5.552359033371692e-07, + "logits/chosen": 3.133727550506592, + "logits/rejected": 3.2467198371887207, + "logps/chosen": -168.38607788085938, + "logps/rejected": -176.84515380859375, + "loss": 0.5294, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.947371482849121, + "rewards/margins": 0.8739485740661621, + "rewards/rejected": -12.821320533752441, + "step": 4154 + }, + { + "epoch": 2.8673451785406243, + "grad_norm": 0.49480152130126953, + "learning_rate": 5.523590333716917e-07, + "logits/chosen": 3.676018238067627, + "logits/rejected": 3.747281551361084, + "logps/chosen": -177.422119140625, + "logps/rejected": -182.0740966796875, + "loss": 0.6097, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.849624633789062, + "rewards/margins": 0.45844602584838867, + "rewards/rejected": -13.30807113647461, + "step": 4155 + }, + { + "epoch": 2.868035190615836, + "grad_norm": 0.3405141532421112, + "learning_rate": 5.494821634062141e-07, + "logits/chosen": 3.8552846908569336, + "logits/rejected": 3.8552846908569336, + "logps/chosen": -163.1634979248047, + "logps/rejected": -163.1634979248047, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.571463584899902, + "rewards/margins": -2.086162567138672e-07, + "rewards/rejected": -11.571463584899902, + "step": 4156 + }, + { + "epoch": 2.868725202691047, + "grad_norm": 0.32602179050445557, + "learning_rate": 5.466052934407366e-07, + "logits/chosen": 3.5007271766662598, + "logits/rejected": 3.7707667350769043, + "logps/chosen": -169.3417205810547, + "logps/rejected": -196.9251708984375, + "loss": 0.4337, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.104320526123047, + "rewards/margins": 2.9084506034851074, + "rewards/rejected": -15.012771606445312, + "step": 4157 + }, + { + "epoch": 2.869415214766258, + "grad_norm": 0.35195526480674744, + "learning_rate": 5.43728423475259e-07, + "logits/chosen": 3.7136952877044678, + "logits/rejected": 3.7136952877044678, + "logps/chosen": -189.27249145507812, + "logps/rejected": -189.27249145507812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.237343788146973, + "rewards/margins": 0.0, + "rewards/rejected": -14.237343788146973, + "step": 4158 + }, + { + "epoch": 2.8701052268414697, + "grad_norm": 1.4968838691711426, + "learning_rate": 5.408515535097814e-07, + "logits/chosen": 3.268378257751465, + "logits/rejected": 3.336979389190674, + "logps/chosen": -155.8896942138672, + "logps/rejected": -159.3898468017578, + "loss": 0.6149, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.927419662475586, + "rewards/margins": 0.3333049416542053, + "rewards/rejected": -11.260724067687988, + "step": 4159 + }, + { + "epoch": 2.8707952389166813, + "grad_norm": 0.3480892479419708, + "learning_rate": 5.379746835443038e-07, + "logits/chosen": 3.682338237762451, + "logits/rejected": 3.682338237762451, + "logps/chosen": -196.24502563476562, + "logps/rejected": -196.24502563476562, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.748229026794434, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.748229026794434, + "step": 4160 + }, + { + "epoch": 2.8714852509918924, + "grad_norm": 0.37381845712661743, + "learning_rate": 5.350978135788263e-07, + "logits/chosen": 3.619340181350708, + "logits/rejected": 3.619340181350708, + "logps/chosen": -179.5501251220703, + "logps/rejected": -179.5501251220703, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.093660354614258, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.093660354614258, + "step": 4161 + }, + { + "epoch": 2.8721752630671036, + "grad_norm": 0.4612249732017517, + "learning_rate": 5.322209436133487e-07, + "logits/chosen": 3.6863691806793213, + "logits/rejected": 3.6863691806793213, + "logps/chosen": -178.81321716308594, + "logps/rejected": -178.81321716308594, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.093342781066895, + "rewards/margins": 0.0, + "rewards/rejected": -13.093342781066895, + "step": 4162 + }, + { + "epoch": 2.872865275142315, + "grad_norm": 0.4230221211910248, + "learning_rate": 5.293440736478712e-07, + "logits/chosen": 3.6651675701141357, + "logits/rejected": 3.7914326190948486, + "logps/chosen": -166.718017578125, + "logps/rejected": -182.5207977294922, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.819421768188477, + "rewards/margins": 1.52895188331604, + "rewards/rejected": -13.348373413085938, + "step": 4163 + }, + { + "epoch": 2.8735552872175263, + "grad_norm": 0.41343602538108826, + "learning_rate": 5.264672036823936e-07, + "logits/chosen": 3.785857915878296, + "logits/rejected": 3.785857915878296, + "logps/chosen": -191.19049072265625, + "logps/rejected": -191.19049072265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.24787712097168, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.24787712097168, + "step": 4164 + }, + { + "epoch": 2.8742452992927374, + "grad_norm": 0.38168442249298096, + "learning_rate": 5.23590333716916e-07, + "logits/chosen": 3.412156581878662, + "logits/rejected": 3.449449300765991, + "logps/chosen": -176.04559326171875, + "logps/rejected": -182.79800415039062, + "loss": 0.607, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.02476692199707, + "rewards/margins": 0.6839653253555298, + "rewards/rejected": -13.708731651306152, + "step": 4165 + }, + { + "epoch": 2.874935311367949, + "grad_norm": 19.954010009765625, + "learning_rate": 5.207134637514385e-07, + "logits/chosen": 3.6849918365478516, + "logits/rejected": 3.7322473526000977, + "logps/chosen": -169.87515258789062, + "logps/rejected": -179.98623657226562, + "loss": 0.656, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.175764083862305, + "rewards/margins": 1.028018593788147, + "rewards/rejected": -13.203782081604004, + "step": 4166 + }, + { + "epoch": 2.87562532344316, + "grad_norm": 0.30515241622924805, + "learning_rate": 5.178365937859609e-07, + "logits/chosen": 3.0748772621154785, + "logits/rejected": 3.3278892040252686, + "logps/chosen": -171.40838623046875, + "logps/rejected": -190.94642639160156, + "loss": 0.52, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.39594841003418, + "rewards/margins": 1.922282099723816, + "rewards/rejected": -14.318231582641602, + "step": 4167 + }, + { + "epoch": 2.8763153355183717, + "grad_norm": 0.46526509523391724, + "learning_rate": 5.149597238204834e-07, + "logits/chosen": 3.3121864795684814, + "logits/rejected": 3.6807427406311035, + "logps/chosen": -153.67430114746094, + "logps/rejected": -183.29832458496094, + "loss": 0.4349, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.450528144836426, + "rewards/margins": 2.9514503479003906, + "rewards/rejected": -13.401978492736816, + "step": 4168 + }, + { + "epoch": 2.877005347593583, + "grad_norm": 19.17107391357422, + "learning_rate": 5.120828538550057e-07, + "logits/chosen": 3.101097583770752, + "logits/rejected": 3.346409559249878, + "logps/chosen": -149.39959716796875, + "logps/rejected": -188.15725708007812, + "loss": 0.4229, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.167179107666016, + "rewards/margins": 3.8860368728637695, + "rewards/rejected": -14.053216934204102, + "step": 4169 + }, + { + "epoch": 2.8776953596687944, + "grad_norm": 0.3937528431415558, + "learning_rate": 5.092059838895282e-07, + "logits/chosen": 3.417038917541504, + "logits/rejected": 3.4322309494018555, + "logps/chosen": -160.55828857421875, + "logps/rejected": -174.83798217773438, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.230927467346191, + "rewards/margins": 1.4419963359832764, + "rewards/rejected": -12.67292308807373, + "step": 4170 + }, + { + "epoch": 2.8783853717440055, + "grad_norm": 0.39462146162986755, + "learning_rate": 5.063291139240507e-07, + "logits/chosen": 3.755345344543457, + "logits/rejected": 3.755345344543457, + "logps/chosen": -198.05874633789062, + "logps/rejected": -198.05874633789062, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.870842933654785, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.870842933654785, + "step": 4171 + }, + { + "epoch": 2.8790753838192167, + "grad_norm": 0.39808863401412964, + "learning_rate": 5.034522439585731e-07, + "logits/chosen": 3.0539498329162598, + "logits/rejected": 3.2157630920410156, + "logps/chosen": -167.64727783203125, + "logps/rejected": -188.47634887695312, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.1071195602417, + "rewards/margins": 2.092599391937256, + "rewards/rejected": -14.199718475341797, + "step": 4172 + }, + { + "epoch": 2.8797653958944283, + "grad_norm": 0.31300514936447144, + "learning_rate": 5.005753739930956e-07, + "logits/chosen": 3.6486587524414062, + "logits/rejected": 3.7850122451782227, + "logps/chosen": -172.3521728515625, + "logps/rejected": -186.69573974609375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.499320030212402, + "rewards/margins": 1.4565232992172241, + "rewards/rejected": -13.955843925476074, + "step": 4173 + }, + { + "epoch": 2.8804554079696394, + "grad_norm": 0.42921507358551025, + "learning_rate": 4.97698504027618e-07, + "logits/chosen": 3.865095376968384, + "logits/rejected": 3.865095376968384, + "logps/chosen": -176.95852661132812, + "logps/rejected": -176.95852661132812, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.853021621704102, + "rewards/margins": -1.7881393432617188e-07, + "rewards/rejected": -12.853021621704102, + "step": 4174 + }, + { + "epoch": 2.8811454200448505, + "grad_norm": 0.3669321835041046, + "learning_rate": 4.948216340621404e-07, + "logits/chosen": 3.3829150199890137, + "logits/rejected": 3.3792459964752197, + "logps/chosen": -171.2514190673828, + "logps/rejected": -182.66122436523438, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.39861011505127, + "rewards/margins": 1.1364585161209106, + "rewards/rejected": -13.53506851196289, + "step": 4175 + }, + { + "epoch": 2.881835432120062, + "grad_norm": 0.35035037994384766, + "learning_rate": 4.919447640966628e-07, + "logits/chosen": 3.546210289001465, + "logits/rejected": 3.61629319190979, + "logps/chosen": -171.14219665527344, + "logps/rejected": -179.36801147460938, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.207777976989746, + "rewards/margins": 0.8517420291900635, + "rewards/rejected": -13.05951976776123, + "step": 4176 + }, + { + "epoch": 2.8825254441952737, + "grad_norm": 0.4514801800251007, + "learning_rate": 4.890678941311853e-07, + "logits/chosen": 3.599885940551758, + "logits/rejected": 3.599885940551758, + "logps/chosen": -192.00949096679688, + "logps/rejected": -192.00949096679688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.468194961547852, + "rewards/margins": 0.0, + "rewards/rejected": -14.468194961547852, + "step": 4177 + }, + { + "epoch": 2.883215456270485, + "grad_norm": 0.29404714703559875, + "learning_rate": 4.861910241657078e-07, + "logits/chosen": 3.591676712036133, + "logits/rejected": 3.5618720054626465, + "logps/chosen": -156.34913635253906, + "logps/rejected": -176.99212646484375, + "loss": 0.5202, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.769991874694824, + "rewards/margins": 2.104387044906616, + "rewards/rejected": -12.874378204345703, + "step": 4178 + }, + { + "epoch": 2.883905468345696, + "grad_norm": 0.3797314763069153, + "learning_rate": 4.833141542002302e-07, + "logits/chosen": 3.634268283843994, + "logits/rejected": 3.7572755813598633, + "logps/chosen": -170.79856872558594, + "logps/rejected": -179.613525390625, + "loss": 0.6067, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.498430252075195, + "rewards/margins": 0.8344161510467529, + "rewards/rejected": -13.332847595214844, + "step": 4179 + }, + { + "epoch": 2.8845954804209075, + "grad_norm": 0.3797697424888611, + "learning_rate": 4.804372842347526e-07, + "logits/chosen": 3.773031711578369, + "logits/rejected": 3.9030041694641113, + "logps/chosen": -173.7334442138672, + "logps/rejected": -180.76168823242188, + "loss": 0.6072, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.32843017578125, + "rewards/margins": 0.6502668857574463, + "rewards/rejected": -12.978696823120117, + "step": 4180 + }, + { + "epoch": 2.8852854924961187, + "grad_norm": 0.3523382842540741, + "learning_rate": 4.775604142692751e-07, + "logits/chosen": 3.3915181159973145, + "logits/rejected": 3.5804343223571777, + "logps/chosen": -149.514892578125, + "logps/rejected": -159.37074279785156, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.006572723388672, + "rewards/margins": 0.931606113910675, + "rewards/rejected": -10.938179016113281, + "step": 4181 + }, + { + "epoch": 2.88597550457133, + "grad_norm": 0.4734683334827423, + "learning_rate": 4.7468354430379753e-07, + "logits/chosen": 4.016009330749512, + "logits/rejected": 4.016009330749512, + "logps/chosen": -174.7001190185547, + "logps/rejected": -174.7001190185547, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.722637176513672, + "rewards/margins": 0.0, + "rewards/rejected": -12.722637176513672, + "step": 4182 + }, + { + "epoch": 2.8866655166465414, + "grad_norm": 0.4463370144367218, + "learning_rate": 4.7180667433832e-07, + "logits/chosen": 3.5081167221069336, + "logits/rejected": 3.494961738586426, + "logps/chosen": -171.1953582763672, + "logps/rejected": -177.99127197265625, + "loss": 0.6073, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.339639663696289, + "rewards/margins": 0.630515456199646, + "rewards/rejected": -12.970155715942383, + "step": 4183 + }, + { + "epoch": 2.8873555287217525, + "grad_norm": 0.4115760624408722, + "learning_rate": 4.689298043728424e-07, + "logits/chosen": 4.171863079071045, + "logits/rejected": 4.171863079071045, + "logps/chosen": -179.855712890625, + "logps/rejected": -179.855712890625, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.084019660949707, + "rewards/margins": 7.748603820800781e-07, + "rewards/rejected": -13.084020614624023, + "step": 4184 + }, + { + "epoch": 2.888045540796964, + "grad_norm": 0.37216460704803467, + "learning_rate": 4.6605293440736485e-07, + "logits/chosen": 3.451193332672119, + "logits/rejected": 3.451193332672119, + "logps/chosen": -190.105224609375, + "logps/rejected": -190.105224609375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.420848846435547, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -14.420848846435547, + "step": 4185 + }, + { + "epoch": 2.888735552872175, + "grad_norm": 1.792559266090393, + "learning_rate": 4.631760644418873e-07, + "logits/chosen": 3.6109297275543213, + "logits/rejected": 3.7381751537323, + "logps/chosen": -179.6470947265625, + "logps/rejected": -196.33775329589844, + "loss": 0.5437, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.111984252929688, + "rewards/margins": 1.6456109285354614, + "rewards/rejected": -14.75759506225586, + "step": 4186 + }, + { + "epoch": 2.889425564947387, + "grad_norm": 0.38352078199386597, + "learning_rate": 4.602991944764097e-07, + "logits/chosen": 3.607435941696167, + "logits/rejected": 3.607435941696167, + "logps/chosen": -175.97210693359375, + "logps/rejected": -175.97210693359375, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.933006286621094, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -12.933006286621094, + "step": 4187 + }, + { + "epoch": 2.890115577022598, + "grad_norm": 0.31916916370391846, + "learning_rate": 4.5742232451093217e-07, + "logits/chosen": 3.3323864936828613, + "logits/rejected": 3.510544776916504, + "logps/chosen": -172.03598022460938, + "logps/rejected": -187.3303680419922, + "loss": 0.5207, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.502676010131836, + "rewards/margins": 1.5598063468933105, + "rewards/rejected": -14.062480926513672, + "step": 4188 + }, + { + "epoch": 2.890805589097809, + "grad_norm": 0.36854374408721924, + "learning_rate": 4.5454545454545457e-07, + "logits/chosen": 3.4161787033081055, + "logits/rejected": 3.491677761077881, + "logps/chosen": -173.9534912109375, + "logps/rejected": -192.51858520507812, + "loss": 0.52, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.611400604248047, + "rewards/margins": 1.881978988647461, + "rewards/rejected": -14.493380546569824, + "step": 4189 + }, + { + "epoch": 2.8914956011730206, + "grad_norm": 20.3362979888916, + "learning_rate": 4.5166858457997703e-07, + "logits/chosen": 3.613401174545288, + "logits/rejected": 3.7506275177001953, + "logps/chosen": -180.73992919921875, + "logps/rejected": -188.96798706054688, + "loss": 1.2322, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.405143737792969, + "rewards/margins": 0.858659029006958, + "rewards/rejected": -14.263803482055664, + "step": 4190 + }, + { + "epoch": 2.8921856132482318, + "grad_norm": 0.4798313081264496, + "learning_rate": 4.487917146144995e-07, + "logits/chosen": 3.1151022911071777, + "logits/rejected": 3.055417537689209, + "logps/chosen": -151.56155395507812, + "logps/rejected": -172.18997192382812, + "loss": 0.52, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.216596603393555, + "rewards/margins": 2.159104585647583, + "rewards/rejected": -12.375699996948242, + "step": 4191 + }, + { + "epoch": 2.892875625323443, + "grad_norm": 0.4033330976963043, + "learning_rate": 4.459148446490219e-07, + "logits/chosen": 3.4297807216644287, + "logits/rejected": 3.528088331222534, + "logps/chosen": -172.5845489501953, + "logps/rejected": -181.68048095703125, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.319414138793945, + "rewards/margins": 0.9513242244720459, + "rewards/rejected": -13.27073860168457, + "step": 4192 + }, + { + "epoch": 2.8935656373986545, + "grad_norm": 0.3845350742340088, + "learning_rate": 4.4303797468354435e-07, + "logits/chosen": 3.174579620361328, + "logits/rejected": 3.3210387229919434, + "logps/chosen": -130.68408203125, + "logps/rejected": -159.59683227539062, + "loss": 0.4352, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.33251953125, + "rewards/margins": 2.9421491622924805, + "rewards/rejected": -11.274667739868164, + "step": 4193 + }, + { + "epoch": 2.894255649473866, + "grad_norm": 0.4288361370563507, + "learning_rate": 4.4016110471806675e-07, + "logits/chosen": 3.389308452606201, + "logits/rejected": 3.389308452606201, + "logps/chosen": -168.65338134765625, + "logps/rejected": -168.65338134765625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.11220932006836, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -12.11220932006836, + "step": 4194 + }, + { + "epoch": 2.894945661549077, + "grad_norm": 0.4249904751777649, + "learning_rate": 4.372842347525892e-07, + "logits/chosen": 3.522077798843384, + "logits/rejected": 3.522077798843384, + "logps/chosen": -163.57574462890625, + "logps/rejected": -163.57574462890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.708342552185059, + "rewards/margins": 0.0, + "rewards/rejected": -11.708342552185059, + "step": 4195 + }, + { + "epoch": 2.8956356736242883, + "grad_norm": 0.35516926646232605, + "learning_rate": 4.3440736478711167e-07, + "logits/chosen": 3.5233614444732666, + "logits/rejected": 3.7057571411132812, + "logps/chosen": -152.42982482910156, + "logps/rejected": -173.85287475585938, + "loss": 0.5201, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.46561050415039, + "rewards/margins": 2.0540037155151367, + "rewards/rejected": -12.519613265991211, + "step": 4196 + }, + { + "epoch": 2.8963256856995, + "grad_norm": 0.40762102603912354, + "learning_rate": 4.315304948216341e-07, + "logits/chosen": 3.353184461593628, + "logits/rejected": 3.3724570274353027, + "logps/chosen": -146.32174682617188, + "logps/rejected": -161.08316040039062, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.119190216064453, + "rewards/margins": 1.2485978603363037, + "rewards/rejected": -11.36778736114502, + "step": 4197 + }, + { + "epoch": 2.897015697774711, + "grad_norm": 0.40373149514198303, + "learning_rate": 4.2865362485615653e-07, + "logits/chosen": 3.6415135860443115, + "logits/rejected": 3.6415135860443115, + "logps/chosen": -174.63641357421875, + "logps/rejected": -174.63641357421875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.702966690063477, + "rewards/margins": -4.76837158203125e-07, + "rewards/rejected": -12.70296573638916, + "step": 4198 + }, + { + "epoch": 2.897705709849922, + "grad_norm": 0.40443432331085205, + "learning_rate": 4.2577675489067894e-07, + "logits/chosen": 3.3407132625579834, + "logits/rejected": 3.272672414779663, + "logps/chosen": -173.45602416992188, + "logps/rejected": -181.97207641601562, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.624391555786133, + "rewards/margins": 0.8233208656311035, + "rewards/rejected": -13.447712898254395, + "step": 4199 + }, + { + "epoch": 2.8983957219251337, + "grad_norm": 0.3445543348789215, + "learning_rate": 4.228998849252014e-07, + "logits/chosen": 3.4711525440216064, + "logits/rejected": 3.5936572551727295, + "logps/chosen": -167.61956787109375, + "logps/rejected": -184.73562622070312, + "loss": 0.5204, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.988410949707031, + "rewards/margins": 1.6103019714355469, + "rewards/rejected": -13.598712921142578, + "step": 4200 + }, + { + "epoch": 2.899085734000345, + "grad_norm": 0.3326055407524109, + "learning_rate": 4.2002301495972385e-07, + "logits/chosen": 3.714416980743408, + "logits/rejected": 3.714416980743408, + "logps/chosen": -188.06637573242188, + "logps/rejected": -188.06637573242188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.988521575927734, + "rewards/margins": 0.0, + "rewards/rejected": -13.988521575927734, + "step": 4201 + }, + { + "epoch": 2.8997757460755564, + "grad_norm": 0.49800899624824524, + "learning_rate": 4.1714614499424626e-07, + "logits/chosen": 3.595944404602051, + "logits/rejected": 3.595944404602051, + "logps/chosen": -185.88418579101562, + "logps/rejected": -185.88418579101562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.602191925048828, + "rewards/margins": 0.0, + "rewards/rejected": -13.602191925048828, + "step": 4202 + }, + { + "epoch": 2.9004657581507676, + "grad_norm": 0.3453908860683441, + "learning_rate": 4.142692750287687e-07, + "logits/chosen": 3.3270034790039062, + "logits/rejected": 3.3822379112243652, + "logps/chosen": -157.41162109375, + "logps/rejected": -163.70066833496094, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.804264068603516, + "rewards/margins": 0.6583237648010254, + "rewards/rejected": -11.4625883102417, + "step": 4203 + }, + { + "epoch": 2.901155770225979, + "grad_norm": 0.394268661737442, + "learning_rate": 4.1139240506329117e-07, + "logits/chosen": 3.6740565299987793, + "logits/rejected": 3.6740565299987793, + "logps/chosen": -190.02407836914062, + "logps/rejected": -190.02407836914062, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.248676300048828, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -14.248676300048828, + "step": 4204 + }, + { + "epoch": 2.9018457823011903, + "grad_norm": 0.36422234773635864, + "learning_rate": 4.085155350978136e-07, + "logits/chosen": 3.595003604888916, + "logits/rejected": 3.7113330364227295, + "logps/chosen": -167.21299743652344, + "logps/rejected": -184.2791748046875, + "loss": 0.5206, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.055046081542969, + "rewards/margins": 1.539053201675415, + "rewards/rejected": -13.594099044799805, + "step": 4205 + }, + { + "epoch": 2.9025357943764014, + "grad_norm": 0.3875141441822052, + "learning_rate": 4.0563866513233603e-07, + "logits/chosen": 3.7231807708740234, + "logits/rejected": 3.7231807708740234, + "logps/chosen": -167.52520751953125, + "logps/rejected": -167.52520751953125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.270679473876953, + "rewards/margins": 2.980232238769531e-07, + "rewards/rejected": -12.270679473876953, + "step": 4206 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 0.35641762614250183, + "learning_rate": 4.0276179516685854e-07, + "logits/chosen": 3.412121295928955, + "logits/rejected": 3.412121295928955, + "logps/chosen": -183.37832641601562, + "logps/rejected": -183.37832641601562, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.73415756225586, + "rewards/margins": 0.0, + "rewards/rejected": -13.73415756225586, + "step": 4207 + }, + { + "epoch": 2.903915818526824, + "grad_norm": 0.33472132682800293, + "learning_rate": 3.9988492520138095e-07, + "logits/chosen": 3.616989850997925, + "logits/rejected": 3.7246556282043457, + "logps/chosen": -179.67095947265625, + "logps/rejected": -196.73391723632812, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.851943016052246, + "rewards/margins": 1.8083945512771606, + "rewards/rejected": -14.660337448120117, + "step": 4208 + }, + { + "epoch": 2.9046058306020357, + "grad_norm": 0.3623875379562378, + "learning_rate": 3.970080552359034e-07, + "logits/chosen": 3.800856113433838, + "logits/rejected": 3.8133726119995117, + "logps/chosen": -176.39431762695312, + "logps/rejected": -191.176025390625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.57866096496582, + "rewards/margins": 1.5192521810531616, + "rewards/rejected": -14.09791374206543, + "step": 4209 + }, + { + "epoch": 2.905295842677247, + "grad_norm": 0.34926408529281616, + "learning_rate": 3.941311852704258e-07, + "logits/chosen": 3.4155054092407227, + "logits/rejected": 3.455961227416992, + "logps/chosen": -163.90777587890625, + "logps/rejected": -179.27041625976562, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.755809783935547, + "rewards/margins": 1.5044115781784058, + "rewards/rejected": -13.260222434997559, + "step": 4210 + }, + { + "epoch": 2.9059858547524584, + "grad_norm": 0.43545544147491455, + "learning_rate": 3.9125431530494827e-07, + "logits/chosen": 3.2304983139038086, + "logits/rejected": 3.2788455486297607, + "logps/chosen": -166.79196166992188, + "logps/rejected": -180.44406127929688, + "loss": 0.5214, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.753804206848145, + "rewards/margins": 1.4223837852478027, + "rewards/rejected": -13.176187515258789, + "step": 4211 + }, + { + "epoch": 2.9066758668276695, + "grad_norm": 7.9140448570251465, + "learning_rate": 3.883774453394707e-07, + "logits/chosen": 3.1857714653015137, + "logits/rejected": 3.2731237411499023, + "logps/chosen": -164.0279998779297, + "logps/rejected": -174.26788330078125, + "loss": 0.5731, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.827611923217773, + "rewards/margins": 0.9382661581039429, + "rewards/rejected": -12.765878677368164, + "step": 4212 + }, + { + "epoch": 2.9073658789028807, + "grad_norm": 0.4668515920639038, + "learning_rate": 3.8550057537399313e-07, + "logits/chosen": 3.5567970275878906, + "logits/rejected": 3.7916388511657715, + "logps/chosen": -176.22933959960938, + "logps/rejected": -183.28213500976562, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.79633617401123, + "rewards/margins": 0.6787991523742676, + "rewards/rejected": -13.475135803222656, + "step": 4213 + }, + { + "epoch": 2.9080558909780923, + "grad_norm": 0.3360441029071808, + "learning_rate": 3.826237054085156e-07, + "logits/chosen": 3.846571445465088, + "logits/rejected": 3.9711971282958984, + "logps/chosen": -170.0235137939453, + "logps/rejected": -182.13072204589844, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.17968463897705, + "rewards/margins": 1.2214974164962769, + "rewards/rejected": -13.401182174682617, + "step": 4214 + }, + { + "epoch": 2.9087459030533034, + "grad_norm": 0.4025779962539673, + "learning_rate": 3.79746835443038e-07, + "logits/chosen": 3.9557816982269287, + "logits/rejected": 3.9557816982269287, + "logps/chosen": -188.10885620117188, + "logps/rejected": -188.10885620117188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.14729118347168, + "rewards/margins": 5.960464477539062e-07, + "rewards/rejected": -14.147293090820312, + "step": 4215 + }, + { + "epoch": 2.9094359151285145, + "grad_norm": 0.31294792890548706, + "learning_rate": 3.7686996547756045e-07, + "logits/chosen": 3.7497777938842773, + "logits/rejected": 3.7497777938842773, + "logps/chosen": -169.76046752929688, + "logps/rejected": -169.76046752929688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.02560043334961, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.025598526000977, + "step": 4216 + }, + { + "epoch": 2.910125927203726, + "grad_norm": 0.4264557957649231, + "learning_rate": 3.739930955120829e-07, + "logits/chosen": 3.3190178871154785, + "logits/rejected": 3.3190178871154785, + "logps/chosen": -195.66964721679688, + "logps/rejected": -195.6696319580078, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -14.775890350341797, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -14.775890350341797, + "step": 4217 + }, + { + "epoch": 2.9108159392789372, + "grad_norm": 0.37412068247795105, + "learning_rate": 3.711162255466053e-07, + "logits/chosen": 3.752023220062256, + "logits/rejected": 3.752023220062256, + "logps/chosen": -195.0183563232422, + "logps/rejected": -195.0183563232422, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.77419662475586, + "rewards/margins": 0.0, + "rewards/rejected": -14.77419662475586, + "step": 4218 + }, + { + "epoch": 2.911505951354149, + "grad_norm": 3.007509469985962, + "learning_rate": 3.6823935558112777e-07, + "logits/chosen": 3.5827932357788086, + "logits/rejected": 3.8401646614074707, + "logps/chosen": -162.9820556640625, + "logps/rejected": -178.9545135498047, + "loss": 0.5344, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.79132080078125, + "rewards/margins": 1.5430116653442383, + "rewards/rejected": -13.334333419799805, + "step": 4219 + }, + { + "epoch": 2.91219596342936, + "grad_norm": 0.31640625, + "learning_rate": 3.653624856156502e-07, + "logits/chosen": 3.6267199516296387, + "logits/rejected": 3.729613780975342, + "logps/chosen": -171.56361389160156, + "logps/rejected": -184.10548400878906, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.332893371582031, + "rewards/margins": 1.2410824298858643, + "rewards/rejected": -13.573974609375, + "step": 4220 + }, + { + "epoch": 2.9128859755045715, + "grad_norm": 0.38495972752571106, + "learning_rate": 3.6248561565017263e-07, + "logits/chosen": 3.5080041885375977, + "logits/rejected": 3.5080041885375977, + "logps/chosen": -198.8612060546875, + "logps/rejected": -198.8612060546875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -15.034567832946777, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -15.034567832946777, + "step": 4221 + }, + { + "epoch": 2.9135759875797826, + "grad_norm": 0.374055951833725, + "learning_rate": 3.596087456846951e-07, + "logits/chosen": 3.9009461402893066, + "logits/rejected": 4.094765663146973, + "logps/chosen": -178.70449829101562, + "logps/rejected": -184.95263671875, + "loss": 0.6075, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.182754516601562, + "rewards/margins": 0.6020197868347168, + "rewards/rejected": -13.784774780273438, + "step": 4222 + }, + { + "epoch": 2.9142659996549938, + "grad_norm": 19.044740676879883, + "learning_rate": 3.567318757192175e-07, + "logits/chosen": 3.8621249198913574, + "logits/rejected": 3.8406715393066406, + "logps/chosen": -164.59527587890625, + "logps/rejected": -187.35787963867188, + "loss": 0.8036, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.665732383728027, + "rewards/margins": 2.3010895252227783, + "rewards/rejected": -13.966822624206543, + "step": 4223 + }, + { + "epoch": 2.9149560117302054, + "grad_norm": 0.44026651978492737, + "learning_rate": 3.5385500575373995e-07, + "logits/chosen": 3.735963821411133, + "logits/rejected": 3.816854953765869, + "logps/chosen": -174.37997436523438, + "logps/rejected": -189.06602478027344, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.633148193359375, + "rewards/margins": 1.5091391801834106, + "rewards/rejected": -14.14228630065918, + "step": 4224 + }, + { + "epoch": 2.9156460238054165, + "grad_norm": 0.2947924733161926, + "learning_rate": 3.509781357882624e-07, + "logits/chosen": 3.6484827995300293, + "logits/rejected": 3.735283613204956, + "logps/chosen": -177.41122436523438, + "logps/rejected": -189.24411010742188, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.030311584472656, + "rewards/margins": 1.244083046913147, + "rewards/rejected": -14.274393081665039, + "step": 4225 + }, + { + "epoch": 2.916336035880628, + "grad_norm": 0.3444617986679077, + "learning_rate": 3.481012658227848e-07, + "logits/chosen": 3.471675157546997, + "logits/rejected": 3.698687791824341, + "logps/chosen": -166.73915100097656, + "logps/rejected": -173.82135009765625, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.91757869720459, + "rewards/margins": 0.7386443018913269, + "rewards/rejected": -12.656222343444824, + "step": 4226 + }, + { + "epoch": 2.917026047955839, + "grad_norm": 0.48141053318977356, + "learning_rate": 3.4522439585730727e-07, + "logits/chosen": 3.6866345405578613, + "logits/rejected": 3.6866345405578613, + "logps/chosen": -182.00221252441406, + "logps/rejected": -182.00221252441406, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.436641693115234, + "rewards/margins": 8.344650268554688e-07, + "rewards/rejected": -13.436641693115234, + "step": 4227 + }, + { + "epoch": 2.9177160600310508, + "grad_norm": 0.3126585781574249, + "learning_rate": 3.423475258918297e-07, + "logits/chosen": 3.5478720664978027, + "logits/rejected": 3.7367053031921387, + "logps/chosen": -154.263671875, + "logps/rejected": -165.4939727783203, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.571023941040039, + "rewards/margins": 1.1641747951507568, + "rewards/rejected": -11.735198974609375, + "step": 4228 + }, + { + "epoch": 2.918406072106262, + "grad_norm": 0.3543045222759247, + "learning_rate": 3.3947065592635213e-07, + "logits/chosen": 3.795560359954834, + "logits/rejected": 3.795560359954834, + "logps/chosen": -175.3643341064453, + "logps/rejected": -175.3643341064453, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.590564727783203, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -12.590564727783203, + "step": 4229 + }, + { + "epoch": 2.919096084181473, + "grad_norm": 0.3274024724960327, + "learning_rate": 3.365937859608746e-07, + "logits/chosen": 3.779965400695801, + "logits/rejected": 3.839385509490967, + "logps/chosen": -174.0459442138672, + "logps/rejected": -186.56253051757812, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.496835708618164, + "rewards/margins": 1.2699674367904663, + "rewards/rejected": -13.766802787780762, + "step": 4230 + }, + { + "epoch": 2.9197860962566846, + "grad_norm": 0.4024048149585724, + "learning_rate": 3.33716915995397e-07, + "logits/chosen": 3.3413102626800537, + "logits/rejected": 3.5265183448791504, + "logps/chosen": -172.5736083984375, + "logps/rejected": -182.04440307617188, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.474762916564941, + "rewards/margins": 0.9835658669471741, + "rewards/rejected": -13.458328247070312, + "step": 4231 + }, + { + "epoch": 2.9204761083318957, + "grad_norm": 18.33167839050293, + "learning_rate": 3.308400460299195e-07, + "logits/chosen": 3.165606737136841, + "logits/rejected": 3.3703413009643555, + "logps/chosen": -154.5361328125, + "logps/rejected": -174.5713653564453, + "loss": 0.6211, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.563191413879395, + "rewards/margins": 1.9997905492782593, + "rewards/rejected": -12.562982559204102, + "step": 4232 + }, + { + "epoch": 2.921166120407107, + "grad_norm": 0.39861947298049927, + "learning_rate": 3.2796317606444196e-07, + "logits/chosen": 3.147329568862915, + "logits/rejected": 3.147329568862915, + "logps/chosen": -189.11611938476562, + "logps/rejected": -189.11611938476562, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.170907974243164, + "rewards/margins": 0.0, + "rewards/rejected": -14.170907974243164, + "step": 4233 + }, + { + "epoch": 2.9218561324823185, + "grad_norm": 0.37203529477119446, + "learning_rate": 3.2508630609896437e-07, + "logits/chosen": 3.5685319900512695, + "logits/rejected": 3.6484804153442383, + "logps/chosen": -159.04986572265625, + "logps/rejected": -168.2745819091797, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.241547584533691, + "rewards/margins": 0.9422942996025085, + "rewards/rejected": -12.18384075164795, + "step": 4234 + }, + { + "epoch": 2.9225461445575296, + "grad_norm": 0.395467609167099, + "learning_rate": 3.222094361334868e-07, + "logits/chosen": 3.4718172550201416, + "logits/rejected": 3.553351879119873, + "logps/chosen": -165.078857421875, + "logps/rejected": -173.11721801757812, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.670394897460938, + "rewards/margins": 0.8547317385673523, + "rewards/rejected": -12.525125503540039, + "step": 4235 + }, + { + "epoch": 2.923236156632741, + "grad_norm": 5.224883079528809, + "learning_rate": 3.193325661680093e-07, + "logits/chosen": 3.4566667079925537, + "logits/rejected": 3.4992434978485107, + "logps/chosen": -162.15115356445312, + "logps/rejected": -173.17709350585938, + "loss": 0.6014, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.35035514831543, + "rewards/margins": 1.0486012697219849, + "rewards/rejected": -12.398956298828125, + "step": 4236 + }, + { + "epoch": 2.9239261687079523, + "grad_norm": 0.3244951665401459, + "learning_rate": 3.164556962025317e-07, + "logits/chosen": 3.5803704261779785, + "logits/rejected": 3.6024014949798584, + "logps/chosen": -146.62066650390625, + "logps/rejected": -178.69699096679688, + "loss": 0.4334, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.826065063476562, + "rewards/margins": 3.1777288913726807, + "rewards/rejected": -13.003793716430664, + "step": 4237 + }, + { + "epoch": 2.924616180783164, + "grad_norm": 0.3224639892578125, + "learning_rate": 3.1357882623705414e-07, + "logits/chosen": 3.8484654426574707, + "logits/rejected": 3.9483604431152344, + "logps/chosen": -170.67335510253906, + "logps/rejected": -181.85400390625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.107246398925781, + "rewards/margins": 1.146817922592163, + "rewards/rejected": -13.254064559936523, + "step": 4238 + }, + { + "epoch": 2.925306192858375, + "grad_norm": 0.3505931794643402, + "learning_rate": 3.1070195627157655e-07, + "logits/chosen": 3.8180744647979736, + "logits/rejected": 3.800319194793701, + "logps/chosen": -175.92239379882812, + "logps/rejected": -187.0870819091797, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.671606063842773, + "rewards/margins": 1.1354994773864746, + "rewards/rejected": -13.80710506439209, + "step": 4239 + }, + { + "epoch": 2.925996204933586, + "grad_norm": 0.2988438308238983, + "learning_rate": 3.07825086306099e-07, + "logits/chosen": 3.49857497215271, + "logits/rejected": 3.5752763748168945, + "logps/chosen": -179.9456329345703, + "logps/rejected": -191.6068115234375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.248611450195312, + "rewards/margins": 1.1920918226242065, + "rewards/rejected": -14.440703392028809, + "step": 4240 + }, + { + "epoch": 2.9266862170087977, + "grad_norm": 0.3577800393104553, + "learning_rate": 3.0494821634062146e-07, + "logits/chosen": 3.346634864807129, + "logits/rejected": 3.2983903884887695, + "logps/chosen": -176.33004760742188, + "logps/rejected": -184.3780517578125, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.694473266601562, + "rewards/margins": 0.8380063772201538, + "rewards/rejected": -13.532480239868164, + "step": 4241 + }, + { + "epoch": 2.927376229084009, + "grad_norm": 0.32094669342041016, + "learning_rate": 3.0207134637514387e-07, + "logits/chosen": 3.9851770401000977, + "logits/rejected": 3.9851770401000977, + "logps/chosen": -187.04356384277344, + "logps/rejected": -187.04356384277344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.903402328491211, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.903402328491211, + "step": 4242 + }, + { + "epoch": 2.9280662411592204, + "grad_norm": 0.3881905972957611, + "learning_rate": 2.991944764096663e-07, + "logits/chosen": 3.3836400508880615, + "logits/rejected": 3.49666166305542, + "logps/chosen": -157.5004119873047, + "logps/rejected": -179.35830688476562, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.012953758239746, + "rewards/margins": 2.1634273529052734, + "rewards/rejected": -13.176382064819336, + "step": 4243 + }, + { + "epoch": 2.9287562532344316, + "grad_norm": 0.9848654866218567, + "learning_rate": 2.9631760644418873e-07, + "logits/chosen": 2.822164535522461, + "logits/rejected": 3.0635781288146973, + "logps/chosen": -155.97035217285156, + "logps/rejected": -172.76171875, + "loss": 0.5252, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.750629425048828, + "rewards/margins": 1.6830253601074219, + "rewards/rejected": -12.43365478515625, + "step": 4244 + }, + { + "epoch": 2.929446265309643, + "grad_norm": 0.3543437719345093, + "learning_rate": 2.934407364787112e-07, + "logits/chosen": 3.672881603240967, + "logits/rejected": 3.672881603240967, + "logps/chosen": -182.2783660888672, + "logps/rejected": -182.2783660888672, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.563365936279297, + "rewards/margins": 0.0, + "rewards/rejected": -13.563365936279297, + "step": 4245 + }, + { + "epoch": 2.9301362773848543, + "grad_norm": 30.466154098510742, + "learning_rate": 2.9056386651323364e-07, + "logits/chosen": 3.7469608783721924, + "logits/rejected": 3.575589418411255, + "logps/chosen": -166.9027557373047, + "logps/rejected": -185.85235595703125, + "loss": 1.2463, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.856821060180664, + "rewards/margins": 1.8698093891143799, + "rewards/rejected": -13.726631164550781, + "step": 4246 + }, + { + "epoch": 2.9308262894600654, + "grad_norm": 0.48204389214515686, + "learning_rate": 2.8768699654775605e-07, + "logits/chosen": 3.406719446182251, + "logits/rejected": 3.705604076385498, + "logps/chosen": -142.58529663085938, + "logps/rejected": -178.3399658203125, + "loss": 0.348, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.54409408569336, + "rewards/margins": 3.743762254714966, + "rewards/rejected": -13.28785514831543, + "step": 4247 + }, + { + "epoch": 2.931516301535277, + "grad_norm": 0.4632953107357025, + "learning_rate": 2.848101265822785e-07, + "logits/chosen": 3.60001277923584, + "logits/rejected": 3.60001277923584, + "logps/chosen": -167.23117065429688, + "logps/rejected": -167.23117065429688, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -11.8916654586792, + "rewards/margins": 0.0, + "rewards/rejected": -11.8916654586792, + "step": 4248 + }, + { + "epoch": 2.932206313610488, + "grad_norm": 0.6579151153564453, + "learning_rate": 2.819332566168009e-07, + "logits/chosen": 3.4957258701324463, + "logits/rejected": 3.4842703342437744, + "logps/chosen": -152.3874053955078, + "logps/rejected": -180.57156372070312, + "loss": 0.3489, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.590816497802734, + "rewards/margins": 2.994884490966797, + "rewards/rejected": -13.585700988769531, + "step": 4249 + }, + { + "epoch": 2.9328963256856992, + "grad_norm": 0.3674022853374481, + "learning_rate": 2.7905638665132337e-07, + "logits/chosen": 3.3977885246276855, + "logits/rejected": 3.5769412517547607, + "logps/chosen": -146.0874786376953, + "logps/rejected": -164.54269409179688, + "loss": 0.5209, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.639687538146973, + "rewards/margins": 1.8886287212371826, + "rewards/rejected": -11.528316497802734, + "step": 4250 + }, + { + "epoch": 2.933586337760911, + "grad_norm": 0.39287829399108887, + "learning_rate": 2.7617951668584583e-07, + "logits/chosen": 4.076351165771484, + "logits/rejected": 4.076351165771484, + "logps/chosen": -181.54763793945312, + "logps/rejected": -181.54763793945312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.242944717407227, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -13.242944717407227, + "step": 4251 + }, + { + "epoch": 2.9342763498361224, + "grad_norm": 0.4721458852291107, + "learning_rate": 2.733026467203683e-07, + "logits/chosen": 3.3878724575042725, + "logits/rejected": 3.4599907398223877, + "logps/chosen": -140.60101318359375, + "logps/rejected": -172.9891357421875, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.324440956115723, + "rewards/margins": 3.2872695922851562, + "rewards/rejected": -12.611710548400879, + "step": 4252 + }, + { + "epoch": 2.9349663619113335, + "grad_norm": 0.41781941056251526, + "learning_rate": 2.704257767548907e-07, + "logits/chosen": 3.381145477294922, + "logits/rejected": 3.381145477294922, + "logps/chosen": -179.73983764648438, + "logps/rejected": -179.73983764648438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.162318229675293, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.162318229675293, + "step": 4253 + }, + { + "epoch": 2.9356563739865447, + "grad_norm": 0.4576009511947632, + "learning_rate": 2.6754890678941315e-07, + "logits/chosen": 3.3041622638702393, + "logits/rejected": 3.519645929336548, + "logps/chosen": -184.2747039794922, + "logps/rejected": -190.5198974609375, + "loss": 0.6073, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.501152992248535, + "rewards/margins": 0.6367491483688354, + "rewards/rejected": -14.13790225982666, + "step": 4254 + }, + { + "epoch": 2.9363463860617562, + "grad_norm": 0.39555177092552185, + "learning_rate": 2.646720368239356e-07, + "logits/chosen": 3.410133123397827, + "logits/rejected": 3.410133123397827, + "logps/chosen": -174.0036163330078, + "logps/rejected": -174.0036163330078, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.58245849609375, + "rewards/margins": 0.0, + "rewards/rejected": -12.58245849609375, + "step": 4255 + }, + { + "epoch": 2.9370363981369674, + "grad_norm": 0.2911883592605591, + "learning_rate": 2.61795166858458e-07, + "logits/chosen": 3.1888484954833984, + "logits/rejected": 3.4253573417663574, + "logps/chosen": -159.395263671875, + "logps/rejected": -199.5117645263672, + "loss": 0.4332, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.221597671508789, + "rewards/margins": 4.034659385681152, + "rewards/rejected": -15.256258010864258, + "step": 4256 + }, + { + "epoch": 2.9377264102121785, + "grad_norm": 0.8698664903640747, + "learning_rate": 2.5891829689298047e-07, + "logits/chosen": 3.6952269077301025, + "logits/rejected": 3.669820785522461, + "logps/chosen": -172.16476440429688, + "logps/rejected": -182.62930297851562, + "loss": 0.5243, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.4249267578125, + "rewards/margins": 1.1220802068710327, + "rewards/rejected": -13.547006607055664, + "step": 4257 + }, + { + "epoch": 2.93841642228739, + "grad_norm": 0.2855982780456543, + "learning_rate": 2.5604142692750287e-07, + "logits/chosen": 3.5922348499298096, + "logits/rejected": 3.6664209365844727, + "logps/chosen": -179.45298767089844, + "logps/rejected": -190.1790771484375, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.178876876831055, + "rewards/margins": 1.0694248676300049, + "rewards/rejected": -14.24830150604248, + "step": 4258 + }, + { + "epoch": 2.939106434362601, + "grad_norm": 0.46080470085144043, + "learning_rate": 2.5316455696202533e-07, + "logits/chosen": 3.4354560375213623, + "logits/rejected": 3.591403007507324, + "logps/chosen": -159.11741638183594, + "logps/rejected": -169.17286682128906, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.891236305236816, + "rewards/margins": 1.0070199966430664, + "rewards/rejected": -11.898256301879883, + "step": 4259 + }, + { + "epoch": 2.939796446437813, + "grad_norm": 0.34889280796051025, + "learning_rate": 2.502876869965478e-07, + "logits/chosen": 3.614901542663574, + "logits/rejected": 3.6114392280578613, + "logps/chosen": -168.37376403808594, + "logps/rejected": -182.46910095214844, + "loss": 0.5213, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.008504867553711, + "rewards/margins": 1.3746110200881958, + "rewards/rejected": -13.383115768432617, + "step": 4260 + }, + { + "epoch": 2.940486458513024, + "grad_norm": 8.93016529083252, + "learning_rate": 2.474108170310702e-07, + "logits/chosen": 3.199619770050049, + "logits/rejected": 3.4157886505126953, + "logps/chosen": -167.6609649658203, + "logps/rejected": -172.57493591308594, + "loss": 0.6279, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.920098304748535, + "rewards/margins": 0.5073094367980957, + "rewards/rejected": -12.427408218383789, + "step": 4261 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.45381537079811096, + "learning_rate": 2.4453394706559265e-07, + "logits/chosen": 3.517566442489624, + "logits/rejected": 3.517566442489624, + "logps/chosen": -188.33746337890625, + "logps/rejected": -188.33746337890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -14.006664276123047, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.006664276123047, + "step": 4262 + }, + { + "epoch": 2.9418664826634466, + "grad_norm": 0.37276896834373474, + "learning_rate": 2.416570771001151e-07, + "logits/chosen": 3.6563334465026855, + "logits/rejected": 3.791205406188965, + "logps/chosen": -170.14051818847656, + "logps/rejected": -184.4879913330078, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.338482856750488, + "rewards/margins": 1.3902171850204468, + "rewards/rejected": -13.728699684143066, + "step": 4263 + }, + { + "epoch": 2.9425564947386578, + "grad_norm": 0.39639490842819214, + "learning_rate": 2.3878020713463756e-07, + "logits/chosen": 3.388495922088623, + "logits/rejected": 3.388495922088623, + "logps/chosen": -174.12020874023438, + "logps/rejected": -174.12020874023438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.49513053894043, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -12.49513053894043, + "step": 4264 + }, + { + "epoch": 2.9432465068138693, + "grad_norm": 0.3200145363807678, + "learning_rate": 2.3590333716916e-07, + "logits/chosen": 3.2618582248687744, + "logits/rejected": 3.3050460815429688, + "logps/chosen": -165.89837646484375, + "logps/rejected": -192.32797241210938, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.908997535705566, + "rewards/margins": 2.6393492221832275, + "rewards/rejected": -14.548346519470215, + "step": 4265 + }, + { + "epoch": 2.9439365188890805, + "grad_norm": 0.33640849590301514, + "learning_rate": 2.3302646720368242e-07, + "logits/chosen": 3.810431718826294, + "logits/rejected": 3.871429681777954, + "logps/chosen": -160.61709594726562, + "logps/rejected": -170.57020568847656, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.249163627624512, + "rewards/margins": 1.0237295627593994, + "rewards/rejected": -12.272892951965332, + "step": 4266 + }, + { + "epoch": 2.9446265309642916, + "grad_norm": 30.46946907043457, + "learning_rate": 2.3014959723820486e-07, + "logits/chosen": 3.721118688583374, + "logits/rejected": 3.6506266593933105, + "logps/chosen": -175.8800048828125, + "logps/rejected": -172.21859741210938, + "loss": 1.0248, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.861711502075195, + "rewards/margins": -0.4138529300689697, + "rewards/rejected": -12.447857856750488, + "step": 4267 + }, + { + "epoch": 2.945316543039503, + "grad_norm": 14.57688045501709, + "learning_rate": 2.2727272727272729e-07, + "logits/chosen": 3.3116660118103027, + "logits/rejected": 3.252845048904419, + "logps/chosen": -173.05894470214844, + "logps/rejected": -183.5712127685547, + "loss": 0.904, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.512755393981934, + "rewards/margins": 1.1455734968185425, + "rewards/rejected": -13.658329010009766, + "step": 4268 + }, + { + "epoch": 2.9460065551147148, + "grad_norm": 12.929847717285156, + "learning_rate": 2.2439585730724974e-07, + "logits/chosen": 3.294402599334717, + "logits/rejected": 3.3724188804626465, + "logps/chosen": -157.17584228515625, + "logps/rejected": -162.65652465820312, + "loss": 0.5694, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.837836265563965, + "rewards/margins": 0.5393208861351013, + "rewards/rejected": -11.377157211303711, + "step": 4269 + }, + { + "epoch": 2.946696567189926, + "grad_norm": 0.26839709281921387, + "learning_rate": 2.2151898734177218e-07, + "logits/chosen": 3.781813859939575, + "logits/rejected": 4.001158237457275, + "logps/chosen": -170.1009521484375, + "logps/rejected": -192.34291076660156, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.257641792297363, + "rewards/margins": 2.2387967109680176, + "rewards/rejected": -14.496438026428223, + "step": 4270 + }, + { + "epoch": 2.947386579265137, + "grad_norm": 0.30713099241256714, + "learning_rate": 2.186421173762946e-07, + "logits/chosen": 3.7596871852874756, + "logits/rejected": 3.82159161567688, + "logps/chosen": -168.55059814453125, + "logps/rejected": -181.17361450195312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.097841262817383, + "rewards/margins": 1.2741268873214722, + "rewards/rejected": -13.371967315673828, + "step": 4271 + }, + { + "epoch": 2.9480765913403486, + "grad_norm": 0.44775867462158203, + "learning_rate": 2.1576524741081704e-07, + "logits/chosen": 3.634777069091797, + "logits/rejected": 3.634777069091797, + "logps/chosen": -182.58973693847656, + "logps/rejected": -182.58975219726562, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.378933906555176, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -13.37893295288086, + "step": 4272 + }, + { + "epoch": 2.9487666034155597, + "grad_norm": 0.3871248960494995, + "learning_rate": 2.1288837744533947e-07, + "logits/chosen": 3.0781095027923584, + "logits/rejected": 3.27557635307312, + "logps/chosen": -144.07200622558594, + "logps/rejected": -185.43215942382812, + "loss": 0.4332, + "rewards/accuracies": 0.375, + "rewards/chosen": -9.503921508789062, + "rewards/margins": 4.176668167114258, + "rewards/rejected": -13.68058967590332, + "step": 4273 + }, + { + "epoch": 2.949456615490771, + "grad_norm": 0.4014122188091278, + "learning_rate": 2.1001150747986193e-07, + "logits/chosen": 3.587322235107422, + "logits/rejected": 3.587322235107422, + "logps/chosen": -170.05953979492188, + "logps/rejected": -170.05953979492188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.28969669342041, + "rewards/margins": 0.0, + "rewards/rejected": -12.28969669342041, + "step": 4274 + }, + { + "epoch": 2.9501466275659824, + "grad_norm": 12.422956466674805, + "learning_rate": 2.0713463751438436e-07, + "logits/chosen": 3.6477365493774414, + "logits/rejected": 3.6147561073303223, + "logps/chosen": -172.132080078125, + "logps/rejected": -177.4769287109375, + "loss": 0.6313, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.419153213500977, + "rewards/margins": 0.5005013942718506, + "rewards/rejected": -12.919654846191406, + "step": 4275 + }, + { + "epoch": 2.9508366396411936, + "grad_norm": 0.4538765847682953, + "learning_rate": 2.042577675489068e-07, + "logits/chosen": 3.441887378692627, + "logits/rejected": 3.513148784637451, + "logps/chosen": -167.3863525390625, + "logps/rejected": -173.7162322998047, + "loss": 0.6076, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.014182090759277, + "rewards/margins": 0.5940772891044617, + "rewards/rejected": -12.608259201049805, + "step": 4276 + }, + { + "epoch": 2.951526651716405, + "grad_norm": 20.17192840576172, + "learning_rate": 2.0138089758342927e-07, + "logits/chosen": 3.586951971054077, + "logits/rejected": 3.5172905921936035, + "logps/chosen": -166.6197052001953, + "logps/rejected": -176.23089599609375, + "loss": 0.7205, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.011478424072266, + "rewards/margins": 0.8997818231582642, + "rewards/rejected": -12.911260604858398, + "step": 4277 + }, + { + "epoch": 2.9522166637916163, + "grad_norm": 0.493383526802063, + "learning_rate": 1.985040276179517e-07, + "logits/chosen": 4.128995895385742, + "logits/rejected": 4.128489017486572, + "logps/chosen": -174.23321533203125, + "logps/rejected": -179.24868774414062, + "loss": 0.6084, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.716550827026367, + "rewards/margins": 0.5230121612548828, + "rewards/rejected": -13.23956298828125, + "step": 4278 + }, + { + "epoch": 2.952906675866828, + "grad_norm": 0.25684431195259094, + "learning_rate": 1.9562715765247413e-07, + "logits/chosen": 3.3351211547851562, + "logits/rejected": 3.4024524688720703, + "logps/chosen": -172.0750274658203, + "logps/rejected": -202.63729858398438, + "loss": 0.4339, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.271390914916992, + "rewards/margins": 3.0730912685394287, + "rewards/rejected": -15.344482421875, + "step": 4279 + }, + { + "epoch": 2.953596687942039, + "grad_norm": 0.4175972640514374, + "learning_rate": 1.9275028768699656e-07, + "logits/chosen": 3.5690488815307617, + "logits/rejected": 3.6599416732788086, + "logps/chosen": -142.84359741210938, + "logps/rejected": -159.0776824951172, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.461026191711426, + "rewards/margins": 1.5669511556625366, + "rewards/rejected": -11.02797794342041, + "step": 4280 + }, + { + "epoch": 2.95428670001725, + "grad_norm": 0.4279506504535675, + "learning_rate": 1.89873417721519e-07, + "logits/chosen": 3.162921190261841, + "logits/rejected": 3.305732011795044, + "logps/chosen": -169.02455139160156, + "logps/rejected": -179.3835906982422, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.248368263244629, + "rewards/margins": 1.0613975524902344, + "rewards/rejected": -13.309765815734863, + "step": 4281 + }, + { + "epoch": 2.9549767120924617, + "grad_norm": 0.7892491221427917, + "learning_rate": 1.8699654775604145e-07, + "logits/chosen": 3.1822359561920166, + "logits/rejected": 3.358924388885498, + "logps/chosen": -115.80687713623047, + "logps/rejected": -150.03848266601562, + "loss": 0.3606, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.99680233001709, + "rewards/margins": 3.4161033630371094, + "rewards/rejected": -10.4129056930542, + "step": 4282 + }, + { + "epoch": 2.955666724167673, + "grad_norm": 1.818038821220398, + "learning_rate": 1.8411967779056388e-07, + "logits/chosen": 3.3817973136901855, + "logits/rejected": 3.7140181064605713, + "logps/chosen": -164.7511749267578, + "logps/rejected": -188.98159790039062, + "loss": 0.4452, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.543971061706543, + "rewards/margins": 2.3993020057678223, + "rewards/rejected": -13.943273544311523, + "step": 4283 + }, + { + "epoch": 2.956356736242884, + "grad_norm": 0.35576754808425903, + "learning_rate": 1.8124280782508632e-07, + "logits/chosen": 3.4329993724823, + "logits/rejected": 3.507159471511841, + "logps/chosen": -178.088623046875, + "logps/rejected": -189.63523864746094, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.071556091308594, + "rewards/margins": 1.1769427061080933, + "rewards/rejected": -14.248498916625977, + "step": 4284 + }, + { + "epoch": 2.9570467483180956, + "grad_norm": 0.3526248037815094, + "learning_rate": 1.7836593785960875e-07, + "logits/chosen": 3.3297388553619385, + "logits/rejected": 3.457200527191162, + "logps/chosen": -153.4473876953125, + "logps/rejected": -163.83181762695312, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.692859649658203, + "rewards/margins": 1.0775073766708374, + "rewards/rejected": -11.770366668701172, + "step": 4285 + }, + { + "epoch": 2.957736760393307, + "grad_norm": 0.3400050699710846, + "learning_rate": 1.754890678941312e-07, + "logits/chosen": 3.588804006576538, + "logits/rejected": 3.588804006576538, + "logps/chosen": -190.94760131835938, + "logps/rejected": -190.94760131835938, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.507610321044922, + "rewards/margins": -1.1920928955078125e-07, + "rewards/rejected": -14.507610321044922, + "step": 4286 + }, + { + "epoch": 2.9584267724685183, + "grad_norm": 0.3461601138114929, + "learning_rate": 1.7261219792865363e-07, + "logits/chosen": 2.850477695465088, + "logits/rejected": 3.407485246658325, + "logps/chosen": -159.1339874267578, + "logps/rejected": -184.79791259765625, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.24575424194336, + "rewards/margins": 2.577442169189453, + "rewards/rejected": -13.823196411132812, + "step": 4287 + }, + { + "epoch": 2.9591167845437294, + "grad_norm": 0.3256668746471405, + "learning_rate": 1.6973532796317607e-07, + "logits/chosen": 3.303537368774414, + "logits/rejected": 3.323751449584961, + "logps/chosen": -152.18881225585938, + "logps/rejected": -165.93206787109375, + "loss": 0.5211, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.38400650024414, + "rewards/margins": 1.3829474449157715, + "rewards/rejected": -11.76695442199707, + "step": 4288 + }, + { + "epoch": 2.959806796618941, + "grad_norm": 0.6132733225822449, + "learning_rate": 1.668584579976985e-07, + "logits/chosen": 3.5287160873413086, + "logits/rejected": 3.5287160873413086, + "logps/chosen": -182.90536499023438, + "logps/rejected": -182.90536499023438, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.392601013183594, + "rewards/margins": 0.0, + "rewards/rejected": -13.392601013183594, + "step": 4289 + }, + { + "epoch": 2.960496808694152, + "grad_norm": 0.4025234878063202, + "learning_rate": 1.6398158803222098e-07, + "logits/chosen": 3.5856900215148926, + "logits/rejected": 3.5856900215148926, + "logps/chosen": -179.51568603515625, + "logps/rejected": -179.51568603515625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.239513397216797, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.239513397216797, + "step": 4290 + }, + { + "epoch": 2.9611868207693632, + "grad_norm": 0.3382314443588257, + "learning_rate": 1.611047180667434e-07, + "logits/chosen": 3.549700975418091, + "logits/rejected": 3.5747666358947754, + "logps/chosen": -160.740478515625, + "logps/rejected": -176.89785766601562, + "loss": 0.5203, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.39631462097168, + "rewards/margins": 1.6328234672546387, + "rewards/rejected": -13.029138565063477, + "step": 4291 + }, + { + "epoch": 2.961876832844575, + "grad_norm": 0.3861912786960602, + "learning_rate": 1.5822784810126584e-07, + "logits/chosen": 3.8323163986206055, + "logits/rejected": 3.8323163986206055, + "logps/chosen": -180.71902465820312, + "logps/rejected": -180.71902465820312, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.381431579589844, + "rewards/margins": 0.0, + "rewards/rejected": -13.381431579589844, + "step": 4292 + }, + { + "epoch": 2.962566844919786, + "grad_norm": 0.4870743751525879, + "learning_rate": 1.5535097813578827e-07, + "logits/chosen": 3.8336524963378906, + "logits/rejected": 3.8336524963378906, + "logps/chosen": -187.64566040039062, + "logps/rejected": -187.64566040039062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.045063972473145, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -14.045063018798828, + "step": 4293 + }, + { + "epoch": 2.9632568569949975, + "grad_norm": 0.5258663892745972, + "learning_rate": 1.5247410817031073e-07, + "logits/chosen": 3.548550605773926, + "logits/rejected": 3.8381590843200684, + "logps/chosen": -167.63565063476562, + "logps/rejected": -179.07647705078125, + "loss": 0.522, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.878595352172852, + "rewards/margins": 1.1881715059280396, + "rewards/rejected": -13.066767692565918, + "step": 4294 + }, + { + "epoch": 2.9639468690702087, + "grad_norm": 39.35256576538086, + "learning_rate": 1.4959723820483316e-07, + "logits/chosen": 3.450171709060669, + "logits/rejected": 3.4395928382873535, + "logps/chosen": -182.71414184570312, + "logps/rejected": -179.92919921875, + "loss": 0.9166, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.61381721496582, + "rewards/margins": -0.2992061376571655, + "rewards/rejected": -13.314611434936523, + "step": 4295 + }, + { + "epoch": 2.9646368811454202, + "grad_norm": 0.24636082351207733, + "learning_rate": 1.467203682393556e-07, + "logits/chosen": 3.1074740886688232, + "logits/rejected": 3.2114460468292236, + "logps/chosen": -169.44302368164062, + "logps/rejected": -188.93154907226562, + "loss": 0.52, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.195237159729004, + "rewards/margins": 1.9900875091552734, + "rewards/rejected": -14.185324668884277, + "step": 4296 + }, + { + "epoch": 2.9653268932206314, + "grad_norm": 0.46989864110946655, + "learning_rate": 1.4384349827387802e-07, + "logits/chosen": 3.7882730960845947, + "logits/rejected": 3.8205981254577637, + "logps/chosen": -157.3135223388672, + "logps/rejected": -170.97779846191406, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -10.96725082397461, + "rewards/margins": 1.2746260166168213, + "rewards/rejected": -12.241876602172852, + "step": 4297 + }, + { + "epoch": 2.9660169052958425, + "grad_norm": 0.4001195728778839, + "learning_rate": 1.4096662830840046e-07, + "logits/chosen": 3.798032760620117, + "logits/rejected": 3.798032760620117, + "logps/chosen": -188.13795471191406, + "logps/rejected": -188.13795471191406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.955392837524414, + "rewards/margins": 0.0, + "rewards/rejected": -13.955392837524414, + "step": 4298 + }, + { + "epoch": 2.966706917371054, + "grad_norm": 0.9842440485954285, + "learning_rate": 1.3808975834292291e-07, + "logits/chosen": 3.504481077194214, + "logits/rejected": 3.502955198287964, + "logps/chosen": -177.1500701904297, + "logps/rejected": -181.97137451171875, + "loss": 0.6087, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.86030101776123, + "rewards/margins": 0.5025016069412231, + "rewards/rejected": -13.362802505493164, + "step": 4299 + }, + { + "epoch": 2.967396929446265, + "grad_norm": 0.4373147487640381, + "learning_rate": 1.3521288837744534e-07, + "logits/chosen": 3.9093058109283447, + "logits/rejected": 3.9093058109283447, + "logps/chosen": -185.62969970703125, + "logps/rejected": -185.62969970703125, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.804386138916016, + "rewards/margins": -3.5762786865234375e-07, + "rewards/rejected": -13.804386138916016, + "step": 4300 + }, + { + "epoch": 2.968086941521477, + "grad_norm": 0.35367339849472046, + "learning_rate": 1.323360184119678e-07, + "logits/chosen": 3.4371047019958496, + "logits/rejected": 3.73388409614563, + "logps/chosen": -170.5494842529297, + "logps/rejected": -187.18649291992188, + "loss": 0.5203, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.275225639343262, + "rewards/margins": 1.62661612033844, + "rewards/rejected": -13.901841163635254, + "step": 4301 + }, + { + "epoch": 2.968776953596688, + "grad_norm": 0.381662517786026, + "learning_rate": 1.2945914844649023e-07, + "logits/chosen": 3.155467987060547, + "logits/rejected": 3.1283044815063477, + "logps/chosen": -173.77438354492188, + "logps/rejected": -179.69400024414062, + "loss": 0.6076, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.654789924621582, + "rewards/margins": 0.5971270799636841, + "rewards/rejected": -13.251917839050293, + "step": 4302 + }, + { + "epoch": 2.9694669656718995, + "grad_norm": 0.3446969985961914, + "learning_rate": 1.2658227848101266e-07, + "logits/chosen": 3.2313878536224365, + "logits/rejected": 3.3677899837493896, + "logps/chosen": -186.94187927246094, + "logps/rejected": -195.92648315429688, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.77585220336914, + "rewards/margins": 0.9116207361221313, + "rewards/rejected": -14.687471389770508, + "step": 4303 + }, + { + "epoch": 2.9701569777471106, + "grad_norm": 0.4828381836414337, + "learning_rate": 1.237054085155351e-07, + "logits/chosen": 3.301175355911255, + "logits/rejected": 3.516017436981201, + "logps/chosen": -157.1670379638672, + "logps/rejected": -165.95828247070312, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.015250205993652, + "rewards/margins": 0.837421715259552, + "rewards/rejected": -11.85267162322998, + "step": 4304 + }, + { + "epoch": 2.9708469898223218, + "grad_norm": 0.356237530708313, + "learning_rate": 1.2082853855005755e-07, + "logits/chosen": 3.509765148162842, + "logits/rejected": 3.5658936500549316, + "logps/chosen": -157.55923461914062, + "logps/rejected": -179.5614013671875, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.92662239074707, + "rewards/margins": 2.26343035697937, + "rewards/rejected": -13.190052032470703, + "step": 4305 + }, + { + "epoch": 2.9715370018975333, + "grad_norm": 0.44018545746803284, + "learning_rate": 1.1795166858458e-07, + "logits/chosen": 3.589545249938965, + "logits/rejected": 3.589545249938965, + "logps/chosen": -172.99871826171875, + "logps/rejected": -172.99871826171875, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.505720138549805, + "rewards/margins": 0.0, + "rewards/rejected": -12.505718231201172, + "step": 4306 + }, + { + "epoch": 2.9722270139727445, + "grad_norm": 0.46952471137046814, + "learning_rate": 1.1507479861910243e-07, + "logits/chosen": 3.561260461807251, + "logits/rejected": 3.561260461807251, + "logps/chosen": -188.67886352539062, + "logps/rejected": -188.67886352539062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -14.191040992736816, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -14.1910400390625, + "step": 4307 + }, + { + "epoch": 2.9729170260479556, + "grad_norm": 0.3476300835609436, + "learning_rate": 1.1219792865362487e-07, + "logits/chosen": 3.958385944366455, + "logits/rejected": 4.029808521270752, + "logps/chosen": -171.99928283691406, + "logps/rejected": -179.53135681152344, + "loss": 0.6069, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.429266929626465, + "rewards/margins": 0.7188286781311035, + "rewards/rejected": -13.148096084594727, + "step": 4308 + }, + { + "epoch": 2.973607038123167, + "grad_norm": 5.893113136291504, + "learning_rate": 1.093210586881473e-07, + "logits/chosen": 3.5488736629486084, + "logits/rejected": 3.6846935749053955, + "logps/chosen": -183.16641235351562, + "logps/rejected": -184.29896545410156, + "loss": 0.6434, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.547126770019531, + "rewards/margins": 0.13346457481384277, + "rewards/rejected": -13.680591583251953, + "step": 4309 + }, + { + "epoch": 2.9742970501983783, + "grad_norm": 1.6348161697387695, + "learning_rate": 1.0644418872266973e-07, + "logits/chosen": 3.4910717010498047, + "logits/rejected": 3.560668468475342, + "logps/chosen": -165.361328125, + "logps/rejected": -177.80723571777344, + "loss": 0.5256, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.532062530517578, + "rewards/margins": 1.244722604751587, + "rewards/rejected": -12.77678394317627, + "step": 4310 + }, + { + "epoch": 2.97498706227359, + "grad_norm": 0.3415098786354065, + "learning_rate": 1.0356731875719218e-07, + "logits/chosen": 3.6750011444091797, + "logits/rejected": 3.8625288009643555, + "logps/chosen": -168.6241455078125, + "logps/rejected": -177.78982543945312, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.098970413208008, + "rewards/margins": 0.9136289954185486, + "rewards/rejected": -13.012598037719727, + "step": 4311 + }, + { + "epoch": 2.975677074348801, + "grad_norm": 0.36943143606185913, + "learning_rate": 1.0069044879171464e-07, + "logits/chosen": 3.206606388092041, + "logits/rejected": 3.41750431060791, + "logps/chosen": -161.6815185546875, + "logps/rejected": -171.66790771484375, + "loss": 0.6066, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.399707794189453, + "rewards/margins": 0.9426285028457642, + "rewards/rejected": -12.342336654663086, + "step": 4312 + }, + { + "epoch": 2.9763670864240126, + "grad_norm": 0.5131269693374634, + "learning_rate": 9.781357882623707e-08, + "logits/chosen": 3.3740334510803223, + "logits/rejected": 3.3740334510803223, + "logps/chosen": -172.2891845703125, + "logps/rejected": -172.2891845703125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -12.431163787841797, + "rewards/margins": 0.0, + "rewards/rejected": -12.431163787841797, + "step": 4313 + }, + { + "epoch": 2.9770570984992237, + "grad_norm": 9.057720184326172, + "learning_rate": 9.49367088607595e-08, + "logits/chosen": 3.757628917694092, + "logits/rejected": 3.864628553390503, + "logps/chosen": -168.6748046875, + "logps/rejected": -184.2315216064453, + "loss": 0.5727, + "rewards/accuracies": 0.5, + "rewards/chosen": -12.046201705932617, + "rewards/margins": 1.5491658449172974, + "rewards/rejected": -13.595367431640625, + "step": 4314 + }, + { + "epoch": 2.977747110574435, + "grad_norm": 0.4084939658641815, + "learning_rate": 9.205983889528194e-08, + "logits/chosen": 3.50212025642395, + "logits/rejected": 3.5568716526031494, + "logps/chosen": -171.39085388183594, + "logps/rejected": -185.68997192382812, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.289216995239258, + "rewards/margins": 1.4345828294754028, + "rewards/rejected": -13.723799705505371, + "step": 4315 + }, + { + "epoch": 2.9784371226496464, + "grad_norm": 0.42179328203201294, + "learning_rate": 8.918296892980437e-08, + "logits/chosen": 3.432990312576294, + "logits/rejected": 3.6030101776123047, + "logps/chosen": -145.6173858642578, + "logps/rejected": -160.91558837890625, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -9.770232200622559, + "rewards/margins": 1.5113080739974976, + "rewards/rejected": -11.281540870666504, + "step": 4316 + }, + { + "epoch": 2.9791271347248576, + "grad_norm": 0.33707910776138306, + "learning_rate": 8.630609896432682e-08, + "logits/chosen": 3.4228031635284424, + "logits/rejected": 3.5699961185455322, + "logps/chosen": -168.47012329101562, + "logps/rejected": -188.56410217285156, + "loss": 0.5202, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.089561462402344, + "rewards/margins": 2.0469067096710205, + "rewards/rejected": -14.136467933654785, + "step": 4317 + }, + { + "epoch": 2.979817146800069, + "grad_norm": 3.4576334953308105, + "learning_rate": 8.342922899884925e-08, + "logits/chosen": 3.405381202697754, + "logits/rejected": 3.4552645683288574, + "logps/chosen": -146.47816467285156, + "logps/rejected": -166.43601989746094, + "loss": 0.5394, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.04703426361084, + "rewards/margins": 1.7997158765792847, + "rewards/rejected": -11.846749305725098, + "step": 4318 + }, + { + "epoch": 2.9805071588752803, + "grad_norm": 0.37747281789779663, + "learning_rate": 8.05523590333717e-08, + "logits/chosen": 3.758253335952759, + "logits/rejected": 3.803945779800415, + "logps/chosen": -165.95175170898438, + "logps/rejected": -177.39871215820312, + "loss": 0.6065, + "rewards/accuracies": 0.375, + "rewards/chosen": -11.949031829833984, + "rewards/margins": 1.1394466161727905, + "rewards/rejected": -13.088478088378906, + "step": 4319 + }, + { + "epoch": 2.981197170950492, + "grad_norm": 0.5201277136802673, + "learning_rate": 7.767548906789414e-08, + "logits/chosen": 3.830204486846924, + "logits/rejected": 3.830204486846924, + "logps/chosen": -184.68167114257812, + "logps/rejected": -184.68167114257812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.693763732910156, + "rewards/margins": -8.344650268554688e-07, + "rewards/rejected": -13.693763732910156, + "step": 4320 + }, + { + "epoch": 2.981887183025703, + "grad_norm": 0.4202337861061096, + "learning_rate": 7.479861910241658e-08, + "logits/chosen": 3.1598947048187256, + "logits/rejected": 3.253795862197876, + "logps/chosen": -171.6373748779297, + "logps/rejected": -192.34547424316406, + "loss": 0.5199, + "rewards/accuracies": 0.375, + "rewards/chosen": -12.308723449707031, + "rewards/margins": 2.0810952186584473, + "rewards/rejected": -14.38981819152832, + "step": 4321 + }, + { + "epoch": 2.982577195100914, + "grad_norm": 26.889925003051758, + "learning_rate": 7.192174913693901e-08, + "logits/chosen": 3.1956686973571777, + "logits/rejected": 3.060293674468994, + "logps/chosen": -157.14892578125, + "logps/rejected": -182.222900390625, + "loss": 0.8393, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.983800888061523, + "rewards/margins": 2.4200596809387207, + "rewards/rejected": -13.403860092163086, + "step": 4322 + }, + { + "epoch": 2.9832672071761257, + "grad_norm": 0.4184090197086334, + "learning_rate": 6.904487917146146e-08, + "logits/chosen": 3.4337339401245117, + "logits/rejected": 3.469649314880371, + "logps/chosen": -170.95501708984375, + "logps/rejected": -184.697265625, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.34354019165039, + "rewards/margins": 1.398298978805542, + "rewards/rejected": -13.741840362548828, + "step": 4323 + }, + { + "epoch": 2.983957219251337, + "grad_norm": 0.5321756601333618, + "learning_rate": 6.61680092059839e-08, + "logits/chosen": 3.3298513889312744, + "logits/rejected": 3.5430829524993896, + "logps/chosen": -168.84608459472656, + "logps/rejected": -176.54019165039062, + "loss": 0.6068, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.098834037780762, + "rewards/margins": 0.7503643035888672, + "rewards/rejected": -12.849198341369629, + "step": 4324 + }, + { + "epoch": 2.984647231326548, + "grad_norm": 0.3116004765033722, + "learning_rate": 6.329113924050633e-08, + "logits/chosen": 3.704916477203369, + "logits/rejected": 3.956122875213623, + "logps/chosen": -164.22019958496094, + "logps/rejected": -188.2568359375, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.630939483642578, + "rewards/margins": 2.3966221809387207, + "rewards/rejected": -14.027563095092773, + "step": 4325 + }, + { + "epoch": 2.9853372434017595, + "grad_norm": 0.39084774255752563, + "learning_rate": 6.041426927502878e-08, + "logits/chosen": 3.466265916824341, + "logits/rejected": 3.718341827392578, + "logps/chosen": -163.738037109375, + "logps/rejected": -173.28762817382812, + "loss": 0.6066, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.534187316894531, + "rewards/margins": 0.9437185525894165, + "rewards/rejected": -12.4779052734375, + "step": 4326 + }, + { + "epoch": 2.986027255476971, + "grad_norm": 0.35063374042510986, + "learning_rate": 5.7537399309551214e-08, + "logits/chosen": 3.5186307430267334, + "logits/rejected": 3.598336696624756, + "logps/chosen": -155.89544677734375, + "logps/rejected": -182.66763305664062, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.702898025512695, + "rewards/margins": 2.6814026832580566, + "rewards/rejected": -13.384302139282227, + "step": 4327 + }, + { + "epoch": 2.9867172675521823, + "grad_norm": 0.4367833435535431, + "learning_rate": 5.466052934407365e-08, + "logits/chosen": 3.4599709510803223, + "logits/rejected": 3.4599709510803223, + "logps/chosen": -198.2620849609375, + "logps/rejected": -198.2620849609375, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -15.07972240447998, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -15.07972240447998, + "step": 4328 + }, + { + "epoch": 2.9874072796273934, + "grad_norm": 0.6659621596336365, + "learning_rate": 5.178365937859609e-08, + "logits/chosen": 3.6245064735412598, + "logits/rejected": 3.749631881713867, + "logps/chosen": -164.44842529296875, + "logps/rejected": -177.47450256347656, + "loss": 0.6065, + "rewards/accuracies": 0.125, + "rewards/chosen": -11.75085163116455, + "rewards/margins": 1.339130163192749, + "rewards/rejected": -13.089982032775879, + "step": 4329 + }, + { + "epoch": 2.988097291702605, + "grad_norm": 1.638742446899414, + "learning_rate": 4.8906789413118533e-08, + "logits/chosen": 3.158039093017578, + "logits/rejected": 3.281987190246582, + "logps/chosen": -166.2753448486328, + "logps/rejected": -175.51290893554688, + "loss": 0.5288, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.775978088378906, + "rewards/margins": 1.0061795711517334, + "rewards/rejected": -12.782155990600586, + "step": 4330 + }, + { + "epoch": 2.988787303777816, + "grad_norm": 0.4126066565513611, + "learning_rate": 4.602991944764097e-08, + "logits/chosen": 3.685859441757202, + "logits/rejected": 3.685859441757202, + "logps/chosen": -177.23825073242188, + "logps/rejected": -177.23825073242188, + "loss": 0.6931, + "rewards/accuracies": 0.25, + "rewards/chosen": -13.104129791259766, + "rewards/margins": 1.1920928955078125e-07, + "rewards/rejected": -13.104129791259766, + "step": 4331 + }, + { + "epoch": 2.9894773158530272, + "grad_norm": 0.438930481672287, + "learning_rate": 4.315304948216341e-08, + "logits/chosen": 3.7693161964416504, + "logits/rejected": 3.7693161964416504, + "logps/chosen": -167.1541290283203, + "logps/rejected": -167.1541290283203, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.103493690490723, + "rewards/margins": 5.960464477539063e-08, + "rewards/rejected": -12.103493690490723, + "step": 4332 + }, + { + "epoch": 2.990167327928239, + "grad_norm": 0.38305628299713135, + "learning_rate": 4.027617951668585e-08, + "logits/chosen": 3.582365036010742, + "logits/rejected": 3.547403573989868, + "logps/chosen": -172.2504119873047, + "logps/rejected": -185.3028564453125, + "loss": 0.5213, + "rewards/accuracies": 0.25, + "rewards/chosen": -12.465898513793945, + "rewards/margins": 1.3022440671920776, + "rewards/rejected": -13.768142700195312, + "step": 4333 + }, + { + "epoch": 2.99085734000345, + "grad_norm": 0.32738545536994934, + "learning_rate": 3.739930955120829e-08, + "logits/chosen": 3.3025858402252197, + "logits/rejected": 3.4693984985351562, + "logps/chosen": -156.05621337890625, + "logps/rejected": -178.22006225585938, + "loss": 0.5201, + "rewards/accuracies": 0.375, + "rewards/chosen": -10.966703414916992, + "rewards/margins": 2.1253204345703125, + "rewards/rejected": -13.092023849487305, + "step": 4334 + }, + { + "epoch": 2.9915473520786615, + "grad_norm": 0.4203018248081207, + "learning_rate": 3.452243958573073e-08, + "logits/chosen": 3.618143320083618, + "logits/rejected": 3.631239652633667, + "logps/chosen": -174.85301208496094, + "logps/rejected": -180.90846252441406, + "loss": 0.6071, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.577597618103027, + "rewards/margins": 0.6637831330299377, + "rewards/rejected": -13.24138069152832, + "step": 4335 + }, + { + "epoch": 2.9922373641538726, + "grad_norm": 0.35423028469085693, + "learning_rate": 3.1645569620253166e-08, + "logits/chosen": 3.539797306060791, + "logits/rejected": 3.660081386566162, + "logps/chosen": -149.94561767578125, + "logps/rejected": -177.63430786132812, + "loss": 0.5199, + "rewards/accuracies": 0.5, + "rewards/chosen": -10.254694938659668, + "rewards/margins": 2.7401106357574463, + "rewards/rejected": -12.994806289672852, + "step": 4336 + }, + { + "epoch": 2.9929273762290842, + "grad_norm": 0.5347331762313843, + "learning_rate": 2.8768699654775607e-08, + "logits/chosen": 3.771368980407715, + "logits/rejected": 3.9203639030456543, + "logps/chosen": -173.63406372070312, + "logps/rejected": -180.7509307861328, + "loss": 0.6069, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.414774894714355, + "rewards/margins": 0.7231423854827881, + "rewards/rejected": -13.137917518615723, + "step": 4337 + }, + { + "epoch": 2.9936173883042954, + "grad_norm": 0.5090091228485107, + "learning_rate": 2.5891829689298045e-08, + "logits/chosen": 3.570406436920166, + "logits/rejected": 3.570406436920166, + "logps/chosen": -176.12416076660156, + "logps/rejected": -176.12417602539062, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.66588306427002, + "rewards/margins": 3.5762786865234375e-07, + "rewards/rejected": -12.66588306427002, + "step": 4338 + }, + { + "epoch": 2.9943074003795065, + "grad_norm": 0.3408271372318268, + "learning_rate": 2.3014959723820486e-08, + "logits/chosen": 3.9843764305114746, + "logits/rejected": 3.9843764305114746, + "logps/chosen": -181.9000244140625, + "logps/rejected": -181.9000244140625, + "loss": 0.6931, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.336235046386719, + "rewards/margins": 2.384185791015625e-07, + "rewards/rejected": -13.336235046386719, + "step": 4339 + }, + { + "epoch": 2.994997412454718, + "grad_norm": 18.930946350097656, + "learning_rate": 2.0138089758342927e-08, + "logits/chosen": 3.3596925735473633, + "logits/rejected": 3.2881908416748047, + "logps/chosen": -178.07713317871094, + "logps/rejected": -180.84078979492188, + "loss": 0.9947, + "rewards/accuracies": 0.125, + "rewards/chosen": -13.058171272277832, + "rewards/margins": 0.2932482957839966, + "rewards/rejected": -13.351419448852539, + "step": 4340 + }, + { + "epoch": 2.995687424529929, + "grad_norm": 0.32428470253944397, + "learning_rate": 1.7261219792865364e-08, + "logits/chosen": 3.4249939918518066, + "logits/rejected": 3.564899444580078, + "logps/chosen": -157.46481323242188, + "logps/rejected": -192.01419067382812, + "loss": 0.5199, + "rewards/accuracies": 0.25, + "rewards/chosen": -10.878650665283203, + "rewards/margins": 3.441061019897461, + "rewards/rejected": -14.319711685180664, + "step": 4341 + }, + { + "epoch": 2.9963774366051403, + "grad_norm": 0.49158498644828796, + "learning_rate": 1.4384349827387803e-08, + "logits/chosen": 3.0267717838287354, + "logits/rejected": 3.0978503227233887, + "logps/chosen": -145.7841796875, + "logps/rejected": -174.44105529785156, + "loss": 0.4353, + "rewards/accuracies": 0.5, + "rewards/chosen": -9.666278839111328, + "rewards/margins": 2.927975654602051, + "rewards/rejected": -12.594254493713379, + "step": 4342 + }, + { + "epoch": 2.997067448680352, + "grad_norm": 0.349697083234787, + "learning_rate": 1.1507479861910243e-08, + "logits/chosen": 3.4009811878204346, + "logits/rejected": 3.423271894454956, + "logps/chosen": -160.17977905273438, + "logps/rejected": -174.7362060546875, + "loss": 0.6065, + "rewards/accuracies": 0.25, + "rewards/chosen": -11.40654182434082, + "rewards/margins": 1.4637856483459473, + "rewards/rejected": -12.87032699584961, + "step": 4343 + }, + { + "epoch": 2.9977574607555635, + "grad_norm": 1.0379626750946045, + "learning_rate": 8.630609896432682e-09, + "logits/chosen": 3.2493040561676025, + "logits/rejected": 3.262796640396118, + "logps/chosen": -163.5171356201172, + "logps/rejected": -193.3923797607422, + "loss": 0.3535, + "rewards/accuracies": 0.5, + "rewards/chosen": -11.70035171508789, + "rewards/margins": 2.982640266418457, + "rewards/rejected": -14.682991981506348, + "step": 4344 + }, + { + "epoch": 2.9984474728307746, + "grad_norm": 0.4270744025707245, + "learning_rate": 5.753739930955121e-09, + "logits/chosen": 3.7662529945373535, + "logits/rejected": 3.7662529945373535, + "logps/chosen": -179.7417449951172, + "logps/rejected": -179.74172973632812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -13.203372955322266, + "rewards/margins": -2.384185791015625e-07, + "rewards/rejected": -13.203372955322266, + "step": 4345 + }, + { + "epoch": 2.9991374849059858, + "grad_norm": 0.3578815162181854, + "learning_rate": 2.8768699654775607e-09, + "logits/chosen": 3.007091522216797, + "logits/rejected": 3.1798095703125, + "logps/chosen": -135.8365936279297, + "logps/rejected": -176.0757598876953, + "loss": 0.4332, + "rewards/accuracies": 0.5, + "rewards/chosen": -8.887646675109863, + "rewards/margins": 3.949338674545288, + "rewards/rejected": -12.836984634399414, + "step": 4346 + }, + { + "epoch": 2.9998274969811973, + "grad_norm": 0.3254624903202057, + "learning_rate": 0.0, + "logits/chosen": 3.8633079528808594, + "logits/rejected": 3.9135189056396484, + "logps/chosen": -175.4900360107422, + "logps/rejected": -183.610595703125, + "loss": 0.6067, + "rewards/accuracies": 0.125, + "rewards/chosen": -12.835563659667969, + "rewards/margins": 0.8144688606262207, + "rewards/rejected": -13.650032997131348, + "step": 4347 + } + ], + "logging_steps": 1, + "max_steps": 4347, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}