{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 7344, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00013616557734204794, "grad_norm": 114.1466307381031, "learning_rate": 1.0884353741496598e-09, "logits/chosen": -8.211344718933105, "logits/rejected": -7.574012756347656, "logps/chosen": -1.1677236557006836, "logps/rejected": -1.2345831394195557, "loss": 5.8192, "rewards/accuracies": 0.5, "rewards/chosen": -11.677236557006836, "rewards/margins": 0.6685943603515625, "rewards/rejected": -12.345830917358398, "step": 1 }, { "epoch": 0.0002723311546840959, "grad_norm": 76.86048464045298, "learning_rate": 2.1768707482993195e-09, "logits/chosen": -9.145951271057129, "logits/rejected": -7.078125476837158, "logps/chosen": -1.578005313873291, "logps/rejected": -2.109044075012207, "loss": 5.2105, "rewards/accuracies": 0.5, "rewards/chosen": -15.780054092407227, "rewards/margins": 5.310388088226318, "rewards/rejected": -21.090442657470703, "step": 2 }, { "epoch": 0.0004084967320261438, "grad_norm": 76.43961845113635, "learning_rate": 3.2653061224489797e-09, "logits/chosen": -7.927372932434082, "logits/rejected": -6.205313205718994, "logps/chosen": -2.5349032878875732, "logps/rejected": -2.0752487182617188, "loss": 4.8378, "rewards/accuracies": 0.5, "rewards/chosen": -25.34903335571289, "rewards/margins": -4.596545696258545, "rewards/rejected": -20.752487182617188, "step": 3 }, { "epoch": 0.0005446623093681918, "grad_norm": 77.98031568789776, "learning_rate": 4.353741496598639e-09, "logits/chosen": -5.9748406410217285, "logits/rejected": -7.572969436645508, "logps/chosen": -3.1493289470672607, "logps/rejected": -2.46872615814209, "loss": 5.215, "rewards/accuracies": 0.5, "rewards/chosen": -31.493289947509766, "rewards/margins": -6.8060302734375, "rewards/rejected": -24.6872615814209, "step": 4 }, { "epoch": 0.0006808278867102397, "grad_norm": 87.3306205376835, "learning_rate": 5.442176870748299e-09, "logits/chosen": -7.935755729675293, "logits/rejected": -7.728808879852295, "logps/chosen": -1.6264891624450684, "logps/rejected": -1.3496191501617432, "loss": 4.7414, "rewards/accuracies": 0.25, "rewards/chosen": -16.264892578125, "rewards/margins": -2.768700361251831, "rewards/rejected": -13.496191024780273, "step": 5 }, { "epoch": 0.0008169934640522876, "grad_norm": 164.49533571987374, "learning_rate": 6.5306122448979594e-09, "logits/chosen": -7.936881065368652, "logits/rejected": -7.454012393951416, "logps/chosen": -1.0871236324310303, "logps/rejected": -1.2872685194015503, "loss": 6.2777, "rewards/accuracies": 0.5, "rewards/chosen": -10.871236801147461, "rewards/margins": 2.0014491081237793, "rewards/rejected": -12.872684478759766, "step": 6 }, { "epoch": 0.0009531590413943355, "grad_norm": 167.48787329226752, "learning_rate": 7.61904761904762e-09, "logits/chosen": -8.905806541442871, "logits/rejected": -6.656863212585449, "logps/chosen": -1.5222654342651367, "logps/rejected": -3.3753795623779297, "loss": 6.6024, "rewards/accuracies": 1.0, "rewards/chosen": -15.222654342651367, "rewards/margins": 18.531143188476562, "rewards/rejected": -33.7537956237793, "step": 7 }, { "epoch": 0.0010893246187363835, "grad_norm": 127.86369516545541, "learning_rate": 8.707482993197278e-09, "logits/chosen": -8.361207008361816, "logits/rejected": -7.8086347579956055, "logps/chosen": -1.3292715549468994, "logps/rejected": -3.623223304748535, "loss": 6.387, "rewards/accuracies": 1.0, "rewards/chosen": -13.292716026306152, "rewards/margins": 22.939516067504883, "rewards/rejected": -36.23223114013672, "step": 8 }, { "epoch": 0.0012254901960784314, "grad_norm": 171.52450106839234, "learning_rate": 9.795918367346937e-09, "logits/chosen": -8.539985656738281, "logits/rejected": -7.584209442138672, "logps/chosen": -1.1267136335372925, "logps/rejected": -1.5293104648590088, "loss": 5.6042, "rewards/accuracies": 0.5, "rewards/chosen": -11.267135620117188, "rewards/margins": 4.0259690284729, "rewards/rejected": -15.293105125427246, "step": 9 }, { "epoch": 0.0013616557734204794, "grad_norm": 187.39916682211575, "learning_rate": 1.0884353741496598e-08, "logits/chosen": -8.87921142578125, "logits/rejected": -9.262184143066406, "logps/chosen": -0.7852182388305664, "logps/rejected": -0.6963953971862793, "loss": 6.9377, "rewards/accuracies": 0.25, "rewards/chosen": -7.852182388305664, "rewards/margins": -0.8882288932800293, "rewards/rejected": -6.963953495025635, "step": 10 }, { "epoch": 0.0014978213507625272, "grad_norm": 117.86843909063884, "learning_rate": 1.1972789115646258e-08, "logits/chosen": -8.689042091369629, "logits/rejected": -9.15873908996582, "logps/chosen": -1.6882120370864868, "logps/rejected": -1.5802668333053589, "loss": 7.3226, "rewards/accuracies": 0.5, "rewards/chosen": -16.88212013244629, "rewards/margins": -1.0794520378112793, "rewards/rejected": -15.802667617797852, "step": 11 }, { "epoch": 0.0016339869281045752, "grad_norm": 113.2337766979685, "learning_rate": 1.3061224489795919e-08, "logits/chosen": -7.202520370483398, "logits/rejected": -8.234952926635742, "logps/chosen": -2.2109673023223877, "logps/rejected": -2.203378915786743, "loss": 5.1287, "rewards/accuracies": 0.25, "rewards/chosen": -22.10967445373535, "rewards/margins": -0.07588529586791992, "rewards/rejected": -22.033788681030273, "step": 12 }, { "epoch": 0.001770152505446623, "grad_norm": 94.17930024704908, "learning_rate": 1.414965986394558e-08, "logits/chosen": -8.136855125427246, "logits/rejected": -8.138833045959473, "logps/chosen": -1.1407005786895752, "logps/rejected": -1.114087462425232, "loss": 5.523, "rewards/accuracies": 0.5, "rewards/chosen": -11.407005310058594, "rewards/margins": -0.26613128185272217, "rewards/rejected": -11.140874862670898, "step": 13 }, { "epoch": 0.001906318082788671, "grad_norm": 86.03672701426065, "learning_rate": 1.523809523809524e-08, "logits/chosen": -9.734512329101562, "logits/rejected": -9.69428539276123, "logps/chosen": -1.0596859455108643, "logps/rejected": -1.180835485458374, "loss": 4.694, "rewards/accuracies": 0.5, "rewards/chosen": -10.596858978271484, "rewards/margins": 1.2114964723587036, "rewards/rejected": -11.808355331420898, "step": 14 }, { "epoch": 0.002042483660130719, "grad_norm": 79.29614050556592, "learning_rate": 1.6326530612244897e-08, "logits/chosen": -7.513303756713867, "logits/rejected": -8.469420433044434, "logps/chosen": -1.335381031036377, "logps/rejected": -1.1892902851104736, "loss": 5.5952, "rewards/accuracies": 0.5, "rewards/chosen": -13.353809356689453, "rewards/margins": -1.460907220840454, "rewards/rejected": -11.892902374267578, "step": 15 }, { "epoch": 0.002178649237472767, "grad_norm": 125.4236844549834, "learning_rate": 1.7414965986394556e-08, "logits/chosen": -9.610332489013672, "logits/rejected": -9.503217697143555, "logps/chosen": -1.2437952756881714, "logps/rejected": -1.767426609992981, "loss": 5.9875, "rewards/accuracies": 0.75, "rewards/chosen": -12.43795394897461, "rewards/margins": 5.236311912536621, "rewards/rejected": -17.674264907836914, "step": 16 }, { "epoch": 0.0023148148148148147, "grad_norm": 130.21391405647876, "learning_rate": 1.8503401360544215e-08, "logits/chosen": -8.501277923583984, "logits/rejected": -9.421195030212402, "logps/chosen": -1.3410167694091797, "logps/rejected": -1.0022532939910889, "loss": 6.4125, "rewards/accuracies": 0.0, "rewards/chosen": -13.410167694091797, "rewards/margins": -3.387634754180908, "rewards/rejected": -10.02253246307373, "step": 17 }, { "epoch": 0.0024509803921568627, "grad_norm": 115.77977594533623, "learning_rate": 1.9591836734693874e-08, "logits/chosen": -9.150215148925781, "logits/rejected": -7.175837516784668, "logps/chosen": -1.1750270128250122, "logps/rejected": -1.6741033792495728, "loss": 6.0578, "rewards/accuracies": 1.0, "rewards/chosen": -11.75027084350586, "rewards/margins": 4.990763187408447, "rewards/rejected": -16.74103355407715, "step": 18 }, { "epoch": 0.0025871459694989107, "grad_norm": 118.1429288190624, "learning_rate": 2.0680272108843536e-08, "logits/chosen": -8.59371280670166, "logits/rejected": -8.491493225097656, "logps/chosen": -4.535824298858643, "logps/rejected": -1.898592233657837, "loss": 5.3852, "rewards/accuracies": 0.5, "rewards/chosen": -45.358238220214844, "rewards/margins": -26.372318267822266, "rewards/rejected": -18.98592185974121, "step": 19 }, { "epoch": 0.0027233115468409588, "grad_norm": 127.5721256093001, "learning_rate": 2.1768707482993195e-08, "logits/chosen": -7.222818374633789, "logits/rejected": -7.5780534744262695, "logps/chosen": -1.5290307998657227, "logps/rejected": -2.143995523452759, "loss": 6.2404, "rewards/accuracies": 0.75, "rewards/chosen": -15.290307998657227, "rewards/margins": 6.149646759033203, "rewards/rejected": -21.439956665039062, "step": 20 }, { "epoch": 0.0028594771241830064, "grad_norm": 99.40704965752145, "learning_rate": 2.2857142857142854e-08, "logits/chosen": -7.122776031494141, "logits/rejected": -8.08615779876709, "logps/chosen": -1.611696481704712, "logps/rejected": -1.4275271892547607, "loss": 5.4325, "rewards/accuracies": 0.25, "rewards/chosen": -16.11696434020996, "rewards/margins": -1.8416929244995117, "rewards/rejected": -14.27527141571045, "step": 21 }, { "epoch": 0.0029956427015250544, "grad_norm": 89.36148588522657, "learning_rate": 2.3945578231292517e-08, "logits/chosen": -7.899971008300781, "logits/rejected": -7.134831428527832, "logps/chosen": -1.1833109855651855, "logps/rejected": -1.4166796207427979, "loss": 4.9541, "rewards/accuracies": 0.5, "rewards/chosen": -11.833109855651855, "rewards/margins": 2.333686113357544, "rewards/rejected": -14.16679573059082, "step": 22 }, { "epoch": 0.0031318082788671024, "grad_norm": 152.42598823063548, "learning_rate": 2.503401360544218e-08, "logits/chosen": -7.501503944396973, "logits/rejected": -7.411863327026367, "logps/chosen": -1.0078234672546387, "logps/rejected": -1.2176735401153564, "loss": 6.4242, "rewards/accuracies": 0.5, "rewards/chosen": -10.078235626220703, "rewards/margins": 2.0984997749328613, "rewards/rejected": -12.176734924316406, "step": 23 }, { "epoch": 0.0032679738562091504, "grad_norm": 117.57280305127702, "learning_rate": 2.6122448979591838e-08, "logits/chosen": -8.515674591064453, "logits/rejected": -8.054911613464355, "logps/chosen": -1.6368861198425293, "logps/rejected": -1.5409890413284302, "loss": 6.3687, "rewards/accuracies": 0.25, "rewards/chosen": -16.368860244750977, "rewards/margins": -0.9589707851409912, "rewards/rejected": -15.409890174865723, "step": 24 }, { "epoch": 0.0034041394335511985, "grad_norm": 74.29078068598226, "learning_rate": 2.7210884353741497e-08, "logits/chosen": -8.20907974243164, "logits/rejected": -7.5154643058776855, "logps/chosen": -1.421484112739563, "logps/rejected": -1.5936205387115479, "loss": 5.1423, "rewards/accuracies": 0.25, "rewards/chosen": -14.214841842651367, "rewards/margins": 1.721365213394165, "rewards/rejected": -15.936205863952637, "step": 25 }, { "epoch": 0.003540305010893246, "grad_norm": 73.30416088801161, "learning_rate": 2.829931972789116e-08, "logits/chosen": -8.965385437011719, "logits/rejected": -8.74022102355957, "logps/chosen": -1.210025429725647, "logps/rejected": -1.1828711032867432, "loss": 5.1587, "rewards/accuracies": 0.5, "rewards/chosen": -12.10025405883789, "rewards/margins": -0.2715420722961426, "rewards/rejected": -11.82871150970459, "step": 26 }, { "epoch": 0.003676470588235294, "grad_norm": 69.57707526154249, "learning_rate": 2.9387755102040818e-08, "logits/chosen": -7.099234104156494, "logits/rejected": -7.478131294250488, "logps/chosen": -1.679986596107483, "logps/rejected": -1.6992101669311523, "loss": 4.7157, "rewards/accuracies": 0.75, "rewards/chosen": -16.79986572265625, "rewards/margins": 0.19223499298095703, "rewards/rejected": -16.992101669311523, "step": 27 }, { "epoch": 0.003812636165577342, "grad_norm": 76.89270114914073, "learning_rate": 3.047619047619048e-08, "logits/chosen": -8.429234504699707, "logits/rejected": -8.70054817199707, "logps/chosen": -1.4596924781799316, "logps/rejected": -1.416865587234497, "loss": 4.9061, "rewards/accuracies": 0.5, "rewards/chosen": -14.596925735473633, "rewards/margins": -0.42826974391937256, "rewards/rejected": -14.168655395507812, "step": 28 }, { "epoch": 0.00394880174291939, "grad_norm": 195.9487427258406, "learning_rate": 3.156462585034013e-08, "logits/chosen": -9.023954391479492, "logits/rejected": -8.18747329711914, "logps/chosen": -2.179352045059204, "logps/rejected": -2.6804957389831543, "loss": 5.9604, "rewards/accuracies": 0.75, "rewards/chosen": -21.793519973754883, "rewards/margins": 5.011435508728027, "rewards/rejected": -26.804956436157227, "step": 29 }, { "epoch": 0.004084967320261438, "grad_norm": 137.52642175006892, "learning_rate": 3.2653061224489795e-08, "logits/chosen": -10.156384468078613, "logits/rejected": -8.7210693359375, "logps/chosen": -1.4117178916931152, "logps/rejected": -1.9476935863494873, "loss": 6.4322, "rewards/accuracies": 1.0, "rewards/chosen": -14.117178916931152, "rewards/margins": 5.359757423400879, "rewards/rejected": -19.47693634033203, "step": 30 }, { "epoch": 0.004221132897603486, "grad_norm": 77.64172540319154, "learning_rate": 3.3741496598639454e-08, "logits/chosen": -8.296278953552246, "logits/rejected": -8.093879699707031, "logps/chosen": -1.420203685760498, "logps/rejected": -1.0650157928466797, "loss": 5.6083, "rewards/accuracies": 0.25, "rewards/chosen": -14.202037811279297, "rewards/margins": -3.551880121231079, "rewards/rejected": -10.650157928466797, "step": 31 }, { "epoch": 0.004357298474945534, "grad_norm": 224.1673890854367, "learning_rate": 3.482993197278911e-08, "logits/chosen": -9.610221862792969, "logits/rejected": -8.1019287109375, "logps/chosen": -1.2892544269561768, "logps/rejected": -1.3113367557525635, "loss": 6.3824, "rewards/accuracies": 0.5, "rewards/chosen": -12.892545700073242, "rewards/margins": 0.22082161903381348, "rewards/rejected": -13.113367080688477, "step": 32 }, { "epoch": 0.004493464052287581, "grad_norm": 97.9362092198681, "learning_rate": 3.591836734693877e-08, "logits/chosen": -8.641178131103516, "logits/rejected": -7.257102966308594, "logps/chosen": -0.9310708045959473, "logps/rejected": -1.1190235614776611, "loss": 5.8542, "rewards/accuracies": 0.5, "rewards/chosen": -9.310707092285156, "rewards/margins": 1.8795278072357178, "rewards/rejected": -11.190235137939453, "step": 33 }, { "epoch": 0.004629629629629629, "grad_norm": 78.90814047381855, "learning_rate": 3.700680272108843e-08, "logits/chosen": -9.328125, "logits/rejected": -7.129678726196289, "logps/chosen": -0.792837381362915, "logps/rejected": -1.184739589691162, "loss": 5.437, "rewards/accuracies": 1.0, "rewards/chosen": -7.928374290466309, "rewards/margins": 3.91902232170105, "rewards/rejected": -11.847396850585938, "step": 34 }, { "epoch": 0.004765795206971677, "grad_norm": 83.84449815163184, "learning_rate": 3.809523809523809e-08, "logits/chosen": -10.108911514282227, "logits/rejected": -8.289352416992188, "logps/chosen": -0.83558589220047, "logps/rejected": -0.9919118881225586, "loss": 5.3822, "rewards/accuracies": 0.75, "rewards/chosen": -8.355859756469727, "rewards/margins": 1.5632598400115967, "rewards/rejected": -9.919118881225586, "step": 35 }, { "epoch": 0.004901960784313725, "grad_norm": 56.733587648120654, "learning_rate": 3.918367346938775e-08, "logits/chosen": -7.068085670471191, "logits/rejected": -7.348333835601807, "logps/chosen": -1.414238691329956, "logps/rejected": -1.5801136493682861, "loss": 4.5908, "rewards/accuracies": 0.25, "rewards/chosen": -14.142387390136719, "rewards/margins": 1.6587491035461426, "rewards/rejected": -15.801136016845703, "step": 36 }, { "epoch": 0.0050381263616557734, "grad_norm": 147.24495848753503, "learning_rate": 4.0272108843537414e-08, "logits/chosen": -7.519924163818359, "logits/rejected": -8.549153327941895, "logps/chosen": -1.178124189376831, "logps/rejected": -1.1782786846160889, "loss": 5.1081, "rewards/accuracies": 0.5, "rewards/chosen": -11.781242370605469, "rewards/margins": 0.0015451908111572266, "rewards/rejected": -11.782787322998047, "step": 37 }, { "epoch": 0.0051742919389978215, "grad_norm": 82.33515551791946, "learning_rate": 4.136054421768707e-08, "logits/chosen": -8.169211387634277, "logits/rejected": -7.195664882659912, "logps/chosen": -1.086193323135376, "logps/rejected": -2.0211470127105713, "loss": 4.4568, "rewards/accuracies": 1.0, "rewards/chosen": -10.861932754516602, "rewards/margins": 9.349535942077637, "rewards/rejected": -20.211467742919922, "step": 38 }, { "epoch": 0.0053104575163398695, "grad_norm": 154.33189417762586, "learning_rate": 4.244897959183673e-08, "logits/chosen": -6.354782581329346, "logits/rejected": -6.38871955871582, "logps/chosen": -2.1581192016601562, "logps/rejected": -1.4443436861038208, "loss": 6.4702, "rewards/accuracies": 0.25, "rewards/chosen": -21.581192016601562, "rewards/margins": -7.137753963470459, "rewards/rejected": -14.443437576293945, "step": 39 }, { "epoch": 0.0054466230936819175, "grad_norm": 84.10312094932894, "learning_rate": 4.353741496598639e-08, "logits/chosen": -11.454967498779297, "logits/rejected": -7.408640384674072, "logps/chosen": -1.6280879974365234, "logps/rejected": -1.2988227605819702, "loss": 5.1652, "rewards/accuracies": 0.5, "rewards/chosen": -16.280879974365234, "rewards/margins": -3.292651414871216, "rewards/rejected": -12.988227844238281, "step": 40 }, { "epoch": 0.0055827886710239655, "grad_norm": 98.41552039031836, "learning_rate": 4.462585034013605e-08, "logits/chosen": -9.39912223815918, "logits/rejected": -8.352947235107422, "logps/chosen": -0.8734592795372009, "logps/rejected": -1.1101105213165283, "loss": 6.844, "rewards/accuracies": 1.0, "rewards/chosen": -8.73459243774414, "rewards/margins": 2.366513252258301, "rewards/rejected": -11.101105690002441, "step": 41 }, { "epoch": 0.005718954248366013, "grad_norm": 133.43674955444925, "learning_rate": 4.571428571428571e-08, "logits/chosen": -7.2400221824646, "logits/rejected": -8.104711532592773, "logps/chosen": -2.300874710083008, "logps/rejected": -1.7017452716827393, "loss": 6.6227, "rewards/accuracies": 0.25, "rewards/chosen": -23.008747100830078, "rewards/margins": -5.991292953491211, "rewards/rejected": -17.017454147338867, "step": 42 }, { "epoch": 0.005855119825708061, "grad_norm": 124.26319880689016, "learning_rate": 4.680272108843537e-08, "logits/chosen": -6.776821136474609, "logits/rejected": -7.68208122253418, "logps/chosen": -1.5143163204193115, "logps/rejected": -1.5395972728729248, "loss": 6.8612, "rewards/accuracies": 0.5, "rewards/chosen": -15.143162727355957, "rewards/margins": 0.2528102397918701, "rewards/rejected": -15.395973205566406, "step": 43 }, { "epoch": 0.005991285403050109, "grad_norm": 144.04833755610252, "learning_rate": 4.789115646258503e-08, "logits/chosen": -9.737911224365234, "logits/rejected": -7.971320152282715, "logps/chosen": -0.9367937445640564, "logps/rejected": -1.076307773590088, "loss": 7.0768, "rewards/accuracies": 0.5, "rewards/chosen": -9.367937088012695, "rewards/margins": 1.395140290260315, "rewards/rejected": -10.763077735900879, "step": 44 }, { "epoch": 0.006127450980392157, "grad_norm": 101.72183976160838, "learning_rate": 4.897959183673469e-08, "logits/chosen": -7.6925578117370605, "logits/rejected": -7.423181533813477, "logps/chosen": -1.545872449874878, "logps/rejected": -1.6742762327194214, "loss": 5.727, "rewards/accuracies": 0.5, "rewards/chosen": -15.458723068237305, "rewards/margins": 1.2840385437011719, "rewards/rejected": -16.74276351928711, "step": 45 }, { "epoch": 0.006263616557734205, "grad_norm": 125.3616686943247, "learning_rate": 5.006802721088436e-08, "logits/chosen": -7.420891761779785, "logits/rejected": -6.859945297241211, "logps/chosen": -4.174806118011475, "logps/rejected": -1.7693819999694824, "loss": 6.2751, "rewards/accuracies": 0.75, "rewards/chosen": -41.7480583190918, "rewards/margins": -24.054237365722656, "rewards/rejected": -17.693819046020508, "step": 46 }, { "epoch": 0.006399782135076253, "grad_norm": 96.64040500873067, "learning_rate": 5.115646258503401e-08, "logits/chosen": -8.11874771118164, "logits/rejected": -7.624804496765137, "logps/chosen": -1.5863895416259766, "logps/rejected": -1.7539196014404297, "loss": 5.3442, "rewards/accuracies": 0.75, "rewards/chosen": -15.863895416259766, "rewards/margins": 1.6753010749816895, "rewards/rejected": -17.539196014404297, "step": 47 }, { "epoch": 0.006535947712418301, "grad_norm": 99.52356907473843, "learning_rate": 5.2244897959183676e-08, "logits/chosen": -7.503182411193848, "logits/rejected": -7.31477165222168, "logps/chosen": -1.638970136642456, "logps/rejected": -2.0264317989349365, "loss": 5.4639, "rewards/accuracies": 0.5, "rewards/chosen": -16.38970184326172, "rewards/margins": 3.8746156692504883, "rewards/rejected": -20.26431655883789, "step": 48 }, { "epoch": 0.006672113289760349, "grad_norm": 108.7423549903766, "learning_rate": 5.333333333333333e-08, "logits/chosen": -6.89417839050293, "logits/rejected": -7.197060585021973, "logps/chosen": -1.3763270378112793, "logps/rejected": -0.9428555965423584, "loss": 5.5456, "rewards/accuracies": 0.0, "rewards/chosen": -13.763269424438477, "rewards/margins": -4.334713935852051, "rewards/rejected": -9.428556442260742, "step": 49 }, { "epoch": 0.006808278867102397, "grad_norm": 130.15832449324566, "learning_rate": 5.4421768707482993e-08, "logits/chosen": -9.584125518798828, "logits/rejected": -7.581586837768555, "logps/chosen": -1.2842180728912354, "logps/rejected": -1.396111011505127, "loss": 5.9542, "rewards/accuracies": 0.5, "rewards/chosen": -12.842180252075195, "rewards/margins": 1.1189295053482056, "rewards/rejected": -13.961109161376953, "step": 50 }, { "epoch": 0.006944444444444444, "grad_norm": 110.64279836560613, "learning_rate": 5.551020408163265e-08, "logits/chosen": -7.62509298324585, "logits/rejected": -7.249032020568848, "logps/chosen": -1.2819865942001343, "logps/rejected": -1.0427690744400024, "loss": 5.719, "rewards/accuracies": 0.0, "rewards/chosen": -12.819866180419922, "rewards/margins": -2.3921751976013184, "rewards/rejected": -10.427690505981445, "step": 51 }, { "epoch": 0.007080610021786492, "grad_norm": 119.87864609847475, "learning_rate": 5.659863945578232e-08, "logits/chosen": -6.505054473876953, "logits/rejected": -9.39389419555664, "logps/chosen": -4.63499116897583, "logps/rejected": -1.8232948780059814, "loss": 6.5677, "rewards/accuracies": 0.5, "rewards/chosen": -46.34991455078125, "rewards/margins": -28.11696434020996, "rewards/rejected": -18.232948303222656, "step": 52 }, { "epoch": 0.00721677559912854, "grad_norm": 81.26726532006398, "learning_rate": 5.768707482993197e-08, "logits/chosen": -9.371644020080566, "logits/rejected": -9.768196105957031, "logps/chosen": -1.5649704933166504, "logps/rejected": -1.9932059049606323, "loss": 4.8789, "rewards/accuracies": 0.5, "rewards/chosen": -15.64970588684082, "rewards/margins": 4.282353401184082, "rewards/rejected": -19.932058334350586, "step": 53 }, { "epoch": 0.007352941176470588, "grad_norm": 113.15510629076843, "learning_rate": 5.8775510204081636e-08, "logits/chosen": -7.878812789916992, "logits/rejected": -5.903256416320801, "logps/chosen": -1.3815128803253174, "logps/rejected": -1.8615334033966064, "loss": 5.9815, "rewards/accuracies": 0.75, "rewards/chosen": -13.815129280090332, "rewards/margins": 4.800204277038574, "rewards/rejected": -18.615333557128906, "step": 54 }, { "epoch": 0.007489106753812636, "grad_norm": 97.88784055960298, "learning_rate": 5.986394557823129e-08, "logits/chosen": -8.673227310180664, "logits/rejected": -7.9167280197143555, "logps/chosen": -1.2534761428833008, "logps/rejected": -1.3685142993927002, "loss": 5.7756, "rewards/accuracies": 0.5, "rewards/chosen": -12.534761428833008, "rewards/margins": 1.150381326675415, "rewards/rejected": -13.68514347076416, "step": 55 }, { "epoch": 0.007625272331154684, "grad_norm": 107.17320325974283, "learning_rate": 6.095238095238095e-08, "logits/chosen": -7.264116287231445, "logits/rejected": -9.411066055297852, "logps/chosen": -1.898555040359497, "logps/rejected": -1.3082913160324097, "loss": 5.4008, "rewards/accuracies": 0.25, "rewards/chosen": -18.985549926757812, "rewards/margins": -5.902637481689453, "rewards/rejected": -13.08291244506836, "step": 56 }, { "epoch": 0.007761437908496732, "grad_norm": 63.47571034378853, "learning_rate": 6.20408163265306e-08, "logits/chosen": -9.199972152709961, "logits/rejected": -8.315970420837402, "logps/chosen": -1.6043345928192139, "logps/rejected": -1.5709304809570312, "loss": 4.5587, "rewards/accuracies": 0.5, "rewards/chosen": -16.043346405029297, "rewards/margins": -0.3340415954589844, "rewards/rejected": -15.709305763244629, "step": 57 }, { "epoch": 0.00789760348583878, "grad_norm": 70.99979376886064, "learning_rate": 6.312925170068026e-08, "logits/chosen": -8.813874244689941, "logits/rejected": -9.394254684448242, "logps/chosen": -1.4622546434402466, "logps/rejected": -1.5640861988067627, "loss": 5.174, "rewards/accuracies": 0.5, "rewards/chosen": -14.622546195983887, "rewards/margins": 1.0183155536651611, "rewards/rejected": -15.640861511230469, "step": 58 }, { "epoch": 0.008033769063180828, "grad_norm": 72.15643816550936, "learning_rate": 6.421768707482992e-08, "logits/chosen": -8.310294151306152, "logits/rejected": -8.120220184326172, "logps/chosen": -1.091858148574829, "logps/rejected": -1.5291668176651, "loss": 4.9232, "rewards/accuracies": 0.75, "rewards/chosen": -10.918581008911133, "rewards/margins": 4.373086929321289, "rewards/rejected": -15.291667938232422, "step": 59 }, { "epoch": 0.008169934640522876, "grad_norm": 67.97018547407833, "learning_rate": 6.530612244897959e-08, "logits/chosen": -6.65509557723999, "logits/rejected": -9.490769386291504, "logps/chosen": -1.9426493644714355, "logps/rejected": -0.8362451791763306, "loss": 5.0841, "rewards/accuracies": 0.0, "rewards/chosen": -19.426494598388672, "rewards/margins": -11.064042091369629, "rewards/rejected": -8.362451553344727, "step": 60 }, { "epoch": 0.008306100217864924, "grad_norm": 74.71438410984017, "learning_rate": 6.639455782312925e-08, "logits/chosen": -8.316003799438477, "logits/rejected": -7.666031837463379, "logps/chosen": -1.1824958324432373, "logps/rejected": -1.5117268562316895, "loss": 4.5549, "rewards/accuracies": 0.5, "rewards/chosen": -11.824957847595215, "rewards/margins": 3.2923097610473633, "rewards/rejected": -15.117267608642578, "step": 61 }, { "epoch": 0.008442265795206972, "grad_norm": 138.6088203982334, "learning_rate": 6.748299319727891e-08, "logits/chosen": -9.967666625976562, "logits/rejected": -8.454381942749023, "logps/chosen": -1.2460039854049683, "logps/rejected": -1.7386822700500488, "loss": 6.5243, "rewards/accuracies": 0.5, "rewards/chosen": -12.460041046142578, "rewards/margins": 4.92678165435791, "rewards/rejected": -17.386821746826172, "step": 62 }, { "epoch": 0.00857843137254902, "grad_norm": 70.43622085386934, "learning_rate": 6.857142857142857e-08, "logits/chosen": -8.940287590026855, "logits/rejected": -6.79775333404541, "logps/chosen": -1.1265895366668701, "logps/rejected": -2.7872064113616943, "loss": 4.7779, "rewards/accuracies": 0.75, "rewards/chosen": -11.265893936157227, "rewards/margins": 16.606170654296875, "rewards/rejected": -27.8720645904541, "step": 63 }, { "epoch": 0.008714596949891068, "grad_norm": 99.77522156713466, "learning_rate": 6.965986394557823e-08, "logits/chosen": -7.091365814208984, "logits/rejected": -6.561487197875977, "logps/chosen": -1.318249225616455, "logps/rejected": -1.9107496738433838, "loss": 4.8264, "rewards/accuracies": 1.0, "rewards/chosen": -13.18249225616455, "rewards/margins": 5.925003528594971, "rewards/rejected": -19.10749626159668, "step": 64 }, { "epoch": 0.008850762527233115, "grad_norm": 122.70216789510816, "learning_rate": 7.074829931972789e-08, "logits/chosen": -8.370696067810059, "logits/rejected": -7.31377649307251, "logps/chosen": -1.2015694379806519, "logps/rejected": -1.3841997385025024, "loss": 5.8268, "rewards/accuracies": 0.75, "rewards/chosen": -12.015693664550781, "rewards/margins": 1.8263037204742432, "rewards/rejected": -13.841998100280762, "step": 65 }, { "epoch": 0.008986928104575163, "grad_norm": 132.0032304264201, "learning_rate": 7.183673469387754e-08, "logits/chosen": -8.399620056152344, "logits/rejected": -7.441420078277588, "logps/chosen": -1.150929570198059, "logps/rejected": -1.578810214996338, "loss": 6.2916, "rewards/accuracies": 1.0, "rewards/chosen": -11.509295463562012, "rewards/margins": 4.278807163238525, "rewards/rejected": -15.788103103637695, "step": 66 }, { "epoch": 0.00912309368191721, "grad_norm": 63.150089606353056, "learning_rate": 7.292517006802721e-08, "logits/chosen": -7.126203536987305, "logits/rejected": -7.484006404876709, "logps/chosen": -1.4870810508728027, "logps/rejected": -1.5404748916625977, "loss": 4.7745, "rewards/accuracies": 0.5, "rewards/chosen": -14.870810508728027, "rewards/margins": 0.533939003944397, "rewards/rejected": -15.404748916625977, "step": 67 }, { "epoch": 0.009259259259259259, "grad_norm": 125.45364597271929, "learning_rate": 7.401360544217686e-08, "logits/chosen": -9.911307334899902, "logits/rejected": -9.331811904907227, "logps/chosen": -1.6816699504852295, "logps/rejected": -2.4589548110961914, "loss": 6.4537, "rewards/accuracies": 1.0, "rewards/chosen": -16.816699981689453, "rewards/margins": 7.772848129272461, "rewards/rejected": -24.58954620361328, "step": 68 }, { "epoch": 0.009395424836601307, "grad_norm": 86.1941974771448, "learning_rate": 7.510204081632653e-08, "logits/chosen": -6.7245893478393555, "logits/rejected": -5.919681549072266, "logps/chosen": -1.358107089996338, "logps/rejected": -1.5384711027145386, "loss": 5.7238, "rewards/accuracies": 0.75, "rewards/chosen": -13.581069946289062, "rewards/margins": 1.8036401271820068, "rewards/rejected": -15.384710311889648, "step": 69 }, { "epoch": 0.009531590413943355, "grad_norm": 94.41013818517813, "learning_rate": 7.619047619047618e-08, "logits/chosen": -7.906219482421875, "logits/rejected": -7.004551887512207, "logps/chosen": -1.636581301689148, "logps/rejected": -1.9006057977676392, "loss": 5.26, "rewards/accuracies": 0.5, "rewards/chosen": -16.365814208984375, "rewards/margins": 2.640244722366333, "rewards/rejected": -19.006057739257812, "step": 70 }, { "epoch": 0.009667755991285403, "grad_norm": 82.20978279761624, "learning_rate": 7.727891156462584e-08, "logits/chosen": -8.558752059936523, "logits/rejected": -7.616976261138916, "logps/chosen": -0.8557428121566772, "logps/rejected": -1.2832086086273193, "loss": 4.9706, "rewards/accuracies": 0.75, "rewards/chosen": -8.557427406311035, "rewards/margins": 4.274658203125, "rewards/rejected": -12.832085609436035, "step": 71 }, { "epoch": 0.00980392156862745, "grad_norm": 72.28125414239635, "learning_rate": 7.83673469387755e-08, "logits/chosen": -7.2157487869262695, "logits/rejected": -6.2108306884765625, "logps/chosen": -1.5208988189697266, "logps/rejected": -1.798633098602295, "loss": 5.6179, "rewards/accuracies": 0.5, "rewards/chosen": -15.208988189697266, "rewards/margins": 2.7773427963256836, "rewards/rejected": -17.986331939697266, "step": 72 }, { "epoch": 0.009940087145969499, "grad_norm": 174.15594229895135, "learning_rate": 7.945578231292516e-08, "logits/chosen": -8.816532135009766, "logits/rejected": -8.565436363220215, "logps/chosen": -1.0466110706329346, "logps/rejected": -1.3531405925750732, "loss": 8.2356, "rewards/accuracies": 0.75, "rewards/chosen": -10.466110229492188, "rewards/margins": 3.0652964115142822, "rewards/rejected": -13.53140640258789, "step": 73 }, { "epoch": 0.010076252723311547, "grad_norm": 114.1189808677808, "learning_rate": 8.054421768707483e-08, "logits/chosen": -7.946718215942383, "logits/rejected": -8.267077445983887, "logps/chosen": -1.184006690979004, "logps/rejected": -1.359445333480835, "loss": 5.4727, "rewards/accuracies": 0.25, "rewards/chosen": -11.840066909790039, "rewards/margins": 1.7543872594833374, "rewards/rejected": -13.594453811645508, "step": 74 }, { "epoch": 0.010212418300653595, "grad_norm": 99.28729028496868, "learning_rate": 8.16326530612245e-08, "logits/chosen": -7.435127258300781, "logits/rejected": -7.248805999755859, "logps/chosen": -2.4092087745666504, "logps/rejected": -2.0611274242401123, "loss": 6.359, "rewards/accuracies": 0.75, "rewards/chosen": -24.09208869934082, "rewards/margins": -3.4808149337768555, "rewards/rejected": -20.61127281188965, "step": 75 }, { "epoch": 0.010348583877995643, "grad_norm": 71.36210910793139, "learning_rate": 8.272108843537415e-08, "logits/chosen": -7.562510013580322, "logits/rejected": -8.113588333129883, "logps/chosen": -1.5316396951675415, "logps/rejected": -0.9470953941345215, "loss": 4.6281, "rewards/accuracies": 0.0, "rewards/chosen": -15.31639575958252, "rewards/margins": -5.845442295074463, "rewards/rejected": -9.470952987670898, "step": 76 }, { "epoch": 0.010484749455337691, "grad_norm": 152.5020634367813, "learning_rate": 8.380952380952381e-08, "logits/chosen": -8.81529426574707, "logits/rejected": -7.513816833496094, "logps/chosen": -1.0823626518249512, "logps/rejected": -1.5705209970474243, "loss": 6.0376, "rewards/accuracies": 0.5, "rewards/chosen": -10.823626518249512, "rewards/margins": 4.881583213806152, "rewards/rejected": -15.705209732055664, "step": 77 }, { "epoch": 0.010620915032679739, "grad_norm": 115.58135216339514, "learning_rate": 8.489795918367346e-08, "logits/chosen": -6.990825176239014, "logits/rejected": -6.871254920959473, "logps/chosen": -1.3340160846710205, "logps/rejected": -1.018347144126892, "loss": 6.2653, "rewards/accuracies": 0.25, "rewards/chosen": -13.340160369873047, "rewards/margins": -3.156689167022705, "rewards/rejected": -10.183470726013184, "step": 78 }, { "epoch": 0.010757080610021787, "grad_norm": 91.04966573031922, "learning_rate": 8.598639455782313e-08, "logits/chosen": -8.935365676879883, "logits/rejected": -9.263147354125977, "logps/chosen": -0.8826298117637634, "logps/rejected": -0.7888429760932922, "loss": 4.7763, "rewards/accuracies": 0.25, "rewards/chosen": -8.826297760009766, "rewards/margins": -0.9378681182861328, "rewards/rejected": -7.888429641723633, "step": 79 }, { "epoch": 0.010893246187363835, "grad_norm": 92.66196323244837, "learning_rate": 8.707482993197278e-08, "logits/chosen": -7.85068416595459, "logits/rejected": -8.759309768676758, "logps/chosen": -1.5066897869110107, "logps/rejected": -1.4703220129013062, "loss": 6.0878, "rewards/accuracies": 0.5, "rewards/chosen": -15.066898345947266, "rewards/margins": -0.3636772632598877, "rewards/rejected": -14.70322036743164, "step": 80 }, { "epoch": 0.011029411764705883, "grad_norm": 66.24374178406546, "learning_rate": 8.816326530612245e-08, "logits/chosen": -6.682755470275879, "logits/rejected": -7.729180335998535, "logps/chosen": -1.4993747472763062, "logps/rejected": -1.3412177562713623, "loss": 5.0086, "rewards/accuracies": 0.5, "rewards/chosen": -14.99374771118164, "rewards/margins": -1.5815701484680176, "rewards/rejected": -13.412177085876465, "step": 81 }, { "epoch": 0.011165577342047931, "grad_norm": 107.79983831497081, "learning_rate": 8.92517006802721e-08, "logits/chosen": -7.459165096282959, "logits/rejected": -8.513818740844727, "logps/chosen": -1.28342866897583, "logps/rejected": -1.2030141353607178, "loss": 5.5921, "rewards/accuracies": 0.75, "rewards/chosen": -12.8342866897583, "rewards/margins": -0.8041467666625977, "rewards/rejected": -12.030139923095703, "step": 82 }, { "epoch": 0.011301742919389977, "grad_norm": 114.67483127909163, "learning_rate": 9.034013605442176e-08, "logits/chosen": -7.378748416900635, "logits/rejected": -6.690046787261963, "logps/chosen": -1.793397068977356, "logps/rejected": -2.0308127403259277, "loss": 6.2229, "rewards/accuracies": 0.75, "rewards/chosen": -17.933971405029297, "rewards/margins": 2.374157667160034, "rewards/rejected": -20.308128356933594, "step": 83 }, { "epoch": 0.011437908496732025, "grad_norm": 76.31372176217329, "learning_rate": 9.142857142857142e-08, "logits/chosen": -7.920049667358398, "logits/rejected": -9.507408142089844, "logps/chosen": -1.574668288230896, "logps/rejected": -1.28892183303833, "loss": 5.2408, "rewards/accuracies": 0.25, "rewards/chosen": -15.746682167053223, "rewards/margins": -2.8574633598327637, "rewards/rejected": -12.889219284057617, "step": 84 }, { "epoch": 0.011574074074074073, "grad_norm": 53.19374567724993, "learning_rate": 9.251700680272108e-08, "logits/chosen": -7.838123321533203, "logits/rejected": -8.164039611816406, "logps/chosen": -1.3105769157409668, "logps/rejected": -1.5724979639053345, "loss": 4.7178, "rewards/accuracies": 0.5, "rewards/chosen": -13.105770111083984, "rewards/margins": 2.6192095279693604, "rewards/rejected": -15.724979400634766, "step": 85 }, { "epoch": 0.011710239651416121, "grad_norm": 83.11818058397047, "learning_rate": 9.360544217687074e-08, "logits/chosen": -8.956910133361816, "logits/rejected": -7.127848148345947, "logps/chosen": -0.9617190361022949, "logps/rejected": -1.3131762742996216, "loss": 4.5313, "rewards/accuracies": 0.75, "rewards/chosen": -9.61719036102295, "rewards/margins": 3.514573335647583, "rewards/rejected": -13.131763458251953, "step": 86 }, { "epoch": 0.01184640522875817, "grad_norm": 135.50909691562592, "learning_rate": 9.46938775510204e-08, "logits/chosen": -8.015868186950684, "logits/rejected": -7.18034553527832, "logps/chosen": -1.4160012006759644, "logps/rejected": -1.899852991104126, "loss": 6.3869, "rewards/accuracies": 0.75, "rewards/chosen": -14.160012245178223, "rewards/margins": 4.838517189025879, "rewards/rejected": -18.9985294342041, "step": 87 }, { "epoch": 0.011982570806100218, "grad_norm": 194.8526131887022, "learning_rate": 9.578231292517007e-08, "logits/chosen": -8.310699462890625, "logits/rejected": -7.483238220214844, "logps/chosen": -1.567845344543457, "logps/rejected": -2.027216672897339, "loss": 6.5591, "rewards/accuracies": 0.75, "rewards/chosen": -15.678454399108887, "rewards/margins": 4.593711853027344, "rewards/rejected": -20.272167205810547, "step": 88 }, { "epoch": 0.012118736383442266, "grad_norm": 86.41937950338823, "learning_rate": 9.687074829931973e-08, "logits/chosen": -8.387208938598633, "logits/rejected": -8.707887649536133, "logps/chosen": -1.0567429065704346, "logps/rejected": -1.2387934923171997, "loss": 5.2817, "rewards/accuracies": 0.75, "rewards/chosen": -10.567428588867188, "rewards/margins": 1.8205052614212036, "rewards/rejected": -12.387933731079102, "step": 89 }, { "epoch": 0.012254901960784314, "grad_norm": 167.52811527627333, "learning_rate": 9.795918367346938e-08, "logits/chosen": -8.506210327148438, "logits/rejected": -7.366031169891357, "logps/chosen": -1.1873722076416016, "logps/rejected": -1.6340794563293457, "loss": 5.9765, "rewards/accuracies": 0.75, "rewards/chosen": -11.873722076416016, "rewards/margins": 4.467071533203125, "rewards/rejected": -16.34079360961914, "step": 90 }, { "epoch": 0.012391067538126362, "grad_norm": 99.77974952104485, "learning_rate": 9.904761904761905e-08, "logits/chosen": -6.567556381225586, "logits/rejected": -6.088610649108887, "logps/chosen": -2.4839515686035156, "logps/rejected": -1.5262925624847412, "loss": 5.1157, "rewards/accuracies": 0.25, "rewards/chosen": -24.839515686035156, "rewards/margins": -9.576591491699219, "rewards/rejected": -15.262925148010254, "step": 91 }, { "epoch": 0.01252723311546841, "grad_norm": 138.5112647724549, "learning_rate": 1.0013605442176872e-07, "logits/chosen": -8.968034744262695, "logits/rejected": -7.947174072265625, "logps/chosen": -1.3343839645385742, "logps/rejected": -1.4076025485992432, "loss": 6.2193, "rewards/accuracies": 0.75, "rewards/chosen": -13.343840599060059, "rewards/margins": 0.7321840524673462, "rewards/rejected": -14.076025009155273, "step": 92 }, { "epoch": 0.012663398692810458, "grad_norm": 76.83693389649015, "learning_rate": 1.0122448979591835e-07, "logits/chosen": -8.755231857299805, "logits/rejected": -6.5800371170043945, "logps/chosen": -1.6095538139343262, "logps/rejected": -1.6626031398773193, "loss": 4.981, "rewards/accuracies": 0.25, "rewards/chosen": -16.095539093017578, "rewards/margins": 0.5304934978485107, "rewards/rejected": -16.62603187561035, "step": 93 }, { "epoch": 0.012799564270152506, "grad_norm": 67.67459755217072, "learning_rate": 1.0231292517006802e-07, "logits/chosen": -8.35530948638916, "logits/rejected": -7.346632957458496, "logps/chosen": -1.5239591598510742, "logps/rejected": -1.4855124950408936, "loss": 4.976, "rewards/accuracies": 0.5, "rewards/chosen": -15.239592552185059, "rewards/margins": -0.38446712493896484, "rewards/rejected": -14.855125427246094, "step": 94 }, { "epoch": 0.012935729847494554, "grad_norm": 138.79095674842807, "learning_rate": 1.0340136054421769e-07, "logits/chosen": -7.688117027282715, "logits/rejected": -5.764824867248535, "logps/chosen": -4.866403579711914, "logps/rejected": -1.7673399448394775, "loss": 6.2775, "rewards/accuracies": 0.5, "rewards/chosen": -48.664031982421875, "rewards/margins": -30.990633010864258, "rewards/rejected": -17.67340087890625, "step": 95 }, { "epoch": 0.013071895424836602, "grad_norm": 72.14711455913951, "learning_rate": 1.0448979591836735e-07, "logits/chosen": -8.103166580200195, "logits/rejected": -8.934942245483398, "logps/chosen": -1.0589070320129395, "logps/rejected": -1.3673222064971924, "loss": 5.4907, "rewards/accuracies": 0.5, "rewards/chosen": -10.589070320129395, "rewards/margins": 3.0841519832611084, "rewards/rejected": -13.673222541809082, "step": 96 }, { "epoch": 0.01320806100217865, "grad_norm": 80.5400932846621, "learning_rate": 1.0557823129251699e-07, "logits/chosen": -8.798177719116211, "logits/rejected": -7.312837600708008, "logps/chosen": -1.3897900581359863, "logps/rejected": -1.9143118858337402, "loss": 4.662, "rewards/accuracies": 1.0, "rewards/chosen": -13.89790153503418, "rewards/margins": 5.245217800140381, "rewards/rejected": -19.14311981201172, "step": 97 }, { "epoch": 0.013344226579520698, "grad_norm": 132.90113160721006, "learning_rate": 1.0666666666666666e-07, "logits/chosen": -6.230717658996582, "logits/rejected": -6.846138000488281, "logps/chosen": -1.3648370504379272, "logps/rejected": -1.1475584506988525, "loss": 5.6988, "rewards/accuracies": 0.25, "rewards/chosen": -13.648370742797852, "rewards/margins": -2.172786235809326, "rewards/rejected": -11.475584030151367, "step": 98 }, { "epoch": 0.013480392156862746, "grad_norm": 103.64860072059895, "learning_rate": 1.0775510204081632e-07, "logits/chosen": -8.25450325012207, "logits/rejected": -7.826421737670898, "logps/chosen": -1.0872471332550049, "logps/rejected": -1.0798908472061157, "loss": 4.986, "rewards/accuracies": 0.5, "rewards/chosen": -10.87247085571289, "rewards/margins": -0.07356202602386475, "rewards/rejected": -10.798908233642578, "step": 99 }, { "epoch": 0.013616557734204794, "grad_norm": 161.02047160147487, "learning_rate": 1.0884353741496599e-07, "logits/chosen": -7.526217460632324, "logits/rejected": -8.304452896118164, "logps/chosen": -1.9105037450790405, "logps/rejected": -1.7503979206085205, "loss": 5.6975, "rewards/accuracies": 0.5, "rewards/chosen": -19.105037689208984, "rewards/margins": -1.6010582447052002, "rewards/rejected": -17.503978729248047, "step": 100 }, { "epoch": 0.01375272331154684, "grad_norm": 87.66251614498431, "learning_rate": 1.0993197278911564e-07, "logits/chosen": -8.093244552612305, "logits/rejected": -8.369668960571289, "logps/chosen": -0.9477163553237915, "logps/rejected": -0.9063022136688232, "loss": 4.9535, "rewards/accuracies": 0.25, "rewards/chosen": -9.477163314819336, "rewards/margins": -0.4141418933868408, "rewards/rejected": -9.063021659851074, "step": 101 }, { "epoch": 0.013888888888888888, "grad_norm": 147.94781920097606, "learning_rate": 1.110204081632653e-07, "logits/chosen": -8.414246559143066, "logits/rejected": -8.22299861907959, "logps/chosen": -1.2817943096160889, "logps/rejected": -1.3437156677246094, "loss": 6.1949, "rewards/accuracies": 0.25, "rewards/chosen": -12.81794261932373, "rewards/margins": 0.6192144155502319, "rewards/rejected": -13.437156677246094, "step": 102 }, { "epoch": 0.014025054466230936, "grad_norm": 69.18705436076095, "learning_rate": 1.1210884353741497e-07, "logits/chosen": -7.203943729400635, "logits/rejected": -7.0678205490112305, "logps/chosen": -1.3164646625518799, "logps/rejected": -1.4119610786437988, "loss": 4.5866, "rewards/accuracies": 0.5, "rewards/chosen": -13.16464614868164, "rewards/margins": 0.9549641609191895, "rewards/rejected": -14.119610786437988, "step": 103 }, { "epoch": 0.014161220043572984, "grad_norm": 147.6829392865544, "learning_rate": 1.1319727891156464e-07, "logits/chosen": -7.827901363372803, "logits/rejected": -9.09907341003418, "logps/chosen": -1.3967491388320923, "logps/rejected": -1.0639358758926392, "loss": 5.6567, "rewards/accuracies": 0.5, "rewards/chosen": -13.967491149902344, "rewards/margins": -3.328132152557373, "rewards/rejected": -10.639358520507812, "step": 104 }, { "epoch": 0.014297385620915032, "grad_norm": 139.16878999895226, "learning_rate": 1.1428571428571427e-07, "logits/chosen": -8.653733253479004, "logits/rejected": -8.594332695007324, "logps/chosen": -1.2562005519866943, "logps/rejected": -1.2710366249084473, "loss": 5.9136, "rewards/accuracies": 0.5, "rewards/chosen": -12.562005043029785, "rewards/margins": 0.1483595371246338, "rewards/rejected": -12.71036434173584, "step": 105 }, { "epoch": 0.01443355119825708, "grad_norm": 74.16418578351004, "learning_rate": 1.1537414965986394e-07, "logits/chosen": -8.14253044128418, "logits/rejected": -8.070277214050293, "logps/chosen": -1.6088261604309082, "logps/rejected": -1.5226283073425293, "loss": 5.0765, "rewards/accuracies": 0.5, "rewards/chosen": -16.088260650634766, "rewards/margins": -0.8619790077209473, "rewards/rejected": -15.226282119750977, "step": 106 }, { "epoch": 0.014569716775599128, "grad_norm": 90.97523981322425, "learning_rate": 1.164625850340136e-07, "logits/chosen": -8.80117416381836, "logits/rejected": -6.526435852050781, "logps/chosen": -1.2978707551956177, "logps/rejected": -1.7596895694732666, "loss": 5.2639, "rewards/accuracies": 1.0, "rewards/chosen": -12.978708267211914, "rewards/margins": 4.618188858032227, "rewards/rejected": -17.596895217895508, "step": 107 }, { "epoch": 0.014705882352941176, "grad_norm": 152.92236713673708, "learning_rate": 1.1755102040816327e-07, "logits/chosen": -8.181175231933594, "logits/rejected": -8.921409606933594, "logps/chosen": -2.7746469974517822, "logps/rejected": -1.390120029449463, "loss": 4.6746, "rewards/accuracies": 0.5, "rewards/chosen": -27.746471405029297, "rewards/margins": -13.845272064208984, "rewards/rejected": -13.901199340820312, "step": 108 }, { "epoch": 0.014842047930283224, "grad_norm": 112.70207828013707, "learning_rate": 1.1863945578231291e-07, "logits/chosen": -8.3693208694458, "logits/rejected": -6.938942909240723, "logps/chosen": -1.2172410488128662, "logps/rejected": -1.351880431175232, "loss": 4.8407, "rewards/accuracies": 0.5, "rewards/chosen": -12.172409057617188, "rewards/margins": 1.3463950157165527, "rewards/rejected": -13.518804550170898, "step": 109 }, { "epoch": 0.014978213507625272, "grad_norm": 121.54226995243312, "learning_rate": 1.1972789115646258e-07, "logits/chosen": -6.318542957305908, "logits/rejected": -6.55689811706543, "logps/chosen": -3.979107141494751, "logps/rejected": -1.9599238634109497, "loss": 6.4685, "rewards/accuracies": 0.25, "rewards/chosen": -39.79107666015625, "rewards/margins": -20.191835403442383, "rewards/rejected": -19.599239349365234, "step": 110 }, { "epoch": 0.01511437908496732, "grad_norm": 84.58190857401989, "learning_rate": 1.2081632653061225e-07, "logits/chosen": -8.3256196975708, "logits/rejected": -7.410186767578125, "logps/chosen": -1.3572335243225098, "logps/rejected": -1.0426664352416992, "loss": 5.0825, "rewards/accuracies": 0.25, "rewards/chosen": -13.572334289550781, "rewards/margins": -3.14566969871521, "rewards/rejected": -10.426664352416992, "step": 111 }, { "epoch": 0.015250544662309368, "grad_norm": 65.64314693674002, "learning_rate": 1.219047619047619e-07, "logits/chosen": -6.951257705688477, "logits/rejected": -7.209904193878174, "logps/chosen": -1.1895802021026611, "logps/rejected": -1.2549169063568115, "loss": 4.868, "rewards/accuracies": 0.75, "rewards/chosen": -11.895801544189453, "rewards/margins": 0.6533675193786621, "rewards/rejected": -12.549169540405273, "step": 112 }, { "epoch": 0.015386710239651416, "grad_norm": 113.08861912889519, "learning_rate": 1.2299319727891156e-07, "logits/chosen": -8.145864486694336, "logits/rejected": -6.895856857299805, "logps/chosen": -1.325095772743225, "logps/rejected": -1.4878772497177124, "loss": 4.8132, "rewards/accuracies": 0.75, "rewards/chosen": -13.250957489013672, "rewards/margins": 1.627814769744873, "rewards/rejected": -14.878772735595703, "step": 113 }, { "epoch": 0.015522875816993464, "grad_norm": 75.60935770369265, "learning_rate": 1.240816326530612e-07, "logits/chosen": -7.004352569580078, "logits/rejected": -7.219120979309082, "logps/chosen": -1.666276454925537, "logps/rejected": -1.442216157913208, "loss": 4.8391, "rewards/accuracies": 0.25, "rewards/chosen": -16.662765502929688, "rewards/margins": -2.240602493286133, "rewards/rejected": -14.422163009643555, "step": 114 }, { "epoch": 0.01565904139433551, "grad_norm": 145.08244740594614, "learning_rate": 1.251700680272109e-07, "logits/chosen": -7.890751838684082, "logits/rejected": -7.44482421875, "logps/chosen": -0.9217901229858398, "logps/rejected": -1.1771790981292725, "loss": 5.7998, "rewards/accuracies": 0.75, "rewards/chosen": -9.217901229858398, "rewards/margins": 2.5538902282714844, "rewards/rejected": -11.771791458129883, "step": 115 }, { "epoch": 0.01579520697167756, "grad_norm": 81.64108183081876, "learning_rate": 1.2625850340136052e-07, "logits/chosen": -8.327146530151367, "logits/rejected": -7.732615947723389, "logps/chosen": -1.1325106620788574, "logps/rejected": -1.4629204273223877, "loss": 4.8724, "rewards/accuracies": 0.5, "rewards/chosen": -11.32510757446289, "rewards/margins": 3.3040976524353027, "rewards/rejected": -14.629205703735352, "step": 116 }, { "epoch": 0.015931372549019607, "grad_norm": 122.8120274021992, "learning_rate": 1.273469387755102e-07, "logits/chosen": -7.807650089263916, "logits/rejected": -8.775320053100586, "logps/chosen": -1.8602893352508545, "logps/rejected": -1.7113218307495117, "loss": 5.5495, "rewards/accuracies": 0.5, "rewards/chosen": -18.602893829345703, "rewards/margins": -1.48967444896698, "rewards/rejected": -17.113218307495117, "step": 117 }, { "epoch": 0.016067538126361657, "grad_norm": 109.5513659618724, "learning_rate": 1.2843537414965985e-07, "logits/chosen": -7.386325836181641, "logits/rejected": -7.472169399261475, "logps/chosen": -1.4960219860076904, "logps/rejected": -1.2435331344604492, "loss": 4.8499, "rewards/accuracies": 0.25, "rewards/chosen": -14.960220336914062, "rewards/margins": -2.5248897075653076, "rewards/rejected": -12.435330390930176, "step": 118 }, { "epoch": 0.016203703703703703, "grad_norm": 71.06846317108376, "learning_rate": 1.2952380952380953e-07, "logits/chosen": -8.483373641967773, "logits/rejected": -7.18443489074707, "logps/chosen": -0.824257493019104, "logps/rejected": -1.4479196071624756, "loss": 4.4634, "rewards/accuracies": 1.0, "rewards/chosen": -8.242574691772461, "rewards/margins": 6.236621856689453, "rewards/rejected": -14.479196548461914, "step": 119 }, { "epoch": 0.016339869281045753, "grad_norm": 155.3238168301606, "learning_rate": 1.3061224489795918e-07, "logits/chosen": -8.394693374633789, "logits/rejected": -7.223039627075195, "logps/chosen": -1.6742703914642334, "logps/rejected": -1.5777275562286377, "loss": 6.6937, "rewards/accuracies": 0.5, "rewards/chosen": -16.742706298828125, "rewards/margins": -0.9654300212860107, "rewards/rejected": -15.777275085449219, "step": 120 }, { "epoch": 0.0164760348583878, "grad_norm": 65.75487204851369, "learning_rate": 1.3170068027210883e-07, "logits/chosen": -8.552877426147461, "logits/rejected": -9.355611801147461, "logps/chosen": -1.0653274059295654, "logps/rejected": -1.0810471773147583, "loss": 5.119, "rewards/accuracies": 0.5, "rewards/chosen": -10.653273582458496, "rewards/margins": 0.1571979522705078, "rewards/rejected": -10.810471534729004, "step": 121 }, { "epoch": 0.01661220043572985, "grad_norm": 92.27963701703484, "learning_rate": 1.327891156462585e-07, "logits/chosen": -7.977911949157715, "logits/rejected": -8.875438690185547, "logps/chosen": -0.8868775367736816, "logps/rejected": -0.6830118894577026, "loss": 5.8881, "rewards/accuracies": 0.25, "rewards/chosen": -8.8687744140625, "rewards/margins": -2.0386557579040527, "rewards/rejected": -6.8301191329956055, "step": 122 }, { "epoch": 0.016748366013071895, "grad_norm": 133.28121418787444, "learning_rate": 1.3387755102040816e-07, "logits/chosen": -7.237972736358643, "logits/rejected": -7.679261207580566, "logps/chosen": -1.4854984283447266, "logps/rejected": -1.3006603717803955, "loss": 6.7854, "rewards/accuracies": 0.0, "rewards/chosen": -14.85498332977295, "rewards/margins": -1.848380208015442, "rewards/rejected": -13.006603240966797, "step": 123 }, { "epoch": 0.016884531590413945, "grad_norm": 116.64182000018063, "learning_rate": 1.3496598639455781e-07, "logits/chosen": -8.235772132873535, "logits/rejected": -7.443356990814209, "logps/chosen": -1.5524362325668335, "logps/rejected": -2.6640877723693848, "loss": 4.8964, "rewards/accuracies": 0.5, "rewards/chosen": -15.524361610412598, "rewards/margins": 11.116518020629883, "rewards/rejected": -26.640878677368164, "step": 124 }, { "epoch": 0.01702069716775599, "grad_norm": 77.24402407554578, "learning_rate": 1.3605442176870747e-07, "logits/chosen": -8.537908554077148, "logits/rejected": -9.160828590393066, "logps/chosen": -1.2190563678741455, "logps/rejected": -0.9468997716903687, "loss": 4.6712, "rewards/accuracies": 0.5, "rewards/chosen": -12.190563201904297, "rewards/margins": -2.7215657234191895, "rewards/rejected": -9.468997955322266, "step": 125 }, { "epoch": 0.01715686274509804, "grad_norm": 62.60496842150831, "learning_rate": 1.3714285714285715e-07, "logits/chosen": -8.927555084228516, "logits/rejected": -6.9217071533203125, "logps/chosen": -1.3026244640350342, "logps/rejected": -1.621842622756958, "loss": 4.8888, "rewards/accuracies": 0.75, "rewards/chosen": -13.026244163513184, "rewards/margins": 3.1921825408935547, "rewards/rejected": -16.218425750732422, "step": 126 }, { "epoch": 0.017293028322440087, "grad_norm": 71.76649279098766, "learning_rate": 1.382312925170068e-07, "logits/chosen": -8.929429054260254, "logits/rejected": -8.861654281616211, "logps/chosen": -1.2698887586593628, "logps/rejected": -1.2908586263656616, "loss": 4.4883, "rewards/accuracies": 0.75, "rewards/chosen": -12.698887825012207, "rewards/margins": 0.20969891548156738, "rewards/rejected": -12.908586502075195, "step": 127 }, { "epoch": 0.017429193899782137, "grad_norm": 90.7280684147843, "learning_rate": 1.3931972789115645e-07, "logits/chosen": -8.100556373596191, "logits/rejected": -7.284461975097656, "logps/chosen": -1.0994539260864258, "logps/rejected": -1.3498657941818237, "loss": 4.6904, "rewards/accuracies": 0.75, "rewards/chosen": -10.994539260864258, "rewards/margins": 2.5041182041168213, "rewards/rejected": -13.4986572265625, "step": 128 }, { "epoch": 0.017565359477124183, "grad_norm": 119.93755865533443, "learning_rate": 1.404081632653061e-07, "logits/chosen": -8.692374229431152, "logits/rejected": -9.704275131225586, "logps/chosen": -1.1250450611114502, "logps/rejected": -1.0680081844329834, "loss": 5.5376, "rewards/accuracies": 0.5, "rewards/chosen": -11.250452041625977, "rewards/margins": -0.5703698396682739, "rewards/rejected": -10.680082321166992, "step": 129 }, { "epoch": 0.01770152505446623, "grad_norm": 64.18488248079667, "learning_rate": 1.4149659863945578e-07, "logits/chosen": -7.641815662384033, "logits/rejected": -6.911394119262695, "logps/chosen": -1.1077734231948853, "logps/rejected": -1.3053556680679321, "loss": 4.7644, "rewards/accuracies": 0.75, "rewards/chosen": -11.077733993530273, "rewards/margins": 1.9758222103118896, "rewards/rejected": -13.053556442260742, "step": 130 }, { "epoch": 0.01783769063180828, "grad_norm": 96.93506026483561, "learning_rate": 1.4258503401360543e-07, "logits/chosen": -8.411888122558594, "logits/rejected": -7.937638282775879, "logps/chosen": -1.0048186779022217, "logps/rejected": -1.1728907823562622, "loss": 4.828, "rewards/accuracies": 0.75, "rewards/chosen": -10.048186302185059, "rewards/margins": 1.6807211637496948, "rewards/rejected": -11.728907585144043, "step": 131 }, { "epoch": 0.017973856209150325, "grad_norm": 96.99601743690654, "learning_rate": 1.4367346938775509e-07, "logits/chosen": -7.937776565551758, "logits/rejected": -8.09326171875, "logps/chosen": -1.8889554738998413, "logps/rejected": -1.9573417901992798, "loss": 4.5314, "rewards/accuracies": 0.5, "rewards/chosen": -18.889554977416992, "rewards/margins": 0.6838645935058594, "rewards/rejected": -19.57341766357422, "step": 132 }, { "epoch": 0.018110021786492375, "grad_norm": 169.39217875481538, "learning_rate": 1.4476190476190476e-07, "logits/chosen": -8.100971221923828, "logits/rejected": -6.834127426147461, "logps/chosen": -1.3643587827682495, "logps/rejected": -1.4150521755218506, "loss": 5.8794, "rewards/accuracies": 0.75, "rewards/chosen": -13.643587112426758, "rewards/margins": 0.5069348812103271, "rewards/rejected": -14.150522232055664, "step": 133 }, { "epoch": 0.01824618736383442, "grad_norm": 73.43780486551361, "learning_rate": 1.4585034013605442e-07, "logits/chosen": -8.857494354248047, "logits/rejected": -7.973879814147949, "logps/chosen": -1.2026863098144531, "logps/rejected": -1.3635804653167725, "loss": 4.6363, "rewards/accuracies": 0.5, "rewards/chosen": -12.026863098144531, "rewards/margins": 1.6089420318603516, "rewards/rejected": -13.635804176330566, "step": 134 }, { "epoch": 0.01838235294117647, "grad_norm": 132.97652969892755, "learning_rate": 1.4693877551020407e-07, "logits/chosen": -9.920654296875, "logits/rejected": -8.664923667907715, "logps/chosen": -0.8287444114685059, "logps/rejected": -0.9304681420326233, "loss": 5.2533, "rewards/accuracies": 0.5, "rewards/chosen": -8.287444114685059, "rewards/margins": 1.0172369480133057, "rewards/rejected": -9.304681777954102, "step": 135 }, { "epoch": 0.018518518518518517, "grad_norm": 108.6310445221214, "learning_rate": 1.4802721088435372e-07, "logits/chosen": -6.958468914031982, "logits/rejected": -8.03316879272461, "logps/chosen": -1.4720054864883423, "logps/rejected": -1.568041205406189, "loss": 5.461, "rewards/accuracies": 0.5, "rewards/chosen": -14.720054626464844, "rewards/margins": 0.9603571891784668, "rewards/rejected": -15.680412292480469, "step": 136 }, { "epoch": 0.018654684095860567, "grad_norm": 130.04682495899175, "learning_rate": 1.491156462585034e-07, "logits/chosen": -8.107259750366211, "logits/rejected": -7.727598667144775, "logps/chosen": -0.7272666096687317, "logps/rejected": -0.9790180921554565, "loss": 5.1799, "rewards/accuracies": 1.0, "rewards/chosen": -7.2726664543151855, "rewards/margins": 2.5175139904022217, "rewards/rejected": -9.790180206298828, "step": 137 }, { "epoch": 0.018790849673202614, "grad_norm": 125.55864439144133, "learning_rate": 1.5020408163265305e-07, "logits/chosen": -7.8524322509765625, "logits/rejected": -6.454014778137207, "logps/chosen": -1.3398516178131104, "logps/rejected": -1.2288532257080078, "loss": 5.5296, "rewards/accuracies": 0.5, "rewards/chosen": -13.398515701293945, "rewards/margins": -1.1099846363067627, "rewards/rejected": -12.288532257080078, "step": 138 }, { "epoch": 0.018927015250544663, "grad_norm": 120.82654758672258, "learning_rate": 1.5129251700680273e-07, "logits/chosen": -6.387835502624512, "logits/rejected": -5.812603950500488, "logps/chosen": -1.5713181495666504, "logps/rejected": -1.8111991882324219, "loss": 5.0277, "rewards/accuracies": 0.5, "rewards/chosen": -15.713180541992188, "rewards/margins": 2.398810625076294, "rewards/rejected": -18.11199188232422, "step": 139 }, { "epoch": 0.01906318082788671, "grad_norm": 71.8867716521012, "learning_rate": 1.5238095238095236e-07, "logits/chosen": -8.14934253692627, "logits/rejected": -7.855738639831543, "logps/chosen": -0.8292730450630188, "logps/rejected": -0.9465879797935486, "loss": 5.0979, "rewards/accuracies": 0.5, "rewards/chosen": -8.292731285095215, "rewards/margins": 1.1731492280960083, "rewards/rejected": -9.465880393981934, "step": 140 }, { "epoch": 0.01919934640522876, "grad_norm": 133.13783769484897, "learning_rate": 1.5346938775510204e-07, "logits/chosen": -7.077601909637451, "logits/rejected": -6.995731353759766, "logps/chosen": -1.3134288787841797, "logps/rejected": -1.7887707948684692, "loss": 5.9851, "rewards/accuracies": 0.75, "rewards/chosen": -13.13428783416748, "rewards/margins": 4.753420352935791, "rewards/rejected": -17.88770866394043, "step": 141 }, { "epoch": 0.019335511982570806, "grad_norm": 187.32748292609577, "learning_rate": 1.545578231292517e-07, "logits/chosen": -6.660077095031738, "logits/rejected": -6.529077529907227, "logps/chosen": -1.9430222511291504, "logps/rejected": -1.3025259971618652, "loss": 6.2829, "rewards/accuracies": 0.5, "rewards/chosen": -19.430221557617188, "rewards/margins": -6.40496301651001, "rewards/rejected": -13.025259971618652, "step": 142 }, { "epoch": 0.019471677559912855, "grad_norm": 93.32424058033999, "learning_rate": 1.5564625850340137e-07, "logits/chosen": -6.744956016540527, "logits/rejected": -6.147989273071289, "logps/chosen": -1.6654924154281616, "logps/rejected": -1.4076611995697021, "loss": 5.2559, "rewards/accuracies": 0.25, "rewards/chosen": -16.654924392700195, "rewards/margins": -2.578312873840332, "rewards/rejected": -14.076611518859863, "step": 143 }, { "epoch": 0.0196078431372549, "grad_norm": 105.58250000816324, "learning_rate": 1.56734693877551e-07, "logits/chosen": -7.31378173828125, "logits/rejected": -7.078433990478516, "logps/chosen": -1.2827730178833008, "logps/rejected": -1.2402223348617554, "loss": 6.6242, "rewards/accuracies": 0.5, "rewards/chosen": -12.827730178833008, "rewards/margins": -0.4255063533782959, "rewards/rejected": -12.402223587036133, "step": 144 }, { "epoch": 0.01974400871459695, "grad_norm": 88.00451371475083, "learning_rate": 1.5782312925170067e-07, "logits/chosen": -10.263345718383789, "logits/rejected": -7.425136566162109, "logps/chosen": -1.2247413396835327, "logps/rejected": -1.4101629257202148, "loss": 5.2318, "rewards/accuracies": 0.75, "rewards/chosen": -12.247413635253906, "rewards/margins": 1.854215145111084, "rewards/rejected": -14.101628303527832, "step": 145 }, { "epoch": 0.019880174291938998, "grad_norm": 101.8242574204444, "learning_rate": 1.5891156462585032e-07, "logits/chosen": -7.049863815307617, "logits/rejected": -6.860673904418945, "logps/chosen": -1.3652560710906982, "logps/rejected": -1.8334383964538574, "loss": 4.5453, "rewards/accuracies": 0.75, "rewards/chosen": -13.65256118774414, "rewards/margins": 4.681824207305908, "rewards/rejected": -18.33438491821289, "step": 146 }, { "epoch": 0.020016339869281044, "grad_norm": 137.92112931366847, "learning_rate": 1.6e-07, "logits/chosen": -9.203563690185547, "logits/rejected": -8.293973922729492, "logps/chosen": -3.522048234939575, "logps/rejected": -1.0164036750793457, "loss": 5.5492, "rewards/accuracies": 0.75, "rewards/chosen": -35.220481872558594, "rewards/margins": -25.056446075439453, "rewards/rejected": -10.16403579711914, "step": 147 }, { "epoch": 0.020152505446623094, "grad_norm": 142.7548544512285, "learning_rate": 1.6108843537414966e-07, "logits/chosen": -6.722068786621094, "logits/rejected": -6.706391334533691, "logps/chosen": -1.446770191192627, "logps/rejected": -1.3910293579101562, "loss": 4.8614, "rewards/accuracies": 0.25, "rewards/chosen": -14.467700958251953, "rewards/margins": -0.557408332824707, "rewards/rejected": -13.910293579101562, "step": 148 }, { "epoch": 0.02028867102396514, "grad_norm": 147.4414610504222, "learning_rate": 1.621768707482993e-07, "logits/chosen": -6.472548484802246, "logits/rejected": -7.359100341796875, "logps/chosen": -1.774766206741333, "logps/rejected": -1.749028205871582, "loss": 5.616, "rewards/accuracies": 0.75, "rewards/chosen": -17.747661590576172, "rewards/margins": -0.25737929344177246, "rewards/rejected": -17.49028205871582, "step": 149 }, { "epoch": 0.02042483660130719, "grad_norm": 85.40604401509445, "learning_rate": 1.63265306122449e-07, "logits/chosen": -8.287897109985352, "logits/rejected": -7.272768020629883, "logps/chosen": -0.9985866546630859, "logps/rejected": -1.0714044570922852, "loss": 5.0447, "rewards/accuracies": 0.5, "rewards/chosen": -9.98586654663086, "rewards/margins": 0.728177547454834, "rewards/rejected": -10.714043617248535, "step": 150 }, { "epoch": 0.020561002178649236, "grad_norm": 191.94433843514835, "learning_rate": 1.6435374149659864e-07, "logits/chosen": -7.810141563415527, "logits/rejected": -6.642199993133545, "logps/chosen": -1.408130407333374, "logps/rejected": -2.367379903793335, "loss": 5.586, "rewards/accuracies": 1.0, "rewards/chosen": -14.081304550170898, "rewards/margins": 9.59249496459961, "rewards/rejected": -23.673799514770508, "step": 151 }, { "epoch": 0.020697167755991286, "grad_norm": 84.36206782043772, "learning_rate": 1.654421768707483e-07, "logits/chosen": -8.025960922241211, "logits/rejected": -7.982983589172363, "logps/chosen": -1.2618099451065063, "logps/rejected": -1.185113549232483, "loss": 4.9386, "rewards/accuracies": 0.5, "rewards/chosen": -12.6181001663208, "rewards/margins": -0.7669644355773926, "rewards/rejected": -11.85113525390625, "step": 152 }, { "epoch": 0.020833333333333332, "grad_norm": 129.32292423879954, "learning_rate": 1.6653061224489794e-07, "logits/chosen": -8.213658332824707, "logits/rejected": -7.41917610168457, "logps/chosen": -1.7385635375976562, "logps/rejected": -1.744072675704956, "loss": 4.8346, "rewards/accuracies": 0.75, "rewards/chosen": -17.385635375976562, "rewards/margins": 0.05509233474731445, "rewards/rejected": -17.44072723388672, "step": 153 }, { "epoch": 0.020969498910675382, "grad_norm": 123.1360072588639, "learning_rate": 1.6761904761904762e-07, "logits/chosen": -5.90570592880249, "logits/rejected": -5.351279258728027, "logps/chosen": -1.4041664600372314, "logps/rejected": -1.6992828845977783, "loss": 5.4451, "rewards/accuracies": 0.75, "rewards/chosen": -14.041665077209473, "rewards/margins": 2.9511637687683105, "rewards/rejected": -16.992828369140625, "step": 154 }, { "epoch": 0.021105664488017428, "grad_norm": 92.59032196523893, "learning_rate": 1.6870748299319727e-07, "logits/chosen": -8.394842147827148, "logits/rejected": -6.3303375244140625, "logps/chosen": -1.135949969291687, "logps/rejected": -1.2834014892578125, "loss": 4.406, "rewards/accuracies": 0.75, "rewards/chosen": -11.35949993133545, "rewards/margins": 1.4745149612426758, "rewards/rejected": -12.834014892578125, "step": 155 }, { "epoch": 0.021241830065359478, "grad_norm": 102.56602321070297, "learning_rate": 1.6979591836734693e-07, "logits/chosen": -7.4587016105651855, "logits/rejected": -6.154718399047852, "logps/chosen": -1.2901796102523804, "logps/rejected": -1.79585862159729, "loss": 5.3973, "rewards/accuracies": 0.75, "rewards/chosen": -12.901796340942383, "rewards/margins": 5.056789875030518, "rewards/rejected": -17.958585739135742, "step": 156 }, { "epoch": 0.021377995642701524, "grad_norm": 97.38957721642002, "learning_rate": 1.7088435374149658e-07, "logits/chosen": -5.6771650314331055, "logits/rejected": -7.148407936096191, "logps/chosen": -1.198584794998169, "logps/rejected": -0.9971252083778381, "loss": 4.7677, "rewards/accuracies": 0.0, "rewards/chosen": -11.985848426818848, "rewards/margins": -2.0145962238311768, "rewards/rejected": -9.971251487731934, "step": 157 }, { "epoch": 0.021514161220043574, "grad_norm": 104.15165875462495, "learning_rate": 1.7197278911564626e-07, "logits/chosen": -6.29349422454834, "logits/rejected": -6.065828323364258, "logps/chosen": -1.5833168029785156, "logps/rejected": -1.5859633684158325, "loss": 4.5853, "rewards/accuracies": 0.5, "rewards/chosen": -15.833168983459473, "rewards/margins": 0.026464223861694336, "rewards/rejected": -15.859633445739746, "step": 158 }, { "epoch": 0.02165032679738562, "grad_norm": 79.15068341682391, "learning_rate": 1.730612244897959e-07, "logits/chosen": -7.3802618980407715, "logits/rejected": -6.974637031555176, "logps/chosen": -0.9901829957962036, "logps/rejected": -1.0578351020812988, "loss": 4.9174, "rewards/accuracies": 0.5, "rewards/chosen": -9.901830673217773, "rewards/margins": 0.676520824432373, "rewards/rejected": -10.578350067138672, "step": 159 }, { "epoch": 0.02178649237472767, "grad_norm": 68.74837348448261, "learning_rate": 1.7414965986394556e-07, "logits/chosen": -7.3688154220581055, "logits/rejected": -7.208442687988281, "logps/chosen": -1.0561609268188477, "logps/rejected": -1.7540125846862793, "loss": 5.2544, "rewards/accuracies": 1.0, "rewards/chosen": -10.561607360839844, "rewards/margins": 6.978518486022949, "rewards/rejected": -17.54012680053711, "step": 160 }, { "epoch": 0.021922657952069716, "grad_norm": 95.65614437023193, "learning_rate": 1.7523809523809524e-07, "logits/chosen": -6.960417747497559, "logits/rejected": -7.1131367683410645, "logps/chosen": -1.4105963706970215, "logps/rejected": -1.388922929763794, "loss": 4.5346, "rewards/accuracies": 0.5, "rewards/chosen": -14.105962753295898, "rewards/margins": -0.21673369407653809, "rewards/rejected": -13.889229774475098, "step": 161 }, { "epoch": 0.022058823529411766, "grad_norm": 88.99570284408347, "learning_rate": 1.763265306122449e-07, "logits/chosen": -5.26167631149292, "logits/rejected": -5.481692314147949, "logps/chosen": -1.5601022243499756, "logps/rejected": -2.136596918106079, "loss": 5.0081, "rewards/accuracies": 1.0, "rewards/chosen": -15.601022720336914, "rewards/margins": 5.764946460723877, "rewards/rejected": -21.365968704223633, "step": 162 }, { "epoch": 0.022194989106753812, "grad_norm": 137.2807080323382, "learning_rate": 1.7741496598639457e-07, "logits/chosen": -6.738339424133301, "logits/rejected": -5.6601057052612305, "logps/chosen": -1.0017204284667969, "logps/rejected": -0.9605573415756226, "loss": 6.3733, "rewards/accuracies": 0.25, "rewards/chosen": -10.017204284667969, "rewards/margins": -0.41163039207458496, "rewards/rejected": -9.605573654174805, "step": 163 }, { "epoch": 0.022331154684095862, "grad_norm": 71.07082662178453, "learning_rate": 1.785034013605442e-07, "logits/chosen": -6.039597511291504, "logits/rejected": -6.692374229431152, "logps/chosen": -1.6276044845581055, "logps/rejected": -1.286259412765503, "loss": 4.7965, "rewards/accuracies": 0.5, "rewards/chosen": -16.276046752929688, "rewards/margins": -3.413450002670288, "rewards/rejected": -12.862594604492188, "step": 164 }, { "epoch": 0.02246732026143791, "grad_norm": 54.70497607264681, "learning_rate": 1.7959183673469388e-07, "logits/chosen": -7.710261344909668, "logits/rejected": -8.2518892288208, "logps/chosen": -1.2027339935302734, "logps/rejected": -1.2235569953918457, "loss": 4.4275, "rewards/accuracies": 0.5, "rewards/chosen": -12.027338981628418, "rewards/margins": 0.20823073387145996, "rewards/rejected": -12.23556900024414, "step": 165 }, { "epoch": 0.022603485838779955, "grad_norm": 153.98278521390245, "learning_rate": 1.8068027210884353e-07, "logits/chosen": -8.509668350219727, "logits/rejected": -6.1804022789001465, "logps/chosen": -1.461782455444336, "logps/rejected": -2.0017993450164795, "loss": 5.5284, "rewards/accuracies": 0.75, "rewards/chosen": -14.61782455444336, "rewards/margins": 5.400167465209961, "rewards/rejected": -20.01799201965332, "step": 166 }, { "epoch": 0.022739651416122005, "grad_norm": 75.15901475261614, "learning_rate": 1.817687074829932e-07, "logits/chosen": -7.961825370788574, "logits/rejected": -6.909971237182617, "logps/chosen": -1.1711571216583252, "logps/rejected": -1.2304127216339111, "loss": 5.1794, "rewards/accuracies": 0.5, "rewards/chosen": -11.711570739746094, "rewards/margins": 0.5925552845001221, "rewards/rejected": -12.304126739501953, "step": 167 }, { "epoch": 0.02287581699346405, "grad_norm": 109.45754873895234, "learning_rate": 1.8285714285714283e-07, "logits/chosen": -6.563947677612305, "logits/rejected": -8.403340339660645, "logps/chosen": -1.0987374782562256, "logps/rejected": -0.9991467595100403, "loss": 5.7174, "rewards/accuracies": 0.25, "rewards/chosen": -10.987373352050781, "rewards/margins": -0.9959064722061157, "rewards/rejected": -9.991467475891113, "step": 168 }, { "epoch": 0.0230119825708061, "grad_norm": 112.1123860248158, "learning_rate": 1.8394557823129251e-07, "logits/chosen": -8.20693588256836, "logits/rejected": -8.455974578857422, "logps/chosen": -0.8681545257568359, "logps/rejected": -1.1759507656097412, "loss": 5.454, "rewards/accuracies": 0.25, "rewards/chosen": -8.68154525756836, "rewards/margins": 3.077962875366211, "rewards/rejected": -11.75950813293457, "step": 169 }, { "epoch": 0.023148148148148147, "grad_norm": 76.72218375026486, "learning_rate": 1.8503401360544217e-07, "logits/chosen": -6.5804033279418945, "logits/rejected": -5.652115821838379, "logps/chosen": -1.0899848937988281, "logps/rejected": -0.9910410642623901, "loss": 4.645, "rewards/accuracies": 0.5, "rewards/chosen": -10.899848937988281, "rewards/margins": -0.9894390106201172, "rewards/rejected": -9.91041088104248, "step": 170 }, { "epoch": 0.023284313725490197, "grad_norm": 79.72833006512947, "learning_rate": 1.8612244897959182e-07, "logits/chosen": -8.579032897949219, "logits/rejected": -6.2043914794921875, "logps/chosen": -1.2742339372634888, "logps/rejected": -1.6190030574798584, "loss": 5.2016, "rewards/accuracies": 1.0, "rewards/chosen": -12.742340087890625, "rewards/margins": 3.4476895332336426, "rewards/rejected": -16.19002914428711, "step": 171 }, { "epoch": 0.023420479302832243, "grad_norm": 97.46553575067091, "learning_rate": 1.8721088435374147e-07, "logits/chosen": -7.081697940826416, "logits/rejected": -7.289755821228027, "logps/chosen": -0.9433913230895996, "logps/rejected": -1.1108554601669312, "loss": 5.5086, "rewards/accuracies": 0.75, "rewards/chosen": -9.433913230895996, "rewards/margins": 1.6746413707733154, "rewards/rejected": -11.108553886413574, "step": 172 }, { "epoch": 0.023556644880174293, "grad_norm": 77.7655501824403, "learning_rate": 1.8829931972789115e-07, "logits/chosen": -8.084100723266602, "logits/rejected": -8.048393249511719, "logps/chosen": -1.1167643070220947, "logps/rejected": -0.8588655591011047, "loss": 4.5307, "rewards/accuracies": 0.5, "rewards/chosen": -11.167643547058105, "rewards/margins": -2.5789875984191895, "rewards/rejected": -8.588655471801758, "step": 173 }, { "epoch": 0.02369281045751634, "grad_norm": 73.69536119387801, "learning_rate": 1.893877551020408e-07, "logits/chosen": -7.671952724456787, "logits/rejected": -7.287993431091309, "logps/chosen": -1.3024232387542725, "logps/rejected": -1.4799177646636963, "loss": 4.2354, "rewards/accuracies": 0.75, "rewards/chosen": -13.024232864379883, "rewards/margins": 1.7749452590942383, "rewards/rejected": -14.799179077148438, "step": 174 }, { "epoch": 0.02382897603485839, "grad_norm": 73.30163669138452, "learning_rate": 1.9047619047619045e-07, "logits/chosen": -6.510768413543701, "logits/rejected": -6.433928489685059, "logps/chosen": -1.4896583557128906, "logps/rejected": -1.4321060180664062, "loss": 5.2671, "rewards/accuracies": 0.5, "rewards/chosen": -14.896583557128906, "rewards/margins": -0.5755228996276855, "rewards/rejected": -14.321060180664062, "step": 175 }, { "epoch": 0.023965141612200435, "grad_norm": 84.59154091137295, "learning_rate": 1.9156462585034013e-07, "logits/chosen": -7.444751739501953, "logits/rejected": -6.793341636657715, "logps/chosen": -0.9935526251792908, "logps/rejected": -1.2576138973236084, "loss": 5.225, "rewards/accuracies": 1.0, "rewards/chosen": -9.935525894165039, "rewards/margins": 2.6406126022338867, "rewards/rejected": -12.576138496398926, "step": 176 }, { "epoch": 0.024101307189542485, "grad_norm": 94.40683894409909, "learning_rate": 1.9265306122448978e-07, "logits/chosen": -8.296958923339844, "logits/rejected": -7.922490119934082, "logps/chosen": -0.9406330585479736, "logps/rejected": -1.1321996450424194, "loss": 5.1181, "rewards/accuracies": 0.75, "rewards/chosen": -9.406330108642578, "rewards/margins": 1.9156659841537476, "rewards/rejected": -11.321996688842773, "step": 177 }, { "epoch": 0.02423747276688453, "grad_norm": 99.27857201672498, "learning_rate": 1.9374149659863946e-07, "logits/chosen": -7.959944725036621, "logits/rejected": -6.8451433181762695, "logps/chosen": -1.232956886291504, "logps/rejected": -1.5422444343566895, "loss": 4.9527, "rewards/accuracies": 0.5, "rewards/chosen": -12.329568862915039, "rewards/margins": 3.0928754806518555, "rewards/rejected": -15.422444343566895, "step": 178 }, { "epoch": 0.02437363834422658, "grad_norm": 96.55958330988591, "learning_rate": 1.948299319727891e-07, "logits/chosen": -7.794098854064941, "logits/rejected": -5.9459075927734375, "logps/chosen": -1.4828112125396729, "logps/rejected": -2.5785632133483887, "loss": 5.2391, "rewards/accuracies": 0.75, "rewards/chosen": -14.82811164855957, "rewards/margins": 10.95751953125, "rewards/rejected": -25.78563117980957, "step": 179 }, { "epoch": 0.024509803921568627, "grad_norm": 83.74751977372985, "learning_rate": 1.9591836734693877e-07, "logits/chosen": -7.918519973754883, "logits/rejected": -6.533925533294678, "logps/chosen": -1.2089519500732422, "logps/rejected": -1.3907301425933838, "loss": 5.6945, "rewards/accuracies": 0.75, "rewards/chosen": -12.089519500732422, "rewards/margins": 1.8177810907363892, "rewards/rejected": -13.90730094909668, "step": 180 }, { "epoch": 0.024645969498910677, "grad_norm": 68.04222975287135, "learning_rate": 1.9700680272108842e-07, "logits/chosen": -7.509442329406738, "logits/rejected": -6.4137959480285645, "logps/chosen": -1.4295858144760132, "logps/rejected": -1.257176399230957, "loss": 5.2318, "rewards/accuracies": 0.5, "rewards/chosen": -14.295858383178711, "rewards/margins": -1.7240939140319824, "rewards/rejected": -12.57176399230957, "step": 181 }, { "epoch": 0.024782135076252723, "grad_norm": 70.04948463922646, "learning_rate": 1.980952380952381e-07, "logits/chosen": -6.858748435974121, "logits/rejected": -5.326076507568359, "logps/chosen": -1.170296311378479, "logps/rejected": -1.025686264038086, "loss": 5.2616, "rewards/accuracies": 0.25, "rewards/chosen": -11.702963829040527, "rewards/margins": -1.4461005926132202, "rewards/rejected": -10.25686264038086, "step": 182 }, { "epoch": 0.024918300653594773, "grad_norm": 67.06831822147421, "learning_rate": 1.9918367346938773e-07, "logits/chosen": -5.9540486335754395, "logits/rejected": -6.134858131408691, "logps/chosen": -1.4768328666687012, "logps/rejected": -1.4597630500793457, "loss": 4.5049, "rewards/accuracies": 0.75, "rewards/chosen": -14.768327713012695, "rewards/margins": -0.17069673538208008, "rewards/rejected": -14.597631454467773, "step": 183 }, { "epoch": 0.02505446623093682, "grad_norm": 80.87774508979096, "learning_rate": 2.0027210884353743e-07, "logits/chosen": -6.767053604125977, "logits/rejected": -6.290744304656982, "logps/chosen": -1.2690455913543701, "logps/rejected": -1.3472695350646973, "loss": 5.2294, "rewards/accuracies": 0.75, "rewards/chosen": -12.69045639038086, "rewards/margins": 0.7822391986846924, "rewards/rejected": -13.472695350646973, "step": 184 }, { "epoch": 0.025190631808278865, "grad_norm": 116.3431268533652, "learning_rate": 2.0136054421768706e-07, "logits/chosen": -8.071076393127441, "logits/rejected": -6.2148756980896, "logps/chosen": -0.7556376457214355, "logps/rejected": -1.154641032218933, "loss": 6.0319, "rewards/accuracies": 0.75, "rewards/chosen": -7.5563764572143555, "rewards/margins": 3.990034580230713, "rewards/rejected": -11.546411514282227, "step": 185 }, { "epoch": 0.025326797385620915, "grad_norm": 115.56135296089637, "learning_rate": 2.024489795918367e-07, "logits/chosen": -5.8168463706970215, "logits/rejected": -6.642218589782715, "logps/chosen": -1.6020214557647705, "logps/rejected": -1.114095687866211, "loss": 4.9616, "rewards/accuracies": 0.25, "rewards/chosen": -16.020214080810547, "rewards/margins": -4.879256725311279, "rewards/rejected": -11.140957832336426, "step": 186 }, { "epoch": 0.02546296296296296, "grad_norm": 67.26543233381646, "learning_rate": 2.035374149659864e-07, "logits/chosen": -6.950218200683594, "logits/rejected": -4.895829200744629, "logps/chosen": -1.0128856897354126, "logps/rejected": -1.8647680282592773, "loss": 4.0077, "rewards/accuracies": 0.75, "rewards/chosen": -10.128856658935547, "rewards/margins": 8.518823623657227, "rewards/rejected": -18.647680282592773, "step": 187 }, { "epoch": 0.02559912854030501, "grad_norm": 90.73890023080358, "learning_rate": 2.0462585034013604e-07, "logits/chosen": -6.8982133865356445, "logits/rejected": -7.051582336425781, "logps/chosen": -1.4017748832702637, "logps/rejected": -1.128211259841919, "loss": 5.2079, "rewards/accuracies": 0.5, "rewards/chosen": -14.017749786376953, "rewards/margins": -2.7356364727020264, "rewards/rejected": -11.282113075256348, "step": 188 }, { "epoch": 0.025735294117647058, "grad_norm": 96.41724944087588, "learning_rate": 2.057142857142857e-07, "logits/chosen": -6.817584037780762, "logits/rejected": -8.678714752197266, "logps/chosen": -1.083337664604187, "logps/rejected": -1.046867847442627, "loss": 5.2113, "rewards/accuracies": 0.5, "rewards/chosen": -10.833375930786133, "rewards/margins": -0.36469900608062744, "rewards/rejected": -10.468677520751953, "step": 189 }, { "epoch": 0.025871459694989107, "grad_norm": 83.25695770923153, "learning_rate": 2.0680272108843537e-07, "logits/chosen": -6.661531448364258, "logits/rejected": -6.242840766906738, "logps/chosen": -1.2485356330871582, "logps/rejected": -1.082529067993164, "loss": 4.4928, "rewards/accuracies": 0.25, "rewards/chosen": -12.485357284545898, "rewards/margins": -1.66006600856781, "rewards/rejected": -10.82529067993164, "step": 190 }, { "epoch": 0.026007625272331154, "grad_norm": 70.5737780269745, "learning_rate": 2.0789115646258502e-07, "logits/chosen": -6.130125045776367, "logits/rejected": -5.073531150817871, "logps/chosen": -1.3772631883621216, "logps/rejected": -1.27760910987854, "loss": 4.6737, "rewards/accuracies": 0.75, "rewards/chosen": -13.772631645202637, "rewards/margins": -0.9965405464172363, "rewards/rejected": -12.776091575622559, "step": 191 }, { "epoch": 0.026143790849673203, "grad_norm": 58.367081528081165, "learning_rate": 2.089795918367347e-07, "logits/chosen": -7.82505989074707, "logits/rejected": -6.476356506347656, "logps/chosen": -1.4139668941497803, "logps/rejected": -1.6912710666656494, "loss": 4.4833, "rewards/accuracies": 0.5, "rewards/chosen": -14.139669418334961, "rewards/margins": 2.773041248321533, "rewards/rejected": -16.912710189819336, "step": 192 }, { "epoch": 0.02627995642701525, "grad_norm": 61.6651807762063, "learning_rate": 2.1006802721088435e-07, "logits/chosen": -6.099628448486328, "logits/rejected": -7.124472618103027, "logps/chosen": -1.1536598205566406, "logps/rejected": -0.8781601786613464, "loss": 5.311, "rewards/accuracies": 0.0, "rewards/chosen": -11.536596298217773, "rewards/margins": -2.754995107650757, "rewards/rejected": -8.781601905822754, "step": 193 }, { "epoch": 0.0264161220043573, "grad_norm": 87.1504705483159, "learning_rate": 2.1115646258503398e-07, "logits/chosen": -7.453646659851074, "logits/rejected": -6.912619590759277, "logps/chosen": -0.8000520467758179, "logps/rejected": -0.9588027000427246, "loss": 5.1691, "rewards/accuracies": 0.75, "rewards/chosen": -8.000520706176758, "rewards/margins": 1.587506651878357, "rewards/rejected": -9.588027000427246, "step": 194 }, { "epoch": 0.026552287581699346, "grad_norm": 103.33590340183373, "learning_rate": 2.1224489795918369e-07, "logits/chosen": -7.2746901512146, "logits/rejected": -6.692259788513184, "logps/chosen": -1.3528649806976318, "logps/rejected": -1.20949125289917, "loss": 4.6167, "rewards/accuracies": 0.75, "rewards/chosen": -13.52864933013916, "rewards/margins": -1.433736801147461, "rewards/rejected": -12.0949125289917, "step": 195 }, { "epoch": 0.026688453159041396, "grad_norm": 118.7605415818877, "learning_rate": 2.133333333333333e-07, "logits/chosen": -6.24162483215332, "logits/rejected": -4.508004188537598, "logps/chosen": -0.903314471244812, "logps/rejected": -1.5306448936462402, "loss": 5.6855, "rewards/accuracies": 0.75, "rewards/chosen": -9.033143997192383, "rewards/margins": 6.273303031921387, "rewards/rejected": -15.306447982788086, "step": 196 }, { "epoch": 0.026824618736383442, "grad_norm": 72.31368849575523, "learning_rate": 2.1442176870748296e-07, "logits/chosen": -6.148506164550781, "logits/rejected": -7.032370567321777, "logps/chosen": -1.2420457601547241, "logps/rejected": -0.9879764914512634, "loss": 5.4145, "rewards/accuracies": 0.25, "rewards/chosen": -12.42045783996582, "rewards/margins": -2.5406928062438965, "rewards/rejected": -9.879764556884766, "step": 197 }, { "epoch": 0.02696078431372549, "grad_norm": 61.23065509770097, "learning_rate": 2.1551020408163264e-07, "logits/chosen": -7.058525085449219, "logits/rejected": -6.249241828918457, "logps/chosen": -1.1598646640777588, "logps/rejected": -1.441483974456787, "loss": 5.3416, "rewards/accuracies": 0.75, "rewards/chosen": -11.59864616394043, "rewards/margins": 2.816193103790283, "rewards/rejected": -14.414838790893555, "step": 198 }, { "epoch": 0.027096949891067538, "grad_norm": 116.70120451114593, "learning_rate": 2.165986394557823e-07, "logits/chosen": -7.153243064880371, "logits/rejected": -6.457721710205078, "logps/chosen": -0.8915928602218628, "logps/rejected": -1.0144236087799072, "loss": 4.8856, "rewards/accuracies": 0.75, "rewards/chosen": -8.91592788696289, "rewards/margins": 1.2283074855804443, "rewards/rejected": -10.144235610961914, "step": 199 }, { "epoch": 0.027233115468409588, "grad_norm": 72.40451493157046, "learning_rate": 2.1768707482993197e-07, "logits/chosen": -7.039863586425781, "logits/rejected": -5.742154121398926, "logps/chosen": -1.1692641973495483, "logps/rejected": -1.4311308860778809, "loss": 4.776, "rewards/accuracies": 0.75, "rewards/chosen": -11.692641258239746, "rewards/margins": 2.618666648864746, "rewards/rejected": -14.311307907104492, "step": 200 }, { "epoch": 0.027369281045751634, "grad_norm": 105.33046777142792, "learning_rate": 2.1877551020408163e-07, "logits/chosen": -6.790611267089844, "logits/rejected": -6.1442718505859375, "logps/chosen": -1.0044852495193481, "logps/rejected": -0.8278141617774963, "loss": 4.6958, "rewards/accuracies": 0.25, "rewards/chosen": -10.044853210449219, "rewards/margins": -1.766710877418518, "rewards/rejected": -8.278141021728516, "step": 201 }, { "epoch": 0.02750544662309368, "grad_norm": 56.480333990031674, "learning_rate": 2.1986394557823128e-07, "logits/chosen": -4.502762794494629, "logits/rejected": -4.234026908874512, "logps/chosen": -1.476064682006836, "logps/rejected": -1.517585277557373, "loss": 4.4695, "rewards/accuracies": 0.25, "rewards/chosen": -14.76064682006836, "rewards/margins": 0.41520583629608154, "rewards/rejected": -15.17585277557373, "step": 202 }, { "epoch": 0.02764161220043573, "grad_norm": 91.62946975206289, "learning_rate": 2.2095238095238096e-07, "logits/chosen": -6.68681001663208, "logits/rejected": -6.122259140014648, "logps/chosen": -1.2721357345581055, "logps/rejected": -1.5630892515182495, "loss": 5.5299, "rewards/accuracies": 1.0, "rewards/chosen": -12.721357345581055, "rewards/margins": 2.9095358848571777, "rewards/rejected": -15.630891799926758, "step": 203 }, { "epoch": 0.027777777777777776, "grad_norm": 75.07575471569352, "learning_rate": 2.220408163265306e-07, "logits/chosen": -5.518540382385254, "logits/rejected": -5.904209613800049, "logps/chosen": -2.094674825668335, "logps/rejected": -1.1049268245697021, "loss": 5.6318, "rewards/accuracies": 0.0, "rewards/chosen": -20.946748733520508, "rewards/margins": -9.897480010986328, "rewards/rejected": -11.04926872253418, "step": 204 }, { "epoch": 0.027913943355119826, "grad_norm": 77.50770347599598, "learning_rate": 2.2312925170068024e-07, "logits/chosen": -5.377975940704346, "logits/rejected": -5.207005500793457, "logps/chosen": -1.399886965751648, "logps/rejected": -1.221920371055603, "loss": 5.1013, "rewards/accuracies": 0.25, "rewards/chosen": -13.998869895935059, "rewards/margins": -1.7796659469604492, "rewards/rejected": -12.21920394897461, "step": 205 }, { "epoch": 0.028050108932461872, "grad_norm": 82.4322279404147, "learning_rate": 2.2421768707482994e-07, "logits/chosen": -5.701888084411621, "logits/rejected": -5.295608997344971, "logps/chosen": -1.2542833089828491, "logps/rejected": -1.351365089416504, "loss": 4.6043, "rewards/accuracies": 0.75, "rewards/chosen": -12.54283332824707, "rewards/margins": 0.9708186388015747, "rewards/rejected": -13.513651847839355, "step": 206 }, { "epoch": 0.028186274509803922, "grad_norm": 136.82930422236254, "learning_rate": 2.2530612244897957e-07, "logits/chosen": -6.315647125244141, "logits/rejected": -6.732501029968262, "logps/chosen": -1.0674160718917847, "logps/rejected": -1.1574244499206543, "loss": 5.053, "rewards/accuracies": 0.5, "rewards/chosen": -10.67416000366211, "rewards/margins": 0.9000828266143799, "rewards/rejected": -11.574243545532227, "step": 207 }, { "epoch": 0.02832244008714597, "grad_norm": 72.4359417661519, "learning_rate": 2.2639455782312927e-07, "logits/chosen": -5.756917953491211, "logits/rejected": -5.640669822692871, "logps/chosen": -1.105139970779419, "logps/rejected": -1.0349900722503662, "loss": 5.9381, "rewards/accuracies": 0.25, "rewards/chosen": -11.051400184631348, "rewards/margins": -0.7014998197555542, "rewards/rejected": -10.34990119934082, "step": 208 }, { "epoch": 0.028458605664488018, "grad_norm": 61.22684484759386, "learning_rate": 2.274829931972789e-07, "logits/chosen": -7.9694929122924805, "logits/rejected": -6.877322196960449, "logps/chosen": -0.625891923904419, "logps/rejected": -1.001348614692688, "loss": 5.3777, "rewards/accuracies": 1.0, "rewards/chosen": -6.2589192390441895, "rewards/margins": 3.7545676231384277, "rewards/rejected": -10.013486862182617, "step": 209 }, { "epoch": 0.028594771241830064, "grad_norm": 79.62462810757066, "learning_rate": 2.2857142857142855e-07, "logits/chosen": -6.145734786987305, "logits/rejected": -5.478681564331055, "logps/chosen": -1.0469872951507568, "logps/rejected": -1.3055447340011597, "loss": 4.6382, "rewards/accuracies": 1.0, "rewards/chosen": -10.46987247467041, "rewards/margins": 2.5855751037597656, "rewards/rejected": -13.055447578430176, "step": 210 }, { "epoch": 0.028730936819172114, "grad_norm": 71.31357401147491, "learning_rate": 2.2965986394557823e-07, "logits/chosen": -6.855096340179443, "logits/rejected": -4.953028678894043, "logps/chosen": -1.691826581954956, "logps/rejected": -1.3644262552261353, "loss": 5.4208, "rewards/accuracies": 0.5, "rewards/chosen": -16.91826629638672, "rewards/margins": -3.274003505706787, "rewards/rejected": -13.644262313842773, "step": 211 }, { "epoch": 0.02886710239651416, "grad_norm": 76.58918075000632, "learning_rate": 2.3074829931972788e-07, "logits/chosen": -7.463372230529785, "logits/rejected": -6.737998962402344, "logps/chosen": -1.1528640985488892, "logps/rejected": -1.1275687217712402, "loss": 4.6627, "rewards/accuracies": 0.5, "rewards/chosen": -11.528640747070312, "rewards/margins": -0.2529531717300415, "rewards/rejected": -11.275687217712402, "step": 212 }, { "epoch": 0.02900326797385621, "grad_norm": 136.91719761616997, "learning_rate": 2.3183673469387753e-07, "logits/chosen": -6.638874053955078, "logits/rejected": -6.179720878601074, "logps/chosen": -1.0043089389801025, "logps/rejected": -1.470382571220398, "loss": 5.1253, "rewards/accuracies": 0.75, "rewards/chosen": -10.043089866638184, "rewards/margins": 4.660735130310059, "rewards/rejected": -14.703824996948242, "step": 213 }, { "epoch": 0.029139433551198256, "grad_norm": 77.57457712666577, "learning_rate": 2.329251700680272e-07, "logits/chosen": -6.933961868286133, "logits/rejected": -5.310276031494141, "logps/chosen": -0.9395098686218262, "logps/rejected": -1.1492676734924316, "loss": 4.7131, "rewards/accuracies": 0.5, "rewards/chosen": -9.395098686218262, "rewards/margins": 2.097578525543213, "rewards/rejected": -11.492677688598633, "step": 214 }, { "epoch": 0.029275599128540306, "grad_norm": 63.55906381894036, "learning_rate": 2.3401360544217686e-07, "logits/chosen": -7.894040107727051, "logits/rejected": -6.691250801086426, "logps/chosen": -1.2535067796707153, "logps/rejected": -1.3696892261505127, "loss": 4.5394, "rewards/accuracies": 0.75, "rewards/chosen": -12.535067558288574, "rewards/margins": 1.1618250608444214, "rewards/rejected": -13.696892738342285, "step": 215 }, { "epoch": 0.029411764705882353, "grad_norm": 78.30885681022556, "learning_rate": 2.3510204081632654e-07, "logits/chosen": -5.697659492492676, "logits/rejected": -4.982385635375977, "logps/chosen": -1.7605178356170654, "logps/rejected": -2.1654295921325684, "loss": 4.9137, "rewards/accuracies": 0.5, "rewards/chosen": -17.605178833007812, "rewards/margins": 4.049118995666504, "rewards/rejected": -21.654296875, "step": 216 }, { "epoch": 0.029547930283224402, "grad_norm": 62.09341804124108, "learning_rate": 2.361904761904762e-07, "logits/chosen": -6.962115287780762, "logits/rejected": -6.143084526062012, "logps/chosen": -1.0354642868041992, "logps/rejected": -1.2692071199417114, "loss": 5.8084, "rewards/accuracies": 0.5, "rewards/chosen": -10.354642868041992, "rewards/margins": 2.3374288082122803, "rewards/rejected": -12.692071914672852, "step": 217 }, { "epoch": 0.02968409586056645, "grad_norm": 194.27908701239292, "learning_rate": 2.3727891156462582e-07, "logits/chosen": -5.615656852722168, "logits/rejected": -4.609704971313477, "logps/chosen": -1.2224977016448975, "logps/rejected": -1.3477003574371338, "loss": 5.1518, "rewards/accuracies": 0.75, "rewards/chosen": -12.2249755859375, "rewards/margins": 1.252026915550232, "rewards/rejected": -13.47700309753418, "step": 218 }, { "epoch": 0.0298202614379085, "grad_norm": 62.49515112751743, "learning_rate": 2.3836734693877553e-07, "logits/chosen": -5.848294258117676, "logits/rejected": -4.78947639465332, "logps/chosen": -1.5041086673736572, "logps/rejected": -1.9121339321136475, "loss": 4.5845, "rewards/accuracies": 0.75, "rewards/chosen": -15.041086196899414, "rewards/margins": 4.080253601074219, "rewards/rejected": -19.121339797973633, "step": 219 }, { "epoch": 0.029956427015250545, "grad_norm": 66.62784254424413, "learning_rate": 2.3945578231292515e-07, "logits/chosen": -7.249790191650391, "logits/rejected": -6.993615627288818, "logps/chosen": -1.1668280363082886, "logps/rejected": -1.1935391426086426, "loss": 4.5609, "rewards/accuracies": 0.5, "rewards/chosen": -11.668279647827148, "rewards/margins": 0.2671109437942505, "rewards/rejected": -11.93539047241211, "step": 220 }, { "epoch": 0.03009259259259259, "grad_norm": 58.40283878205376, "learning_rate": 2.405442176870748e-07, "logits/chosen": -6.601233005523682, "logits/rejected": -6.202434539794922, "logps/chosen": -1.5255262851715088, "logps/rejected": -1.5346779823303223, "loss": 4.6333, "rewards/accuracies": 0.75, "rewards/chosen": -15.255263328552246, "rewards/margins": 0.09151554107666016, "rewards/rejected": -15.346778869628906, "step": 221 }, { "epoch": 0.03022875816993464, "grad_norm": 104.48901953297994, "learning_rate": 2.416326530612245e-07, "logits/chosen": -4.526758670806885, "logits/rejected": -4.371284008026123, "logps/chosen": -0.9522086977958679, "logps/rejected": -1.0991909503936768, "loss": 5.8593, "rewards/accuracies": 0.75, "rewards/chosen": -9.522087097167969, "rewards/margins": 1.4698230028152466, "rewards/rejected": -10.991910934448242, "step": 222 }, { "epoch": 0.030364923747276687, "grad_norm": 60.14996073416937, "learning_rate": 2.427210884353741e-07, "logits/chosen": -6.021376609802246, "logits/rejected": -5.64418888092041, "logps/chosen": -1.384101152420044, "logps/rejected": -1.3105504512786865, "loss": 5.4161, "rewards/accuracies": 0.25, "rewards/chosen": -13.841011047363281, "rewards/margins": -0.7355060577392578, "rewards/rejected": -13.105504989624023, "step": 223 }, { "epoch": 0.030501089324618737, "grad_norm": 73.97540727183505, "learning_rate": 2.438095238095238e-07, "logits/chosen": -7.346739292144775, "logits/rejected": -6.452864646911621, "logps/chosen": -1.2245633602142334, "logps/rejected": -1.2087429761886597, "loss": 4.5699, "rewards/accuracies": 0.75, "rewards/chosen": -12.245634078979492, "rewards/margins": -0.15820443630218506, "rewards/rejected": -12.08742904663086, "step": 224 }, { "epoch": 0.030637254901960783, "grad_norm": 69.95087811826433, "learning_rate": 2.4489795918367347e-07, "logits/chosen": -7.0311970710754395, "logits/rejected": -7.328324317932129, "logps/chosen": -1.4359487295150757, "logps/rejected": -1.1880981922149658, "loss": 5.156, "rewards/accuracies": 0.25, "rewards/chosen": -14.359487533569336, "rewards/margins": -2.4785051345825195, "rewards/rejected": -11.880982398986816, "step": 225 }, { "epoch": 0.030773420479302833, "grad_norm": 59.44744527515022, "learning_rate": 2.459863945578231e-07, "logits/chosen": -7.7552618980407715, "logits/rejected": -6.726212024688721, "logps/chosen": -1.0393712520599365, "logps/rejected": -1.3850208520889282, "loss": 4.2634, "rewards/accuracies": 0.75, "rewards/chosen": -10.393712043762207, "rewards/margins": 3.4564967155456543, "rewards/rejected": -13.850208282470703, "step": 226 }, { "epoch": 0.03090958605664488, "grad_norm": 61.86202628007466, "learning_rate": 2.4707482993197277e-07, "logits/chosen": -6.196245193481445, "logits/rejected": -5.664061069488525, "logps/chosen": -1.2878174781799316, "logps/rejected": -1.5794470310211182, "loss": 4.3617, "rewards/accuracies": 1.0, "rewards/chosen": -12.878175735473633, "rewards/margins": 2.9162960052490234, "rewards/rejected": -15.794471740722656, "step": 227 }, { "epoch": 0.03104575163398693, "grad_norm": 65.2414416870879, "learning_rate": 2.481632653061224e-07, "logits/chosen": -5.041067123413086, "logits/rejected": -4.838307857513428, "logps/chosen": -1.2869311571121216, "logps/rejected": -1.5623878240585327, "loss": 4.495, "rewards/accuracies": 0.5, "rewards/chosen": -12.869311332702637, "rewards/margins": 2.7545652389526367, "rewards/rejected": -15.623876571655273, "step": 228 }, { "epoch": 0.031181917211328975, "grad_norm": 103.61266108847221, "learning_rate": 2.492517006802721e-07, "logits/chosen": -6.699453353881836, "logits/rejected": -6.316040992736816, "logps/chosen": -1.4382461309432983, "logps/rejected": -1.2933666706085205, "loss": 4.6222, "rewards/accuracies": 0.25, "rewards/chosen": -14.382461547851562, "rewards/margins": -1.4487944841384888, "rewards/rejected": -12.933666229248047, "step": 229 }, { "epoch": 0.03131808278867102, "grad_norm": 56.60251125164447, "learning_rate": 2.503401360544218e-07, "logits/chosen": -6.429636001586914, "logits/rejected": -4.45748233795166, "logps/chosen": -0.8151422739028931, "logps/rejected": -1.4166911840438843, "loss": 5.0471, "rewards/accuracies": 1.0, "rewards/chosen": -8.151422500610352, "rewards/margins": 6.01548957824707, "rewards/rejected": -14.166912078857422, "step": 230 }, { "epoch": 0.03145424836601307, "grad_norm": 103.04138187604374, "learning_rate": 2.5142857142857143e-07, "logits/chosen": -6.237385272979736, "logits/rejected": -6.88185453414917, "logps/chosen": -1.1841727495193481, "logps/rejected": -0.8830050230026245, "loss": 4.834, "rewards/accuracies": 0.25, "rewards/chosen": -11.841728210449219, "rewards/margins": -3.0116770267486572, "rewards/rejected": -8.830050468444824, "step": 231 }, { "epoch": 0.03159041394335512, "grad_norm": 62.129766564570225, "learning_rate": 2.5251700680272103e-07, "logits/chosen": -6.272815704345703, "logits/rejected": -6.050259590148926, "logps/chosen": -1.3385813236236572, "logps/rejected": -1.1527156829833984, "loss": 5.1696, "rewards/accuracies": 0.25, "rewards/chosen": -13.38581371307373, "rewards/margins": -1.8586556911468506, "rewards/rejected": -11.5271577835083, "step": 232 }, { "epoch": 0.03172657952069717, "grad_norm": 64.97048324187509, "learning_rate": 2.5360544217687074e-07, "logits/chosen": -6.52187967300415, "logits/rejected": -5.35231876373291, "logps/chosen": -0.7651889324188232, "logps/rejected": -1.1424862146377563, "loss": 5.0345, "rewards/accuracies": 0.75, "rewards/chosen": -7.651888847351074, "rewards/margins": 3.7729735374450684, "rewards/rejected": -11.424861907958984, "step": 233 }, { "epoch": 0.031862745098039214, "grad_norm": 71.6213463546921, "learning_rate": 2.546938775510204e-07, "logits/chosen": -5.955198287963867, "logits/rejected": -5.022669792175293, "logps/chosen": -1.0090972185134888, "logps/rejected": -1.6245440244674683, "loss": 4.2366, "rewards/accuracies": 1.0, "rewards/chosen": -10.090971946716309, "rewards/margins": 6.154468059539795, "rewards/rejected": -16.245439529418945, "step": 234 }, { "epoch": 0.03199891067538126, "grad_norm": 94.12603895643487, "learning_rate": 2.557823129251701e-07, "logits/chosen": -5.043076515197754, "logits/rejected": -4.527265548706055, "logps/chosen": -1.1433395147323608, "logps/rejected": -1.2495653629302979, "loss": 5.1634, "rewards/accuracies": 0.75, "rewards/chosen": -11.433395385742188, "rewards/margins": 1.0622570514678955, "rewards/rejected": -12.495652198791504, "step": 235 }, { "epoch": 0.03213507625272331, "grad_norm": 66.94369496685466, "learning_rate": 2.568707482993197e-07, "logits/chosen": -5.487996578216553, "logits/rejected": -5.347557067871094, "logps/chosen": -0.8969215154647827, "logps/rejected": -0.9316048622131348, "loss": 4.6605, "rewards/accuracies": 0.25, "rewards/chosen": -8.96921443939209, "rewards/margins": 0.34683430194854736, "rewards/rejected": -9.316048622131348, "step": 236 }, { "epoch": 0.03227124183006536, "grad_norm": 69.2373029335222, "learning_rate": 2.5795918367346935e-07, "logits/chosen": -5.951333999633789, "logits/rejected": -6.019161224365234, "logps/chosen": -1.0771560668945312, "logps/rejected": -0.8683271408081055, "loss": 5.1164, "rewards/accuracies": 0.25, "rewards/chosen": -10.771560668945312, "rewards/margins": -2.088289260864258, "rewards/rejected": -8.683271408081055, "step": 237 }, { "epoch": 0.032407407407407406, "grad_norm": 95.09701899094627, "learning_rate": 2.5904761904761905e-07, "logits/chosen": -4.775607585906982, "logits/rejected": -4.902871131896973, "logps/chosen": -1.538007378578186, "logps/rejected": -1.1899385452270508, "loss": 6.3959, "rewards/accuracies": 0.5, "rewards/chosen": -15.380073547363281, "rewards/margins": -3.48068904876709, "rewards/rejected": -11.899385452270508, "step": 238 }, { "epoch": 0.032543572984749455, "grad_norm": 117.22008093150545, "learning_rate": 2.601360544217687e-07, "logits/chosen": -5.904189586639404, "logits/rejected": -4.84073543548584, "logps/chosen": -1.3049538135528564, "logps/rejected": -1.7636070251464844, "loss": 5.2874, "rewards/accuracies": 0.75, "rewards/chosen": -13.049537658691406, "rewards/margins": 4.5865325927734375, "rewards/rejected": -17.636070251464844, "step": 239 }, { "epoch": 0.032679738562091505, "grad_norm": 96.66844165249017, "learning_rate": 2.6122448979591836e-07, "logits/chosen": -5.6741790771484375, "logits/rejected": -5.74574089050293, "logps/chosen": -1.1981593370437622, "logps/rejected": -1.5953609943389893, "loss": 5.3706, "rewards/accuracies": 0.5, "rewards/chosen": -11.981593132019043, "rewards/margins": 3.9720165729522705, "rewards/rejected": -15.953609466552734, "step": 240 }, { "epoch": 0.03281590413943355, "grad_norm": 63.68893045904735, "learning_rate": 2.62312925170068e-07, "logits/chosen": -6.569314002990723, "logits/rejected": -5.0147905349731445, "logps/chosen": -1.0044927597045898, "logps/rejected": -1.1354849338531494, "loss": 4.0248, "rewards/accuracies": 0.75, "rewards/chosen": -10.044926643371582, "rewards/margins": 1.309922695159912, "rewards/rejected": -11.354848861694336, "step": 241 }, { "epoch": 0.0329520697167756, "grad_norm": 73.3670993478221, "learning_rate": 2.6340136054421766e-07, "logits/chosen": -5.521923542022705, "logits/rejected": -5.294943809509277, "logps/chosen": -1.1957913637161255, "logps/rejected": -0.9369980692863464, "loss": 5.1792, "rewards/accuracies": 0.5, "rewards/chosen": -11.957913398742676, "rewards/margins": -2.587932825088501, "rewards/rejected": -9.369979858398438, "step": 242 }, { "epoch": 0.03308823529411765, "grad_norm": 67.84197497193351, "learning_rate": 2.6448979591836737e-07, "logits/chosen": -6.474400043487549, "logits/rejected": -5.056929111480713, "logps/chosen": -1.1984574794769287, "logps/rejected": -1.4924933910369873, "loss": 4.6672, "rewards/accuracies": 0.75, "rewards/chosen": -11.984574317932129, "rewards/margins": 2.940359354019165, "rewards/rejected": -14.924932479858398, "step": 243 }, { "epoch": 0.0332244008714597, "grad_norm": 57.308658956394545, "learning_rate": 2.65578231292517e-07, "logits/chosen": -5.604331970214844, "logits/rejected": -5.682088851928711, "logps/chosen": -1.3401546478271484, "logps/rejected": -1.7893145084381104, "loss": 4.4413, "rewards/accuracies": 0.75, "rewards/chosen": -13.401546478271484, "rewards/margins": 4.491598606109619, "rewards/rejected": -17.893146514892578, "step": 244 }, { "epoch": 0.03336056644880174, "grad_norm": 90.77730958492798, "learning_rate": 2.666666666666666e-07, "logits/chosen": -6.021014213562012, "logits/rejected": -6.30087947845459, "logps/chosen": -1.0005228519439697, "logps/rejected": -0.9910122156143188, "loss": 5.2992, "rewards/accuracies": 0.5, "rewards/chosen": -10.005228996276855, "rewards/margins": -0.09510648250579834, "rewards/rejected": -9.91012191772461, "step": 245 }, { "epoch": 0.03349673202614379, "grad_norm": 63.547566454714264, "learning_rate": 2.677551020408163e-07, "logits/chosen": -6.363530158996582, "logits/rejected": -5.341102600097656, "logps/chosen": -1.0328192710876465, "logps/rejected": -1.2250367403030396, "loss": 4.3594, "rewards/accuracies": 0.75, "rewards/chosen": -10.328191757202148, "rewards/margins": 1.922175645828247, "rewards/rejected": -12.250368118286133, "step": 246 }, { "epoch": 0.03363289760348584, "grad_norm": 71.25722896478057, "learning_rate": 2.68843537414966e-07, "logits/chosen": -5.572047233581543, "logits/rejected": -5.132349014282227, "logps/chosen": -1.0486743450164795, "logps/rejected": -1.3887054920196533, "loss": 4.8229, "rewards/accuracies": 0.5, "rewards/chosen": -10.486742973327637, "rewards/margins": 3.400311231613159, "rewards/rejected": -13.887054443359375, "step": 247 }, { "epoch": 0.03376906318082789, "grad_norm": 48.25483382944756, "learning_rate": 2.6993197278911563e-07, "logits/chosen": -5.587971210479736, "logits/rejected": -4.888079643249512, "logps/chosen": -1.018635630607605, "logps/rejected": -1.04044771194458, "loss": 3.9453, "rewards/accuracies": 0.5, "rewards/chosen": -10.186356544494629, "rewards/margins": 0.21812069416046143, "rewards/rejected": -10.4044771194458, "step": 248 }, { "epoch": 0.03390522875816993, "grad_norm": 58.15812929388971, "learning_rate": 2.710204081632653e-07, "logits/chosen": -6.11392879486084, "logits/rejected": -5.03648567199707, "logps/chosen": -1.0052895545959473, "logps/rejected": -1.0711268186569214, "loss": 4.6013, "rewards/accuracies": 0.5, "rewards/chosen": -10.052896499633789, "rewards/margins": 0.6583713293075562, "rewards/rejected": -10.711267471313477, "step": 249 }, { "epoch": 0.03404139433551198, "grad_norm": 65.30866071072909, "learning_rate": 2.7210884353741493e-07, "logits/chosen": -6.044029235839844, "logits/rejected": -5.183504104614258, "logps/chosen": -1.089360237121582, "logps/rejected": -1.3705317974090576, "loss": 4.4488, "rewards/accuracies": 0.75, "rewards/chosen": -10.89360237121582, "rewards/margins": 2.811715602874756, "rewards/rejected": -13.705318450927734, "step": 250 }, { "epoch": 0.03417755991285403, "grad_norm": 57.4053048100066, "learning_rate": 2.7319727891156464e-07, "logits/chosen": -6.623503684997559, "logits/rejected": -5.899845600128174, "logps/chosen": -1.10856032371521, "logps/rejected": -1.375737190246582, "loss": 4.9093, "rewards/accuracies": 0.5, "rewards/chosen": -11.085603713989258, "rewards/margins": 2.6717677116394043, "rewards/rejected": -13.757370948791504, "step": 251 }, { "epoch": 0.03431372549019608, "grad_norm": 68.93087216361234, "learning_rate": 2.742857142857143e-07, "logits/chosen": -4.927043914794922, "logits/rejected": -4.6741533279418945, "logps/chosen": -1.3462625741958618, "logps/rejected": -1.3670933246612549, "loss": 5.0532, "rewards/accuracies": 0.5, "rewards/chosen": -13.462625503540039, "rewards/margins": 0.20830845832824707, "rewards/rejected": -13.670934677124023, "step": 252 }, { "epoch": 0.034449891067538124, "grad_norm": 78.19863175129578, "learning_rate": 2.7537414965986394e-07, "logits/chosen": -9.363874435424805, "logits/rejected": -7.845788955688477, "logps/chosen": -0.8910379409790039, "logps/rejected": -1.08247709274292, "loss": 5.3969, "rewards/accuracies": 0.5, "rewards/chosen": -8.910379409790039, "rewards/margins": 1.914391040802002, "rewards/rejected": -10.824769973754883, "step": 253 }, { "epoch": 0.034586056644880174, "grad_norm": 98.14738407838406, "learning_rate": 2.764625850340136e-07, "logits/chosen": -5.11147403717041, "logits/rejected": -4.387681484222412, "logps/chosen": -1.1952247619628906, "logps/rejected": -1.1681249141693115, "loss": 5.6966, "rewards/accuracies": 0.5, "rewards/chosen": -11.95224666595459, "rewards/margins": -0.27099788188934326, "rewards/rejected": -11.681249618530273, "step": 254 }, { "epoch": 0.034722222222222224, "grad_norm": 77.57095842737702, "learning_rate": 2.7755102040816325e-07, "logits/chosen": -4.062272071838379, "logits/rejected": -5.8715410232543945, "logps/chosen": -1.1749825477600098, "logps/rejected": -1.3008275032043457, "loss": 5.1191, "rewards/accuracies": 0.5, "rewards/chosen": -11.749824523925781, "rewards/margins": 1.2584497928619385, "rewards/rejected": -13.00827407836914, "step": 255 }, { "epoch": 0.034858387799564274, "grad_norm": 88.12204704195358, "learning_rate": 2.786394557823129e-07, "logits/chosen": -4.978282928466797, "logits/rejected": -5.032160758972168, "logps/chosen": -1.1378146409988403, "logps/rejected": -1.1246583461761475, "loss": 3.9981, "rewards/accuracies": 0.5, "rewards/chosen": -11.37814712524414, "rewards/margins": -0.13156390190124512, "rewards/rejected": -11.246583938598633, "step": 256 }, { "epoch": 0.034994553376906316, "grad_norm": 73.8351288098107, "learning_rate": 2.797278911564626e-07, "logits/chosen": -6.188664436340332, "logits/rejected": -6.1422576904296875, "logps/chosen": -1.3975176811218262, "logps/rejected": -1.3318121433258057, "loss": 4.9609, "rewards/accuracies": 0.25, "rewards/chosen": -13.975176811218262, "rewards/margins": -0.6570560932159424, "rewards/rejected": -13.318120956420898, "step": 257 }, { "epoch": 0.035130718954248366, "grad_norm": 77.04059209661084, "learning_rate": 2.808163265306122e-07, "logits/chosen": -3.5865583419799805, "logits/rejected": -5.211180210113525, "logps/chosen": -1.1150599718093872, "logps/rejected": -1.1279733180999756, "loss": 4.7276, "rewards/accuracies": 0.75, "rewards/chosen": -11.150598526000977, "rewards/margins": 0.1291351318359375, "rewards/rejected": -11.279733657836914, "step": 258 }, { "epoch": 0.035266884531590416, "grad_norm": 66.66712116757404, "learning_rate": 2.819047619047619e-07, "logits/chosen": -5.319080352783203, "logits/rejected": -4.613959789276123, "logps/chosen": -1.2092119455337524, "logps/rejected": -1.38112473487854, "loss": 4.4729, "rewards/accuracies": 0.25, "rewards/chosen": -12.092119216918945, "rewards/margins": 1.7191277742385864, "rewards/rejected": -13.811246871948242, "step": 259 }, { "epoch": 0.03540305010893246, "grad_norm": 53.26694010940253, "learning_rate": 2.8299319727891156e-07, "logits/chosen": -7.482570648193359, "logits/rejected": -6.389812469482422, "logps/chosen": -1.184424877166748, "logps/rejected": -1.3781447410583496, "loss": 4.2651, "rewards/accuracies": 0.5, "rewards/chosen": -11.844247817993164, "rewards/margins": 1.9371987581253052, "rewards/rejected": -13.78144645690918, "step": 260 }, { "epoch": 0.03553921568627451, "grad_norm": 54.176787649214354, "learning_rate": 2.840816326530612e-07, "logits/chosen": -5.978750228881836, "logits/rejected": -5.551822662353516, "logps/chosen": -0.9772324562072754, "logps/rejected": -1.0748043060302734, "loss": 4.8221, "rewards/accuracies": 1.0, "rewards/chosen": -9.772324562072754, "rewards/margins": 0.9757180213928223, "rewards/rejected": -10.748042106628418, "step": 261 }, { "epoch": 0.03567538126361656, "grad_norm": 66.664806814064, "learning_rate": 2.8517006802721087e-07, "logits/chosen": -5.673116683959961, "logits/rejected": -5.13516902923584, "logps/chosen": -1.2682722806930542, "logps/rejected": -1.5272258520126343, "loss": 4.9832, "rewards/accuracies": 0.75, "rewards/chosen": -12.682723045349121, "rewards/margins": 2.5895354747772217, "rewards/rejected": -15.272258758544922, "step": 262 }, { "epoch": 0.03581154684095861, "grad_norm": 59.91748095731296, "learning_rate": 2.862585034013605e-07, "logits/chosen": -6.447942733764648, "logits/rejected": -5.6805524826049805, "logps/chosen": -0.9962539076805115, "logps/rejected": -1.2909497022628784, "loss": 4.461, "rewards/accuracies": 0.75, "rewards/chosen": -9.962538719177246, "rewards/margins": 2.946958541870117, "rewards/rejected": -12.909497261047363, "step": 263 }, { "epoch": 0.03594771241830065, "grad_norm": 55.97479846461005, "learning_rate": 2.8734693877551017e-07, "logits/chosen": -5.6748762130737305, "logits/rejected": -5.556204319000244, "logps/chosen": -1.0425621271133423, "logps/rejected": -0.7304791808128357, "loss": 4.7079, "rewards/accuracies": 0.25, "rewards/chosen": -10.425621032714844, "rewards/margins": -3.1208291053771973, "rewards/rejected": -7.3047919273376465, "step": 264 }, { "epoch": 0.0360838779956427, "grad_norm": 61.16058931205353, "learning_rate": 2.884353741496599e-07, "logits/chosen": -6.118993759155273, "logits/rejected": -5.219854354858398, "logps/chosen": -1.3427917957305908, "logps/rejected": -1.3493716716766357, "loss": 4.8154, "rewards/accuracies": 0.25, "rewards/chosen": -13.42791748046875, "rewards/margins": 0.06579875946044922, "rewards/rejected": -13.493717193603516, "step": 265 }, { "epoch": 0.03622004357298475, "grad_norm": 63.515818854204575, "learning_rate": 2.8952380952380953e-07, "logits/chosen": -5.471975326538086, "logits/rejected": -5.962667465209961, "logps/chosen": -1.1339610815048218, "logps/rejected": -0.893965482711792, "loss": 4.9408, "rewards/accuracies": 0.25, "rewards/chosen": -11.339611053466797, "rewards/margins": -2.3999555110931396, "rewards/rejected": -8.939655303955078, "step": 266 }, { "epoch": 0.0363562091503268, "grad_norm": 89.28554828495842, "learning_rate": 2.906122448979592e-07, "logits/chosen": -5.867156028747559, "logits/rejected": -5.653191566467285, "logps/chosen": -1.7523956298828125, "logps/rejected": -1.2520313262939453, "loss": 5.7334, "rewards/accuracies": 0.0, "rewards/chosen": -17.523956298828125, "rewards/margins": -5.003640174865723, "rewards/rejected": -12.52031421661377, "step": 267 }, { "epoch": 0.03649237472766884, "grad_norm": 60.89498108842642, "learning_rate": 2.9170068027210883e-07, "logits/chosen": -7.2450761795043945, "logits/rejected": -7.492772102355957, "logps/chosen": -0.693792998790741, "logps/rejected": -0.5410770177841187, "loss": 4.2041, "rewards/accuracies": 0.5, "rewards/chosen": -6.937929630279541, "rewards/margins": -1.5271596908569336, "rewards/rejected": -5.410769939422607, "step": 268 }, { "epoch": 0.03662854030501089, "grad_norm": 73.40782375570238, "learning_rate": 2.927891156462585e-07, "logits/chosen": -5.03182315826416, "logits/rejected": -5.203419208526611, "logps/chosen": -0.8716521263122559, "logps/rejected": -1.2112388610839844, "loss": 5.1114, "rewards/accuracies": 0.5, "rewards/chosen": -8.716522216796875, "rewards/margins": 3.3958678245544434, "rewards/rejected": -12.11238956451416, "step": 269 }, { "epoch": 0.03676470588235294, "grad_norm": 82.4989027353636, "learning_rate": 2.9387755102040814e-07, "logits/chosen": -5.914813041687012, "logits/rejected": -6.236295700073242, "logps/chosen": -1.0074737071990967, "logps/rejected": -1.2283413410186768, "loss": 4.345, "rewards/accuracies": 0.75, "rewards/chosen": -10.074736595153809, "rewards/margins": 2.208677291870117, "rewards/rejected": -12.283413887023926, "step": 270 }, { "epoch": 0.03690087145969499, "grad_norm": 56.683897816670466, "learning_rate": 2.949659863945578e-07, "logits/chosen": -5.1891303062438965, "logits/rejected": -4.342794418334961, "logps/chosen": -1.2542762756347656, "logps/rejected": -1.4795784950256348, "loss": 4.9206, "rewards/accuracies": 0.75, "rewards/chosen": -12.542762756347656, "rewards/margins": 2.253021717071533, "rewards/rejected": -14.795784950256348, "step": 271 }, { "epoch": 0.037037037037037035, "grad_norm": 73.00412768811887, "learning_rate": 2.9605442176870744e-07, "logits/chosen": -5.744339942932129, "logits/rejected": -4.7394890785217285, "logps/chosen": -1.1511955261230469, "logps/rejected": -1.0473639965057373, "loss": 4.8546, "rewards/accuracies": 0.5, "rewards/chosen": -11.511956214904785, "rewards/margins": -1.0383156538009644, "rewards/rejected": -10.473640441894531, "step": 272 }, { "epoch": 0.037173202614379085, "grad_norm": 47.943558420834904, "learning_rate": 2.9714285714285715e-07, "logits/chosen": -4.224227428436279, "logits/rejected": -4.2890095710754395, "logps/chosen": -1.323521614074707, "logps/rejected": -1.2335691452026367, "loss": 5.0834, "rewards/accuracies": 0.5, "rewards/chosen": -13.23521614074707, "rewards/margins": -0.899524450302124, "rewards/rejected": -12.335691452026367, "step": 273 }, { "epoch": 0.037309368191721135, "grad_norm": 45.23249096934942, "learning_rate": 2.982312925170068e-07, "logits/chosen": -5.175953388214111, "logits/rejected": -5.670047760009766, "logps/chosen": -1.3690056800842285, "logps/rejected": -1.2710869312286377, "loss": 4.4505, "rewards/accuracies": 0.75, "rewards/chosen": -13.690056800842285, "rewards/margins": -0.9791874885559082, "rewards/rejected": -12.710868835449219, "step": 274 }, { "epoch": 0.037445533769063184, "grad_norm": 101.81107993656474, "learning_rate": 2.9931972789115645e-07, "logits/chosen": -6.438827991485596, "logits/rejected": -4.477993488311768, "logps/chosen": -1.4407565593719482, "logps/rejected": -2.0651755332946777, "loss": 5.1641, "rewards/accuracies": 0.5, "rewards/chosen": -14.40756607055664, "rewards/margins": 6.24418830871582, "rewards/rejected": -20.65175437927246, "step": 275 }, { "epoch": 0.03758169934640523, "grad_norm": 50.52675591009575, "learning_rate": 3.004081632653061e-07, "logits/chosen": -6.486952781677246, "logits/rejected": -6.977458953857422, "logps/chosen": -1.0360772609710693, "logps/rejected": -1.4686963558197021, "loss": 4.6235, "rewards/accuracies": 0.5, "rewards/chosen": -10.360772132873535, "rewards/margins": 4.326190948486328, "rewards/rejected": -14.686963081359863, "step": 276 }, { "epoch": 0.03771786492374728, "grad_norm": 59.04572550867243, "learning_rate": 3.0149659863945576e-07, "logits/chosen": -6.62136173248291, "logits/rejected": -6.684057235717773, "logps/chosen": -1.1884666681289673, "logps/rejected": -1.4418880939483643, "loss": 5.0253, "rewards/accuracies": 0.75, "rewards/chosen": -11.88466739654541, "rewards/margins": 2.534213066101074, "rewards/rejected": -14.418880462646484, "step": 277 }, { "epoch": 0.03785403050108933, "grad_norm": 51.474232343825726, "learning_rate": 3.0258503401360546e-07, "logits/chosen": -4.66886043548584, "logits/rejected": -4.881121635437012, "logps/chosen": -1.4682432413101196, "logps/rejected": -0.8505520820617676, "loss": 4.6748, "rewards/accuracies": 0.0, "rewards/chosen": -14.682432174682617, "rewards/margins": -6.176910877227783, "rewards/rejected": -8.505520820617676, "step": 278 }, { "epoch": 0.03799019607843137, "grad_norm": 59.46845057819673, "learning_rate": 3.0367346938775506e-07, "logits/chosen": -4.250846862792969, "logits/rejected": -4.090045928955078, "logps/chosen": -0.9794827699661255, "logps/rejected": -0.9620204567909241, "loss": 5.0954, "rewards/accuracies": 0.5, "rewards/chosen": -9.79482650756836, "rewards/margins": -0.1746230125427246, "rewards/rejected": -9.62020492553711, "step": 279 }, { "epoch": 0.03812636165577342, "grad_norm": 81.45401538772694, "learning_rate": 3.047619047619047e-07, "logits/chosen": -5.700243949890137, "logits/rejected": -5.550249099731445, "logps/chosen": -1.2983471155166626, "logps/rejected": -1.1378917694091797, "loss": 5.4449, "rewards/accuracies": 0.25, "rewards/chosen": -12.983470916748047, "rewards/margins": -1.604552984237671, "rewards/rejected": -11.378917694091797, "step": 280 }, { "epoch": 0.03826252723311547, "grad_norm": 51.746731100736916, "learning_rate": 3.058503401360544e-07, "logits/chosen": -5.497616767883301, "logits/rejected": -5.22199010848999, "logps/chosen": -1.2074272632598877, "logps/rejected": -1.392014503479004, "loss": 4.2037, "rewards/accuracies": 0.5, "rewards/chosen": -12.074272155761719, "rewards/margins": 1.845873475074768, "rewards/rejected": -13.920145988464355, "step": 281 }, { "epoch": 0.03839869281045752, "grad_norm": 81.42446689293689, "learning_rate": 3.0693877551020407e-07, "logits/chosen": -5.306422710418701, "logits/rejected": -7.0699462890625, "logps/chosen": -1.1654083728790283, "logps/rejected": -1.1180654764175415, "loss": 4.2958, "rewards/accuracies": 0.5, "rewards/chosen": -11.654083251953125, "rewards/margins": -0.4734284281730652, "rewards/rejected": -11.180654525756836, "step": 282 }, { "epoch": 0.03853485838779956, "grad_norm": 85.28601029015509, "learning_rate": 3.080272108843537e-07, "logits/chosen": -5.311562538146973, "logits/rejected": -3.963003158569336, "logps/chosen": -1.0550737380981445, "logps/rejected": -1.9785308837890625, "loss": 5.7733, "rewards/accuracies": 1.0, "rewards/chosen": -10.550737380981445, "rewards/margins": 9.23457145690918, "rewards/rejected": -19.785308837890625, "step": 283 }, { "epoch": 0.03867102396514161, "grad_norm": 82.89479518815811, "learning_rate": 3.091156462585034e-07, "logits/chosen": -4.117852687835693, "logits/rejected": -3.5049514770507812, "logps/chosen": -1.2195240259170532, "logps/rejected": -1.4119523763656616, "loss": 5.0577, "rewards/accuracies": 0.25, "rewards/chosen": -12.195240020751953, "rewards/margins": 1.924283504486084, "rewards/rejected": -14.119524002075195, "step": 284 }, { "epoch": 0.03880718954248366, "grad_norm": 60.26087213923758, "learning_rate": 3.1020408163265303e-07, "logits/chosen": -7.015053749084473, "logits/rejected": -6.443515777587891, "logps/chosen": -0.6964254379272461, "logps/rejected": -1.0593734979629517, "loss": 4.6616, "rewards/accuracies": 0.5, "rewards/chosen": -6.964254379272461, "rewards/margins": 3.6294798851013184, "rewards/rejected": -10.593734741210938, "step": 285 }, { "epoch": 0.03894335511982571, "grad_norm": 76.71687439288748, "learning_rate": 3.1129251700680274e-07, "logits/chosen": -5.824742317199707, "logits/rejected": -5.608431816101074, "logps/chosen": -1.022223949432373, "logps/rejected": -1.494560718536377, "loss": 5.3093, "rewards/accuracies": 0.75, "rewards/chosen": -10.222238540649414, "rewards/margins": 4.723368167877197, "rewards/rejected": -14.945606231689453, "step": 286 }, { "epoch": 0.039079520697167754, "grad_norm": 48.11281649992343, "learning_rate": 3.123809523809524e-07, "logits/chosen": -6.561643600463867, "logits/rejected": -5.773690223693848, "logps/chosen": -1.1647915840148926, "logps/rejected": -1.3229095935821533, "loss": 4.3772, "rewards/accuracies": 0.5, "rewards/chosen": -11.647916793823242, "rewards/margins": 1.581178903579712, "rewards/rejected": -13.229095458984375, "step": 287 }, { "epoch": 0.0392156862745098, "grad_norm": 101.91462629033805, "learning_rate": 3.13469387755102e-07, "logits/chosen": -6.1120195388793945, "logits/rejected": -4.842982769012451, "logps/chosen": -1.114365577697754, "logps/rejected": -1.148498773574829, "loss": 5.1493, "rewards/accuracies": 0.5, "rewards/chosen": -11.143655776977539, "rewards/margins": 0.34133267402648926, "rewards/rejected": -11.48498821258545, "step": 288 }, { "epoch": 0.03935185185185185, "grad_norm": 58.586709869111985, "learning_rate": 3.145578231292517e-07, "logits/chosen": -5.140995979309082, "logits/rejected": -5.744998931884766, "logps/chosen": -1.2991633415222168, "logps/rejected": -0.978131890296936, "loss": 4.7759, "rewards/accuracies": 0.25, "rewards/chosen": -12.991634368896484, "rewards/margins": -3.2103145122528076, "rewards/rejected": -9.781318664550781, "step": 289 }, { "epoch": 0.0394880174291939, "grad_norm": 85.71902529513093, "learning_rate": 3.1564625850340134e-07, "logits/chosen": -6.510519504547119, "logits/rejected": -5.653317451477051, "logps/chosen": -0.7901195287704468, "logps/rejected": -0.8980304598808289, "loss": 5.4417, "rewards/accuracies": 0.5, "rewards/chosen": -7.901196002960205, "rewards/margins": 1.0791089534759521, "rewards/rejected": -8.980304718017578, "step": 290 }, { "epoch": 0.039624183006535946, "grad_norm": 63.6967584546626, "learning_rate": 3.16734693877551e-07, "logits/chosen": -5.364646911621094, "logits/rejected": -5.718958854675293, "logps/chosen": -0.8741910457611084, "logps/rejected": -0.9376577138900757, "loss": 5.5728, "rewards/accuracies": 0.5, "rewards/chosen": -8.741909980773926, "rewards/margins": 0.634666919708252, "rewards/rejected": -9.376577377319336, "step": 291 }, { "epoch": 0.039760348583877995, "grad_norm": 124.29152590483952, "learning_rate": 3.1782312925170065e-07, "logits/chosen": -6.3695173263549805, "logits/rejected": -6.648038864135742, "logps/chosen": -1.1529583930969238, "logps/rejected": -0.9785535335540771, "loss": 4.5874, "rewards/accuracies": 0.25, "rewards/chosen": -11.529583930969238, "rewards/margins": -1.7440489530563354, "rewards/rejected": -9.78553581237793, "step": 292 }, { "epoch": 0.039896514161220045, "grad_norm": 63.53746644444534, "learning_rate": 3.189115646258503e-07, "logits/chosen": -4.745383262634277, "logits/rejected": -4.805789947509766, "logps/chosen": -1.211514949798584, "logps/rejected": -1.1108471155166626, "loss": 4.4226, "rewards/accuracies": 0.5, "rewards/chosen": -12.115150451660156, "rewards/margins": -1.006678819656372, "rewards/rejected": -11.108470916748047, "step": 293 }, { "epoch": 0.04003267973856209, "grad_norm": 71.56180968292765, "learning_rate": 3.2e-07, "logits/chosen": -4.7049102783203125, "logits/rejected": -4.994217872619629, "logps/chosen": -1.4565553665161133, "logps/rejected": -1.3495374917984009, "loss": 4.8107, "rewards/accuracies": 0.5, "rewards/chosen": -14.56555461883545, "rewards/margins": -1.0701792240142822, "rewards/rejected": -13.49537467956543, "step": 294 }, { "epoch": 0.04016884531590414, "grad_norm": 64.27970013723564, "learning_rate": 3.2108843537414966e-07, "logits/chosen": -3.9368350505828857, "logits/rejected": -4.138067245483398, "logps/chosen": -1.4147156476974487, "logps/rejected": -1.2015376091003418, "loss": 4.8499, "rewards/accuracies": 0.25, "rewards/chosen": -14.147156715393066, "rewards/margins": -2.1317806243896484, "rewards/rejected": -12.015376091003418, "step": 295 }, { "epoch": 0.04030501089324619, "grad_norm": 69.40538567174809, "learning_rate": 3.221768707482993e-07, "logits/chosen": -6.214529991149902, "logits/rejected": -4.970919609069824, "logps/chosen": -0.8570600152015686, "logps/rejected": -1.350068211555481, "loss": 4.6123, "rewards/accuracies": 0.75, "rewards/chosen": -8.570600509643555, "rewards/margins": 4.930081367492676, "rewards/rejected": -13.500682830810547, "step": 296 }, { "epoch": 0.04044117647058824, "grad_norm": 62.912887250279155, "learning_rate": 3.2326530612244896e-07, "logits/chosen": -5.182707786560059, "logits/rejected": -5.297575950622559, "logps/chosen": -1.0879995822906494, "logps/rejected": -1.397923231124878, "loss": 4.6399, "rewards/accuracies": 0.75, "rewards/chosen": -10.879995346069336, "rewards/margins": 3.099235773086548, "rewards/rejected": -13.979232788085938, "step": 297 }, { "epoch": 0.04057734204793028, "grad_norm": 56.44378818566525, "learning_rate": 3.243537414965986e-07, "logits/chosen": -7.36495304107666, "logits/rejected": -6.3211565017700195, "logps/chosen": -0.8207796812057495, "logps/rejected": -1.0906308889389038, "loss": 4.9065, "rewards/accuracies": 0.75, "rewards/chosen": -8.207796096801758, "rewards/margins": 2.698512315750122, "rewards/rejected": -10.906309127807617, "step": 298 }, { "epoch": 0.04071350762527233, "grad_norm": 75.23329800036315, "learning_rate": 3.2544217687074827e-07, "logits/chosen": -5.045053482055664, "logits/rejected": -4.893669128417969, "logps/chosen": -1.4212563037872314, "logps/rejected": -1.2317540645599365, "loss": 5.4285, "rewards/accuracies": 0.5, "rewards/chosen": -14.212562561035156, "rewards/margins": -1.8950209617614746, "rewards/rejected": -12.317541122436523, "step": 299 }, { "epoch": 0.04084967320261438, "grad_norm": 53.10982447079129, "learning_rate": 3.26530612244898e-07, "logits/chosen": -4.828580379486084, "logits/rejected": -4.032795429229736, "logps/chosen": -1.0820941925048828, "logps/rejected": -1.4866735935211182, "loss": 5.3607, "rewards/accuracies": 0.5, "rewards/chosen": -10.820941925048828, "rewards/margins": 4.045794486999512, "rewards/rejected": -14.86673641204834, "step": 300 }, { "epoch": 0.04098583877995643, "grad_norm": 66.96904129367707, "learning_rate": 3.2761904761904757e-07, "logits/chosen": -4.872203826904297, "logits/rejected": -4.1192426681518555, "logps/chosen": -1.389347791671753, "logps/rejected": -2.259610652923584, "loss": 4.5198, "rewards/accuracies": 0.75, "rewards/chosen": -13.893478393554688, "rewards/margins": 8.702627182006836, "rewards/rejected": -22.596105575561523, "step": 301 }, { "epoch": 0.04112200435729847, "grad_norm": 73.52684445258411, "learning_rate": 3.287074829931973e-07, "logits/chosen": -8.252922058105469, "logits/rejected": -6.003087043762207, "logps/chosen": -0.6266273856163025, "logps/rejected": -0.892977237701416, "loss": 5.399, "rewards/accuracies": 1.0, "rewards/chosen": -6.2662739753723145, "rewards/margins": 2.6634984016418457, "rewards/rejected": -8.92977237701416, "step": 302 }, { "epoch": 0.04125816993464052, "grad_norm": 64.7885280541169, "learning_rate": 3.2979591836734693e-07, "logits/chosen": -5.001124858856201, "logits/rejected": -5.295206546783447, "logps/chosen": -1.039398193359375, "logps/rejected": -1.0232157707214355, "loss": 4.858, "rewards/accuracies": 0.5, "rewards/chosen": -10.39398193359375, "rewards/margins": -0.16182303428649902, "rewards/rejected": -10.232158660888672, "step": 303 }, { "epoch": 0.04139433551198257, "grad_norm": 53.44314392064221, "learning_rate": 3.308843537414966e-07, "logits/chosen": -3.8118338584899902, "logits/rejected": -3.767061471939087, "logps/chosen": -1.2660624980926514, "logps/rejected": -1.5525192022323608, "loss": 4.3837, "rewards/accuracies": 0.75, "rewards/chosen": -12.660625457763672, "rewards/margins": 2.8645665645599365, "rewards/rejected": -15.525192260742188, "step": 304 }, { "epoch": 0.04153050108932462, "grad_norm": 111.33391115443968, "learning_rate": 3.3197278911564624e-07, "logits/chosen": -5.365604400634766, "logits/rejected": -4.919011116027832, "logps/chosen": -0.9577054977416992, "logps/rejected": -1.2963093519210815, "loss": 5.3884, "rewards/accuracies": 0.75, "rewards/chosen": -9.577054977416992, "rewards/margins": 3.386038064956665, "rewards/rejected": -12.963092803955078, "step": 305 }, { "epoch": 0.041666666666666664, "grad_norm": 68.26294673178148, "learning_rate": 3.330612244897959e-07, "logits/chosen": -5.444345951080322, "logits/rejected": -5.243512153625488, "logps/chosen": -1.289650797843933, "logps/rejected": -1.1728914976119995, "loss": 4.8367, "rewards/accuracies": 0.25, "rewards/chosen": -12.89650821685791, "rewards/margins": -1.167593240737915, "rewards/rejected": -11.728914260864258, "step": 306 }, { "epoch": 0.041802832244008714, "grad_norm": 45.83251318701521, "learning_rate": 3.3414965986394554e-07, "logits/chosen": -5.622945785522461, "logits/rejected": -4.702096462249756, "logps/chosen": -0.8104870319366455, "logps/rejected": -1.3166258335113525, "loss": 4.1325, "rewards/accuracies": 1.0, "rewards/chosen": -8.104870796203613, "rewards/margins": 5.0613861083984375, "rewards/rejected": -13.16625690460205, "step": 307 }, { "epoch": 0.041938997821350764, "grad_norm": 65.01352811772412, "learning_rate": 3.3523809523809525e-07, "logits/chosen": -4.006753921508789, "logits/rejected": -4.324404239654541, "logps/chosen": -1.6145024299621582, "logps/rejected": -1.6190104484558105, "loss": 4.6983, "rewards/accuracies": 0.75, "rewards/chosen": -16.1450252532959, "rewards/margins": 0.04508066177368164, "rewards/rejected": -16.190105438232422, "step": 308 }, { "epoch": 0.042075163398692814, "grad_norm": 73.40716928272855, "learning_rate": 3.363265306122449e-07, "logits/chosen": -4.923980712890625, "logits/rejected": -4.357913017272949, "logps/chosen": -1.2101860046386719, "logps/rejected": -1.0742626190185547, "loss": 4.9399, "rewards/accuracies": 0.25, "rewards/chosen": -12.101859092712402, "rewards/margins": -1.3592329025268555, "rewards/rejected": -10.742626190185547, "step": 309 }, { "epoch": 0.042211328976034856, "grad_norm": 86.64242989388097, "learning_rate": 3.3741496598639455e-07, "logits/chosen": -5.090981960296631, "logits/rejected": -4.790233135223389, "logps/chosen": -1.2187621593475342, "logps/rejected": -1.46649968624115, "loss": 5.7045, "rewards/accuracies": 0.5, "rewards/chosen": -12.1876220703125, "rewards/margins": 2.47737455368042, "rewards/rejected": -14.664997100830078, "step": 310 }, { "epoch": 0.042347494553376906, "grad_norm": 84.52761647877323, "learning_rate": 3.385034013605442e-07, "logits/chosen": -4.32208251953125, "logits/rejected": -4.260677337646484, "logps/chosen": -1.5720665454864502, "logps/rejected": -1.6891428232192993, "loss": 4.703, "rewards/accuracies": 0.5, "rewards/chosen": -15.720664978027344, "rewards/margins": 1.1707630157470703, "rewards/rejected": -16.891427993774414, "step": 311 }, { "epoch": 0.042483660130718956, "grad_norm": 56.415207724586466, "learning_rate": 3.3959183673469385e-07, "logits/chosen": -4.906296730041504, "logits/rejected": -3.4032950401306152, "logps/chosen": -1.1015807390213013, "logps/rejected": -1.6876873970031738, "loss": 4.575, "rewards/accuracies": 1.0, "rewards/chosen": -11.015806198120117, "rewards/margins": 5.861067771911621, "rewards/rejected": -16.876874923706055, "step": 312 }, { "epoch": 0.042619825708061, "grad_norm": 112.02772938904974, "learning_rate": 3.4068027210884356e-07, "logits/chosen": -5.677821636199951, "logits/rejected": -4.527283668518066, "logps/chosen": -1.5534473657608032, "logps/rejected": -1.644795536994934, "loss": 4.8996, "rewards/accuracies": 0.5, "rewards/chosen": -15.534473419189453, "rewards/margins": 0.9134814739227295, "rewards/rejected": -16.447956085205078, "step": 313 }, { "epoch": 0.04275599128540305, "grad_norm": 67.18467578850294, "learning_rate": 3.4176870748299316e-07, "logits/chosen": -5.308394908905029, "logits/rejected": -5.2597455978393555, "logps/chosen": -0.9776167869567871, "logps/rejected": -1.4059151411056519, "loss": 4.9014, "rewards/accuracies": 1.0, "rewards/chosen": -9.776167869567871, "rewards/margins": 4.282983303070068, "rewards/rejected": -14.059150695800781, "step": 314 }, { "epoch": 0.0428921568627451, "grad_norm": 75.33241825063263, "learning_rate": 3.428571428571428e-07, "logits/chosen": -5.659082412719727, "logits/rejected": -4.746170997619629, "logps/chosen": -1.3662632703781128, "logps/rejected": -1.3791353702545166, "loss": 4.9001, "rewards/accuracies": 0.5, "rewards/chosen": -13.66263198852539, "rewards/margins": 0.12872076034545898, "rewards/rejected": -13.791353225708008, "step": 315 }, { "epoch": 0.04302832244008715, "grad_norm": 56.62935616086173, "learning_rate": 3.439455782312925e-07, "logits/chosen": -4.99162483215332, "logits/rejected": -4.908379554748535, "logps/chosen": -0.9431143999099731, "logps/rejected": -0.9593716263771057, "loss": 4.2346, "rewards/accuracies": 0.5, "rewards/chosen": -9.431144714355469, "rewards/margins": 0.16257143020629883, "rewards/rejected": -9.59371566772461, "step": 316 }, { "epoch": 0.04316448801742919, "grad_norm": 75.45895738515944, "learning_rate": 3.4503401360544217e-07, "logits/chosen": -5.662214279174805, "logits/rejected": -4.625615119934082, "logps/chosen": -2.197333335876465, "logps/rejected": -1.1858606338500977, "loss": 5.417, "rewards/accuracies": 0.5, "rewards/chosen": -21.97333526611328, "rewards/margins": -10.114727973937988, "rewards/rejected": -11.858606338500977, "step": 317 }, { "epoch": 0.04330065359477124, "grad_norm": 69.00190768507453, "learning_rate": 3.461224489795918e-07, "logits/chosen": -6.073953628540039, "logits/rejected": -5.639466762542725, "logps/chosen": -0.8199307322502136, "logps/rejected": -0.9989339113235474, "loss": 6.7057, "rewards/accuracies": 0.75, "rewards/chosen": -8.19930648803711, "rewards/margins": 1.790032148361206, "rewards/rejected": -9.989338874816895, "step": 318 }, { "epoch": 0.04343681917211329, "grad_norm": 47.132847487006686, "learning_rate": 3.472108843537415e-07, "logits/chosen": -5.244969367980957, "logits/rejected": -4.629319190979004, "logps/chosen": -0.9945318102836609, "logps/rejected": -0.9217130541801453, "loss": 5.2587, "rewards/accuracies": 0.5, "rewards/chosen": -9.945318222045898, "rewards/margins": -0.7281875610351562, "rewards/rejected": -9.217130661010742, "step": 319 }, { "epoch": 0.04357298474945534, "grad_norm": 80.67572060200001, "learning_rate": 3.482993197278911e-07, "logits/chosen": -6.84930419921875, "logits/rejected": -5.082592964172363, "logps/chosen": -0.8338396549224854, "logps/rejected": -0.7946495413780212, "loss": 4.6799, "rewards/accuracies": 0.25, "rewards/chosen": -8.338396072387695, "rewards/margins": -0.39190101623535156, "rewards/rejected": -7.946495056152344, "step": 320 }, { "epoch": 0.04370915032679738, "grad_norm": 53.715382075407234, "learning_rate": 3.4938775510204083e-07, "logits/chosen": -4.414486885070801, "logits/rejected": -4.325369834899902, "logps/chosen": -1.1800625324249268, "logps/rejected": -1.2570714950561523, "loss": 4.3018, "rewards/accuracies": 0.75, "rewards/chosen": -11.80062484741211, "rewards/margins": 0.7700886726379395, "rewards/rejected": -12.570714950561523, "step": 321 }, { "epoch": 0.04384531590413943, "grad_norm": 76.3922283044073, "learning_rate": 3.504761904761905e-07, "logits/chosen": -5.061430931091309, "logits/rejected": -5.193187713623047, "logps/chosen": -0.9471786022186279, "logps/rejected": -1.0105222463607788, "loss": 4.7564, "rewards/accuracies": 0.75, "rewards/chosen": -9.471786499023438, "rewards/margins": 0.6334362030029297, "rewards/rejected": -10.10522174835205, "step": 322 }, { "epoch": 0.04398148148148148, "grad_norm": 78.80229104300813, "learning_rate": 3.515646258503401e-07, "logits/chosen": -3.73931884765625, "logits/rejected": -5.390448570251465, "logps/chosen": -1.020543098449707, "logps/rejected": -0.8303165435791016, "loss": 4.9138, "rewards/accuracies": 0.25, "rewards/chosen": -10.20543098449707, "rewards/margins": -1.902264952659607, "rewards/rejected": -8.303165435791016, "step": 323 }, { "epoch": 0.04411764705882353, "grad_norm": 82.43983763027742, "learning_rate": 3.526530612244898e-07, "logits/chosen": -4.164395809173584, "logits/rejected": -4.700032711029053, "logps/chosen": -1.0258872509002686, "logps/rejected": -1.0510218143463135, "loss": 5.5313, "rewards/accuracies": 0.75, "rewards/chosen": -10.25887393951416, "rewards/margins": 0.25134384632110596, "rewards/rejected": -10.510217666625977, "step": 324 }, { "epoch": 0.044253812636165575, "grad_norm": 65.96171987316494, "learning_rate": 3.5374149659863944e-07, "logits/chosen": -4.379007339477539, "logits/rejected": -7.247016906738281, "logps/chosen": -1.0006804466247559, "logps/rejected": -0.7425208687782288, "loss": 4.8815, "rewards/accuracies": 0.5, "rewards/chosen": -10.006805419921875, "rewards/margins": -2.5815958976745605, "rewards/rejected": -7.425209045410156, "step": 325 }, { "epoch": 0.044389978213507625, "grad_norm": 67.045670852833, "learning_rate": 3.5482993197278915e-07, "logits/chosen": -4.473821640014648, "logits/rejected": -5.27337646484375, "logps/chosen": -1.364248275756836, "logps/rejected": -0.941375195980072, "loss": 5.2318, "rewards/accuracies": 0.0, "rewards/chosen": -13.64248275756836, "rewards/margins": -4.228731155395508, "rewards/rejected": -9.413751602172852, "step": 326 }, { "epoch": 0.044526143790849675, "grad_norm": 75.04393921776482, "learning_rate": 3.5591836734693875e-07, "logits/chosen": -5.228488445281982, "logits/rejected": -4.250424385070801, "logps/chosen": -1.8979551792144775, "logps/rejected": -1.6222800016403198, "loss": 4.702, "rewards/accuracies": 0.5, "rewards/chosen": -18.979551315307617, "rewards/margins": -2.756751537322998, "rewards/rejected": -16.222801208496094, "step": 327 }, { "epoch": 0.044662309368191724, "grad_norm": 49.55325983822202, "learning_rate": 3.570068027210884e-07, "logits/chosen": -5.328608989715576, "logits/rejected": -4.373261451721191, "logps/chosen": -1.0470333099365234, "logps/rejected": -1.5911309719085693, "loss": 4.4238, "rewards/accuracies": 0.5, "rewards/chosen": -10.470333099365234, "rewards/margins": 5.440976142883301, "rewards/rejected": -15.911310195922852, "step": 328 }, { "epoch": 0.04479847494553377, "grad_norm": 49.88808470156949, "learning_rate": 3.580952380952381e-07, "logits/chosen": -5.277864456176758, "logits/rejected": -3.787621259689331, "logps/chosen": -0.8987516164779663, "logps/rejected": -1.3171722888946533, "loss": 4.8759, "rewards/accuracies": 1.0, "rewards/chosen": -8.987516403198242, "rewards/margins": 4.184206962585449, "rewards/rejected": -13.171723365783691, "step": 329 }, { "epoch": 0.04493464052287582, "grad_norm": 56.35995975770236, "learning_rate": 3.5918367346938776e-07, "logits/chosen": -4.28376579284668, "logits/rejected": -3.742021083831787, "logps/chosen": -1.1659579277038574, "logps/rejected": -1.6126478910446167, "loss": 4.1243, "rewards/accuracies": 0.75, "rewards/chosen": -11.659578323364258, "rewards/margins": 4.466899394989014, "rewards/rejected": -16.12647819519043, "step": 330 }, { "epoch": 0.04507080610021787, "grad_norm": 52.58990131690406, "learning_rate": 3.602721088435374e-07, "logits/chosen": -4.0313568115234375, "logits/rejected": -5.071317672729492, "logps/chosen": -1.0978147983551025, "logps/rejected": -0.9905543923377991, "loss": 4.6989, "rewards/accuracies": 0.25, "rewards/chosen": -10.9781494140625, "rewards/margins": -1.0726053714752197, "rewards/rejected": -9.90554428100586, "step": 331 }, { "epoch": 0.04520697167755991, "grad_norm": 51.608049607727914, "learning_rate": 3.6136054421768706e-07, "logits/chosen": -4.0459394454956055, "logits/rejected": -3.6338887214660645, "logps/chosen": -1.3316205739974976, "logps/rejected": -1.599829912185669, "loss": 5.2421, "rewards/accuracies": 0.75, "rewards/chosen": -13.316205978393555, "rewards/margins": 2.682093381881714, "rewards/rejected": -15.998298645019531, "step": 332 }, { "epoch": 0.04534313725490196, "grad_norm": 69.42275051159423, "learning_rate": 3.624489795918367e-07, "logits/chosen": -3.898859739303589, "logits/rejected": -3.7162675857543945, "logps/chosen": -1.6424038410186768, "logps/rejected": -1.857055425643921, "loss": 4.8124, "rewards/accuracies": 0.75, "rewards/chosen": -16.42403793334961, "rewards/margins": 2.1465158462524414, "rewards/rejected": -18.570552825927734, "step": 333 }, { "epoch": 0.04547930283224401, "grad_norm": 53.622499284965485, "learning_rate": 3.635374149659864e-07, "logits/chosen": -4.4776411056518555, "logits/rejected": -4.470993518829346, "logps/chosen": -1.687505841255188, "logps/rejected": -1.3818199634552002, "loss": 4.875, "rewards/accuracies": 0.25, "rewards/chosen": -16.875059127807617, "rewards/margins": -3.0568594932556152, "rewards/rejected": -13.81820011138916, "step": 334 }, { "epoch": 0.04561546840958606, "grad_norm": 59.11941552248355, "learning_rate": 3.64625850340136e-07, "logits/chosen": -5.771968364715576, "logits/rejected": -4.812268257141113, "logps/chosen": -0.8832271099090576, "logps/rejected": -1.2285106182098389, "loss": 4.221, "rewards/accuracies": 1.0, "rewards/chosen": -8.832271575927734, "rewards/margins": 3.452834129333496, "rewards/rejected": -12.28510570526123, "step": 335 }, { "epoch": 0.0457516339869281, "grad_norm": 88.1622514093425, "learning_rate": 3.6571428571428567e-07, "logits/chosen": -4.847055912017822, "logits/rejected": -4.330475330352783, "logps/chosen": -0.914954662322998, "logps/rejected": -1.245988130569458, "loss": 5.2333, "rewards/accuracies": 0.75, "rewards/chosen": -9.149547576904297, "rewards/margins": 3.310335159301758, "rewards/rejected": -12.459882736206055, "step": 336 }, { "epoch": 0.04588779956427015, "grad_norm": 52.001334009645625, "learning_rate": 3.668027210884354e-07, "logits/chosen": -6.903139114379883, "logits/rejected": -5.824688911437988, "logps/chosen": -0.5480578541755676, "logps/rejected": -0.6635133028030396, "loss": 4.3156, "rewards/accuracies": 0.75, "rewards/chosen": -5.480578422546387, "rewards/margins": 1.1545543670654297, "rewards/rejected": -6.635132789611816, "step": 337 }, { "epoch": 0.0460239651416122, "grad_norm": 80.05198967809932, "learning_rate": 3.6789115646258503e-07, "logits/chosen": -4.128746032714844, "logits/rejected": -4.49485969543457, "logps/chosen": -1.1353280544281006, "logps/rejected": -1.1029114723205566, "loss": 5.3823, "rewards/accuracies": 0.5, "rewards/chosen": -11.353281021118164, "rewards/margins": -0.32416510581970215, "rewards/rejected": -11.029115676879883, "step": 338 }, { "epoch": 0.04616013071895425, "grad_norm": 81.94806348695083, "learning_rate": 3.689795918367347e-07, "logits/chosen": -4.851099014282227, "logits/rejected": -4.12558650970459, "logps/chosen": -1.1570490598678589, "logps/rejected": -1.3034899234771729, "loss": 4.7139, "rewards/accuracies": 0.75, "rewards/chosen": -11.570490837097168, "rewards/margins": 1.464409351348877, "rewards/rejected": -13.034900665283203, "step": 339 }, { "epoch": 0.046296296296296294, "grad_norm": 103.62559251002375, "learning_rate": 3.7006802721088433e-07, "logits/chosen": -5.333596229553223, "logits/rejected": -4.665900230407715, "logps/chosen": -1.1381239891052246, "logps/rejected": -1.030791997909546, "loss": 5.3047, "rewards/accuracies": 0.5, "rewards/chosen": -11.381240844726562, "rewards/margins": -1.0733213424682617, "rewards/rejected": -10.3079195022583, "step": 340 }, { "epoch": 0.046432461873638343, "grad_norm": 52.47840897312059, "learning_rate": 3.71156462585034e-07, "logits/chosen": -3.387547016143799, "logits/rejected": -4.031194686889648, "logps/chosen": -1.4097591638565063, "logps/rejected": -1.5441861152648926, "loss": 4.533, "rewards/accuracies": 0.5, "rewards/chosen": -14.097591400146484, "rewards/margins": 1.3442699909210205, "rewards/rejected": -15.441862106323242, "step": 341 }, { "epoch": 0.04656862745098039, "grad_norm": 64.26360427992691, "learning_rate": 3.7224489795918364e-07, "logits/chosen": -4.687727928161621, "logits/rejected": -3.7767446041107178, "logps/chosen": -1.1348202228546143, "logps/rejected": -1.4387342929840088, "loss": 4.6277, "rewards/accuracies": 1.0, "rewards/chosen": -11.348201751708984, "rewards/margins": 3.0391411781311035, "rewards/rejected": -14.387343406677246, "step": 342 }, { "epoch": 0.04670479302832244, "grad_norm": 47.44529818345173, "learning_rate": 3.7333333333333334e-07, "logits/chosen": -3.479304313659668, "logits/rejected": -3.206601858139038, "logps/chosen": -1.4744844436645508, "logps/rejected": -1.1833782196044922, "loss": 4.8163, "rewards/accuracies": 0.25, "rewards/chosen": -14.744844436645508, "rewards/margins": -2.911062240600586, "rewards/rejected": -11.833782196044922, "step": 343 }, { "epoch": 0.046840958605664486, "grad_norm": 64.26531673086846, "learning_rate": 3.7442176870748294e-07, "logits/chosen": -3.7847156524658203, "logits/rejected": -4.481882095336914, "logps/chosen": -1.8097883462905884, "logps/rejected": -1.3306057453155518, "loss": 4.6497, "rewards/accuracies": 0.25, "rewards/chosen": -18.097885131835938, "rewards/margins": -4.791826248168945, "rewards/rejected": -13.306057929992676, "step": 344 }, { "epoch": 0.046977124183006536, "grad_norm": 72.58462310643739, "learning_rate": 3.7551020408163265e-07, "logits/chosen": -5.742620468139648, "logits/rejected": -3.8809614181518555, "logps/chosen": -0.7879772186279297, "logps/rejected": -1.2834994792938232, "loss": 4.6548, "rewards/accuracies": 1.0, "rewards/chosen": -7.879772186279297, "rewards/margins": 4.955223083496094, "rewards/rejected": -12.83499526977539, "step": 345 }, { "epoch": 0.047113289760348585, "grad_norm": 75.7942208749751, "learning_rate": 3.765986394557823e-07, "logits/chosen": -5.122191905975342, "logits/rejected": -3.5494017601013184, "logps/chosen": -1.184884786605835, "logps/rejected": -1.0418821573257446, "loss": 4.7957, "rewards/accuracies": 0.5, "rewards/chosen": -11.848846435546875, "rewards/margins": -1.430025339126587, "rewards/rejected": -10.418821334838867, "step": 346 }, { "epoch": 0.047249455337690635, "grad_norm": 84.8430919001219, "learning_rate": 3.7768707482993195e-07, "logits/chosen": -4.72030782699585, "logits/rejected": -4.733547210693359, "logps/chosen": -1.1555423736572266, "logps/rejected": -1.5096678733825684, "loss": 4.648, "rewards/accuracies": 0.5, "rewards/chosen": -11.55542278289795, "rewards/margins": 3.5412545204162598, "rewards/rejected": -15.096677780151367, "step": 347 }, { "epoch": 0.04738562091503268, "grad_norm": 61.499147763932804, "learning_rate": 3.787755102040816e-07, "logits/chosen": -4.381627082824707, "logits/rejected": -3.8670363426208496, "logps/chosen": -0.9360236525535583, "logps/rejected": -1.106676697731018, "loss": 4.1167, "rewards/accuracies": 0.75, "rewards/chosen": -9.360236167907715, "rewards/margins": 1.7065308094024658, "rewards/rejected": -11.066766738891602, "step": 348 }, { "epoch": 0.04752178649237473, "grad_norm": 70.53092724254896, "learning_rate": 3.7986394557823126e-07, "logits/chosen": -2.729912519454956, "logits/rejected": -4.400382995605469, "logps/chosen": -1.1213295459747314, "logps/rejected": -2.0715765953063965, "loss": 4.9453, "rewards/accuracies": 0.75, "rewards/chosen": -11.213296890258789, "rewards/margins": 9.502470016479492, "rewards/rejected": -20.71576690673828, "step": 349 }, { "epoch": 0.04765795206971678, "grad_norm": 60.41701033824524, "learning_rate": 3.809523809523809e-07, "logits/chosen": -6.25025749206543, "logits/rejected": -5.233442306518555, "logps/chosen": -1.1553184986114502, "logps/rejected": -1.1237400770187378, "loss": 5.2371, "rewards/accuracies": 0.5, "rewards/chosen": -11.553184509277344, "rewards/margins": -0.3157843351364136, "rewards/rejected": -11.23740005493164, "step": 350 }, { "epoch": 0.04779411764705882, "grad_norm": 39.688776218157535, "learning_rate": 3.820408163265306e-07, "logits/chosen": -3.9296278953552246, "logits/rejected": -3.9206624031066895, "logps/chosen": -1.246860146522522, "logps/rejected": -1.5343701839447021, "loss": 4.0967, "rewards/accuracies": 0.5, "rewards/chosen": -12.468602180480957, "rewards/margins": 2.8751001358032227, "rewards/rejected": -15.34370231628418, "step": 351 }, { "epoch": 0.04793028322440087, "grad_norm": 91.18671826620812, "learning_rate": 3.8312925170068026e-07, "logits/chosen": -3.2646350860595703, "logits/rejected": -2.839343309402466, "logps/chosen": -1.4661448001861572, "logps/rejected": -1.6291890144348145, "loss": 4.7386, "rewards/accuracies": 0.5, "rewards/chosen": -14.661447525024414, "rewards/margins": 1.6304430961608887, "rewards/rejected": -16.29189109802246, "step": 352 }, { "epoch": 0.04806644880174292, "grad_norm": 99.64857481042004, "learning_rate": 3.842176870748299e-07, "logits/chosen": -5.036593437194824, "logits/rejected": -4.73375129699707, "logps/chosen": -1.1730133295059204, "logps/rejected": -1.1294384002685547, "loss": 4.4026, "rewards/accuracies": 0.5, "rewards/chosen": -11.730133056640625, "rewards/margins": -0.4357491731643677, "rewards/rejected": -11.294384002685547, "step": 353 }, { "epoch": 0.04820261437908497, "grad_norm": 70.43635347849056, "learning_rate": 3.8530612244897957e-07, "logits/chosen": -5.037467956542969, "logits/rejected": -3.364009141921997, "logps/chosen": -1.0652697086334229, "logps/rejected": -1.746490240097046, "loss": 4.5528, "rewards/accuracies": 1.0, "rewards/chosen": -10.652698516845703, "rewards/margins": 6.812203407287598, "rewards/rejected": -17.464900970458984, "step": 354 }, { "epoch": 0.04833877995642701, "grad_norm": 59.65530580634134, "learning_rate": 3.863945578231292e-07, "logits/chosen": -4.721744537353516, "logits/rejected": -4.3982648849487305, "logps/chosen": -1.0750972032546997, "logps/rejected": -1.1355454921722412, "loss": 5.0939, "rewards/accuracies": 0.75, "rewards/chosen": -10.750971794128418, "rewards/margins": 0.6044834852218628, "rewards/rejected": -11.35545539855957, "step": 355 }, { "epoch": 0.04847494553376906, "grad_norm": 46.770668291367954, "learning_rate": 3.8748299319727893e-07, "logits/chosen": -5.162968635559082, "logits/rejected": -5.444433689117432, "logps/chosen": -0.8656356334686279, "logps/rejected": -0.8235973119735718, "loss": 4.9787, "rewards/accuracies": 0.5, "rewards/chosen": -8.656355857849121, "rewards/margins": -0.4203832149505615, "rewards/rejected": -8.235973358154297, "step": 356 }, { "epoch": 0.04861111111111111, "grad_norm": 66.49217632232175, "learning_rate": 3.8857142857142853e-07, "logits/chosen": -4.034242153167725, "logits/rejected": -3.3635754585266113, "logps/chosen": -1.3328701257705688, "logps/rejected": -1.2513222694396973, "loss": 4.941, "rewards/accuracies": 0.5, "rewards/chosen": -13.32870101928711, "rewards/margins": -0.8154784440994263, "rewards/rejected": -12.513222694396973, "step": 357 }, { "epoch": 0.04874727668845316, "grad_norm": 50.072997862043174, "learning_rate": 3.896598639455782e-07, "logits/chosen": -2.774782657623291, "logits/rejected": -2.750211000442505, "logps/chosen": -1.333456039428711, "logps/rejected": -2.016745090484619, "loss": 4.3693, "rewards/accuracies": 0.5, "rewards/chosen": -13.33456039428711, "rewards/margins": 6.832892417907715, "rewards/rejected": -20.167451858520508, "step": 358 }, { "epoch": 0.048883442265795204, "grad_norm": 48.626430561867856, "learning_rate": 3.907482993197279e-07, "logits/chosen": -4.561434745788574, "logits/rejected": -3.3318333625793457, "logps/chosen": -1.0798548460006714, "logps/rejected": -1.0834531784057617, "loss": 4.0742, "rewards/accuracies": 0.5, "rewards/chosen": -10.79854965209961, "rewards/margins": 0.03598320484161377, "rewards/rejected": -10.834531784057617, "step": 359 }, { "epoch": 0.049019607843137254, "grad_norm": 71.1432026917616, "learning_rate": 3.9183673469387754e-07, "logits/chosen": -3.937601089477539, "logits/rejected": -5.394858360290527, "logps/chosen": -1.0354994535446167, "logps/rejected": -0.7686932682991028, "loss": 4.7626, "rewards/accuracies": 0.25, "rewards/chosen": -10.354995727539062, "rewards/margins": -2.668062686920166, "rewards/rejected": -7.68693208694458, "step": 360 }, { "epoch": 0.049155773420479304, "grad_norm": 57.589488422016544, "learning_rate": 3.929251700680272e-07, "logits/chosen": -4.250298023223877, "logits/rejected": -3.810303211212158, "logps/chosen": -0.9952207207679749, "logps/rejected": -1.0889822244644165, "loss": 4.7041, "rewards/accuracies": 0.75, "rewards/chosen": -9.952207565307617, "rewards/margins": 0.937615156173706, "rewards/rejected": -10.889822006225586, "step": 361 }, { "epoch": 0.049291938997821354, "grad_norm": 63.333065680523944, "learning_rate": 3.9401360544217684e-07, "logits/chosen": -5.630011558532715, "logits/rejected": -5.683361530303955, "logps/chosen": -1.3218655586242676, "logps/rejected": -0.7940860986709595, "loss": 4.863, "rewards/accuracies": 0.0, "rewards/chosen": -13.218655586242676, "rewards/margins": -5.27779483795166, "rewards/rejected": -7.940860748291016, "step": 362 }, { "epoch": 0.0494281045751634, "grad_norm": 49.946540758737406, "learning_rate": 3.951020408163265e-07, "logits/chosen": -4.430416107177734, "logits/rejected": -4.594329357147217, "logps/chosen": -1.14668869972229, "logps/rejected": -1.1622904539108276, "loss": 4.715, "rewards/accuracies": 0.25, "rewards/chosen": -11.466887474060059, "rewards/margins": 0.15601706504821777, "rewards/rejected": -11.622903823852539, "step": 363 }, { "epoch": 0.049564270152505446, "grad_norm": 75.65677594959018, "learning_rate": 3.961904761904762e-07, "logits/chosen": -4.715834617614746, "logits/rejected": -4.765722274780273, "logps/chosen": -2.1950907707214355, "logps/rejected": -1.3353166580200195, "loss": 5.1503, "rewards/accuracies": 0.25, "rewards/chosen": -21.950904846191406, "rewards/margins": -8.597740173339844, "rewards/rejected": -13.353166580200195, "step": 364 }, { "epoch": 0.049700435729847496, "grad_norm": 59.49474022139297, "learning_rate": 3.9727891156462585e-07, "logits/chosen": -4.238061428070068, "logits/rejected": -3.64263916015625, "logps/chosen": -1.2017842531204224, "logps/rejected": -1.4413776397705078, "loss": 4.5683, "rewards/accuracies": 0.75, "rewards/chosen": -12.017842292785645, "rewards/margins": 2.3959343433380127, "rewards/rejected": -14.413776397705078, "step": 365 }, { "epoch": 0.049836601307189546, "grad_norm": 61.31130797899487, "learning_rate": 3.9836734693877545e-07, "logits/chosen": -4.462696075439453, "logits/rejected": -2.9877586364746094, "logps/chosen": -0.9624910354614258, "logps/rejected": -1.4527757167816162, "loss": 4.2574, "rewards/accuracies": 1.0, "rewards/chosen": -9.624910354614258, "rewards/margins": 4.902846336364746, "rewards/rejected": -14.52775764465332, "step": 366 }, { "epoch": 0.04997276688453159, "grad_norm": 35.89373304338028, "learning_rate": 3.9945578231292516e-07, "logits/chosen": -5.073393821716309, "logits/rejected": -4.809358596801758, "logps/chosen": -0.9978084564208984, "logps/rejected": -1.0534707307815552, "loss": 4.8495, "rewards/accuracies": 0.5, "rewards/chosen": -9.978084564208984, "rewards/margins": 0.556623101234436, "rewards/rejected": -10.534708023071289, "step": 367 }, { "epoch": 0.05010893246187364, "grad_norm": 70.47804626430614, "learning_rate": 4.0054421768707486e-07, "logits/chosen": -4.609557151794434, "logits/rejected": -3.538032293319702, "logps/chosen": -1.2886557579040527, "logps/rejected": -1.4615461826324463, "loss": 4.8944, "rewards/accuracies": 0.5, "rewards/chosen": -12.886558532714844, "rewards/margins": 1.7289037704467773, "rewards/rejected": -14.615461349487305, "step": 368 }, { "epoch": 0.05024509803921569, "grad_norm": 62.440594736931246, "learning_rate": 4.016326530612245e-07, "logits/chosen": -5.322005271911621, "logits/rejected": -4.109156608581543, "logps/chosen": -1.0093988180160522, "logps/rejected": -1.3403376340866089, "loss": 5.0942, "rewards/accuracies": 0.75, "rewards/chosen": -10.093987464904785, "rewards/margins": 3.309389114379883, "rewards/rejected": -13.403376579284668, "step": 369 }, { "epoch": 0.05038126361655773, "grad_norm": 48.339595778900176, "learning_rate": 4.027210884353741e-07, "logits/chosen": -4.733573913574219, "logits/rejected": -4.270490646362305, "logps/chosen": -1.0108896493911743, "logps/rejected": -1.106526255607605, "loss": 4.6791, "rewards/accuracies": 0.25, "rewards/chosen": -10.108896255493164, "rewards/margins": 0.9563665390014648, "rewards/rejected": -11.065262794494629, "step": 370 }, { "epoch": 0.05051742919389978, "grad_norm": 64.31593728825284, "learning_rate": 4.0380952380952377e-07, "logits/chosen": -4.218809127807617, "logits/rejected": -3.940826177597046, "logps/chosen": -0.7416388988494873, "logps/rejected": -0.8213395476341248, "loss": 5.1248, "rewards/accuracies": 0.75, "rewards/chosen": -7.416388511657715, "rewards/margins": 0.7970068454742432, "rewards/rejected": -8.213395118713379, "step": 371 }, { "epoch": 0.05065359477124183, "grad_norm": 50.62888602434962, "learning_rate": 4.048979591836734e-07, "logits/chosen": -4.656181335449219, "logits/rejected": -3.1050233840942383, "logps/chosen": -1.013460636138916, "logps/rejected": -1.09527587890625, "loss": 4.3055, "rewards/accuracies": 0.25, "rewards/chosen": -10.13460636138916, "rewards/margins": 0.8181521892547607, "rewards/rejected": -10.9527587890625, "step": 372 }, { "epoch": 0.05078976034858388, "grad_norm": 50.01530705316231, "learning_rate": 4.059863945578232e-07, "logits/chosen": -4.433231830596924, "logits/rejected": -5.109742164611816, "logps/chosen": -1.093780517578125, "logps/rejected": -0.8204267024993896, "loss": 3.7494, "rewards/accuracies": 0.25, "rewards/chosen": -10.93780517578125, "rewards/margins": -2.7335383892059326, "rewards/rejected": -8.204266548156738, "step": 373 }, { "epoch": 0.05092592592592592, "grad_norm": 49.319680041038744, "learning_rate": 4.070748299319728e-07, "logits/chosen": -5.030124664306641, "logits/rejected": -4.256789207458496, "logps/chosen": -1.061401128768921, "logps/rejected": -1.315832257270813, "loss": 4.6776, "rewards/accuracies": 0.75, "rewards/chosen": -10.614011764526367, "rewards/margins": 2.544311285018921, "rewards/rejected": -13.158323287963867, "step": 374 }, { "epoch": 0.05106209150326797, "grad_norm": 43.68441813473087, "learning_rate": 4.0816326530612243e-07, "logits/chosen": -5.896186828613281, "logits/rejected": -4.19079065322876, "logps/chosen": -0.7971887588500977, "logps/rejected": -0.9658806324005127, "loss": 4.7059, "rewards/accuracies": 0.75, "rewards/chosen": -7.971887588500977, "rewards/margins": 1.6869187355041504, "rewards/rejected": -9.658805847167969, "step": 375 }, { "epoch": 0.05119825708061002, "grad_norm": 52.760022760601025, "learning_rate": 4.092517006802721e-07, "logits/chosen": -4.487117767333984, "logits/rejected": -6.3478593826293945, "logps/chosen": -1.1551398038864136, "logps/rejected": -1.0892930030822754, "loss": 4.5358, "rewards/accuracies": 0.25, "rewards/chosen": -11.551397323608398, "rewards/margins": -0.6584687232971191, "rewards/rejected": -10.892929077148438, "step": 376 }, { "epoch": 0.05133442265795207, "grad_norm": 59.55413225205095, "learning_rate": 4.1034013605442173e-07, "logits/chosen": -3.759427547454834, "logits/rejected": -5.074109077453613, "logps/chosen": -1.3292748928070068, "logps/rejected": -1.1516237258911133, "loss": 4.7997, "rewards/accuracies": 0.5, "rewards/chosen": -13.292749404907227, "rewards/margins": -1.7765127420425415, "rewards/rejected": -11.516237258911133, "step": 377 }, { "epoch": 0.051470588235294115, "grad_norm": 50.03941503449033, "learning_rate": 4.114285714285714e-07, "logits/chosen": -4.392102241516113, "logits/rejected": -4.103697299957275, "logps/chosen": -1.0755290985107422, "logps/rejected": -1.002875804901123, "loss": 4.898, "rewards/accuracies": 0.25, "rewards/chosen": -10.755290985107422, "rewards/margins": -0.7265328168869019, "rewards/rejected": -10.028759002685547, "step": 378 }, { "epoch": 0.051606753812636165, "grad_norm": 58.57907231887316, "learning_rate": 4.125170068027211e-07, "logits/chosen": -5.784555435180664, "logits/rejected": -4.2676873207092285, "logps/chosen": -1.0702967643737793, "logps/rejected": -1.447671890258789, "loss": 5.0389, "rewards/accuracies": 1.0, "rewards/chosen": -10.702967643737793, "rewards/margins": 3.7737512588500977, "rewards/rejected": -14.47671890258789, "step": 379 }, { "epoch": 0.051742919389978215, "grad_norm": 56.84761649791471, "learning_rate": 4.1360544217687074e-07, "logits/chosen": -4.891299247741699, "logits/rejected": -4.225244998931885, "logps/chosen": -0.942592203617096, "logps/rejected": -1.2216744422912598, "loss": 4.9448, "rewards/accuracies": 0.75, "rewards/chosen": -9.425922393798828, "rewards/margins": 2.790821075439453, "rewards/rejected": -12.216743469238281, "step": 380 }, { "epoch": 0.051879084967320264, "grad_norm": 39.86424248709267, "learning_rate": 4.146938775510204e-07, "logits/chosen": -3.847580909729004, "logits/rejected": -4.6515302658081055, "logps/chosen": -1.4245232343673706, "logps/rejected": -1.3031435012817383, "loss": 4.1825, "rewards/accuracies": 0.25, "rewards/chosen": -14.245231628417969, "rewards/margins": -1.2137963771820068, "rewards/rejected": -13.031435012817383, "step": 381 }, { "epoch": 0.05201525054466231, "grad_norm": 65.88429283558246, "learning_rate": 4.1578231292517005e-07, "logits/chosen": -4.4065656661987305, "logits/rejected": -3.639303207397461, "logps/chosen": -1.1241507530212402, "logps/rejected": -1.0169777870178223, "loss": 4.8693, "rewards/accuracies": 0.25, "rewards/chosen": -11.241507530212402, "rewards/margins": -1.0717291831970215, "rewards/rejected": -10.169777870178223, "step": 382 }, { "epoch": 0.05215141612200436, "grad_norm": 63.549812385286906, "learning_rate": 4.168707482993197e-07, "logits/chosen": -4.33826208114624, "logits/rejected": -3.186058282852173, "logps/chosen": -1.1618280410766602, "logps/rejected": -1.484189510345459, "loss": 4.7735, "rewards/accuracies": 0.75, "rewards/chosen": -11.618281364440918, "rewards/margins": 3.223614454269409, "rewards/rejected": -14.84189510345459, "step": 383 }, { "epoch": 0.05228758169934641, "grad_norm": 43.18810535380108, "learning_rate": 4.179591836734694e-07, "logits/chosen": -4.7782697677612305, "logits/rejected": -4.969653129577637, "logps/chosen": -0.9616748094558716, "logps/rejected": -0.7381795644760132, "loss": 4.7153, "rewards/accuracies": 0.25, "rewards/chosen": -9.616748809814453, "rewards/margins": -2.234952926635742, "rewards/rejected": -7.381795406341553, "step": 384 }, { "epoch": 0.05242374727668846, "grad_norm": 47.09229290299882, "learning_rate": 4.1904761904761906e-07, "logits/chosen": -2.751847267150879, "logits/rejected": -3.1571831703186035, "logps/chosen": -1.4500106573104858, "logps/rejected": -1.4934954643249512, "loss": 4.7225, "rewards/accuracies": 0.75, "rewards/chosen": -14.500106811523438, "rewards/margins": 0.4348485469818115, "rewards/rejected": -14.934954643249512, "step": 385 }, { "epoch": 0.0525599128540305, "grad_norm": 86.12620138624588, "learning_rate": 4.201360544217687e-07, "logits/chosen": -3.4587931632995605, "logits/rejected": -3.0349366664886475, "logps/chosen": -1.2382431030273438, "logps/rejected": -1.494152307510376, "loss": 4.4886, "rewards/accuracies": 1.0, "rewards/chosen": -12.382431983947754, "rewards/margins": 2.5590903759002686, "rewards/rejected": -14.941522598266602, "step": 386 }, { "epoch": 0.05269607843137255, "grad_norm": 38.8893282253715, "learning_rate": 4.2122448979591836e-07, "logits/chosen": -3.719339609146118, "logits/rejected": -3.216487407684326, "logps/chosen": -1.2164249420166016, "logps/rejected": -1.5033557415008545, "loss": 4.5824, "rewards/accuracies": 0.75, "rewards/chosen": -12.1642484664917, "rewards/margins": 2.869309186935425, "rewards/rejected": -15.033557891845703, "step": 387 }, { "epoch": 0.0528322440087146, "grad_norm": 48.35433424876795, "learning_rate": 4.2231292517006796e-07, "logits/chosen": -3.389918565750122, "logits/rejected": -2.6068432331085205, "logps/chosen": -1.2881076335906982, "logps/rejected": -1.9020192623138428, "loss": 4.3053, "rewards/accuracies": 1.0, "rewards/chosen": -12.881075859069824, "rewards/margins": 6.1391167640686035, "rewards/rejected": -19.020193099975586, "step": 388 }, { "epoch": 0.05296840958605664, "grad_norm": 50.91057344692353, "learning_rate": 4.234013605442177e-07, "logits/chosen": -4.003451347351074, "logits/rejected": -3.210225820541382, "logps/chosen": -1.6003930568695068, "logps/rejected": -1.6900222301483154, "loss": 4.66, "rewards/accuracies": 0.5, "rewards/chosen": -16.003931045532227, "rewards/margins": 0.896291971206665, "rewards/rejected": -16.900222778320312, "step": 389 }, { "epoch": 0.05310457516339869, "grad_norm": 44.746153392733916, "learning_rate": 4.2448979591836737e-07, "logits/chosen": -4.248198509216309, "logits/rejected": -4.6256232261657715, "logps/chosen": -0.9387425184249878, "logps/rejected": -1.111342191696167, "loss": 4.2304, "rewards/accuracies": 0.25, "rewards/chosen": -9.387425422668457, "rewards/margins": 1.725996494293213, "rewards/rejected": -11.113421440124512, "step": 390 }, { "epoch": 0.05324074074074074, "grad_norm": 56.45366498519587, "learning_rate": 4.25578231292517e-07, "logits/chosen": -3.3149781227111816, "logits/rejected": -4.226804733276367, "logps/chosen": -1.2171587944030762, "logps/rejected": -1.3357594013214111, "loss": 4.8798, "rewards/accuracies": 0.5, "rewards/chosen": -12.171586990356445, "rewards/margins": 1.1860058307647705, "rewards/rejected": -13.357593536376953, "step": 391 }, { "epoch": 0.05337690631808279, "grad_norm": 55.36836792578739, "learning_rate": 4.266666666666666e-07, "logits/chosen": -4.612475395202637, "logits/rejected": -4.072447299957275, "logps/chosen": -1.1316940784454346, "logps/rejected": -1.74626886844635, "loss": 4.3281, "rewards/accuracies": 1.0, "rewards/chosen": -11.316941261291504, "rewards/margins": 6.145747661590576, "rewards/rejected": -17.462688446044922, "step": 392 }, { "epoch": 0.053513071895424834, "grad_norm": 65.47139386378133, "learning_rate": 4.277551020408163e-07, "logits/chosen": -4.157782554626465, "logits/rejected": -3.5690855979919434, "logps/chosen": -1.3194139003753662, "logps/rejected": -1.1529537439346313, "loss": 4.8992, "rewards/accuracies": 0.5, "rewards/chosen": -13.194137573242188, "rewards/margins": -1.6646009683609009, "rewards/rejected": -11.529537200927734, "step": 393 }, { "epoch": 0.053649237472766884, "grad_norm": 57.63009003555696, "learning_rate": 4.2884353741496593e-07, "logits/chosen": -2.6153717041015625, "logits/rejected": -3.171003818511963, "logps/chosen": -1.7021191120147705, "logps/rejected": -1.4870145320892334, "loss": 5.1292, "rewards/accuracies": 0.25, "rewards/chosen": -17.021190643310547, "rewards/margins": -2.151045799255371, "rewards/rejected": -14.870145797729492, "step": 394 }, { "epoch": 0.05378540305010893, "grad_norm": 53.772765354701356, "learning_rate": 4.299319727891157e-07, "logits/chosen": -5.377692699432373, "logits/rejected": -4.610713958740234, "logps/chosen": -1.076084017753601, "logps/rejected": -1.31722891330719, "loss": 4.8871, "rewards/accuracies": 0.75, "rewards/chosen": -10.760839462280273, "rewards/margins": 2.4114491939544678, "rewards/rejected": -13.17228889465332, "step": 395 }, { "epoch": 0.05392156862745098, "grad_norm": 60.45137974302771, "learning_rate": 4.310204081632653e-07, "logits/chosen": -3.5165205001831055, "logits/rejected": -2.8365631103515625, "logps/chosen": -1.6021865606307983, "logps/rejected": -1.4897844791412354, "loss": 5.1689, "rewards/accuracies": 0.75, "rewards/chosen": -16.021865844726562, "rewards/margins": -1.1240203380584717, "rewards/rejected": -14.897844314575195, "step": 396 }, { "epoch": 0.054057734204793026, "grad_norm": 61.94147790998974, "learning_rate": 4.3210884353741494e-07, "logits/chosen": -4.007455348968506, "logits/rejected": -3.336472511291504, "logps/chosen": -1.0130470991134644, "logps/rejected": -1.2166389226913452, "loss": 5.0902, "rewards/accuracies": 0.75, "rewards/chosen": -10.130471229553223, "rewards/margins": 2.0359179973602295, "rewards/rejected": -12.166389465332031, "step": 397 }, { "epoch": 0.054193899782135076, "grad_norm": 42.11986098676664, "learning_rate": 4.331972789115646e-07, "logits/chosen": -4.578699111938477, "logits/rejected": -4.8498311042785645, "logps/chosen": -0.8790339231491089, "logps/rejected": -0.9052631855010986, "loss": 4.3585, "rewards/accuracies": 0.5, "rewards/chosen": -8.790338516235352, "rewards/margins": 0.2622934579849243, "rewards/rejected": -9.052632331848145, "step": 398 }, { "epoch": 0.054330065359477125, "grad_norm": 49.2787181808803, "learning_rate": 4.3428571428571424e-07, "logits/chosen": -3.4915337562561035, "logits/rejected": -3.350977897644043, "logps/chosen": -1.5776159763336182, "logps/rejected": -1.8437435626983643, "loss": 4.6383, "rewards/accuracies": 0.5, "rewards/chosen": -15.77616024017334, "rewards/margins": 2.6612753868103027, "rewards/rejected": -18.437435150146484, "step": 399 }, { "epoch": 0.054466230936819175, "grad_norm": 52.92816465153872, "learning_rate": 4.3537414965986395e-07, "logits/chosen": -3.761528968811035, "logits/rejected": -4.591896057128906, "logps/chosen": -0.9632505178451538, "logps/rejected": -1.0243831872940063, "loss": 5.3516, "rewards/accuracies": 0.5, "rewards/chosen": -9.632505416870117, "rewards/margins": 0.6113263368606567, "rewards/rejected": -10.243831634521484, "step": 400 }, { "epoch": 0.05460239651416122, "grad_norm": 45.68669367097855, "learning_rate": 4.364625850340136e-07, "logits/chosen": -4.95993709564209, "logits/rejected": -3.418145179748535, "logps/chosen": -0.9764912128448486, "logps/rejected": -1.1860324144363403, "loss": 4.5043, "rewards/accuracies": 0.75, "rewards/chosen": -9.764911651611328, "rewards/margins": 2.095412492752075, "rewards/rejected": -11.860323905944824, "step": 401 }, { "epoch": 0.05473856209150327, "grad_norm": 61.88048242463245, "learning_rate": 4.3755102040816325e-07, "logits/chosen": -3.989450216293335, "logits/rejected": -2.737484931945801, "logps/chosen": -1.0540305376052856, "logps/rejected": -1.4780402183532715, "loss": 4.3024, "rewards/accuracies": 1.0, "rewards/chosen": -10.540304183959961, "rewards/margins": 4.2400970458984375, "rewards/rejected": -14.780402183532715, "step": 402 }, { "epoch": 0.05487472766884532, "grad_norm": 44.8862226916202, "learning_rate": 4.386394557823129e-07, "logits/chosen": -3.970062732696533, "logits/rejected": -4.168905258178711, "logps/chosen": -0.8776034712791443, "logps/rejected": -0.9131577610969543, "loss": 3.8417, "rewards/accuracies": 0.25, "rewards/chosen": -8.776034355163574, "rewards/margins": 0.3555431365966797, "rewards/rejected": -9.13157844543457, "step": 403 }, { "epoch": 0.05501089324618736, "grad_norm": 50.74687637635371, "learning_rate": 4.3972789115646256e-07, "logits/chosen": -4.007328510284424, "logits/rejected": -4.83897066116333, "logps/chosen": -0.7331565618515015, "logps/rejected": -0.6233339309692383, "loss": 4.7121, "rewards/accuracies": 0.25, "rewards/chosen": -7.331565856933594, "rewards/margins": -1.09822678565979, "rewards/rejected": -6.233338832855225, "step": 404 }, { "epoch": 0.05514705882352941, "grad_norm": 46.81590078438447, "learning_rate": 4.4081632653061216e-07, "logits/chosen": -3.5568270683288574, "logits/rejected": -2.8460776805877686, "logps/chosen": -0.8345403671264648, "logps/rejected": -1.0704164505004883, "loss": 5.2342, "rewards/accuracies": 0.75, "rewards/chosen": -8.345403671264648, "rewards/margins": 2.358760118484497, "rewards/rejected": -10.704164505004883, "step": 405 }, { "epoch": 0.05528322440087146, "grad_norm": 58.56844435701675, "learning_rate": 4.419047619047619e-07, "logits/chosen": -4.017667293548584, "logits/rejected": -1.9660165309906006, "logps/chosen": -0.7921370267868042, "logps/rejected": -2.1258630752563477, "loss": 4.9128, "rewards/accuracies": 1.0, "rewards/chosen": -7.921370506286621, "rewards/margins": 13.337261199951172, "rewards/rejected": -21.258630752563477, "step": 406 }, { "epoch": 0.05541938997821351, "grad_norm": 55.490557231435055, "learning_rate": 4.4299319727891157e-07, "logits/chosen": -3.3159987926483154, "logits/rejected": -3.43422269821167, "logps/chosen": -0.9569330215454102, "logps/rejected": -1.2971683740615845, "loss": 5.2842, "rewards/accuracies": 0.5, "rewards/chosen": -9.569331169128418, "rewards/margins": 3.402353525161743, "rewards/rejected": -12.971684455871582, "step": 407 }, { "epoch": 0.05555555555555555, "grad_norm": 42.9670271230426, "learning_rate": 4.440816326530612e-07, "logits/chosen": -3.3089730739593506, "logits/rejected": -3.762237071990967, "logps/chosen": -0.8744107484817505, "logps/rejected": -1.3445937633514404, "loss": 4.8553, "rewards/accuracies": 0.5, "rewards/chosen": -8.744108200073242, "rewards/margins": 4.70182991027832, "rewards/rejected": -13.445937156677246, "step": 408 }, { "epoch": 0.0556917211328976, "grad_norm": 60.44621646457349, "learning_rate": 4.451700680272108e-07, "logits/chosen": -3.933250904083252, "logits/rejected": -3.399423837661743, "logps/chosen": -1.0699964761734009, "logps/rejected": -1.0447535514831543, "loss": 4.8824, "rewards/accuracies": 0.25, "rewards/chosen": -10.69996452331543, "rewards/margins": -0.2524292469024658, "rewards/rejected": -10.447535514831543, "step": 409 }, { "epoch": 0.05582788671023965, "grad_norm": 69.99590233280637, "learning_rate": 4.4625850340136047e-07, "logits/chosen": -4.385718822479248, "logits/rejected": -4.861325740814209, "logps/chosen": -1.31895112991333, "logps/rejected": -1.0323377847671509, "loss": 4.8869, "rewards/accuracies": 0.0, "rewards/chosen": -13.189512252807617, "rewards/margins": -2.866133689880371, "rewards/rejected": -10.323378562927246, "step": 410 }, { "epoch": 0.0559640522875817, "grad_norm": 48.5989541910103, "learning_rate": 4.4734693877551023e-07, "logits/chosen": -2.8287084102630615, "logits/rejected": -2.548316717147827, "logps/chosen": -1.1642886400222778, "logps/rejected": -1.5006481409072876, "loss": 4.738, "rewards/accuracies": 0.75, "rewards/chosen": -11.642887115478516, "rewards/margins": 3.3635945320129395, "rewards/rejected": -15.006481170654297, "step": 411 }, { "epoch": 0.056100217864923745, "grad_norm": 63.727842686277626, "learning_rate": 4.484353741496599e-07, "logits/chosen": -4.365447998046875, "logits/rejected": -3.498565673828125, "logps/chosen": -0.9711335897445679, "logps/rejected": -0.8645371198654175, "loss": 4.6315, "rewards/accuracies": 0.25, "rewards/chosen": -9.711336135864258, "rewards/margins": -1.0659648180007935, "rewards/rejected": -8.645370483398438, "step": 412 }, { "epoch": 0.056236383442265794, "grad_norm": 44.7639665818356, "learning_rate": 4.495238095238095e-07, "logits/chosen": -3.38047456741333, "logits/rejected": -3.778137683868408, "logps/chosen": -1.2060441970825195, "logps/rejected": -1.2078670263290405, "loss": 4.4809, "rewards/accuracies": 0.5, "rewards/chosen": -12.060441017150879, "rewards/margins": 0.018228888511657715, "rewards/rejected": -12.078670501708984, "step": 413 }, { "epoch": 0.056372549019607844, "grad_norm": 50.34038629148576, "learning_rate": 4.5061224489795913e-07, "logits/chosen": -3.9107115268707275, "logits/rejected": -3.606355667114258, "logps/chosen": -1.11201012134552, "logps/rejected": -1.0204873085021973, "loss": 4.327, "rewards/accuracies": 0.25, "rewards/chosen": -11.120100975036621, "rewards/margins": -0.9152282476425171, "rewards/rejected": -10.204872131347656, "step": 414 }, { "epoch": 0.056508714596949894, "grad_norm": 59.891614677366846, "learning_rate": 4.517006802721088e-07, "logits/chosen": -3.925548553466797, "logits/rejected": -3.6692562103271484, "logps/chosen": -1.0378334522247314, "logps/rejected": -1.129392385482788, "loss": 4.3964, "rewards/accuracies": 0.5, "rewards/chosen": -10.378334045410156, "rewards/margins": 0.9155896902084351, "rewards/rejected": -11.293924331665039, "step": 415 }, { "epoch": 0.05664488017429194, "grad_norm": 53.506946702027584, "learning_rate": 4.5278911564625854e-07, "logits/chosen": -4.348017692565918, "logits/rejected": -4.019487380981445, "logps/chosen": -1.0946455001831055, "logps/rejected": -0.9418354034423828, "loss": 4.4053, "rewards/accuracies": 0.5, "rewards/chosen": -10.946455001831055, "rewards/margins": -1.5281000137329102, "rewards/rejected": -9.418354034423828, "step": 416 }, { "epoch": 0.056781045751633986, "grad_norm": 45.855944218796346, "learning_rate": 4.5387755102040814e-07, "logits/chosen": -2.7912869453430176, "logits/rejected": -3.9245877265930176, "logps/chosen": -1.4037315845489502, "logps/rejected": -1.0789549350738525, "loss": 4.8322, "rewards/accuracies": 0.0, "rewards/chosen": -14.037315368652344, "rewards/margins": -3.2477667331695557, "rewards/rejected": -10.789548873901367, "step": 417 }, { "epoch": 0.056917211328976036, "grad_norm": 53.15335621757715, "learning_rate": 4.549659863945578e-07, "logits/chosen": -3.7729601860046387, "logits/rejected": -3.731234312057495, "logps/chosen": -1.2479910850524902, "logps/rejected": -1.5364620685577393, "loss": 4.685, "rewards/accuracies": 0.75, "rewards/chosen": -12.479910850524902, "rewards/margins": 2.884709596633911, "rewards/rejected": -15.364620208740234, "step": 418 }, { "epoch": 0.057053376906318086, "grad_norm": 44.23034612914322, "learning_rate": 4.5605442176870745e-07, "logits/chosen": -2.6709446907043457, "logits/rejected": -3.309422492980957, "logps/chosen": -1.3592281341552734, "logps/rejected": -1.4277238845825195, "loss": 4.0537, "rewards/accuracies": 0.5, "rewards/chosen": -13.592281341552734, "rewards/margins": 0.6849575042724609, "rewards/rejected": -14.277238845825195, "step": 419 }, { "epoch": 0.05718954248366013, "grad_norm": 46.14710204673767, "learning_rate": 4.571428571428571e-07, "logits/chosen": -3.9485347270965576, "logits/rejected": -4.034809112548828, "logps/chosen": -1.2876791954040527, "logps/rejected": -0.9872628450393677, "loss": 5.019, "rewards/accuracies": 0.25, "rewards/chosen": -12.876791000366211, "rewards/margins": -3.0041627883911133, "rewards/rejected": -9.872628211975098, "step": 420 }, { "epoch": 0.05732570806100218, "grad_norm": 87.55713279833216, "learning_rate": 4.5823129251700675e-07, "logits/chosen": -4.348410606384277, "logits/rejected": -4.340848922729492, "logps/chosen": -1.6238884925842285, "logps/rejected": -1.380614995956421, "loss": 5.5685, "rewards/accuracies": 0.5, "rewards/chosen": -16.23888397216797, "rewards/margins": -2.432734966278076, "rewards/rejected": -13.80614948272705, "step": 421 }, { "epoch": 0.05746187363834423, "grad_norm": 50.677974000199214, "learning_rate": 4.5931972789115646e-07, "logits/chosen": -4.455499649047852, "logits/rejected": -5.983485221862793, "logps/chosen": -1.112761378288269, "logps/rejected": -0.8814716339111328, "loss": 4.6696, "rewards/accuracies": 0.25, "rewards/chosen": -11.127613067626953, "rewards/margins": -2.3128976821899414, "rewards/rejected": -8.814716339111328, "step": 422 }, { "epoch": 0.05759803921568627, "grad_norm": 57.866129051929605, "learning_rate": 4.604081632653061e-07, "logits/chosen": -2.8621087074279785, "logits/rejected": -3.2761898040771484, "logps/chosen": -1.648512601852417, "logps/rejected": -1.3260743618011475, "loss": 4.4992, "rewards/accuracies": 0.25, "rewards/chosen": -16.485126495361328, "rewards/margins": -3.2243828773498535, "rewards/rejected": -13.260743141174316, "step": 423 }, { "epoch": 0.05773420479302832, "grad_norm": 73.70093797528712, "learning_rate": 4.6149659863945576e-07, "logits/chosen": -3.3768510818481445, "logits/rejected": -3.6393320560455322, "logps/chosen": -1.107578992843628, "logps/rejected": -1.1483232975006104, "loss": 4.5815, "rewards/accuracies": 0.5, "rewards/chosen": -11.075788497924805, "rewards/margins": 0.4074440002441406, "rewards/rejected": -11.483232498168945, "step": 424 }, { "epoch": 0.05787037037037037, "grad_norm": 60.92409832826714, "learning_rate": 4.625850340136054e-07, "logits/chosen": -4.6054840087890625, "logits/rejected": -2.7485876083374023, "logps/chosen": -0.8664693236351013, "logps/rejected": -1.6634843349456787, "loss": 4.8066, "rewards/accuracies": 1.0, "rewards/chosen": -8.664692878723145, "rewards/margins": 7.970150947570801, "rewards/rejected": -16.634843826293945, "step": 425 }, { "epoch": 0.05800653594771242, "grad_norm": 46.6272340091888, "learning_rate": 4.6367346938775507e-07, "logits/chosen": -3.728294849395752, "logits/rejected": -3.700622081756592, "logps/chosen": -1.316657543182373, "logps/rejected": -1.0281788110733032, "loss": 4.358, "rewards/accuracies": 0.5, "rewards/chosen": -13.16657543182373, "rewards/margins": -2.884787082672119, "rewards/rejected": -10.281787872314453, "step": 426 }, { "epoch": 0.05814270152505446, "grad_norm": 47.414764438906474, "learning_rate": 4.6476190476190477e-07, "logits/chosen": -3.9443864822387695, "logits/rejected": -3.8702392578125, "logps/chosen": -0.7735919952392578, "logps/rejected": -0.8687381744384766, "loss": 4.4073, "rewards/accuracies": 0.25, "rewards/chosen": -7.735919952392578, "rewards/margins": 0.9514614343643188, "rewards/rejected": -8.687381744384766, "step": 427 }, { "epoch": 0.05827886710239651, "grad_norm": 50.53715178388315, "learning_rate": 4.658503401360544e-07, "logits/chosen": -3.7760205268859863, "logits/rejected": -2.8079514503479004, "logps/chosen": -0.9912483096122742, "logps/rejected": -1.0823636054992676, "loss": 5.4885, "rewards/accuracies": 0.5, "rewards/chosen": -9.912483215332031, "rewards/margins": 0.9111528396606445, "rewards/rejected": -10.823636054992676, "step": 428 }, { "epoch": 0.05841503267973856, "grad_norm": 74.98241239798743, "learning_rate": 4.669387755102041e-07, "logits/chosen": -2.4879651069641113, "logits/rejected": -2.4984042644500732, "logps/chosen": -1.1001005172729492, "logps/rejected": -1.2046961784362793, "loss": 4.3753, "rewards/accuracies": 0.75, "rewards/chosen": -11.001005172729492, "rewards/margins": 1.045956015586853, "rewards/rejected": -12.046960830688477, "step": 429 }, { "epoch": 0.05855119825708061, "grad_norm": 53.790096389606994, "learning_rate": 4.6802721088435373e-07, "logits/chosen": -3.5369386672973633, "logits/rejected": -3.8102762699127197, "logps/chosen": -0.9720373749732971, "logps/rejected": -0.9755396842956543, "loss": 5.2722, "rewards/accuracies": 0.75, "rewards/chosen": -9.72037410736084, "rewards/margins": 0.03502213954925537, "rewards/rejected": -9.755395889282227, "step": 430 }, { "epoch": 0.058687363834422655, "grad_norm": 67.00063378767558, "learning_rate": 4.6911564625850333e-07, "logits/chosen": -4.447195053100586, "logits/rejected": -4.0393877029418945, "logps/chosen": -0.817156970500946, "logps/rejected": -0.9682495594024658, "loss": 5.7689, "rewards/accuracies": 0.75, "rewards/chosen": -8.17156982421875, "rewards/margins": 1.510925531387329, "rewards/rejected": -9.6824951171875, "step": 431 }, { "epoch": 0.058823529411764705, "grad_norm": 58.691391362644204, "learning_rate": 4.702040816326531e-07, "logits/chosen": -3.0873851776123047, "logits/rejected": -3.8270740509033203, "logps/chosen": -1.0501163005828857, "logps/rejected": -1.7744768857955933, "loss": 5.6094, "rewards/accuracies": 0.5, "rewards/chosen": -10.501163482666016, "rewards/margins": 7.243605613708496, "rewards/rejected": -17.744770050048828, "step": 432 }, { "epoch": 0.058959694989106755, "grad_norm": 53.43824286738626, "learning_rate": 4.7129251700680274e-07, "logits/chosen": -4.134598731994629, "logits/rejected": -2.519047260284424, "logps/chosen": -0.9638064503669739, "logps/rejected": -2.3212029933929443, "loss": 5.1956, "rewards/accuracies": 0.75, "rewards/chosen": -9.638065338134766, "rewards/margins": 13.573965072631836, "rewards/rejected": -23.2120304107666, "step": 433 }, { "epoch": 0.059095860566448805, "grad_norm": 41.68834330275679, "learning_rate": 4.723809523809524e-07, "logits/chosen": -3.853814125061035, "logits/rejected": -3.050342082977295, "logps/chosen": -1.1811935901641846, "logps/rejected": -1.1955151557922363, "loss": 4.1439, "rewards/accuracies": 0.5, "rewards/chosen": -11.811936378479004, "rewards/margins": 0.14321565628051758, "rewards/rejected": -11.95515251159668, "step": 434 }, { "epoch": 0.05923202614379085, "grad_norm": 50.743349072712306, "learning_rate": 4.73469387755102e-07, "logits/chosen": -3.979781150817871, "logits/rejected": -2.5384795665740967, "logps/chosen": -1.0466687679290771, "logps/rejected": -1.5874942541122437, "loss": 4.3178, "rewards/accuracies": 0.75, "rewards/chosen": -10.46668815612793, "rewards/margins": 5.408254623413086, "rewards/rejected": -15.874943733215332, "step": 435 }, { "epoch": 0.0593681917211329, "grad_norm": 58.653100377337914, "learning_rate": 4.7455782312925164e-07, "logits/chosen": -3.9931976795196533, "logits/rejected": -3.668680191040039, "logps/chosen": -1.0595136880874634, "logps/rejected": -0.9401049613952637, "loss": 5.1119, "rewards/accuracies": 0.25, "rewards/chosen": -10.595136642456055, "rewards/margins": -1.1940879821777344, "rewards/rejected": -9.40104866027832, "step": 436 }, { "epoch": 0.05950435729847495, "grad_norm": 51.73253208013592, "learning_rate": 4.756462585034013e-07, "logits/chosen": -4.125868797302246, "logits/rejected": -3.47910737991333, "logps/chosen": -0.9183812737464905, "logps/rejected": -1.1499073505401611, "loss": 4.5405, "rewards/accuracies": 0.75, "rewards/chosen": -9.183812141418457, "rewards/margins": 2.315260887145996, "rewards/rejected": -11.49907398223877, "step": 437 }, { "epoch": 0.059640522875817, "grad_norm": 57.95226933065235, "learning_rate": 4.7673469387755105e-07, "logits/chosen": -3.3008499145507812, "logits/rejected": -2.1456799507141113, "logps/chosen": -1.211259365081787, "logps/rejected": -1.2761374711990356, "loss": 4.8281, "rewards/accuracies": 0.5, "rewards/chosen": -12.112593650817871, "rewards/margins": 0.6487799882888794, "rewards/rejected": -12.761373519897461, "step": 438 }, { "epoch": 0.05977668845315904, "grad_norm": 46.7468864196759, "learning_rate": 4.778231292517007e-07, "logits/chosen": -3.258173942565918, "logits/rejected": -2.0370960235595703, "logps/chosen": -1.163620948791504, "logps/rejected": -1.6498100757598877, "loss": 4.5403, "rewards/accuracies": 0.75, "rewards/chosen": -11.636210441589355, "rewards/margins": 4.86189079284668, "rewards/rejected": -16.49810218811035, "step": 439 }, { "epoch": 0.05991285403050109, "grad_norm": 51.96817124645959, "learning_rate": 4.789115646258503e-07, "logits/chosen": -4.159543037414551, "logits/rejected": -4.327371120452881, "logps/chosen": -1.0910027027130127, "logps/rejected": -1.162089467048645, "loss": 4.5716, "rewards/accuracies": 0.75, "rewards/chosen": -10.910026550292969, "rewards/margins": 0.710867166519165, "rewards/rejected": -11.620894432067871, "step": 440 }, { "epoch": 0.06004901960784314, "grad_norm": 52.38519181045346, "learning_rate": 4.8e-07, "logits/chosen": -4.739843368530273, "logits/rejected": -4.272103309631348, "logps/chosen": -1.0090323686599731, "logps/rejected": -1.314993977546692, "loss": 4.721, "rewards/accuracies": 1.0, "rewards/chosen": -10.090323448181152, "rewards/margins": 3.0596163272857666, "rewards/rejected": -13.149940490722656, "step": 441 }, { "epoch": 0.06018518518518518, "grad_norm": 52.496628501263885, "learning_rate": 4.810884353741496e-07, "logits/chosen": -4.138759613037109, "logits/rejected": -4.138718128204346, "logps/chosen": -0.8354073762893677, "logps/rejected": -1.1500244140625, "loss": 4.2424, "rewards/accuracies": 0.75, "rewards/chosen": -8.354073524475098, "rewards/margins": 3.1461710929870605, "rewards/rejected": -11.500244140625, "step": 442 }, { "epoch": 0.06032135076252723, "grad_norm": 59.10361177680367, "learning_rate": 4.821768707482994e-07, "logits/chosen": -4.489090442657471, "logits/rejected": -4.223858833312988, "logps/chosen": -0.8937998414039612, "logps/rejected": -1.2671387195587158, "loss": 4.492, "rewards/accuracies": 0.75, "rewards/chosen": -8.93799877166748, "rewards/margins": 3.733388900756836, "rewards/rejected": -12.67138671875, "step": 443 }, { "epoch": 0.06045751633986928, "grad_norm": 39.80622864306376, "learning_rate": 4.83265306122449e-07, "logits/chosen": -4.973862648010254, "logits/rejected": -4.001169204711914, "logps/chosen": -0.7247560620307922, "logps/rejected": -0.9546008706092834, "loss": 4.3455, "rewards/accuracies": 1.0, "rewards/chosen": -7.247560501098633, "rewards/margins": 2.298448085784912, "rewards/rejected": -9.546009063720703, "step": 444 }, { "epoch": 0.06059368191721133, "grad_norm": 46.02458082986114, "learning_rate": 4.843537414965987e-07, "logits/chosen": -2.9560928344726562, "logits/rejected": -3.513561725616455, "logps/chosen": -1.3659014701843262, "logps/rejected": -1.6342802047729492, "loss": 4.3847, "rewards/accuracies": 0.5, "rewards/chosen": -13.659013748168945, "rewards/margins": 2.683788537979126, "rewards/rejected": -16.342802047729492, "step": 445 }, { "epoch": 0.060729847494553374, "grad_norm": 68.47259209659465, "learning_rate": 4.854421768707482e-07, "logits/chosen": -4.805624485015869, "logits/rejected": -3.6315250396728516, "logps/chosen": -1.0689924955368042, "logps/rejected": -1.1455459594726562, "loss": 4.8904, "rewards/accuracies": 0.5, "rewards/chosen": -10.689924240112305, "rewards/margins": 0.7655353546142578, "rewards/rejected": -11.455459594726562, "step": 446 }, { "epoch": 0.060866013071895424, "grad_norm": 67.71471245145474, "learning_rate": 4.865306122448979e-07, "logits/chosen": -2.8034379482269287, "logits/rejected": -2.122114896774292, "logps/chosen": -1.1339224576950073, "logps/rejected": -1.473841667175293, "loss": 5.0206, "rewards/accuracies": 1.0, "rewards/chosen": -11.339224815368652, "rewards/margins": 3.3991920948028564, "rewards/rejected": -14.73841667175293, "step": 447 }, { "epoch": 0.06100217864923747, "grad_norm": 52.2491678652555, "learning_rate": 4.876190476190476e-07, "logits/chosen": -4.422534942626953, "logits/rejected": -3.429605484008789, "logps/chosen": -1.192283034324646, "logps/rejected": -0.9996863603591919, "loss": 4.7247, "rewards/accuracies": 0.5, "rewards/chosen": -11.922830581665039, "rewards/margins": -1.9259672164916992, "rewards/rejected": -9.99686336517334, "step": 448 }, { "epoch": 0.06113834422657952, "grad_norm": 58.0851861358179, "learning_rate": 4.887074829931973e-07, "logits/chosen": -3.5463151931762695, "logits/rejected": -2.7396655082702637, "logps/chosen": -1.245950698852539, "logps/rejected": -1.6209993362426758, "loss": 4.8211, "rewards/accuracies": 0.75, "rewards/chosen": -12.45950698852539, "rewards/margins": 3.750485897064209, "rewards/rejected": -16.209993362426758, "step": 449 }, { "epoch": 0.061274509803921566, "grad_norm": 46.514902946733045, "learning_rate": 4.897959183673469e-07, "logits/chosen": -3.8762245178222656, "logits/rejected": -2.6820287704467773, "logps/chosen": -1.0636796951293945, "logps/rejected": -1.3907639980316162, "loss": 4.6935, "rewards/accuracies": 0.75, "rewards/chosen": -10.636796951293945, "rewards/margins": 3.270843029022217, "rewards/rejected": -13.90764045715332, "step": 450 }, { "epoch": 0.061410675381263616, "grad_norm": 50.76247721982234, "learning_rate": 4.908843537414966e-07, "logits/chosen": -3.5851612091064453, "logits/rejected": -3.2284152507781982, "logps/chosen": -0.7779825329780579, "logps/rejected": -0.9971481561660767, "loss": 4.2899, "rewards/accuracies": 0.75, "rewards/chosen": -7.779825210571289, "rewards/margins": 2.1916558742523193, "rewards/rejected": -9.971481323242188, "step": 451 }, { "epoch": 0.061546840958605666, "grad_norm": 45.828198777715656, "learning_rate": 4.919727891156462e-07, "logits/chosen": -3.9039225578308105, "logits/rejected": -3.265124797821045, "logps/chosen": -1.0069327354431152, "logps/rejected": -1.3279647827148438, "loss": 4.1619, "rewards/accuracies": 0.75, "rewards/chosen": -10.069328308105469, "rewards/margins": 3.2103207111358643, "rewards/rejected": -13.279648780822754, "step": 452 }, { "epoch": 0.061683006535947715, "grad_norm": 69.43580505718566, "learning_rate": 4.930612244897959e-07, "logits/chosen": -3.110739231109619, "logits/rejected": -4.229578495025635, "logps/chosen": -1.0588157176971436, "logps/rejected": -0.8863184452056885, "loss": 5.177, "rewards/accuracies": 0.25, "rewards/chosen": -10.588157653808594, "rewards/margins": -1.724973440170288, "rewards/rejected": -8.863183975219727, "step": 453 }, { "epoch": 0.06181917211328976, "grad_norm": 45.379611367260125, "learning_rate": 4.941496598639455e-07, "logits/chosen": -2.806413412094116, "logits/rejected": -3.8524351119995117, "logps/chosen": -1.17108154296875, "logps/rejected": -0.9822512269020081, "loss": 3.9489, "rewards/accuracies": 0.5, "rewards/chosen": -11.7108154296875, "rewards/margins": -1.8883037567138672, "rewards/rejected": -9.822511672973633, "step": 454 }, { "epoch": 0.06195533769063181, "grad_norm": 51.93873353275732, "learning_rate": 4.952380952380952e-07, "logits/chosen": -2.795647144317627, "logits/rejected": -4.194257736206055, "logps/chosen": -1.1835383176803589, "logps/rejected": -1.1161210536956787, "loss": 4.7971, "rewards/accuracies": 0.25, "rewards/chosen": -11.835383415222168, "rewards/margins": -0.67417311668396, "rewards/rejected": -11.161210060119629, "step": 455 }, { "epoch": 0.06209150326797386, "grad_norm": 51.01682721535262, "learning_rate": 4.963265306122448e-07, "logits/chosen": -4.68477725982666, "logits/rejected": -4.566251754760742, "logps/chosen": -0.8473321795463562, "logps/rejected": -0.8451428413391113, "loss": 4.5951, "rewards/accuracies": 0.5, "rewards/chosen": -8.473321914672852, "rewards/margins": -0.02189415693283081, "rewards/rejected": -8.451428413391113, "step": 456 }, { "epoch": 0.06222766884531591, "grad_norm": 90.23549687809697, "learning_rate": 4.974149659863945e-07, "logits/chosen": -3.1310272216796875, "logits/rejected": -2.846999168395996, "logps/chosen": -1.9583301544189453, "logps/rejected": -2.040076732635498, "loss": 5.0545, "rewards/accuracies": 0.75, "rewards/chosen": -19.583301544189453, "rewards/margins": 0.8174660205841064, "rewards/rejected": -20.400768280029297, "step": 457 }, { "epoch": 0.06236383442265795, "grad_norm": 45.269285754337446, "learning_rate": 4.985034013605442e-07, "logits/chosen": -3.829638957977295, "logits/rejected": -2.588376998901367, "logps/chosen": -0.9150474071502686, "logps/rejected": -1.184133529663086, "loss": 5.0787, "rewards/accuracies": 0.5, "rewards/chosen": -9.150474548339844, "rewards/margins": 2.6908602714538574, "rewards/rejected": -11.841334342956543, "step": 458 }, { "epoch": 0.0625, "grad_norm": 52.420884384630206, "learning_rate": 4.995918367346939e-07, "logits/chosen": -3.354618549346924, "logits/rejected": -3.17584228515625, "logps/chosen": -1.1467227935791016, "logps/rejected": -1.1778664588928223, "loss": 4.9923, "rewards/accuracies": 0.75, "rewards/chosen": -11.467228889465332, "rewards/margins": 0.3114355802536011, "rewards/rejected": -11.778663635253906, "step": 459 }, { "epoch": 0.06263616557734204, "grad_norm": 68.242007034197, "learning_rate": 5.006802721088436e-07, "logits/chosen": -4.154873847961426, "logits/rejected": -2.9726951122283936, "logps/chosen": -1.3183729648590088, "logps/rejected": -1.7081583738327026, "loss": 4.8621, "rewards/accuracies": 1.0, "rewards/chosen": -13.183730125427246, "rewards/margins": 3.8978543281555176, "rewards/rejected": -17.081584930419922, "step": 460 }, { "epoch": 0.0627723311546841, "grad_norm": 56.68175274044204, "learning_rate": 5.017687074829932e-07, "logits/chosen": -4.292179107666016, "logits/rejected": -3.23134708404541, "logps/chosen": -0.9359563589096069, "logps/rejected": -1.007216215133667, "loss": 4.7817, "rewards/accuracies": 0.75, "rewards/chosen": -9.359562873840332, "rewards/margins": 0.7125993967056274, "rewards/rejected": -10.072162628173828, "step": 461 }, { "epoch": 0.06290849673202614, "grad_norm": 45.26985405773494, "learning_rate": 5.028571428571429e-07, "logits/chosen": -3.4874684810638428, "logits/rejected": -3.455533027648926, "logps/chosen": -0.804561972618103, "logps/rejected": -0.9082500338554382, "loss": 5.0331, "rewards/accuracies": 0.75, "rewards/chosen": -8.04561996459961, "rewards/margins": 1.0368801355361938, "rewards/rejected": -9.082500457763672, "step": 462 }, { "epoch": 0.06304466230936819, "grad_norm": 40.862119201517714, "learning_rate": 5.039455782312925e-07, "logits/chosen": -1.9846842288970947, "logits/rejected": -2.6712048053741455, "logps/chosen": -1.4726109504699707, "logps/rejected": -1.25473952293396, "loss": 4.8576, "rewards/accuracies": 0.25, "rewards/chosen": -14.726110458374023, "rewards/margins": -2.1787147521972656, "rewards/rejected": -12.547395706176758, "step": 463 }, { "epoch": 0.06318082788671024, "grad_norm": 40.68208220830565, "learning_rate": 5.050340136054421e-07, "logits/chosen": -2.2969954013824463, "logits/rejected": -1.9334020614624023, "logps/chosen": -1.2705832719802856, "logps/rejected": -1.5885778665542603, "loss": 4.1497, "rewards/accuracies": 0.75, "rewards/chosen": -12.705833435058594, "rewards/margins": 3.179945945739746, "rewards/rejected": -15.88577938079834, "step": 464 }, { "epoch": 0.06331699346405228, "grad_norm": 58.27344586656203, "learning_rate": 5.061224489795918e-07, "logits/chosen": -2.9570345878601074, "logits/rejected": -2.241210699081421, "logps/chosen": -1.7653484344482422, "logps/rejected": -2.335049629211426, "loss": 4.944, "rewards/accuracies": 0.75, "rewards/chosen": -17.653484344482422, "rewards/margins": 5.6970109939575195, "rewards/rejected": -23.350496292114258, "step": 465 }, { "epoch": 0.06345315904139434, "grad_norm": 56.32228561288063, "learning_rate": 5.072108843537415e-07, "logits/chosen": -4.360490322113037, "logits/rejected": -3.2545666694641113, "logps/chosen": -0.8597768545150757, "logps/rejected": -1.1643985509872437, "loss": 4.616, "rewards/accuracies": 0.5, "rewards/chosen": -8.597768783569336, "rewards/margins": 3.046217441558838, "rewards/rejected": -11.643985748291016, "step": 466 }, { "epoch": 0.06358932461873638, "grad_norm": 49.95174957039178, "learning_rate": 5.082993197278911e-07, "logits/chosen": -2.1089706420898438, "logits/rejected": -1.9942501783370972, "logps/chosen": -1.1448436975479126, "logps/rejected": -1.2309846878051758, "loss": 4.425, "rewards/accuracies": 0.75, "rewards/chosen": -11.448436737060547, "rewards/margins": 0.86141037940979, "rewards/rejected": -12.309846878051758, "step": 467 }, { "epoch": 0.06372549019607843, "grad_norm": 44.16774580077828, "learning_rate": 5.093877551020408e-07, "logits/chosen": -2.653045654296875, "logits/rejected": -1.8875985145568848, "logps/chosen": -1.2760251760482788, "logps/rejected": -1.6877803802490234, "loss": 4.4469, "rewards/accuracies": 0.5, "rewards/chosen": -12.760251998901367, "rewards/margins": 4.117550849914551, "rewards/rejected": -16.877803802490234, "step": 468 }, { "epoch": 0.06386165577342048, "grad_norm": 59.941774330662945, "learning_rate": 5.104761904761904e-07, "logits/chosen": -2.060730457305908, "logits/rejected": -2.7613019943237305, "logps/chosen": -1.6004455089569092, "logps/rejected": -1.51969313621521, "loss": 5.1016, "rewards/accuracies": 0.5, "rewards/chosen": -16.004453659057617, "rewards/margins": -0.8075222969055176, "rewards/rejected": -15.196931838989258, "step": 469 }, { "epoch": 0.06399782135076253, "grad_norm": 41.94586428850396, "learning_rate": 5.115646258503402e-07, "logits/chosen": -2.015530824661255, "logits/rejected": -2.0496065616607666, "logps/chosen": -1.333191156387329, "logps/rejected": -1.5202157497406006, "loss": 4.7789, "rewards/accuracies": 0.5, "rewards/chosen": -13.331911087036133, "rewards/margins": 1.8702468872070312, "rewards/rejected": -15.202157974243164, "step": 470 }, { "epoch": 0.06413398692810457, "grad_norm": 46.51777142933087, "learning_rate": 5.126530612244897e-07, "logits/chosen": -3.4222207069396973, "logits/rejected": -3.055570125579834, "logps/chosen": -0.8962790966033936, "logps/rejected": -1.0016157627105713, "loss": 4.4768, "rewards/accuracies": 0.75, "rewards/chosen": -8.962791442871094, "rewards/margins": 1.053367018699646, "rewards/rejected": -10.016158103942871, "step": 471 }, { "epoch": 0.06427015250544663, "grad_norm": 52.3872862876935, "learning_rate": 5.137414965986394e-07, "logits/chosen": -3.448495864868164, "logits/rejected": -2.6980953216552734, "logps/chosen": -0.9410432577133179, "logps/rejected": -0.9795297980308533, "loss": 4.6091, "rewards/accuracies": 0.5, "rewards/chosen": -9.410432815551758, "rewards/margins": 0.384865403175354, "rewards/rejected": -9.795297622680664, "step": 472 }, { "epoch": 0.06440631808278867, "grad_norm": 45.11120379902936, "learning_rate": 5.14829931972789e-07, "logits/chosen": -4.022689342498779, "logits/rejected": -3.196662664413452, "logps/chosen": -0.8039114475250244, "logps/rejected": -1.2090609073638916, "loss": 4.4187, "rewards/accuracies": 0.75, "rewards/chosen": -8.039113998413086, "rewards/margins": 4.051494598388672, "rewards/rejected": -12.090608596801758, "step": 473 }, { "epoch": 0.06454248366013073, "grad_norm": 36.8173256798753, "learning_rate": 5.159183673469387e-07, "logits/chosen": -2.3641560077667236, "logits/rejected": -2.9541401863098145, "logps/chosen": -0.7087069749832153, "logps/rejected": -1.0835912227630615, "loss": 4.7348, "rewards/accuracies": 1.0, "rewards/chosen": -7.087069511413574, "rewards/margins": 3.7488441467285156, "rewards/rejected": -10.83591365814209, "step": 474 }, { "epoch": 0.06467864923747277, "grad_norm": 45.139410342142156, "learning_rate": 5.170068027210885e-07, "logits/chosen": -3.845733642578125, "logits/rejected": -3.827660083770752, "logps/chosen": -0.9940779805183411, "logps/rejected": -0.8782870769500732, "loss": 3.8387, "rewards/accuracies": 0.25, "rewards/chosen": -9.940779685974121, "rewards/margins": -1.1579089164733887, "rewards/rejected": -8.78287124633789, "step": 475 }, { "epoch": 0.06481481481481481, "grad_norm": 57.51805531890884, "learning_rate": 5.180952380952381e-07, "logits/chosen": -3.4426534175872803, "logits/rejected": -3.0436577796936035, "logps/chosen": -1.7406007051467896, "logps/rejected": -1.2587840557098389, "loss": 4.1414, "rewards/accuracies": 0.25, "rewards/chosen": -17.406007766723633, "rewards/margins": -4.8181657791137695, "rewards/rejected": -12.587841987609863, "step": 476 }, { "epoch": 0.06495098039215687, "grad_norm": 55.86005498731564, "learning_rate": 5.191836734693878e-07, "logits/chosen": -3.0768518447875977, "logits/rejected": -1.910510778427124, "logps/chosen": -1.1477091312408447, "logps/rejected": -1.3497154712677002, "loss": 5.1364, "rewards/accuracies": 0.25, "rewards/chosen": -11.477090835571289, "rewards/margins": 2.020064353942871, "rewards/rejected": -13.49715518951416, "step": 477 }, { "epoch": 0.06508714596949891, "grad_norm": 47.65826170947195, "learning_rate": 5.202721088435374e-07, "logits/chosen": -2.8557732105255127, "logits/rejected": -2.014413833618164, "logps/chosen": -0.9643675684928894, "logps/rejected": -1.2137538194656372, "loss": 4.5681, "rewards/accuracies": 0.75, "rewards/chosen": -9.643675804138184, "rewards/margins": 2.4938621520996094, "rewards/rejected": -12.137537956237793, "step": 478 }, { "epoch": 0.06522331154684095, "grad_norm": 95.11470108394312, "learning_rate": 5.213605442176871e-07, "logits/chosen": -2.961052417755127, "logits/rejected": -2.1055870056152344, "logps/chosen": -1.0963523387908936, "logps/rejected": -1.3360427618026733, "loss": 5.7739, "rewards/accuracies": 0.5, "rewards/chosen": -10.963523864746094, "rewards/margins": 2.396904230117798, "rewards/rejected": -13.360427856445312, "step": 479 }, { "epoch": 0.06535947712418301, "grad_norm": 46.34763022098749, "learning_rate": 5.224489795918367e-07, "logits/chosen": -3.3133623600006104, "logits/rejected": -2.7986373901367188, "logps/chosen": -1.0011088848114014, "logps/rejected": -1.1091097593307495, "loss": 4.7753, "rewards/accuracies": 0.5, "rewards/chosen": -10.011088371276855, "rewards/margins": 1.0800093412399292, "rewards/rejected": -11.091097831726074, "step": 480 }, { "epoch": 0.06549564270152505, "grad_norm": 51.90832644874036, "learning_rate": 5.235374149659864e-07, "logits/chosen": -3.126180648803711, "logits/rejected": -2.4339141845703125, "logps/chosen": -1.3860628604888916, "logps/rejected": -1.6059900522232056, "loss": 4.4939, "rewards/accuracies": 0.75, "rewards/chosen": -13.86063003540039, "rewards/margins": 2.1992714405059814, "rewards/rejected": -16.059900283813477, "step": 481 }, { "epoch": 0.0656318082788671, "grad_norm": 67.70441558603748, "learning_rate": 5.24625850340136e-07, "logits/chosen": -3.0055291652679443, "logits/rejected": -1.5121899843215942, "logps/chosen": -1.1013017892837524, "logps/rejected": -1.4546605348587036, "loss": 5.1027, "rewards/accuracies": 0.75, "rewards/chosen": -11.013017654418945, "rewards/margins": 3.53358793258667, "rewards/rejected": -14.546605110168457, "step": 482 }, { "epoch": 0.06576797385620915, "grad_norm": 48.709839379173935, "learning_rate": 5.257142857142857e-07, "logits/chosen": -3.293050765991211, "logits/rejected": -3.4682796001434326, "logps/chosen": -1.2368773221969604, "logps/rejected": -1.2902576923370361, "loss": 4.7356, "rewards/accuracies": 0.5, "rewards/chosen": -12.368772506713867, "rewards/margins": 0.5338044166564941, "rewards/rejected": -12.902578353881836, "step": 483 }, { "epoch": 0.0659041394335512, "grad_norm": 51.855017890333194, "learning_rate": 5.268027210884353e-07, "logits/chosen": -3.149684190750122, "logits/rejected": -2.3304836750030518, "logps/chosen": -1.2580931186676025, "logps/rejected": -1.484892725944519, "loss": 4.0661, "rewards/accuracies": 0.75, "rewards/chosen": -12.580930709838867, "rewards/margins": 2.2679967880249023, "rewards/rejected": -14.848928451538086, "step": 484 }, { "epoch": 0.06604030501089325, "grad_norm": 42.11105117371229, "learning_rate": 5.27891156462585e-07, "logits/chosen": -3.6756911277770996, "logits/rejected": -2.7144665718078613, "logps/chosen": -0.8044530153274536, "logps/rejected": -1.265647053718567, "loss": 4.7366, "rewards/accuracies": 0.75, "rewards/chosen": -8.044529914855957, "rewards/margins": 4.611940860748291, "rewards/rejected": -12.656471252441406, "step": 485 }, { "epoch": 0.0661764705882353, "grad_norm": 44.19972393323689, "learning_rate": 5.289795918367347e-07, "logits/chosen": -3.7175045013427734, "logits/rejected": -3.3369345664978027, "logps/chosen": -1.16758131980896, "logps/rejected": -1.6672298908233643, "loss": 4.6451, "rewards/accuracies": 1.0, "rewards/chosen": -11.675813674926758, "rewards/margins": 4.996485710144043, "rewards/rejected": -16.672298431396484, "step": 486 }, { "epoch": 0.06631263616557734, "grad_norm": 44.60575363357122, "learning_rate": 5.300680272108844e-07, "logits/chosen": -3.1067185401916504, "logits/rejected": -2.461934804916382, "logps/chosen": -1.1379724740982056, "logps/rejected": -1.180034875869751, "loss": 5.0355, "rewards/accuracies": 0.5, "rewards/chosen": -11.379724502563477, "rewards/margins": 0.420623779296875, "rewards/rejected": -11.800348281860352, "step": 487 }, { "epoch": 0.0664488017429194, "grad_norm": 41.481436691007644, "learning_rate": 5.31156462585034e-07, "logits/chosen": -2.4957308769226074, "logits/rejected": -1.2822606563568115, "logps/chosen": -1.122103214263916, "logps/rejected": -1.4508724212646484, "loss": 4.1279, "rewards/accuracies": 0.75, "rewards/chosen": -11.221031188964844, "rewards/margins": 3.2876923084259033, "rewards/rejected": -14.508724212646484, "step": 488 }, { "epoch": 0.06658496732026144, "grad_norm": 38.91594213408899, "learning_rate": 5.322448979591836e-07, "logits/chosen": -3.589585304260254, "logits/rejected": -2.4638161659240723, "logps/chosen": -0.9106356501579285, "logps/rejected": -1.1717262268066406, "loss": 4.8046, "rewards/accuracies": 1.0, "rewards/chosen": -9.106355667114258, "rewards/margins": 2.6109066009521484, "rewards/rejected": -11.717262268066406, "step": 489 }, { "epoch": 0.06672113289760348, "grad_norm": 46.338783075051005, "learning_rate": 5.333333333333332e-07, "logits/chosen": -2.7533159255981445, "logits/rejected": -2.0751523971557617, "logps/chosen": -1.0145727396011353, "logps/rejected": -1.2366523742675781, "loss": 4.5132, "rewards/accuracies": 0.75, "rewards/chosen": -10.145727157592773, "rewards/margins": 2.2207961082458496, "rewards/rejected": -12.366523742675781, "step": 490 }, { "epoch": 0.06685729847494554, "grad_norm": 52.280264541358925, "learning_rate": 5.34421768707483e-07, "logits/chosen": -3.603062629699707, "logits/rejected": -2.540248394012451, "logps/chosen": -1.0106991529464722, "logps/rejected": -1.4668488502502441, "loss": 4.6235, "rewards/accuracies": 1.0, "rewards/chosen": -10.1069917678833, "rewards/margins": 4.561498165130615, "rewards/rejected": -14.668490409851074, "step": 491 }, { "epoch": 0.06699346405228758, "grad_norm": 37.26853240778612, "learning_rate": 5.355102040816326e-07, "logits/chosen": -3.20969820022583, "logits/rejected": -2.14206600189209, "logps/chosen": -0.9957271218299866, "logps/rejected": -1.6459245681762695, "loss": 4.0902, "rewards/accuracies": 0.75, "rewards/chosen": -9.957271575927734, "rewards/margins": 6.501974582672119, "rewards/rejected": -16.459245681762695, "step": 492 }, { "epoch": 0.06712962962962964, "grad_norm": 44.93164341542624, "learning_rate": 5.365986394557823e-07, "logits/chosen": -1.9093960523605347, "logits/rejected": -2.954043388366699, "logps/chosen": -1.2237757444381714, "logps/rejected": -1.3916022777557373, "loss": 4.5394, "rewards/accuracies": 0.75, "rewards/chosen": -12.237756729125977, "rewards/margins": 1.6782655715942383, "rewards/rejected": -13.916023254394531, "step": 493 }, { "epoch": 0.06726579520697168, "grad_norm": 41.683993713637626, "learning_rate": 5.37687074829932e-07, "logits/chosen": -3.652287006378174, "logits/rejected": -3.5349388122558594, "logps/chosen": -0.8389030694961548, "logps/rejected": -1.0158255100250244, "loss": 4.2277, "rewards/accuracies": 0.75, "rewards/chosen": -8.389030456542969, "rewards/margins": 1.7692242860794067, "rewards/rejected": -10.158254623413086, "step": 494 }, { "epoch": 0.06740196078431372, "grad_norm": 45.41707603327784, "learning_rate": 5.387755102040816e-07, "logits/chosen": -4.650323390960693, "logits/rejected": -4.1649980545043945, "logps/chosen": -0.7319206595420837, "logps/rejected": -0.6417104005813599, "loss": 4.1789, "rewards/accuracies": 0.5, "rewards/chosen": -7.319206714630127, "rewards/margins": -0.9021024107933044, "rewards/rejected": -6.417104721069336, "step": 495 }, { "epoch": 0.06753812636165578, "grad_norm": 52.72820689140351, "learning_rate": 5.398639455782313e-07, "logits/chosen": -1.763332486152649, "logits/rejected": -2.0758895874023438, "logps/chosen": -1.378919005393982, "logps/rejected": -1.2810397148132324, "loss": 4.4728, "rewards/accuracies": 0.5, "rewards/chosen": -13.789189338684082, "rewards/margins": -0.9787914752960205, "rewards/rejected": -12.81039810180664, "step": 496 }, { "epoch": 0.06767429193899782, "grad_norm": 43.92788969192835, "learning_rate": 5.409523809523809e-07, "logits/chosen": -3.0815505981445312, "logits/rejected": -3.3933563232421875, "logps/chosen": -0.963021457195282, "logps/rejected": -0.9071799516677856, "loss": 4.9592, "rewards/accuracies": 0.5, "rewards/chosen": -9.63021469116211, "rewards/margins": -0.558415412902832, "rewards/rejected": -9.071800231933594, "step": 497 }, { "epoch": 0.06781045751633986, "grad_norm": 40.08541397959765, "learning_rate": 5.420408163265306e-07, "logits/chosen": -2.90879487991333, "logits/rejected": -2.7838528156280518, "logps/chosen": -0.914370059967041, "logps/rejected": -0.9558181762695312, "loss": 4.2109, "rewards/accuracies": 0.5, "rewards/chosen": -9.14370059967041, "rewards/margins": 0.41448163986206055, "rewards/rejected": -9.558181762695312, "step": 498 }, { "epoch": 0.06794662309368192, "grad_norm": 70.55860688839681, "learning_rate": 5.431292517006802e-07, "logits/chosen": -2.4314324855804443, "logits/rejected": -3.0937514305114746, "logps/chosen": -1.2093437910079956, "logps/rejected": -1.604689598083496, "loss": 5.1059, "rewards/accuracies": 0.75, "rewards/chosen": -12.093437194824219, "rewards/margins": 3.9534592628479004, "rewards/rejected": -16.046897888183594, "step": 499 }, { "epoch": 0.06808278867102396, "grad_norm": 52.38013718461145, "learning_rate": 5.442176870748299e-07, "logits/chosen": -2.922597646713257, "logits/rejected": -2.981602430343628, "logps/chosen": -1.353858232498169, "logps/rejected": -1.1737871170043945, "loss": 4.8179, "rewards/accuracies": 0.25, "rewards/chosen": -13.538581848144531, "rewards/margins": -1.8007118701934814, "rewards/rejected": -11.737870216369629, "step": 500 }, { "epoch": 0.068218954248366, "grad_norm": 47.08768851160921, "learning_rate": 5.453061224489795e-07, "logits/chosen": -3.4934327602386475, "logits/rejected": -2.0843870639801025, "logps/chosen": -1.106413722038269, "logps/rejected": -1.4786524772644043, "loss": 4.4632, "rewards/accuracies": 0.75, "rewards/chosen": -11.06413745880127, "rewards/margins": 3.7223877906799316, "rewards/rejected": -14.786524772644043, "step": 501 }, { "epoch": 0.06835511982570806, "grad_norm": 42.646407984377866, "learning_rate": 5.463945578231293e-07, "logits/chosen": -2.5200142860412598, "logits/rejected": -3.198709011077881, "logps/chosen": -1.2127735614776611, "logps/rejected": -1.5204434394836426, "loss": 4.1789, "rewards/accuracies": 0.25, "rewards/chosen": -12.12773609161377, "rewards/margins": 3.0766983032226562, "rewards/rejected": -15.204434394836426, "step": 502 }, { "epoch": 0.0684912854030501, "grad_norm": 49.10985414770245, "learning_rate": 5.474829931972789e-07, "logits/chosen": -3.0949418544769287, "logits/rejected": -2.5613958835601807, "logps/chosen": -1.4624040126800537, "logps/rejected": -1.0002403259277344, "loss": 5.3135, "rewards/accuracies": 0.0, "rewards/chosen": -14.624039649963379, "rewards/margins": -4.621635437011719, "rewards/rejected": -10.00240421295166, "step": 503 }, { "epoch": 0.06862745098039216, "grad_norm": 53.66045543272274, "learning_rate": 5.485714285714286e-07, "logits/chosen": -2.6678225994110107, "logits/rejected": -3.668166160583496, "logps/chosen": -1.0135260820388794, "logps/rejected": -0.7359984517097473, "loss": 4.3491, "rewards/accuracies": 0.25, "rewards/chosen": -10.135261535644531, "rewards/margins": -2.7752761840820312, "rewards/rejected": -7.359984397888184, "step": 504 }, { "epoch": 0.0687636165577342, "grad_norm": 48.390648472242574, "learning_rate": 5.496598639455782e-07, "logits/chosen": -3.422950267791748, "logits/rejected": -3.5057175159454346, "logps/chosen": -0.7419409155845642, "logps/rejected": -0.7836423516273499, "loss": 4.7847, "rewards/accuracies": 0.75, "rewards/chosen": -7.41940975189209, "rewards/margins": 0.4170141816139221, "rewards/rejected": -7.836422920227051, "step": 505 }, { "epoch": 0.06889978213507625, "grad_norm": 51.422004746982076, "learning_rate": 5.507482993197279e-07, "logits/chosen": -1.7364307641983032, "logits/rejected": -1.9594916105270386, "logps/chosen": -1.1852530241012573, "logps/rejected": -1.1967533826828003, "loss": 4.7557, "rewards/accuracies": 0.5, "rewards/chosen": -11.852529525756836, "rewards/margins": 0.11500406265258789, "rewards/rejected": -11.967534065246582, "step": 506 }, { "epoch": 0.0690359477124183, "grad_norm": 40.868177552667646, "learning_rate": 5.518367346938775e-07, "logits/chosen": -1.70644211769104, "logits/rejected": -1.5603753328323364, "logps/chosen": -1.1102569103240967, "logps/rejected": -1.178175687789917, "loss": 4.2951, "rewards/accuracies": 0.5, "rewards/chosen": -11.102569580078125, "rewards/margins": 0.6791872978210449, "rewards/rejected": -11.781757354736328, "step": 507 }, { "epoch": 0.06917211328976035, "grad_norm": 55.985650205819674, "learning_rate": 5.529251700680272e-07, "logits/chosen": -3.3667140007019043, "logits/rejected": -2.224297046661377, "logps/chosen": -0.7313827276229858, "logps/rejected": -1.0872513055801392, "loss": 4.1882, "rewards/accuracies": 0.75, "rewards/chosen": -7.3138275146484375, "rewards/margins": 3.558684825897217, "rewards/rejected": -10.872512817382812, "step": 508 }, { "epoch": 0.06930827886710239, "grad_norm": 41.711723213347675, "learning_rate": 5.540136054421768e-07, "logits/chosen": -2.145443916320801, "logits/rejected": -1.9072749614715576, "logps/chosen": -1.2331466674804688, "logps/rejected": -1.035029411315918, "loss": 4.6434, "rewards/accuracies": 0.25, "rewards/chosen": -12.331466674804688, "rewards/margins": -1.9811729192733765, "rewards/rejected": -10.35029411315918, "step": 509 }, { "epoch": 0.06944444444444445, "grad_norm": 48.6969249257357, "learning_rate": 5.551020408163265e-07, "logits/chosen": -3.2514524459838867, "logits/rejected": -3.084397315979004, "logps/chosen": -1.0264800786972046, "logps/rejected": -0.9999182224273682, "loss": 4.7357, "rewards/accuracies": 0.5, "rewards/chosen": -10.264801025390625, "rewards/margins": -0.2656186819076538, "rewards/rejected": -9.999181747436523, "step": 510 }, { "epoch": 0.06958061002178649, "grad_norm": 41.338736788741876, "learning_rate": 5.561904761904761e-07, "logits/chosen": -3.243659019470215, "logits/rejected": -4.22566032409668, "logps/chosen": -0.9824234247207642, "logps/rejected": -1.0260745286941528, "loss": 4.5057, "rewards/accuracies": 0.5, "rewards/chosen": -9.824234008789062, "rewards/margins": 0.43651092052459717, "rewards/rejected": -10.26074504852295, "step": 511 }, { "epoch": 0.06971677559912855, "grad_norm": 53.015797273715684, "learning_rate": 5.572789115646258e-07, "logits/chosen": -1.9176048040390015, "logits/rejected": -2.2862987518310547, "logps/chosen": -1.2337210178375244, "logps/rejected": -1.2263426780700684, "loss": 4.5489, "rewards/accuracies": 0.25, "rewards/chosen": -12.337209701538086, "rewards/margins": -0.0737830400466919, "rewards/rejected": -12.263425827026367, "step": 512 }, { "epoch": 0.06985294117647059, "grad_norm": 48.81533291148636, "learning_rate": 5.583673469387756e-07, "logits/chosen": -3.0930299758911133, "logits/rejected": -3.5402979850769043, "logps/chosen": -1.1017425060272217, "logps/rejected": -1.254098653793335, "loss": 4.7493, "rewards/accuracies": 0.75, "rewards/chosen": -11.017424583435059, "rewards/margins": 1.5235621929168701, "rewards/rejected": -12.540987014770508, "step": 513 }, { "epoch": 0.06998910675381263, "grad_norm": 44.86252361887883, "learning_rate": 5.594557823129252e-07, "logits/chosen": -2.4983644485473633, "logits/rejected": -2.4844465255737305, "logps/chosen": -1.2743804454803467, "logps/rejected": -1.3881995677947998, "loss": 5.2127, "rewards/accuracies": 0.5, "rewards/chosen": -12.743804931640625, "rewards/margins": 1.1381914615631104, "rewards/rejected": -13.881996154785156, "step": 514 }, { "epoch": 0.07012527233115469, "grad_norm": 49.66063471196953, "learning_rate": 5.605442176870748e-07, "logits/chosen": -2.482285499572754, "logits/rejected": -2.758878707885742, "logps/chosen": -1.1795098781585693, "logps/rejected": -0.9755443334579468, "loss": 4.7912, "rewards/accuracies": 0.25, "rewards/chosen": -11.795099258422852, "rewards/margins": -2.0396554470062256, "rewards/rejected": -9.755443572998047, "step": 515 }, { "epoch": 0.07026143790849673, "grad_norm": 65.53536927526527, "learning_rate": 5.616326530612244e-07, "logits/chosen": -4.666674613952637, "logits/rejected": -4.059658527374268, "logps/chosen": -1.0243947505950928, "logps/rejected": -0.9925739765167236, "loss": 5.2659, "rewards/accuracies": 0.5, "rewards/chosen": -10.24394702911377, "rewards/margins": -0.31820738315582275, "rewards/rejected": -9.925739288330078, "step": 516 }, { "epoch": 0.07039760348583878, "grad_norm": 48.105296664947865, "learning_rate": 5.627210884353741e-07, "logits/chosen": -1.4204787015914917, "logits/rejected": -2.3813610076904297, "logps/chosen": -1.6692068576812744, "logps/rejected": -1.4618620872497559, "loss": 4.3731, "rewards/accuracies": 0.25, "rewards/chosen": -16.69207000732422, "rewards/margins": -2.0734477043151855, "rewards/rejected": -14.618620872497559, "step": 517 }, { "epoch": 0.07053376906318083, "grad_norm": 47.101382086919, "learning_rate": 5.638095238095238e-07, "logits/chosen": -3.296934127807617, "logits/rejected": -2.1067724227905273, "logps/chosen": -1.3198870420455933, "logps/rejected": -1.0719776153564453, "loss": 4.5646, "rewards/accuracies": 0.5, "rewards/chosen": -13.198869705200195, "rewards/margins": -2.479094982147217, "rewards/rejected": -10.719775199890137, "step": 518 }, { "epoch": 0.07066993464052287, "grad_norm": 53.27673088404286, "learning_rate": 5.648979591836735e-07, "logits/chosen": -4.094464302062988, "logits/rejected": -4.086215019226074, "logps/chosen": -0.779076337814331, "logps/rejected": -0.6638896465301514, "loss": 4.8983, "rewards/accuracies": 0.0, "rewards/chosen": -7.790763854980469, "rewards/margins": -1.1518672704696655, "rewards/rejected": -6.638896465301514, "step": 519 }, { "epoch": 0.07080610021786492, "grad_norm": 53.04887052392134, "learning_rate": 5.659863945578231e-07, "logits/chosen": -3.440378427505493, "logits/rejected": -3.5102202892303467, "logps/chosen": -1.3583707809448242, "logps/rejected": -0.9817022681236267, "loss": 4.6782, "rewards/accuracies": 0.25, "rewards/chosen": -13.583707809448242, "rewards/margins": -3.7666847705841064, "rewards/rejected": -9.817022323608398, "step": 520 }, { "epoch": 0.07094226579520697, "grad_norm": 46.50922088523107, "learning_rate": 5.670748299319728e-07, "logits/chosen": -3.4665215015411377, "logits/rejected": -2.627963066101074, "logps/chosen": -1.4215099811553955, "logps/rejected": -1.7134976387023926, "loss": 4.3753, "rewards/accuracies": 1.0, "rewards/chosen": -14.215099334716797, "rewards/margins": 2.9198765754699707, "rewards/rejected": -17.13497543334961, "step": 521 }, { "epoch": 0.07107843137254902, "grad_norm": 40.09358103728764, "learning_rate": 5.681632653061224e-07, "logits/chosen": -3.9202537536621094, "logits/rejected": -2.3913755416870117, "logps/chosen": -0.7618062496185303, "logps/rejected": -1.132534384727478, "loss": 4.0472, "rewards/accuracies": 1.0, "rewards/chosen": -7.618062973022461, "rewards/margins": 3.7072811126708984, "rewards/rejected": -11.32534408569336, "step": 522 }, { "epoch": 0.07121459694989107, "grad_norm": 44.66995740132622, "learning_rate": 5.692517006802721e-07, "logits/chosen": -3.7018542289733887, "logits/rejected": -3.556548833847046, "logps/chosen": -1.0133777856826782, "logps/rejected": -0.8385543823242188, "loss": 4.4956, "rewards/accuracies": 0.0, "rewards/chosen": -10.133777618408203, "rewards/margins": -1.748234748840332, "rewards/rejected": -8.385543823242188, "step": 523 }, { "epoch": 0.07135076252723312, "grad_norm": 42.594198792241826, "learning_rate": 5.703401360544217e-07, "logits/chosen": -2.858088970184326, "logits/rejected": -1.4272640943527222, "logps/chosen": -1.38235342502594, "logps/rejected": -2.3508710861206055, "loss": 4.3528, "rewards/accuracies": 0.75, "rewards/chosen": -13.82353401184082, "rewards/margins": 9.68517780303955, "rewards/rejected": -23.508712768554688, "step": 524 }, { "epoch": 0.07148692810457516, "grad_norm": 62.44522888793085, "learning_rate": 5.714285714285714e-07, "logits/chosen": -2.6862549781799316, "logits/rejected": -2.7906336784362793, "logps/chosen": -1.2222659587860107, "logps/rejected": -1.3346775770187378, "loss": 5.0464, "rewards/accuracies": 0.5, "rewards/chosen": -12.22265911102295, "rewards/margins": 1.1241161823272705, "rewards/rejected": -13.346776008605957, "step": 525 }, { "epoch": 0.07162309368191722, "grad_norm": 44.63890258207661, "learning_rate": 5.72517006802721e-07, "logits/chosen": -2.8498058319091797, "logits/rejected": -1.588024616241455, "logps/chosen": -1.1915925741195679, "logps/rejected": -1.8604580163955688, "loss": 4.2729, "rewards/accuracies": 0.75, "rewards/chosen": -11.915925979614258, "rewards/margins": 6.688653469085693, "rewards/rejected": -18.60457992553711, "step": 526 }, { "epoch": 0.07175925925925926, "grad_norm": 80.20369953965832, "learning_rate": 5.736054421768707e-07, "logits/chosen": -2.0794918537139893, "logits/rejected": -2.626549243927002, "logps/chosen": -1.1030290126800537, "logps/rejected": -1.2535693645477295, "loss": 4.0124, "rewards/accuracies": 1.0, "rewards/chosen": -11.030289649963379, "rewards/margins": 1.5054036378860474, "rewards/rejected": -12.535694122314453, "step": 527 }, { "epoch": 0.0718954248366013, "grad_norm": 45.349301936562675, "learning_rate": 5.746938775510203e-07, "logits/chosen": -2.6132538318634033, "logits/rejected": -2.2892906665802, "logps/chosen": -1.1982331275939941, "logps/rejected": -1.046767234802246, "loss": 4.7384, "rewards/accuracies": 0.25, "rewards/chosen": -11.982330322265625, "rewards/margins": -1.5146574974060059, "rewards/rejected": -10.467673301696777, "step": 528 }, { "epoch": 0.07203159041394336, "grad_norm": 47.48912264063065, "learning_rate": 5.757823129251701e-07, "logits/chosen": -3.647869348526001, "logits/rejected": -3.366572141647339, "logps/chosen": -1.0528995990753174, "logps/rejected": -1.3496793508529663, "loss": 4.3818, "rewards/accuracies": 0.75, "rewards/chosen": -10.528996467590332, "rewards/margins": 2.9677975177764893, "rewards/rejected": -13.496793746948242, "step": 529 }, { "epoch": 0.0721677559912854, "grad_norm": 36.09975103518595, "learning_rate": 5.768707482993198e-07, "logits/chosen": -3.042247772216797, "logits/rejected": -2.245488405227661, "logps/chosen": -1.0453712940216064, "logps/rejected": -1.561471939086914, "loss": 4.1794, "rewards/accuracies": 1.0, "rewards/chosen": -10.453712463378906, "rewards/margins": 5.161006927490234, "rewards/rejected": -15.61471939086914, "step": 530 }, { "epoch": 0.07230392156862746, "grad_norm": 53.95801812969083, "learning_rate": 5.779591836734694e-07, "logits/chosen": -3.0535759925842285, "logits/rejected": -3.4696521759033203, "logps/chosen": -0.9643874168395996, "logps/rejected": -1.2675821781158447, "loss": 4.5439, "rewards/accuracies": 1.0, "rewards/chosen": -9.643874168395996, "rewards/margins": 3.031947374343872, "rewards/rejected": -12.675821304321289, "step": 531 }, { "epoch": 0.0724400871459695, "grad_norm": 39.32530441712375, "learning_rate": 5.790476190476191e-07, "logits/chosen": -2.213926315307617, "logits/rejected": -1.2346551418304443, "logps/chosen": -1.4523228406906128, "logps/rejected": -2.0848255157470703, "loss": 4.0901, "rewards/accuracies": 0.75, "rewards/chosen": -14.523228645324707, "rewards/margins": 6.32502555847168, "rewards/rejected": -20.848255157470703, "step": 532 }, { "epoch": 0.07257625272331154, "grad_norm": 56.21513376812006, "learning_rate": 5.801360544217686e-07, "logits/chosen": -2.9332780838012695, "logits/rejected": -2.846619129180908, "logps/chosen": -1.2396957874298096, "logps/rejected": -1.5473666191101074, "loss": 5.32, "rewards/accuracies": 0.75, "rewards/chosen": -12.396957397460938, "rewards/margins": 3.0767085552215576, "rewards/rejected": -15.47366714477539, "step": 533 }, { "epoch": 0.0727124183006536, "grad_norm": 46.01874032070396, "learning_rate": 5.812244897959184e-07, "logits/chosen": -3.1870458126068115, "logits/rejected": -3.1699070930480957, "logps/chosen": -0.907354474067688, "logps/rejected": -1.159095048904419, "loss": 4.603, "rewards/accuracies": 0.75, "rewards/chosen": -9.0735445022583, "rewards/margins": 2.517406702041626, "rewards/rejected": -11.590951919555664, "step": 534 }, { "epoch": 0.07284858387799564, "grad_norm": 45.54360247026886, "learning_rate": 5.82312925170068e-07, "logits/chosen": -3.084937572479248, "logits/rejected": -2.024446964263916, "logps/chosen": -1.00600266456604, "logps/rejected": -1.5856248140335083, "loss": 4.3489, "rewards/accuracies": 0.75, "rewards/chosen": -10.060026168823242, "rewards/margins": 5.796221733093262, "rewards/rejected": -15.85624885559082, "step": 535 }, { "epoch": 0.07298474945533769, "grad_norm": 47.89850356588516, "learning_rate": 5.834013605442177e-07, "logits/chosen": -2.9349966049194336, "logits/rejected": -2.188476085662842, "logps/chosen": -1.1325953006744385, "logps/rejected": -1.3625712394714355, "loss": 4.8539, "rewards/accuracies": 0.75, "rewards/chosen": -11.325952529907227, "rewards/margins": 2.29975962638855, "rewards/rejected": -13.625712394714355, "step": 536 }, { "epoch": 0.07312091503267974, "grad_norm": 40.401818799397155, "learning_rate": 5.844897959183673e-07, "logits/chosen": -2.659346103668213, "logits/rejected": -1.6673572063446045, "logps/chosen": -1.0310468673706055, "logps/rejected": -1.176596999168396, "loss": 4.5618, "rewards/accuracies": 0.75, "rewards/chosen": -10.310469627380371, "rewards/margins": 1.455500602722168, "rewards/rejected": -11.765970230102539, "step": 537 }, { "epoch": 0.07325708061002179, "grad_norm": 43.33854282400204, "learning_rate": 5.85578231292517e-07, "logits/chosen": -1.8851029872894287, "logits/rejected": -1.9067118167877197, "logps/chosen": -0.908757209777832, "logps/rejected": -1.255047082901001, "loss": 4.5164, "rewards/accuracies": 1.0, "rewards/chosen": -9.08757209777832, "rewards/margins": 3.4628992080688477, "rewards/rejected": -12.550471305847168, "step": 538 }, { "epoch": 0.07339324618736383, "grad_norm": 43.78297520058338, "learning_rate": 5.866666666666666e-07, "logits/chosen": -3.7002968788146973, "logits/rejected": -3.242711305618286, "logps/chosen": -1.1328978538513184, "logps/rejected": -1.4458469152450562, "loss": 4.7127, "rewards/accuracies": 1.0, "rewards/chosen": -11.3289794921875, "rewards/margins": 3.1294898986816406, "rewards/rejected": -14.45846939086914, "step": 539 }, { "epoch": 0.07352941176470588, "grad_norm": 59.82716166022771, "learning_rate": 5.877551020408163e-07, "logits/chosen": -2.583240270614624, "logits/rejected": -3.1657462120056152, "logps/chosen": -1.3673057556152344, "logps/rejected": -1.1530280113220215, "loss": 5.188, "rewards/accuracies": 0.25, "rewards/chosen": -13.673057556152344, "rewards/margins": -2.1427786350250244, "rewards/rejected": -11.530279159545898, "step": 540 }, { "epoch": 0.07366557734204793, "grad_norm": 43.527967613888016, "learning_rate": 5.888435374149659e-07, "logits/chosen": -2.3948464393615723, "logits/rejected": -2.453195333480835, "logps/chosen": -1.2009484767913818, "logps/rejected": -1.1729698181152344, "loss": 4.4634, "rewards/accuracies": 0.25, "rewards/chosen": -12.009485244750977, "rewards/margins": -0.2797858715057373, "rewards/rejected": -11.72969913482666, "step": 541 }, { "epoch": 0.07380174291938998, "grad_norm": 42.59137822994886, "learning_rate": 5.899319727891156e-07, "logits/chosen": -2.440309524536133, "logits/rejected": -1.603435754776001, "logps/chosen": -1.046288013458252, "logps/rejected": -1.2866750955581665, "loss": 4.297, "rewards/accuracies": 0.75, "rewards/chosen": -10.462881088256836, "rewards/margins": 2.403870105743408, "rewards/rejected": -12.866750717163086, "step": 542 }, { "epoch": 0.07393790849673203, "grad_norm": 42.29635124302855, "learning_rate": 5.910204081632652e-07, "logits/chosen": -2.2993502616882324, "logits/rejected": -2.435619354248047, "logps/chosen": -0.9456726908683777, "logps/rejected": -0.9917744398117065, "loss": 4.7685, "rewards/accuracies": 0.5, "rewards/chosen": -9.456727027893066, "rewards/margins": 0.4610171318054199, "rewards/rejected": -9.917743682861328, "step": 543 }, { "epoch": 0.07407407407407407, "grad_norm": 48.51328729075138, "learning_rate": 5.921088435374149e-07, "logits/chosen": -2.1309900283813477, "logits/rejected": -3.0422372817993164, "logps/chosen": -1.6436516046524048, "logps/rejected": -1.1252344846725464, "loss": 4.2011, "rewards/accuracies": 0.25, "rewards/chosen": -16.43651580810547, "rewards/margins": -5.184171676635742, "rewards/rejected": -11.252344131469727, "step": 544 }, { "epoch": 0.07421023965141613, "grad_norm": 65.75172685448237, "learning_rate": 5.931972789115646e-07, "logits/chosen": -3.520249843597412, "logits/rejected": -2.576744556427002, "logps/chosen": -0.9151773452758789, "logps/rejected": -1.057251214981079, "loss": 4.818, "rewards/accuracies": 0.5, "rewards/chosen": -9.151773452758789, "rewards/margins": 1.4207377433776855, "rewards/rejected": -10.572511672973633, "step": 545 }, { "epoch": 0.07434640522875817, "grad_norm": 56.12494558449554, "learning_rate": 5.942857142857143e-07, "logits/chosen": -2.6616082191467285, "logits/rejected": -2.229566812515259, "logps/chosen": -1.3175793886184692, "logps/rejected": -1.4426337480545044, "loss": 4.7484, "rewards/accuracies": 0.5, "rewards/chosen": -13.175793647766113, "rewards/margins": 1.2505439519882202, "rewards/rejected": -14.426337242126465, "step": 546 }, { "epoch": 0.07448257080610021, "grad_norm": 54.212132084712835, "learning_rate": 5.95374149659864e-07, "logits/chosen": -1.2937507629394531, "logits/rejected": -2.2965714931488037, "logps/chosen": -1.6116535663604736, "logps/rejected": -1.9092856645584106, "loss": 5.1724, "rewards/accuracies": 0.5, "rewards/chosen": -16.116535186767578, "rewards/margins": 2.9763214588165283, "rewards/rejected": -19.092857360839844, "step": 547 }, { "epoch": 0.07461873638344227, "grad_norm": 43.482388690947076, "learning_rate": 5.964625850340136e-07, "logits/chosen": -3.2796988487243652, "logits/rejected": -1.8712987899780273, "logps/chosen": -0.8769482374191284, "logps/rejected": -1.241309642791748, "loss": 4.7835, "rewards/accuracies": 1.0, "rewards/chosen": -8.769482612609863, "rewards/margins": 3.6436145305633545, "rewards/rejected": -12.413097381591797, "step": 548 }, { "epoch": 0.07475490196078431, "grad_norm": 41.34192208715584, "learning_rate": 5.975510204081633e-07, "logits/chosen": -3.2644505500793457, "logits/rejected": -2.041151285171509, "logps/chosen": -1.0843509435653687, "logps/rejected": -1.6412169933319092, "loss": 4.3628, "rewards/accuracies": 0.5, "rewards/chosen": -10.843509674072266, "rewards/margins": 5.568659782409668, "rewards/rejected": -16.412168502807617, "step": 549 }, { "epoch": 0.07489106753812637, "grad_norm": 49.36974972809444, "learning_rate": 5.986394557823129e-07, "logits/chosen": -3.4046130180358887, "logits/rejected": -2.7451353073120117, "logps/chosen": -1.1363427639007568, "logps/rejected": -1.2418558597564697, "loss": 4.1305, "rewards/accuracies": 0.25, "rewards/chosen": -11.363428115844727, "rewards/margins": 1.055131435394287, "rewards/rejected": -12.418558120727539, "step": 550 }, { "epoch": 0.07502723311546841, "grad_norm": 54.399646991910316, "learning_rate": 5.997278911564626e-07, "logits/chosen": -4.006908893585205, "logits/rejected": -4.397706985473633, "logps/chosen": -0.929789662361145, "logps/rejected": -1.1951799392700195, "loss": 4.8395, "rewards/accuracies": 0.5, "rewards/chosen": -9.297896385192871, "rewards/margins": 2.653902530670166, "rewards/rejected": -11.951799392700195, "step": 551 }, { "epoch": 0.07516339869281045, "grad_norm": 50.144936632855455, "learning_rate": 6.008163265306122e-07, "logits/chosen": -3.5099809169769287, "logits/rejected": -2.379624366760254, "logps/chosen": -0.9581728577613831, "logps/rejected": -1.0792137384414673, "loss": 4.8317, "rewards/accuracies": 0.5, "rewards/chosen": -9.581727981567383, "rewards/margins": 1.2104097604751587, "rewards/rejected": -10.79213809967041, "step": 552 }, { "epoch": 0.07529956427015251, "grad_norm": 47.904798212407826, "learning_rate": 6.019047619047619e-07, "logits/chosen": -2.6384267807006836, "logits/rejected": -2.4540510177612305, "logps/chosen": -1.073359727859497, "logps/rejected": -1.1735190153121948, "loss": 4.6375, "rewards/accuracies": 0.5, "rewards/chosen": -10.733596801757812, "rewards/margins": 1.0015934705734253, "rewards/rejected": -11.735189437866211, "step": 553 }, { "epoch": 0.07543572984749455, "grad_norm": 41.95315096993988, "learning_rate": 6.029931972789115e-07, "logits/chosen": -1.925865650177002, "logits/rejected": -1.6063957214355469, "logps/chosen": -1.1238300800323486, "logps/rejected": -1.1565053462982178, "loss": 3.9817, "rewards/accuracies": 0.5, "rewards/chosen": -11.238300323486328, "rewards/margins": 0.3267536163330078, "rewards/rejected": -11.565053939819336, "step": 554 }, { "epoch": 0.0755718954248366, "grad_norm": 46.756597965995006, "learning_rate": 6.040816326530612e-07, "logits/chosen": -2.179116725921631, "logits/rejected": -1.8805646896362305, "logps/chosen": -1.0722317695617676, "logps/rejected": -1.037813663482666, "loss": 4.8022, "rewards/accuracies": 0.25, "rewards/chosen": -10.72231674194336, "rewards/margins": -0.3441808223724365, "rewards/rejected": -10.37813663482666, "step": 555 }, { "epoch": 0.07570806100217865, "grad_norm": 40.313886577019375, "learning_rate": 6.051700680272109e-07, "logits/chosen": -3.671840190887451, "logits/rejected": -3.199397087097168, "logps/chosen": -0.935076117515564, "logps/rejected": -1.1351709365844727, "loss": 4.4952, "rewards/accuracies": 0.75, "rewards/chosen": -9.350761413574219, "rewards/margins": 2.0009491443634033, "rewards/rejected": -11.351710319519043, "step": 556 }, { "epoch": 0.0758442265795207, "grad_norm": 45.858084850049366, "learning_rate": 6.062585034013606e-07, "logits/chosen": -1.701064944267273, "logits/rejected": -2.795102596282959, "logps/chosen": -1.633382797241211, "logps/rejected": -1.0506991147994995, "loss": 4.0834, "rewards/accuracies": 0.25, "rewards/chosen": -16.33382797241211, "rewards/margins": -5.826837062835693, "rewards/rejected": -10.506990432739258, "step": 557 }, { "epoch": 0.07598039215686274, "grad_norm": 43.745754811721184, "learning_rate": 6.073469387755101e-07, "logits/chosen": -4.603307247161865, "logits/rejected": -2.691140651702881, "logps/chosen": -1.0311882495880127, "logps/rejected": -1.3798604011535645, "loss": 4.4201, "rewards/accuracies": 0.75, "rewards/chosen": -10.311882019042969, "rewards/margins": 3.486722707748413, "rewards/rejected": -13.798604965209961, "step": 558 }, { "epoch": 0.0761165577342048, "grad_norm": 43.99479877839875, "learning_rate": 6.084353741496598e-07, "logits/chosen": -2.004666566848755, "logits/rejected": -3.7684807777404785, "logps/chosen": -0.912187397480011, "logps/rejected": -0.9345219135284424, "loss": 4.6838, "rewards/accuracies": 0.5, "rewards/chosen": -9.12187385559082, "rewards/margins": 0.22334527969360352, "rewards/rejected": -9.345219612121582, "step": 559 }, { "epoch": 0.07625272331154684, "grad_norm": 56.76280155562993, "learning_rate": 6.095238095238094e-07, "logits/chosen": -3.3215222358703613, "logits/rejected": -2.0298211574554443, "logps/chosen": -1.0113604068756104, "logps/rejected": -1.4177435636520386, "loss": 4.6418, "rewards/accuracies": 0.75, "rewards/chosen": -10.113603591918945, "rewards/margins": 4.063831806182861, "rewards/rejected": -14.177435874938965, "step": 560 }, { "epoch": 0.0763888888888889, "grad_norm": 43.712306320209926, "learning_rate": 6.106122448979592e-07, "logits/chosen": -1.7523083686828613, "logits/rejected": -2.540046215057373, "logps/chosen": -1.2748870849609375, "logps/rejected": -1.1419124603271484, "loss": 4.9561, "rewards/accuracies": 0.25, "rewards/chosen": -12.748870849609375, "rewards/margins": -1.3297455310821533, "rewards/rejected": -11.419124603271484, "step": 561 }, { "epoch": 0.07652505446623094, "grad_norm": 39.34105355999166, "learning_rate": 6.117006802721088e-07, "logits/chosen": -2.174428939819336, "logits/rejected": -3.996283769607544, "logps/chosen": -1.127501130104065, "logps/rejected": -0.8990464210510254, "loss": 4.2886, "rewards/accuracies": 0.25, "rewards/chosen": -11.27501106262207, "rewards/margins": -2.2845466136932373, "rewards/rejected": -8.990464210510254, "step": 562 }, { "epoch": 0.07666122004357298, "grad_norm": 47.27422737830784, "learning_rate": 6.127891156462585e-07, "logits/chosen": -2.7595884799957275, "logits/rejected": -3.3444643020629883, "logps/chosen": -0.9465582966804504, "logps/rejected": -1.205259084701538, "loss": 4.6209, "rewards/accuracies": 0.75, "rewards/chosen": -9.465582847595215, "rewards/margins": 2.587008476257324, "rewards/rejected": -12.052591323852539, "step": 563 }, { "epoch": 0.07679738562091504, "grad_norm": 39.96083324348169, "learning_rate": 6.138775510204081e-07, "logits/chosen": -3.7806780338287354, "logits/rejected": -2.898282289505005, "logps/chosen": -0.8692531585693359, "logps/rejected": -0.9604839086532593, "loss": 4.0891, "rewards/accuracies": 0.5, "rewards/chosen": -8.69253158569336, "rewards/margins": 0.912307620048523, "rewards/rejected": -9.604839324951172, "step": 564 }, { "epoch": 0.07693355119825708, "grad_norm": 47.05891396925553, "learning_rate": 6.149659863945578e-07, "logits/chosen": -1.7988016605377197, "logits/rejected": -1.2815321683883667, "logps/chosen": -1.27614426612854, "logps/rejected": -1.4418425559997559, "loss": 4.7159, "rewards/accuracies": 0.75, "rewards/chosen": -12.761442184448242, "rewards/margins": 1.6569828987121582, "rewards/rejected": -14.418425559997559, "step": 565 }, { "epoch": 0.07706971677559912, "grad_norm": 53.73971184016499, "learning_rate": 6.160544217687075e-07, "logits/chosen": -2.8820126056671143, "logits/rejected": -1.7666704654693604, "logps/chosen": -1.0985972881317139, "logps/rejected": -1.4381980895996094, "loss": 4.743, "rewards/accuracies": 0.75, "rewards/chosen": -10.985973358154297, "rewards/margins": 3.396008253097534, "rewards/rejected": -14.38198184967041, "step": 566 }, { "epoch": 0.07720588235294118, "grad_norm": 44.04197384714748, "learning_rate": 6.171428571428571e-07, "logits/chosen": -2.249223232269287, "logits/rejected": -2.3329615592956543, "logps/chosen": -0.9925230741500854, "logps/rejected": -1.3587489128112793, "loss": 4.211, "rewards/accuracies": 0.75, "rewards/chosen": -9.925230026245117, "rewards/margins": 3.6622586250305176, "rewards/rejected": -13.587489128112793, "step": 567 }, { "epoch": 0.07734204793028322, "grad_norm": 38.442531743044796, "learning_rate": 6.182312925170068e-07, "logits/chosen": -2.5031373500823975, "logits/rejected": -2.8118319511413574, "logps/chosen": -1.5199079513549805, "logps/rejected": -1.2774145603179932, "loss": 4.9204, "rewards/accuracies": 0.5, "rewards/chosen": -15.199079513549805, "rewards/margins": -2.4249343872070312, "rewards/rejected": -12.774145126342773, "step": 568 }, { "epoch": 0.07747821350762528, "grad_norm": 44.696799778767506, "learning_rate": 6.193197278911564e-07, "logits/chosen": -1.7371604442596436, "logits/rejected": -2.1293768882751465, "logps/chosen": -0.9531384706497192, "logps/rejected": -1.4922828674316406, "loss": 4.0725, "rewards/accuracies": 0.75, "rewards/chosen": -9.53138542175293, "rewards/margins": 5.391443252563477, "rewards/rejected": -14.922828674316406, "step": 569 }, { "epoch": 0.07761437908496732, "grad_norm": 49.00582638733023, "learning_rate": 6.204081632653061e-07, "logits/chosen": -2.266124725341797, "logits/rejected": -2.743696928024292, "logps/chosen": -0.9483022689819336, "logps/rejected": -1.517376184463501, "loss": 4.6759, "rewards/accuracies": 0.5, "rewards/chosen": -9.483022689819336, "rewards/margins": 5.690740585327148, "rewards/rejected": -15.173762321472168, "step": 570 }, { "epoch": 0.07775054466230936, "grad_norm": 47.33793842817806, "learning_rate": 6.214965986394557e-07, "logits/chosen": -2.44290828704834, "logits/rejected": -2.327439546585083, "logps/chosen": -1.1810088157653809, "logps/rejected": -1.180440902709961, "loss": 4.7275, "rewards/accuracies": 0.5, "rewards/chosen": -11.810087203979492, "rewards/margins": -0.005678892135620117, "rewards/rejected": -11.80440902709961, "step": 571 }, { "epoch": 0.07788671023965142, "grad_norm": 48.151373140757414, "learning_rate": 6.225850340136055e-07, "logits/chosen": -3.924765110015869, "logits/rejected": -2.7104251384735107, "logps/chosen": -0.8243378400802612, "logps/rejected": -0.9799274802207947, "loss": 4.1381, "rewards/accuracies": 0.5, "rewards/chosen": -8.243378639221191, "rewards/margins": 1.5558961629867554, "rewards/rejected": -9.799274444580078, "step": 572 }, { "epoch": 0.07802287581699346, "grad_norm": 44.106216943961556, "learning_rate": 6.236734693877551e-07, "logits/chosen": -2.1748886108398438, "logits/rejected": -2.3103883266448975, "logps/chosen": -1.191980242729187, "logps/rejected": -1.1168222427368164, "loss": 4.9068, "rewards/accuracies": 0.25, "rewards/chosen": -11.919801712036133, "rewards/margins": -0.7515791654586792, "rewards/rejected": -11.16822338104248, "step": 573 }, { "epoch": 0.07815904139433551, "grad_norm": 46.093596456626365, "learning_rate": 6.247619047619048e-07, "logits/chosen": -2.0175912380218506, "logits/rejected": -1.9599215984344482, "logps/chosen": -1.259394884109497, "logps/rejected": -1.5934698581695557, "loss": 4.6704, "rewards/accuracies": 1.0, "rewards/chosen": -12.593948364257812, "rewards/margins": 3.3407492637634277, "rewards/rejected": -15.934698104858398, "step": 574 }, { "epoch": 0.07829520697167756, "grad_norm": 41.136694340989266, "learning_rate": 6.258503401360544e-07, "logits/chosen": -2.1196842193603516, "logits/rejected": -1.332553505897522, "logps/chosen": -1.5293503999710083, "logps/rejected": -1.452556848526001, "loss": 4.2727, "rewards/accuracies": 0.5, "rewards/chosen": -15.29350471496582, "rewards/margins": -0.7679362297058105, "rewards/rejected": -14.525568008422852, "step": 575 }, { "epoch": 0.0784313725490196, "grad_norm": 48.18608372849713, "learning_rate": 6.26938775510204e-07, "logits/chosen": -3.4053733348846436, "logits/rejected": -2.249399423599243, "logps/chosen": -1.187802791595459, "logps/rejected": -1.5030646324157715, "loss": 4.1786, "rewards/accuracies": 0.75, "rewards/chosen": -11.878026962280273, "rewards/margins": 3.1526191234588623, "rewards/rejected": -15.030646324157715, "step": 576 }, { "epoch": 0.07856753812636165, "grad_norm": 49.45520749866785, "learning_rate": 6.280272108843537e-07, "logits/chosen": -2.1888279914855957, "logits/rejected": -2.373161792755127, "logps/chosen": -1.8322969675064087, "logps/rejected": -1.4023537635803223, "loss": 5.0274, "rewards/accuracies": 0.5, "rewards/chosen": -18.322969436645508, "rewards/margins": -4.299431800842285, "rewards/rejected": -14.023536682128906, "step": 577 }, { "epoch": 0.0787037037037037, "grad_norm": 46.838717302092896, "learning_rate": 6.291156462585034e-07, "logits/chosen": -2.863309860229492, "logits/rejected": -2.748157024383545, "logps/chosen": -1.0235795974731445, "logps/rejected": -1.026947259902954, "loss": 4.4133, "rewards/accuracies": 0.5, "rewards/chosen": -10.235795974731445, "rewards/margins": 0.033676743507385254, "rewards/rejected": -10.269472122192383, "step": 578 }, { "epoch": 0.07883986928104575, "grad_norm": 51.62842616916561, "learning_rate": 6.30204081632653e-07, "logits/chosen": -3.5555646419525146, "logits/rejected": -1.2084641456604004, "logps/chosen": -1.4692548513412476, "logps/rejected": -1.762378215789795, "loss": 4.5166, "rewards/accuracies": 0.75, "rewards/chosen": -14.692547798156738, "rewards/margins": 2.93123459815979, "rewards/rejected": -17.623783111572266, "step": 579 }, { "epoch": 0.0789760348583878, "grad_norm": 60.75186483378536, "learning_rate": 6.312925170068027e-07, "logits/chosen": -1.7952966690063477, "logits/rejected": -2.805755853652954, "logps/chosen": -1.3675873279571533, "logps/rejected": -1.246650218963623, "loss": 5.1494, "rewards/accuracies": 0.25, "rewards/chosen": -13.675874710083008, "rewards/margins": -1.2093708515167236, "rewards/rejected": -12.466503143310547, "step": 580 }, { "epoch": 0.07911220043572985, "grad_norm": 39.789979410666824, "learning_rate": 6.323809523809523e-07, "logits/chosen": -1.9663374423980713, "logits/rejected": -2.4888558387756348, "logps/chosen": -1.0883268117904663, "logps/rejected": -1.1395796537399292, "loss": 4.8861, "rewards/accuracies": 0.75, "rewards/chosen": -10.883268356323242, "rewards/margins": 0.5125281810760498, "rewards/rejected": -11.395795822143555, "step": 581 }, { "epoch": 0.07924836601307189, "grad_norm": 45.80266511954022, "learning_rate": 6.33469387755102e-07, "logits/chosen": -2.9404304027557373, "logits/rejected": -3.0226383209228516, "logps/chosen": -1.2058212757110596, "logps/rejected": -1.457509994506836, "loss": 5.0813, "rewards/accuracies": 0.75, "rewards/chosen": -12.058212280273438, "rewards/margins": 2.5168871879577637, "rewards/rejected": -14.57509994506836, "step": 582 }, { "epoch": 0.07938453159041395, "grad_norm": 45.798708039518324, "learning_rate": 6.345578231292518e-07, "logits/chosen": -3.190579891204834, "logits/rejected": -2.5421926975250244, "logps/chosen": -1.029565453529358, "logps/rejected": -1.1365678310394287, "loss": 4.4425, "rewards/accuracies": 0.5, "rewards/chosen": -10.295655250549316, "rewards/margins": 1.0700243711471558, "rewards/rejected": -11.365679740905762, "step": 583 }, { "epoch": 0.07952069716775599, "grad_norm": 44.729564353282996, "learning_rate": 6.356462585034013e-07, "logits/chosen": -3.2205543518066406, "logits/rejected": -2.5102734565734863, "logps/chosen": -1.357964038848877, "logps/rejected": -1.6317024230957031, "loss": 4.1065, "rewards/accuracies": 0.75, "rewards/chosen": -13.57964038848877, "rewards/margins": 2.7373838424682617, "rewards/rejected": -16.31702423095703, "step": 584 }, { "epoch": 0.07965686274509803, "grad_norm": 44.77401675139446, "learning_rate": 6.36734693877551e-07, "logits/chosen": -2.950859308242798, "logits/rejected": -3.7898221015930176, "logps/chosen": -1.0918176174163818, "logps/rejected": -1.1528356075286865, "loss": 4.3049, "rewards/accuracies": 0.5, "rewards/chosen": -10.91817569732666, "rewards/margins": 0.6101806163787842, "rewards/rejected": -11.528356552124023, "step": 585 }, { "epoch": 0.07979302832244009, "grad_norm": 47.88916309895052, "learning_rate": 6.378231292517006e-07, "logits/chosen": -2.9758925437927246, "logits/rejected": -2.6580958366394043, "logps/chosen": -1.2605466842651367, "logps/rejected": -1.3192414045333862, "loss": 4.7075, "rewards/accuracies": 0.75, "rewards/chosen": -12.605466842651367, "rewards/margins": 0.5869479179382324, "rewards/rejected": -13.192414283752441, "step": 586 }, { "epoch": 0.07992919389978213, "grad_norm": 50.69025262153446, "learning_rate": 6.389115646258503e-07, "logits/chosen": -3.985750198364258, "logits/rejected": -3.3366851806640625, "logps/chosen": -0.83809494972229, "logps/rejected": -0.9132397770881653, "loss": 4.1274, "rewards/accuracies": 0.75, "rewards/chosen": -8.380949020385742, "rewards/margins": 0.7514486312866211, "rewards/rejected": -9.13239860534668, "step": 587 }, { "epoch": 0.08006535947712418, "grad_norm": 52.87468881519949, "learning_rate": 6.4e-07, "logits/chosen": -3.160942554473877, "logits/rejected": -2.0663280487060547, "logps/chosen": -1.2791564464569092, "logps/rejected": -1.3609875440597534, "loss": 4.5632, "rewards/accuracies": 0.75, "rewards/chosen": -12.79156494140625, "rewards/margins": 0.8183104991912842, "rewards/rejected": -13.609875679016113, "step": 588 }, { "epoch": 0.08020152505446623, "grad_norm": 50.39739072516855, "learning_rate": 6.410884353741497e-07, "logits/chosen": -2.665773391723633, "logits/rejected": -2.2744333744049072, "logps/chosen": -1.3363343477249146, "logps/rejected": -1.3927106857299805, "loss": 4.9347, "rewards/accuracies": 0.5, "rewards/chosen": -13.363343238830566, "rewards/margins": 0.5637643337249756, "rewards/rejected": -13.927106857299805, "step": 589 }, { "epoch": 0.08033769063180828, "grad_norm": 47.19267353907659, "learning_rate": 6.421768707482993e-07, "logits/chosen": -1.8304202556610107, "logits/rejected": -2.1784238815307617, "logps/chosen": -1.1479711532592773, "logps/rejected": -1.330281376838684, "loss": 4.4437, "rewards/accuracies": 0.75, "rewards/chosen": -11.479711532592773, "rewards/margins": 1.8231022357940674, "rewards/rejected": -13.302813529968262, "step": 590 }, { "epoch": 0.08047385620915033, "grad_norm": 43.975350629103495, "learning_rate": 6.43265306122449e-07, "logits/chosen": -1.3222476243972778, "logits/rejected": -1.0123072862625122, "logps/chosen": -1.4118249416351318, "logps/rejected": -1.3707385063171387, "loss": 5.0176, "rewards/accuracies": 0.5, "rewards/chosen": -14.11824893951416, "rewards/margins": -0.41086292266845703, "rewards/rejected": -13.707386016845703, "step": 591 }, { "epoch": 0.08061002178649238, "grad_norm": 48.65571417480757, "learning_rate": 6.443537414965986e-07, "logits/chosen": -3.408067226409912, "logits/rejected": -1.6773698329925537, "logps/chosen": -0.7919538617134094, "logps/rejected": -1.171349048614502, "loss": 4.4747, "rewards/accuracies": 1.0, "rewards/chosen": -7.919538497924805, "rewards/margins": 3.7939531803131104, "rewards/rejected": -11.713491439819336, "step": 592 }, { "epoch": 0.08074618736383442, "grad_norm": 41.553542974275196, "learning_rate": 6.454421768707483e-07, "logits/chosen": -3.3986945152282715, "logits/rejected": -2.1686952114105225, "logps/chosen": -0.7907664775848389, "logps/rejected": -1.0912569761276245, "loss": 4.4695, "rewards/accuracies": 0.5, "rewards/chosen": -7.9076642990112305, "rewards/margins": 3.0049057006835938, "rewards/rejected": -10.912569999694824, "step": 593 }, { "epoch": 0.08088235294117647, "grad_norm": 43.82411779584038, "learning_rate": 6.465306122448979e-07, "logits/chosen": -3.7682509422302246, "logits/rejected": -1.8877925872802734, "logps/chosen": -1.45662522315979, "logps/rejected": -1.9482979774475098, "loss": 4.2309, "rewards/accuracies": 1.0, "rewards/chosen": -14.566252708435059, "rewards/margins": 4.9167280197143555, "rewards/rejected": -19.48297882080078, "step": 594 }, { "epoch": 0.08101851851851852, "grad_norm": 42.544132533178335, "learning_rate": 6.476190476190476e-07, "logits/chosen": -3.5360963344573975, "logits/rejected": -3.0905234813690186, "logps/chosen": -0.9950023889541626, "logps/rejected": -1.2738765478134155, "loss": 4.4094, "rewards/accuracies": 0.5, "rewards/chosen": -9.950023651123047, "rewards/margins": 2.7887420654296875, "rewards/rejected": -12.738765716552734, "step": 595 }, { "epoch": 0.08115468409586056, "grad_norm": 44.828004751200176, "learning_rate": 6.487074829931972e-07, "logits/chosen": -3.2545647621154785, "logits/rejected": -2.746997117996216, "logps/chosen": -1.220887303352356, "logps/rejected": -1.2467271089553833, "loss": 4.5996, "rewards/accuracies": 0.5, "rewards/chosen": -12.208873748779297, "rewards/margins": 0.25839662551879883, "rewards/rejected": -12.467270851135254, "step": 596 }, { "epoch": 0.08129084967320262, "grad_norm": 43.50626682298884, "learning_rate": 6.497959183673469e-07, "logits/chosen": -3.8778274059295654, "logits/rejected": -3.4082159996032715, "logps/chosen": -0.9326948523521423, "logps/rejected": -1.0492100715637207, "loss": 4.5342, "rewards/accuracies": 0.75, "rewards/chosen": -9.326948165893555, "rewards/margins": 1.1651530265808105, "rewards/rejected": -10.492101669311523, "step": 597 }, { "epoch": 0.08142701525054466, "grad_norm": 44.59749419734501, "learning_rate": 6.508843537414965e-07, "logits/chosen": -2.8062100410461426, "logits/rejected": -1.5085694789886475, "logps/chosen": -1.1097147464752197, "logps/rejected": -1.308516263961792, "loss": 4.736, "rewards/accuracies": 0.75, "rewards/chosen": -11.097147941589355, "rewards/margins": 1.9880144596099854, "rewards/rejected": -13.085162162780762, "step": 598 }, { "epoch": 0.08156318082788672, "grad_norm": 52.810366334852624, "learning_rate": 6.519727891156463e-07, "logits/chosen": -1.652611255645752, "logits/rejected": -1.7713758945465088, "logps/chosen": -1.5805513858795166, "logps/rejected": -1.4866876602172852, "loss": 4.5614, "rewards/accuracies": 0.25, "rewards/chosen": -15.805513381958008, "rewards/margins": -0.9386374950408936, "rewards/rejected": -14.866876602172852, "step": 599 }, { "epoch": 0.08169934640522876, "grad_norm": 41.760859061283654, "learning_rate": 6.53061224489796e-07, "logits/chosen": -3.1800832748413086, "logits/rejected": -1.2567379474639893, "logps/chosen": -1.1777392625808716, "logps/rejected": -1.2899768352508545, "loss": 4.5501, "rewards/accuracies": 0.5, "rewards/chosen": -11.777393341064453, "rewards/margins": 1.1223751306533813, "rewards/rejected": -12.899767875671387, "step": 600 }, { "epoch": 0.0818355119825708, "grad_norm": 56.62314879662598, "learning_rate": 6.541496598639456e-07, "logits/chosen": -2.701206684112549, "logits/rejected": -2.5864596366882324, "logps/chosen": -0.932152271270752, "logps/rejected": -1.0466785430908203, "loss": 4.1319, "rewards/accuracies": 0.75, "rewards/chosen": -9.321523666381836, "rewards/margins": 1.1452614068984985, "rewards/rejected": -10.466784477233887, "step": 601 }, { "epoch": 0.08197167755991286, "grad_norm": 44.51880462370142, "learning_rate": 6.552380952380951e-07, "logits/chosen": -3.3284997940063477, "logits/rejected": -1.1025123596191406, "logps/chosen": -1.0463488101959229, "logps/rejected": -1.6446454524993896, "loss": 4.0288, "rewards/accuracies": 1.0, "rewards/chosen": -10.46348762512207, "rewards/margins": 5.982967376708984, "rewards/rejected": -16.446455001831055, "step": 602 }, { "epoch": 0.0821078431372549, "grad_norm": 67.16230616140636, "learning_rate": 6.563265306122448e-07, "logits/chosen": -3.403555154800415, "logits/rejected": -1.404071569442749, "logps/chosen": -1.358604907989502, "logps/rejected": -1.7409112453460693, "loss": 4.4323, "rewards/accuracies": 0.75, "rewards/chosen": -13.586050033569336, "rewards/margins": 3.8230628967285156, "rewards/rejected": -17.40911293029785, "step": 603 }, { "epoch": 0.08224400871459694, "grad_norm": 42.53483316441076, "learning_rate": 6.574149659863946e-07, "logits/chosen": -3.2369728088378906, "logits/rejected": -3.8374440670013428, "logps/chosen": -1.0714023113250732, "logps/rejected": -1.0009973049163818, "loss": 4.5801, "rewards/accuracies": 0.5, "rewards/chosen": -10.71402359008789, "rewards/margins": -0.704052209854126, "rewards/rejected": -10.009971618652344, "step": 604 }, { "epoch": 0.082380174291939, "grad_norm": 39.94812631928239, "learning_rate": 6.585034013605442e-07, "logits/chosen": -0.6565009355545044, "logits/rejected": -1.1553157567977905, "logps/chosen": -1.4609384536743164, "logps/rejected": -1.5988821983337402, "loss": 3.9836, "rewards/accuracies": 0.5, "rewards/chosen": -14.609384536743164, "rewards/margins": 1.37943696975708, "rewards/rejected": -15.988821029663086, "step": 605 }, { "epoch": 0.08251633986928104, "grad_norm": 47.63461483858564, "learning_rate": 6.595918367346939e-07, "logits/chosen": -1.2774815559387207, "logits/rejected": -0.5503163933753967, "logps/chosen": -1.463466763496399, "logps/rejected": -1.6915215253829956, "loss": 3.9886, "rewards/accuracies": 0.75, "rewards/chosen": -14.634666442871094, "rewards/margins": 2.280548095703125, "rewards/rejected": -16.91521453857422, "step": 606 }, { "epoch": 0.08265250544662309, "grad_norm": 74.67082453882801, "learning_rate": 6.606802721088435e-07, "logits/chosen": -2.02715802192688, "logits/rejected": -1.6356555223464966, "logps/chosen": -2.181460380554199, "logps/rejected": -1.6771330833435059, "loss": 5.287, "rewards/accuracies": 0.5, "rewards/chosen": -21.81460189819336, "rewards/margins": -5.043273448944092, "rewards/rejected": -16.771329879760742, "step": 607 }, { "epoch": 0.08278867102396514, "grad_norm": 40.71648264815114, "learning_rate": 6.617687074829932e-07, "logits/chosen": -3.2184267044067383, "logits/rejected": -2.1720314025878906, "logps/chosen": -0.9443331956863403, "logps/rejected": -1.4901862144470215, "loss": 4.8521, "rewards/accuracies": 1.0, "rewards/chosen": -9.44333267211914, "rewards/margins": 5.458529472351074, "rewards/rejected": -14.901861190795898, "step": 608 }, { "epoch": 0.08292483660130719, "grad_norm": 54.61812277169453, "learning_rate": 6.628571428571428e-07, "logits/chosen": -4.100066661834717, "logits/rejected": -2.3986003398895264, "logps/chosen": -0.9464295506477356, "logps/rejected": -1.6009061336517334, "loss": 4.2677, "rewards/accuracies": 0.75, "rewards/chosen": -9.464295387268066, "rewards/margins": 6.544765949249268, "rewards/rejected": -16.009061813354492, "step": 609 }, { "epoch": 0.08306100217864924, "grad_norm": 40.14855311598911, "learning_rate": 6.639455782312925e-07, "logits/chosen": -1.5236937999725342, "logits/rejected": -2.4503440856933594, "logps/chosen": -1.5483078956604004, "logps/rejected": -1.457519292831421, "loss": 4.5906, "rewards/accuracies": 0.5, "rewards/chosen": -15.483078002929688, "rewards/margins": -0.9078867435455322, "rewards/rejected": -14.57519245147705, "step": 610 }, { "epoch": 0.08319716775599129, "grad_norm": 43.23584903389519, "learning_rate": 6.650340136054421e-07, "logits/chosen": -1.7164719104766846, "logits/rejected": -3.2310895919799805, "logps/chosen": -1.4800490140914917, "logps/rejected": -1.019094467163086, "loss": 5.2262, "rewards/accuracies": 0.25, "rewards/chosen": -14.800490379333496, "rewards/margins": -4.609546661376953, "rewards/rejected": -10.190943717956543, "step": 611 }, { "epoch": 0.08333333333333333, "grad_norm": 54.717986981745334, "learning_rate": 6.661224489795918e-07, "logits/chosen": -1.1660511493682861, "logits/rejected": -0.7566147446632385, "logps/chosen": -1.3291029930114746, "logps/rejected": -1.4748488664627075, "loss": 4.5578, "rewards/accuracies": 1.0, "rewards/chosen": -13.291029930114746, "rewards/margins": 1.4574581384658813, "rewards/rejected": -14.748488426208496, "step": 612 }, { "epoch": 0.08346949891067539, "grad_norm": 48.89047375105783, "learning_rate": 6.672108843537414e-07, "logits/chosen": -1.5531785488128662, "logits/rejected": -3.430889368057251, "logps/chosen": -1.2994029521942139, "logps/rejected": -1.1016721725463867, "loss": 4.9354, "rewards/accuracies": 0.5, "rewards/chosen": -12.994028091430664, "rewards/margins": -1.9773058891296387, "rewards/rejected": -11.016721725463867, "step": 613 }, { "epoch": 0.08360566448801743, "grad_norm": 48.37378563461317, "learning_rate": 6.682993197278911e-07, "logits/chosen": -2.3766462802886963, "logits/rejected": -0.8229681253433228, "logps/chosen": -1.180281400680542, "logps/rejected": -1.3814853429794312, "loss": 4.5935, "rewards/accuracies": 0.75, "rewards/chosen": -11.802814483642578, "rewards/margins": 2.0120389461517334, "rewards/rejected": -13.81485366821289, "step": 614 }, { "epoch": 0.08374183006535947, "grad_norm": 39.42201634982396, "learning_rate": 6.693877551020408e-07, "logits/chosen": -3.761796236038208, "logits/rejected": -2.83357834815979, "logps/chosen": -1.477515697479248, "logps/rejected": -1.9957828521728516, "loss": 4.671, "rewards/accuracies": 0.75, "rewards/chosen": -14.775156021118164, "rewards/margins": 5.182671546936035, "rewards/rejected": -19.957828521728516, "step": 615 }, { "epoch": 0.08387799564270153, "grad_norm": 35.555666615753964, "learning_rate": 6.704761904761905e-07, "logits/chosen": -1.8137555122375488, "logits/rejected": -2.266355037689209, "logps/chosen": -1.2301747798919678, "logps/rejected": -1.0568057298660278, "loss": 4.3869, "rewards/accuracies": 0.25, "rewards/chosen": -12.30174732208252, "rewards/margins": -1.733690857887268, "rewards/rejected": -10.5680570602417, "step": 616 }, { "epoch": 0.08401416122004357, "grad_norm": 45.39601785623791, "learning_rate": 6.715646258503401e-07, "logits/chosen": -1.8461618423461914, "logits/rejected": -2.3754959106445312, "logps/chosen": -1.2395329475402832, "logps/rejected": -1.3929966688156128, "loss": 4.4692, "rewards/accuracies": 0.5, "rewards/chosen": -12.395328521728516, "rewards/margins": 1.534637212753296, "rewards/rejected": -13.929966926574707, "step": 617 }, { "epoch": 0.08415032679738563, "grad_norm": 64.4313578206256, "learning_rate": 6.726530612244898e-07, "logits/chosen": -3.4530155658721924, "logits/rejected": -4.14552116394043, "logps/chosen": -1.0229074954986572, "logps/rejected": -0.8667797446250916, "loss": 4.8147, "rewards/accuracies": 0.25, "rewards/chosen": -10.22907543182373, "rewards/margins": -1.561277985572815, "rewards/rejected": -8.667797088623047, "step": 618 }, { "epoch": 0.08428649237472767, "grad_norm": 50.015264856752225, "learning_rate": 6.737414965986393e-07, "logits/chosen": -2.146958589553833, "logits/rejected": -0.797439694404602, "logps/chosen": -1.1030113697052002, "logps/rejected": -1.433666467666626, "loss": 4.465, "rewards/accuracies": 0.75, "rewards/chosen": -11.030113220214844, "rewards/margins": 3.3065507411956787, "rewards/rejected": -14.336664199829102, "step": 619 }, { "epoch": 0.08442265795206971, "grad_norm": 43.47979866448672, "learning_rate": 6.748299319727891e-07, "logits/chosen": -1.0039610862731934, "logits/rejected": -1.3385077714920044, "logps/chosen": -1.3408281803131104, "logps/rejected": -1.2685799598693848, "loss": 5.4488, "rewards/accuracies": 0.25, "rewards/chosen": -13.408281326293945, "rewards/margins": -0.7224826812744141, "rewards/rejected": -12.685798645019531, "step": 620 }, { "epoch": 0.08455882352941177, "grad_norm": 49.37233557207004, "learning_rate": 6.759183673469388e-07, "logits/chosen": -2.7208497524261475, "logits/rejected": -2.389752149581909, "logps/chosen": -1.4126777648925781, "logps/rejected": -1.077986717224121, "loss": 4.5352, "rewards/accuracies": 0.25, "rewards/chosen": -14.126777648925781, "rewards/margins": -3.3469111919403076, "rewards/rejected": -10.779867172241211, "step": 621 }, { "epoch": 0.08469498910675381, "grad_norm": 54.274935356289866, "learning_rate": 6.770068027210884e-07, "logits/chosen": -1.1023409366607666, "logits/rejected": -1.422572374343872, "logps/chosen": -1.2761646509170532, "logps/rejected": -1.5361549854278564, "loss": 4.6813, "rewards/accuracies": 0.75, "rewards/chosen": -12.761646270751953, "rewards/margins": 2.599902629852295, "rewards/rejected": -15.361549377441406, "step": 622 }, { "epoch": 0.08483115468409586, "grad_norm": 45.047537163657154, "learning_rate": 6.780952380952381e-07, "logits/chosen": -3.00809907913208, "logits/rejected": -1.9754618406295776, "logps/chosen": -1.0676162242889404, "logps/rejected": -1.3699674606323242, "loss": 4.0774, "rewards/accuracies": 0.75, "rewards/chosen": -10.676162719726562, "rewards/margins": 3.023513078689575, "rewards/rejected": -13.699674606323242, "step": 623 }, { "epoch": 0.08496732026143791, "grad_norm": 38.17507430476359, "learning_rate": 6.791836734693877e-07, "logits/chosen": -2.3432865142822266, "logits/rejected": -1.2568225860595703, "logps/chosen": -0.9527786374092102, "logps/rejected": -2.043945789337158, "loss": 4.1999, "rewards/accuracies": 1.0, "rewards/chosen": -9.527786254882812, "rewards/margins": 10.91167163848877, "rewards/rejected": -20.439456939697266, "step": 624 }, { "epoch": 0.08510348583877995, "grad_norm": 49.74276500280352, "learning_rate": 6.802721088435374e-07, "logits/chosen": -0.7630000710487366, "logits/rejected": -0.8498649001121521, "logps/chosen": -1.3009533882141113, "logps/rejected": -1.2995781898498535, "loss": 4.9997, "rewards/accuracies": 0.5, "rewards/chosen": -13.009532928466797, "rewards/margins": -0.013751626014709473, "rewards/rejected": -12.995780944824219, "step": 625 }, { "epoch": 0.085239651416122, "grad_norm": 49.2524451326264, "learning_rate": 6.813605442176871e-07, "logits/chosen": -1.1068241596221924, "logits/rejected": -1.2969887256622314, "logps/chosen": -1.289494276046753, "logps/rejected": -1.5535030364990234, "loss": 4.5528, "rewards/accuracies": 0.75, "rewards/chosen": -12.894943237304688, "rewards/margins": 2.6400883197784424, "rewards/rejected": -15.53503131866455, "step": 626 }, { "epoch": 0.08537581699346405, "grad_norm": 50.088929152958755, "learning_rate": 6.824489795918367e-07, "logits/chosen": -2.8237311840057373, "logits/rejected": -2.5643885135650635, "logps/chosen": -0.8168638348579407, "logps/rejected": -1.1600303649902344, "loss": 4.1942, "rewards/accuracies": 0.75, "rewards/chosen": -8.168638229370117, "rewards/margins": 3.4316651821136475, "rewards/rejected": -11.600303649902344, "step": 627 }, { "epoch": 0.0855119825708061, "grad_norm": 66.56537063088898, "learning_rate": 6.835374149659863e-07, "logits/chosen": 0.17426913976669312, "logits/rejected": -0.22887703776359558, "logps/chosen": -1.6473190784454346, "logps/rejected": -1.4763453006744385, "loss": 4.8237, "rewards/accuracies": 0.25, "rewards/chosen": -16.473190307617188, "rewards/margins": -1.7097373008728027, "rewards/rejected": -14.76345443725586, "step": 628 }, { "epoch": 0.08564814814814815, "grad_norm": 47.83487313590153, "learning_rate": 6.84625850340136e-07, "logits/chosen": -0.5187608003616333, "logits/rejected": -0.18763595819473267, "logps/chosen": -1.2085174322128296, "logps/rejected": -1.953974962234497, "loss": 4.335, "rewards/accuracies": 1.0, "rewards/chosen": -12.085174560546875, "rewards/margins": 7.4545745849609375, "rewards/rejected": -19.539749145507812, "step": 629 }, { "epoch": 0.0857843137254902, "grad_norm": 46.04685287461739, "learning_rate": 6.857142857142856e-07, "logits/chosen": -0.48565828800201416, "logits/rejected": -0.2621687054634094, "logps/chosen": -1.5707354545593262, "logps/rejected": -1.7580457925796509, "loss": 4.417, "rewards/accuracies": 0.75, "rewards/chosen": -15.707354545593262, "rewards/margins": 1.8731021881103516, "rewards/rejected": -17.58045768737793, "step": 630 }, { "epoch": 0.08592047930283224, "grad_norm": 42.41202475472628, "learning_rate": 6.868027210884354e-07, "logits/chosen": -2.087164878845215, "logits/rejected": -2.4070029258728027, "logps/chosen": -1.232012152671814, "logps/rejected": -1.4048118591308594, "loss": 4.2495, "rewards/accuracies": 0.75, "rewards/chosen": -12.320121765136719, "rewards/margins": 1.7279956340789795, "rewards/rejected": -14.048116683959961, "step": 631 }, { "epoch": 0.0860566448801743, "grad_norm": 46.5008630221996, "learning_rate": 6.87891156462585e-07, "logits/chosen": -2.8778481483459473, "logits/rejected": -4.394512176513672, "logps/chosen": -1.2697117328643799, "logps/rejected": -0.7011905312538147, "loss": 4.4044, "rewards/accuracies": 0.0, "rewards/chosen": -12.69711685180664, "rewards/margins": -5.685211181640625, "rewards/rejected": -7.011905193328857, "step": 632 }, { "epoch": 0.08619281045751634, "grad_norm": 38.95528477058799, "learning_rate": 6.889795918367347e-07, "logits/chosen": -1.7776405811309814, "logits/rejected": -1.8134522438049316, "logps/chosen": -1.155928134918213, "logps/rejected": -1.254547119140625, "loss": 4.4514, "rewards/accuracies": 0.5, "rewards/chosen": -11.559281349182129, "rewards/margins": 0.9861913919448853, "rewards/rejected": -12.545472145080566, "step": 633 }, { "epoch": 0.08632897603485838, "grad_norm": 52.86284507441252, "learning_rate": 6.900680272108843e-07, "logits/chosen": -1.9607799053192139, "logits/rejected": -1.2823646068572998, "logps/chosen": -1.0692086219787598, "logps/rejected": -1.708937406539917, "loss": 4.4277, "rewards/accuracies": 1.0, "rewards/chosen": -10.692086219787598, "rewards/margins": 6.3972883224487305, "rewards/rejected": -17.089374542236328, "step": 634 }, { "epoch": 0.08646514161220044, "grad_norm": 43.352035416065725, "learning_rate": 6.91156462585034e-07, "logits/chosen": -0.057373642921447754, "logits/rejected": -0.47937241196632385, "logps/chosen": -1.5343042612075806, "logps/rejected": -1.8394567966461182, "loss": 4.1931, "rewards/accuracies": 0.5, "rewards/chosen": -15.343042373657227, "rewards/margins": 3.0515260696411133, "rewards/rejected": -18.394569396972656, "step": 635 }, { "epoch": 0.08660130718954248, "grad_norm": 48.05867952747105, "learning_rate": 6.922448979591836e-07, "logits/chosen": -1.7633569240570068, "logits/rejected": -1.424130916595459, "logps/chosen": -1.3606767654418945, "logps/rejected": -1.436800241470337, "loss": 4.9794, "rewards/accuracies": 0.5, "rewards/chosen": -13.606766700744629, "rewards/margins": 0.7612347602844238, "rewards/rejected": -14.368000984191895, "step": 636 }, { "epoch": 0.08673747276688454, "grad_norm": 47.506631696744485, "learning_rate": 6.933333333333333e-07, "logits/chosen": -1.957249402999878, "logits/rejected": -1.777008056640625, "logps/chosen": -1.194991111755371, "logps/rejected": -1.4991106986999512, "loss": 4.6467, "rewards/accuracies": 0.75, "rewards/chosen": -11.949911117553711, "rewards/margins": 3.041196584701538, "rewards/rejected": -14.991107940673828, "step": 637 }, { "epoch": 0.08687363834422658, "grad_norm": 40.63508389008491, "learning_rate": 6.94421768707483e-07, "logits/chosen": -2.484070062637329, "logits/rejected": -1.609302282333374, "logps/chosen": -1.1508885622024536, "logps/rejected": -1.1758496761322021, "loss": 4.1903, "rewards/accuracies": 0.75, "rewards/chosen": -11.508885383605957, "rewards/margins": 0.24961042404174805, "rewards/rejected": -11.758495330810547, "step": 638 }, { "epoch": 0.08700980392156862, "grad_norm": 43.463912497917846, "learning_rate": 6.955102040816326e-07, "logits/chosen": -2.878520965576172, "logits/rejected": -2.8841681480407715, "logps/chosen": -0.9700735211372375, "logps/rejected": -0.9170460104942322, "loss": 4.4368, "rewards/accuracies": 0.25, "rewards/chosen": -9.700736045837402, "rewards/margins": -0.5302753448486328, "rewards/rejected": -9.170459747314453, "step": 639 }, { "epoch": 0.08714596949891068, "grad_norm": 47.71409658539993, "learning_rate": 6.965986394557823e-07, "logits/chosen": -2.790623664855957, "logits/rejected": -1.6341476440429688, "logps/chosen": -1.3916947841644287, "logps/rejected": -1.9466526508331299, "loss": 4.3759, "rewards/accuracies": 0.5, "rewards/chosen": -13.916948318481445, "rewards/margins": 5.549578666687012, "rewards/rejected": -19.46652603149414, "step": 640 }, { "epoch": 0.08728213507625272, "grad_norm": 48.36440264565924, "learning_rate": 6.976870748299319e-07, "logits/chosen": -1.7861464023590088, "logits/rejected": -0.504084050655365, "logps/chosen": -1.5342729091644287, "logps/rejected": -1.6775221824645996, "loss": 5.3509, "rewards/accuracies": 0.75, "rewards/chosen": -15.342729568481445, "rewards/margins": 1.4324915409088135, "rewards/rejected": -16.77522087097168, "step": 641 }, { "epoch": 0.08741830065359477, "grad_norm": 42.057725629663075, "learning_rate": 6.987755102040817e-07, "logits/chosen": -1.5015590190887451, "logits/rejected": -1.7396845817565918, "logps/chosen": -1.4911224842071533, "logps/rejected": -1.397226333618164, "loss": 4.1284, "rewards/accuracies": 0.25, "rewards/chosen": -14.911224365234375, "rewards/margins": -0.938962459564209, "rewards/rejected": -13.972262382507324, "step": 642 }, { "epoch": 0.08755446623093682, "grad_norm": 42.01422328365056, "learning_rate": 6.998639455782313e-07, "logits/chosen": -1.4321460723876953, "logits/rejected": -0.8742757439613342, "logps/chosen": -1.3788492679595947, "logps/rejected": -1.168602466583252, "loss": 4.5654, "rewards/accuracies": 0.25, "rewards/chosen": -13.788494110107422, "rewards/margins": -2.1024692058563232, "rewards/rejected": -11.68602466583252, "step": 643 }, { "epoch": 0.08769063180827887, "grad_norm": 47.039146974124364, "learning_rate": 7.00952380952381e-07, "logits/chosen": -3.179210901260376, "logits/rejected": -2.7196786403656006, "logps/chosen": -1.1540155410766602, "logps/rejected": -1.078901767730713, "loss": 4.4357, "rewards/accuracies": 0.25, "rewards/chosen": -11.540156364440918, "rewards/margins": -0.7511377334594727, "rewards/rejected": -10.789018630981445, "step": 644 }, { "epoch": 0.08782679738562091, "grad_norm": 41.02644998816562, "learning_rate": 7.020408163265305e-07, "logits/chosen": -1.507667899131775, "logits/rejected": -2.143038034439087, "logps/chosen": -1.1204370260238647, "logps/rejected": -1.2364996671676636, "loss": 4.9344, "rewards/accuracies": 0.75, "rewards/chosen": -11.204370498657227, "rewards/margins": 1.1606258153915405, "rewards/rejected": -12.364996910095215, "step": 645 }, { "epoch": 0.08796296296296297, "grad_norm": 43.3142536280653, "learning_rate": 7.031292517006802e-07, "logits/chosen": -2.777125835418701, "logits/rejected": -2.3167309761047363, "logps/chosen": -0.8613138198852539, "logps/rejected": -1.0177910327911377, "loss": 4.4402, "rewards/accuracies": 1.0, "rewards/chosen": -8.613138198852539, "rewards/margins": 1.564771294593811, "rewards/rejected": -10.177909851074219, "step": 646 }, { "epoch": 0.08809912854030501, "grad_norm": 41.13941345821237, "learning_rate": 7.042176870748299e-07, "logits/chosen": 0.5076740980148315, "logits/rejected": -0.6996023654937744, "logps/chosen": -1.6211514472961426, "logps/rejected": -1.937082052230835, "loss": 4.589, "rewards/accuracies": 0.75, "rewards/chosen": -16.211515426635742, "rewards/margins": 3.159304618835449, "rewards/rejected": -19.370819091796875, "step": 647 }, { "epoch": 0.08823529411764706, "grad_norm": 41.45653869873007, "learning_rate": 7.053061224489796e-07, "logits/chosen": -1.014059066772461, "logits/rejected": -2.3023793697357178, "logps/chosen": -1.2494194507598877, "logps/rejected": -0.9857445955276489, "loss": 4.8491, "rewards/accuracies": 0.0, "rewards/chosen": -12.494194984436035, "rewards/margins": -2.6367485523223877, "rewards/rejected": -9.85744571685791, "step": 648 }, { "epoch": 0.08837145969498911, "grad_norm": 51.77357792512685, "learning_rate": 7.063945578231292e-07, "logits/chosen": -1.3247195482254028, "logits/rejected": -0.1585642695426941, "logps/chosen": -1.5682958364486694, "logps/rejected": -1.4894814491271973, "loss": 4.5376, "rewards/accuracies": 0.5, "rewards/chosen": -15.682958602905273, "rewards/margins": -0.7881433963775635, "rewards/rejected": -14.894814491271973, "step": 649 }, { "epoch": 0.08850762527233115, "grad_norm": 40.62419119022593, "learning_rate": 7.074829931972789e-07, "logits/chosen": -2.6472909450531006, "logits/rejected": -0.5212327241897583, "logps/chosen": -1.4638659954071045, "logps/rejected": -1.4837125539779663, "loss": 4.2876, "rewards/accuracies": 0.5, "rewards/chosen": -14.638659477233887, "rewards/margins": 0.19846642017364502, "rewards/rejected": -14.837125778198242, "step": 650 }, { "epoch": 0.0886437908496732, "grad_norm": 40.89181703649058, "learning_rate": 7.085714285714285e-07, "logits/chosen": -1.2926735877990723, "logits/rejected": -0.9279320240020752, "logps/chosen": -1.0575752258300781, "logps/rejected": -0.9190181493759155, "loss": 4.7388, "rewards/accuracies": 0.25, "rewards/chosen": -10.575752258300781, "rewards/margins": -1.3855714797973633, "rewards/rejected": -9.190180778503418, "step": 651 }, { "epoch": 0.08877995642701525, "grad_norm": 46.38708688370896, "learning_rate": 7.096598639455783e-07, "logits/chosen": -1.5076947212219238, "logits/rejected": -1.4862940311431885, "logps/chosen": -1.5216021537780762, "logps/rejected": -1.299424171447754, "loss": 4.1455, "rewards/accuracies": 0.25, "rewards/chosen": -15.216020584106445, "rewards/margins": -2.2217800617218018, "rewards/rejected": -12.994241714477539, "step": 652 }, { "epoch": 0.08891612200435729, "grad_norm": 48.3864863672375, "learning_rate": 7.107482993197278e-07, "logits/chosen": -2.4289488792419434, "logits/rejected": -1.2777515649795532, "logps/chosen": -1.1267143487930298, "logps/rejected": -1.2668046951293945, "loss": 3.7734, "rewards/accuracies": 0.75, "rewards/chosen": -11.267143249511719, "rewards/margins": 1.4009038209915161, "rewards/rejected": -12.668046951293945, "step": 653 }, { "epoch": 0.08905228758169935, "grad_norm": 38.35880018128992, "learning_rate": 7.118367346938775e-07, "logits/chosen": -2.913335084915161, "logits/rejected": -0.5989069938659668, "logps/chosen": -0.9481298923492432, "logps/rejected": -1.07088041305542, "loss": 4.2234, "rewards/accuracies": 0.75, "rewards/chosen": -9.481298446655273, "rewards/margins": 1.2275053262710571, "rewards/rejected": -10.708805084228516, "step": 654 }, { "epoch": 0.08918845315904139, "grad_norm": 40.28690216171138, "learning_rate": 7.129251700680271e-07, "logits/chosen": -2.2218005657196045, "logits/rejected": -1.6588599681854248, "logps/chosen": -1.1588168144226074, "logps/rejected": -1.3977575302124023, "loss": 4.7783, "rewards/accuracies": 0.75, "rewards/chosen": -11.588167190551758, "rewards/margins": 2.3894081115722656, "rewards/rejected": -13.977575302124023, "step": 655 }, { "epoch": 0.08932461873638345, "grad_norm": 49.63238634742794, "learning_rate": 7.140136054421768e-07, "logits/chosen": -0.8765358924865723, "logits/rejected": -1.762982726097107, "logps/chosen": -1.1561572551727295, "logps/rejected": -1.3207387924194336, "loss": 4.4146, "rewards/accuracies": 0.75, "rewards/chosen": -11.561573028564453, "rewards/margins": 1.645815372467041, "rewards/rejected": -13.207387924194336, "step": 656 }, { "epoch": 0.08946078431372549, "grad_norm": 47.06298325802054, "learning_rate": 7.151020408163264e-07, "logits/chosen": 0.384247362613678, "logits/rejected": 1.0668420791625977, "logps/chosen": -1.4139719009399414, "logps/rejected": -1.5222705602645874, "loss": 4.5132, "rewards/accuracies": 0.75, "rewards/chosen": -14.13971996307373, "rewards/margins": 1.0829854011535645, "rewards/rejected": -15.22270393371582, "step": 657 }, { "epoch": 0.08959694989106753, "grad_norm": 52.126776805173854, "learning_rate": 7.161904761904762e-07, "logits/chosen": -2.3037514686584473, "logits/rejected": -1.087194561958313, "logps/chosen": -1.1090755462646484, "logps/rejected": -1.6414690017700195, "loss": 4.3876, "rewards/accuracies": 0.75, "rewards/chosen": -11.090755462646484, "rewards/margins": 5.3239336013793945, "rewards/rejected": -16.414688110351562, "step": 658 }, { "epoch": 0.08973311546840959, "grad_norm": 47.07580814232174, "learning_rate": 7.172789115646259e-07, "logits/chosen": -1.1230690479278564, "logits/rejected": -0.014736682176589966, "logps/chosen": -1.2368491888046265, "logps/rejected": -1.3704285621643066, "loss": 4.9117, "rewards/accuracies": 0.5, "rewards/chosen": -12.368492126464844, "rewards/margins": 1.3357934951782227, "rewards/rejected": -13.70428466796875, "step": 659 }, { "epoch": 0.08986928104575163, "grad_norm": 46.42438831393787, "learning_rate": 7.183673469387755e-07, "logits/chosen": -2.912750244140625, "logits/rejected": -1.9408273696899414, "logps/chosen": -1.057301640510559, "logps/rejected": -1.0691763162612915, "loss": 4.3078, "rewards/accuracies": 0.5, "rewards/chosen": -10.573016166687012, "rewards/margins": 0.11874651908874512, "rewards/rejected": -10.691762924194336, "step": 660 }, { "epoch": 0.09000544662309368, "grad_norm": 51.50931085421901, "learning_rate": 7.194557823129252e-07, "logits/chosen": -2.8921871185302734, "logits/rejected": -1.6095951795578003, "logps/chosen": -1.1666688919067383, "logps/rejected": -1.3741774559020996, "loss": 4.7868, "rewards/accuracies": 0.5, "rewards/chosen": -11.666688919067383, "rewards/margins": 2.0750856399536133, "rewards/rejected": -13.741774559020996, "step": 661 }, { "epoch": 0.09014161220043573, "grad_norm": 52.162037097689655, "learning_rate": 7.205442176870748e-07, "logits/chosen": -0.6305122375488281, "logits/rejected": 0.25764983892440796, "logps/chosen": -1.4661346673965454, "logps/rejected": -1.3292301893234253, "loss": 4.8727, "rewards/accuracies": 0.5, "rewards/chosen": -14.661347389221191, "rewards/margins": -1.3690452575683594, "rewards/rejected": -13.292302131652832, "step": 662 }, { "epoch": 0.09027777777777778, "grad_norm": 50.760965750421526, "learning_rate": 7.216326530612245e-07, "logits/chosen": -1.2042865753173828, "logits/rejected": 0.6945802569389343, "logps/chosen": -1.2335216999053955, "logps/rejected": -1.86216402053833, "loss": 4.1927, "rewards/accuracies": 1.0, "rewards/chosen": -12.335216522216797, "rewards/margins": 6.2864227294921875, "rewards/rejected": -18.621639251708984, "step": 663 }, { "epoch": 0.09041394335511982, "grad_norm": 49.681003798517054, "learning_rate": 7.227210884353741e-07, "logits/chosen": -0.9583320617675781, "logits/rejected": -0.8848538398742676, "logps/chosen": -1.542762279510498, "logps/rejected": -1.5025750398635864, "loss": 4.7822, "rewards/accuracies": 0.5, "rewards/chosen": -15.427621841430664, "rewards/margins": -0.40187156200408936, "rewards/rejected": -15.025751113891602, "step": 664 }, { "epoch": 0.09055010893246188, "grad_norm": 39.16365353244718, "learning_rate": 7.238095238095238e-07, "logits/chosen": -3.033884048461914, "logits/rejected": -1.3297502994537354, "logps/chosen": -1.04249906539917, "logps/rejected": -1.5134327411651611, "loss": 4.7399, "rewards/accuracies": 0.75, "rewards/chosen": -10.424991607666016, "rewards/margins": 4.7093353271484375, "rewards/rejected": -15.134326934814453, "step": 665 }, { "epoch": 0.09068627450980392, "grad_norm": 42.09730965676938, "learning_rate": 7.248979591836734e-07, "logits/chosen": -0.6269055008888245, "logits/rejected": -0.8042051792144775, "logps/chosen": -1.721808671951294, "logps/rejected": -1.617808222770691, "loss": 4.5791, "rewards/accuracies": 0.25, "rewards/chosen": -17.21808624267578, "rewards/margins": -1.0400035381317139, "rewards/rejected": -16.178081512451172, "step": 666 }, { "epoch": 0.09082244008714598, "grad_norm": 43.71926385407577, "learning_rate": 7.259863945578231e-07, "logits/chosen": -2.028883457183838, "logits/rejected": -2.100399971008301, "logps/chosen": -1.2648894786834717, "logps/rejected": -1.160454511642456, "loss": 4.5965, "rewards/accuracies": 0.0, "rewards/chosen": -12.648895263671875, "rewards/margins": -1.0443507432937622, "rewards/rejected": -11.604544639587402, "step": 667 }, { "epoch": 0.09095860566448802, "grad_norm": 42.93072671206798, "learning_rate": 7.270748299319728e-07, "logits/chosen": -0.08947589993476868, "logits/rejected": 0.5510728359222412, "logps/chosen": -1.759604811668396, "logps/rejected": -1.7882659435272217, "loss": 4.2408, "rewards/accuracies": 0.5, "rewards/chosen": -17.596046447753906, "rewards/margins": 0.28661155700683594, "rewards/rejected": -17.882659912109375, "step": 668 }, { "epoch": 0.09109477124183006, "grad_norm": 55.335355565981466, "learning_rate": 7.281632653061225e-07, "logits/chosen": -0.5225589871406555, "logits/rejected": -0.20845964550971985, "logps/chosen": -1.4536821842193604, "logps/rejected": -1.741528034210205, "loss": 4.6765, "rewards/accuracies": 1.0, "rewards/chosen": -14.536821365356445, "rewards/margins": 2.878457546234131, "rewards/rejected": -17.415279388427734, "step": 669 }, { "epoch": 0.09123093681917212, "grad_norm": 48.52931872430723, "learning_rate": 7.29251700680272e-07, "logits/chosen": -0.9874422550201416, "logits/rejected": -1.0176893472671509, "logps/chosen": -1.275254487991333, "logps/rejected": -1.5272136926651, "loss": 4.5543, "rewards/accuracies": 0.75, "rewards/chosen": -12.752545356750488, "rewards/margins": 2.5195915699005127, "rewards/rejected": -15.272136688232422, "step": 670 }, { "epoch": 0.09136710239651416, "grad_norm": 44.05339357540184, "learning_rate": 7.303401360544217e-07, "logits/chosen": 0.3797943592071533, "logits/rejected": -0.28525519371032715, "logps/chosen": -1.6250470876693726, "logps/rejected": -1.8116841316223145, "loss": 4.6731, "rewards/accuracies": 0.5, "rewards/chosen": -16.250471115112305, "rewards/margins": 1.8663709163665771, "rewards/rejected": -18.116840362548828, "step": 671 }, { "epoch": 0.0915032679738562, "grad_norm": 40.71059595107537, "learning_rate": 7.314285714285713e-07, "logits/chosen": -0.7830665111541748, "logits/rejected": -0.5724908113479614, "logps/chosen": -1.2008665800094604, "logps/rejected": -1.5368106365203857, "loss": 4.7824, "rewards/accuracies": 0.75, "rewards/chosen": -12.008665084838867, "rewards/margins": 3.359440326690674, "rewards/rejected": -15.368104934692383, "step": 672 }, { "epoch": 0.09163943355119826, "grad_norm": 45.84473045036917, "learning_rate": 7.32517006802721e-07, "logits/chosen": -2.7611351013183594, "logits/rejected": -1.7336971759796143, "logps/chosen": -0.9858628511428833, "logps/rejected": -1.2811355590820312, "loss": 4.0954, "rewards/accuracies": 1.0, "rewards/chosen": -9.85862922668457, "rewards/margins": 2.9527273178100586, "rewards/rejected": -12.811355590820312, "step": 673 }, { "epoch": 0.0917755991285403, "grad_norm": 42.9230527457608, "learning_rate": 7.336054421768707e-07, "logits/chosen": -1.2866332530975342, "logits/rejected": 0.45228278636932373, "logps/chosen": -1.2401069402694702, "logps/rejected": -1.5210261344909668, "loss": 4.7376, "rewards/accuracies": 0.75, "rewards/chosen": -12.401069641113281, "rewards/margins": 2.809192657470703, "rewards/rejected": -15.210261344909668, "step": 674 }, { "epoch": 0.09191176470588236, "grad_norm": 41.50398605934026, "learning_rate": 7.346938775510204e-07, "logits/chosen": -0.819078266620636, "logits/rejected": -0.3011375665664673, "logps/chosen": -1.5574102401733398, "logps/rejected": -1.73284912109375, "loss": 4.4043, "rewards/accuracies": 0.75, "rewards/chosen": -15.574103355407715, "rewards/margins": 1.7543883323669434, "rewards/rejected": -17.3284912109375, "step": 675 }, { "epoch": 0.0920479302832244, "grad_norm": 42.97044011523364, "learning_rate": 7.357823129251701e-07, "logits/chosen": -1.66579270362854, "logits/rejected": -1.3691505193710327, "logps/chosen": -1.045379400253296, "logps/rejected": -1.1746675968170166, "loss": 4.8539, "rewards/accuracies": 0.5, "rewards/chosen": -10.453794479370117, "rewards/margins": 1.2928812503814697, "rewards/rejected": -11.746675491333008, "step": 676 }, { "epoch": 0.09218409586056645, "grad_norm": 45.3852589585807, "learning_rate": 7.368707482993197e-07, "logits/chosen": -0.9753716588020325, "logits/rejected": -2.5370125770568848, "logps/chosen": -1.1684207916259766, "logps/rejected": -0.9640143513679504, "loss": 4.6153, "rewards/accuracies": 0.0, "rewards/chosen": -11.684207916259766, "rewards/margins": -2.044064521789551, "rewards/rejected": -9.640143394470215, "step": 677 }, { "epoch": 0.0923202614379085, "grad_norm": 43.594824888285785, "learning_rate": 7.379591836734694e-07, "logits/chosen": 0.0014775395393371582, "logits/rejected": 0.01697292923927307, "logps/chosen": -1.5368703603744507, "logps/rejected": -1.6865746974945068, "loss": 4.1063, "rewards/accuracies": 0.75, "rewards/chosen": -15.368703842163086, "rewards/margins": 1.4970426559448242, "rewards/rejected": -16.865745544433594, "step": 678 }, { "epoch": 0.09245642701525054, "grad_norm": 37.81105063939631, "learning_rate": 7.39047619047619e-07, "logits/chosen": -1.1204562187194824, "logits/rejected": 1.3370299339294434, "logps/chosen": -1.2844232320785522, "logps/rejected": -1.5332773923873901, "loss": 3.9683, "rewards/accuracies": 0.75, "rewards/chosen": -12.844232559204102, "rewards/margins": 2.4885406494140625, "rewards/rejected": -15.332773208618164, "step": 679 }, { "epoch": 0.09259259259259259, "grad_norm": 51.412109694308654, "learning_rate": 7.401360544217687e-07, "logits/chosen": -1.3611550331115723, "logits/rejected": -0.7353662848472595, "logps/chosen": -1.722870111465454, "logps/rejected": -1.6674996614456177, "loss": 5.2694, "rewards/accuracies": 0.75, "rewards/chosen": -17.228702545166016, "rewards/margins": -0.5537054538726807, "rewards/rejected": -16.67499542236328, "step": 680 }, { "epoch": 0.09272875816993464, "grad_norm": 40.61052099610408, "learning_rate": 7.412244897959183e-07, "logits/chosen": -2.2304747104644775, "logits/rejected": -0.6143679022789001, "logps/chosen": -1.1939982175827026, "logps/rejected": -1.3303887844085693, "loss": 4.1127, "rewards/accuracies": 0.5, "rewards/chosen": -11.939981460571289, "rewards/margins": 1.3639068603515625, "rewards/rejected": -13.303888320922852, "step": 681 }, { "epoch": 0.09286492374727669, "grad_norm": 46.76448429145674, "learning_rate": 7.42312925170068e-07, "logits/chosen": -2.4520740509033203, "logits/rejected": -1.8481818437576294, "logps/chosen": -1.0003111362457275, "logps/rejected": -1.0156445503234863, "loss": 4.722, "rewards/accuracies": 0.75, "rewards/chosen": -10.003110885620117, "rewards/margins": 0.15333425998687744, "rewards/rejected": -10.15644645690918, "step": 682 }, { "epoch": 0.09300108932461873, "grad_norm": 47.731771780236286, "learning_rate": 7.434013605442176e-07, "logits/chosen": -0.4123488962650299, "logits/rejected": -0.9875236749649048, "logps/chosen": -1.2919455766677856, "logps/rejected": -1.0651922225952148, "loss": 4.0166, "rewards/accuracies": 0.25, "rewards/chosen": -12.919454574584961, "rewards/margins": -2.2675328254699707, "rewards/rejected": -10.651922225952148, "step": 683 }, { "epoch": 0.09313725490196079, "grad_norm": 43.48713858845032, "learning_rate": 7.444897959183673e-07, "logits/chosen": -1.3348259925842285, "logits/rejected": -0.08408054709434509, "logps/chosen": -0.9921106696128845, "logps/rejected": -1.5779616832733154, "loss": 4.6619, "rewards/accuracies": 1.0, "rewards/chosen": -9.921106338500977, "rewards/margins": 5.858510494232178, "rewards/rejected": -15.779617309570312, "step": 684 }, { "epoch": 0.09327342047930283, "grad_norm": 47.57180100849747, "learning_rate": 7.45578231292517e-07, "logits/chosen": -0.025071382522583008, "logits/rejected": 0.7266275882720947, "logps/chosen": -1.5947189331054688, "logps/rejected": -1.8634390830993652, "loss": 4.8943, "rewards/accuracies": 0.75, "rewards/chosen": -15.947188377380371, "rewards/margins": 2.6872026920318604, "rewards/rejected": -18.63439178466797, "step": 685 }, { "epoch": 0.09340958605664489, "grad_norm": 48.056744781454604, "learning_rate": 7.466666666666667e-07, "logits/chosen": -1.1261622905731201, "logits/rejected": -0.629636824131012, "logps/chosen": -1.5971834659576416, "logps/rejected": -1.6774283647537231, "loss": 4.7887, "rewards/accuracies": 0.5, "rewards/chosen": -15.971835136413574, "rewards/margins": 0.8024492263793945, "rewards/rejected": -16.77428436279297, "step": 686 }, { "epoch": 0.09354575163398693, "grad_norm": 42.46765567089297, "learning_rate": 7.477551020408163e-07, "logits/chosen": -1.2263332605361938, "logits/rejected": -0.4848198890686035, "logps/chosen": -1.4264137744903564, "logps/rejected": -1.4181618690490723, "loss": 4.3822, "rewards/accuracies": 0.25, "rewards/chosen": -14.264137268066406, "rewards/margins": -0.08251798152923584, "rewards/rejected": -14.181619644165039, "step": 687 }, { "epoch": 0.09368191721132897, "grad_norm": 43.39960487451937, "learning_rate": 7.488435374149659e-07, "logits/chosen": -0.7010926008224487, "logits/rejected": 1.5825648307800293, "logps/chosen": -1.3506109714508057, "logps/rejected": -1.8319224119186401, "loss": 4.7517, "rewards/accuracies": 1.0, "rewards/chosen": -13.506109237670898, "rewards/margins": 4.813115119934082, "rewards/rejected": -18.319225311279297, "step": 688 }, { "epoch": 0.09381808278867103, "grad_norm": 45.92590793779929, "learning_rate": 7.499319727891155e-07, "logits/chosen": 0.0669097900390625, "logits/rejected": 0.5542805194854736, "logps/chosen": -1.9274545907974243, "logps/rejected": -1.9936411380767822, "loss": 4.611, "rewards/accuracies": 0.75, "rewards/chosen": -19.274545669555664, "rewards/margins": 0.6618654727935791, "rewards/rejected": -19.936410903930664, "step": 689 }, { "epoch": 0.09395424836601307, "grad_norm": 38.477609034526964, "learning_rate": 7.510204081632653e-07, "logits/chosen": -2.074528455734253, "logits/rejected": 0.2922220528125763, "logps/chosen": -1.0959200859069824, "logps/rejected": -1.2657861709594727, "loss": 4.4568, "rewards/accuracies": 0.75, "rewards/chosen": -10.959199905395508, "rewards/margins": 1.6986621618270874, "rewards/rejected": -12.657861709594727, "step": 690 }, { "epoch": 0.09409041394335511, "grad_norm": 47.644655916137815, "learning_rate": 7.521088435374149e-07, "logits/chosen": 0.019661176949739456, "logits/rejected": -0.17806220054626465, "logps/chosen": -1.4486339092254639, "logps/rejected": -1.6424745321273804, "loss": 4.7444, "rewards/accuracies": 0.5, "rewards/chosen": -14.48633861541748, "rewards/margins": 1.9384058713912964, "rewards/rejected": -16.424745559692383, "step": 691 }, { "epoch": 0.09422657952069717, "grad_norm": 42.58671143499203, "learning_rate": 7.531972789115646e-07, "logits/chosen": -1.6215264797210693, "logits/rejected": -1.5511436462402344, "logps/chosen": -1.2827777862548828, "logps/rejected": -1.1078402996063232, "loss": 4.6183, "rewards/accuracies": 0.25, "rewards/chosen": -12.827776908874512, "rewards/margins": -1.7493733167648315, "rewards/rejected": -11.07840347290039, "step": 692 }, { "epoch": 0.09436274509803921, "grad_norm": 53.99476350343462, "learning_rate": 7.542857142857142e-07, "logits/chosen": -2.1922998428344727, "logits/rejected": -1.5359280109405518, "logps/chosen": -1.1636403799057007, "logps/rejected": -1.3527809381484985, "loss": 4.9882, "rewards/accuracies": 0.75, "rewards/chosen": -11.636404037475586, "rewards/margins": 1.8914051055908203, "rewards/rejected": -13.527809143066406, "step": 693 }, { "epoch": 0.09449891067538127, "grad_norm": 47.56082092367841, "learning_rate": 7.553741496598639e-07, "logits/chosen": -1.6094223260879517, "logits/rejected": -1.2937045097351074, "logps/chosen": -1.172528624534607, "logps/rejected": -1.2489964962005615, "loss": 4.8692, "rewards/accuracies": 0.5, "rewards/chosen": -11.725286483764648, "rewards/margins": 0.764678955078125, "rewards/rejected": -12.489965438842773, "step": 694 }, { "epoch": 0.09463507625272331, "grad_norm": 45.82283729314198, "learning_rate": 7.564625850340137e-07, "logits/chosen": 0.7002053260803223, "logits/rejected": 0.06629269570112228, "logps/chosen": -1.1784634590148926, "logps/rejected": -1.212536334991455, "loss": 4.4581, "rewards/accuracies": 0.5, "rewards/chosen": -11.784634590148926, "rewards/margins": 0.3407285213470459, "rewards/rejected": -12.125362396240234, "step": 695 }, { "epoch": 0.09477124183006536, "grad_norm": 58.52897615729702, "learning_rate": 7.575510204081632e-07, "logits/chosen": -0.594469428062439, "logits/rejected": -0.24466761946678162, "logps/chosen": -1.5836455821990967, "logps/rejected": -1.5614933967590332, "loss": 5.285, "rewards/accuracies": 0.25, "rewards/chosen": -15.836456298828125, "rewards/margins": -0.22152304649353027, "rewards/rejected": -15.614933967590332, "step": 696 }, { "epoch": 0.09490740740740741, "grad_norm": 61.416004309602414, "learning_rate": 7.586394557823129e-07, "logits/chosen": -0.844778835773468, "logits/rejected": -0.7908359169960022, "logps/chosen": -1.253016710281372, "logps/rejected": -1.215678334236145, "loss": 5.4285, "rewards/accuracies": 0.5, "rewards/chosen": -12.530166625976562, "rewards/margins": -0.3733830451965332, "rewards/rejected": -12.156784057617188, "step": 697 }, { "epoch": 0.09504357298474946, "grad_norm": 44.04789720185715, "learning_rate": 7.597278911564625e-07, "logits/chosen": -1.1157164573669434, "logits/rejected": -1.470064640045166, "logps/chosen": -1.3065361976623535, "logps/rejected": -1.283850908279419, "loss": 4.1608, "rewards/accuracies": 0.5, "rewards/chosen": -13.065362930297852, "rewards/margins": -0.2268533706665039, "rewards/rejected": -12.838508605957031, "step": 698 }, { "epoch": 0.0951797385620915, "grad_norm": 76.06578873434339, "learning_rate": 7.608163265306122e-07, "logits/chosen": 0.5225315093994141, "logits/rejected": 1.388078212738037, "logps/chosen": -1.4327392578125, "logps/rejected": -1.6005779504776, "loss": 4.7732, "rewards/accuracies": 0.75, "rewards/chosen": -14.327392578125, "rewards/margins": 1.6783864498138428, "rewards/rejected": -16.005779266357422, "step": 699 }, { "epoch": 0.09531590413943355, "grad_norm": 38.91136358172403, "learning_rate": 7.619047619047618e-07, "logits/chosen": -0.9638204574584961, "logits/rejected": -1.8729140758514404, "logps/chosen": -1.0394686460494995, "logps/rejected": -1.2613301277160645, "loss": 4.0696, "rewards/accuracies": 0.75, "rewards/chosen": -10.394686698913574, "rewards/margins": 2.2186150550842285, "rewards/rejected": -12.613302230834961, "step": 700 }, { "epoch": 0.0954520697167756, "grad_norm": 48.83103082948936, "learning_rate": 7.629931972789116e-07, "logits/chosen": 0.44406160712242126, "logits/rejected": 0.1375807374715805, "logps/chosen": -1.7583752870559692, "logps/rejected": -1.6735632419586182, "loss": 4.4896, "rewards/accuracies": 0.25, "rewards/chosen": -17.583751678466797, "rewards/margins": -0.8481202125549316, "rewards/rejected": -16.735633850097656, "step": 701 }, { "epoch": 0.09558823529411764, "grad_norm": 45.218741850317464, "learning_rate": 7.640816326530612e-07, "logits/chosen": -0.43145158886909485, "logits/rejected": -0.7273286581039429, "logps/chosen": -1.319587230682373, "logps/rejected": -1.342893362045288, "loss": 4.0565, "rewards/accuracies": 0.5, "rewards/chosen": -13.195871353149414, "rewards/margins": 0.2330610752105713, "rewards/rejected": -13.428932189941406, "step": 702 }, { "epoch": 0.0957244008714597, "grad_norm": 40.938167859133976, "learning_rate": 7.651700680272109e-07, "logits/chosen": -3.82958984375, "logits/rejected": -1.218679666519165, "logps/chosen": -1.1181875467300415, "logps/rejected": -1.3271405696868896, "loss": 4.1757, "rewards/accuracies": 0.75, "rewards/chosen": -11.181875228881836, "rewards/margins": 2.0895307064056396, "rewards/rejected": -13.271406173706055, "step": 703 }, { "epoch": 0.09586056644880174, "grad_norm": 45.27328655928583, "learning_rate": 7.662585034013605e-07, "logits/chosen": -0.378587543964386, "logits/rejected": -0.5299003720283508, "logps/chosen": -1.216627597808838, "logps/rejected": -1.393390417098999, "loss": 4.5245, "rewards/accuracies": 0.5, "rewards/chosen": -12.166275978088379, "rewards/margins": 1.7676281929016113, "rewards/rejected": -13.933904647827148, "step": 704 }, { "epoch": 0.0959967320261438, "grad_norm": 46.22226285091805, "learning_rate": 7.673469387755102e-07, "logits/chosen": -0.9818664789199829, "logits/rejected": -1.150753140449524, "logps/chosen": -1.323944091796875, "logps/rejected": -1.795841932296753, "loss": 4.5777, "rewards/accuracies": 1.0, "rewards/chosen": -13.23944091796875, "rewards/margins": 4.718977928161621, "rewards/rejected": -17.958419799804688, "step": 705 }, { "epoch": 0.09613289760348584, "grad_norm": 41.83583385984584, "learning_rate": 7.684353741496598e-07, "logits/chosen": -1.8001468181610107, "logits/rejected": -0.4546681046485901, "logps/chosen": -1.3939483165740967, "logps/rejected": -1.5299582481384277, "loss": 3.8141, "rewards/accuracies": 0.5, "rewards/chosen": -13.939482688903809, "rewards/margins": 1.3600986003875732, "rewards/rejected": -15.299581527709961, "step": 706 }, { "epoch": 0.09626906318082788, "grad_norm": 40.95756653939361, "learning_rate": 7.695238095238095e-07, "logits/chosen": -2.39776611328125, "logits/rejected": -1.056107759475708, "logps/chosen": -0.9639331698417664, "logps/rejected": -1.080049991607666, "loss": 4.5, "rewards/accuracies": 0.75, "rewards/chosen": -9.639331817626953, "rewards/margins": 1.1611673831939697, "rewards/rejected": -10.800498962402344, "step": 707 }, { "epoch": 0.09640522875816994, "grad_norm": 42.05871346663438, "learning_rate": 7.706122448979591e-07, "logits/chosen": -1.4461543560028076, "logits/rejected": -1.676652193069458, "logps/chosen": -1.1278717517852783, "logps/rejected": -1.2751262187957764, "loss": 4.5308, "rewards/accuracies": 0.75, "rewards/chosen": -11.278717994689941, "rewards/margins": 1.4725443124771118, "rewards/rejected": -12.751262664794922, "step": 708 }, { "epoch": 0.09654139433551198, "grad_norm": 40.08943107070954, "learning_rate": 7.717006802721088e-07, "logits/chosen": 0.6075837016105652, "logits/rejected": 0.8728734254837036, "logps/chosen": -1.377319574356079, "logps/rejected": -1.39141845703125, "loss": 3.8173, "rewards/accuracies": 0.5, "rewards/chosen": -13.773197174072266, "rewards/margins": 0.14098763465881348, "rewards/rejected": -13.914183616638184, "step": 709 }, { "epoch": 0.09667755991285402, "grad_norm": 52.426585838505645, "learning_rate": 7.727891156462584e-07, "logits/chosen": -1.335512399673462, "logits/rejected": -1.1903434991836548, "logps/chosen": -1.095657229423523, "logps/rejected": -1.202784776687622, "loss": 4.7098, "rewards/accuracies": 0.5, "rewards/chosen": -10.956572532653809, "rewards/margins": 1.0712757110595703, "rewards/rejected": -12.027848243713379, "step": 710 }, { "epoch": 0.09681372549019608, "grad_norm": 41.460696012497564, "learning_rate": 7.738775510204082e-07, "logits/chosen": 0.41538113355636597, "logits/rejected": 1.0969359874725342, "logps/chosen": -1.3142106533050537, "logps/rejected": -1.6236939430236816, "loss": 4.6141, "rewards/accuracies": 0.75, "rewards/chosen": -13.142107009887695, "rewards/margins": 3.094832181930542, "rewards/rejected": -16.2369384765625, "step": 711 }, { "epoch": 0.09694989106753812, "grad_norm": 40.04325750583041, "learning_rate": 7.749659863945579e-07, "logits/chosen": 0.7049305438995361, "logits/rejected": -0.1027403473854065, "logps/chosen": -1.8300504684448242, "logps/rejected": -1.8080768585205078, "loss": 4.7575, "rewards/accuracies": 0.5, "rewards/chosen": -18.300504684448242, "rewards/margins": -0.21973729133605957, "rewards/rejected": -18.080768585205078, "step": 712 }, { "epoch": 0.09708605664488018, "grad_norm": 51.95554158910165, "learning_rate": 7.760544217687075e-07, "logits/chosen": -0.9180151224136353, "logits/rejected": -1.3526248931884766, "logps/chosen": -1.437736988067627, "logps/rejected": -1.3673402070999146, "loss": 5.0882, "rewards/accuracies": 0.25, "rewards/chosen": -14.37736988067627, "rewards/margins": -0.7039682865142822, "rewards/rejected": -13.673401832580566, "step": 713 }, { "epoch": 0.09722222222222222, "grad_norm": 42.386057925850444, "learning_rate": 7.771428571428571e-07, "logits/chosen": -2.036989450454712, "logits/rejected": 0.5961278676986694, "logps/chosen": -0.9952622652053833, "logps/rejected": -1.3730106353759766, "loss": 4.3023, "rewards/accuracies": 0.5, "rewards/chosen": -9.952622413635254, "rewards/margins": 3.7774839401245117, "rewards/rejected": -13.730106353759766, "step": 714 }, { "epoch": 0.09735838779956427, "grad_norm": 37.28317498520363, "learning_rate": 7.782312925170067e-07, "logits/chosen": 0.17235952615737915, "logits/rejected": 0.16617803275585175, "logps/chosen": -2.1389901638031006, "logps/rejected": -2.6632509231567383, "loss": 3.9073, "rewards/accuracies": 1.0, "rewards/chosen": -21.389902114868164, "rewards/margins": 5.242608070373535, "rewards/rejected": -26.63250732421875, "step": 715 }, { "epoch": 0.09749455337690632, "grad_norm": 56.1895806775161, "learning_rate": 7.793197278911564e-07, "logits/chosen": -0.7039341926574707, "logits/rejected": 0.431105375289917, "logps/chosen": -1.6820502281188965, "logps/rejected": -1.826953649520874, "loss": 4.5759, "rewards/accuracies": 0.75, "rewards/chosen": -16.82050323486328, "rewards/margins": 1.4490342140197754, "rewards/rejected": -18.2695369720459, "step": 716 }, { "epoch": 0.09763071895424837, "grad_norm": 42.00465105037301, "learning_rate": 7.804081632653061e-07, "logits/chosen": 1.3665099143981934, "logits/rejected": 0.19601020216941833, "logps/chosen": -1.8020567893981934, "logps/rejected": -2.335940361022949, "loss": 4.289, "rewards/accuracies": 0.5, "rewards/chosen": -18.020566940307617, "rewards/margins": 5.33883810043335, "rewards/rejected": -23.359405517578125, "step": 717 }, { "epoch": 0.09776688453159041, "grad_norm": 44.35484332661711, "learning_rate": 7.814965986394558e-07, "logits/chosen": 0.7743087410926819, "logits/rejected": 1.0919036865234375, "logps/chosen": -1.7339521646499634, "logps/rejected": -1.8304390907287598, "loss": 4.3987, "rewards/accuracies": 0.75, "rewards/chosen": -17.339521408081055, "rewards/margins": 0.9648697376251221, "rewards/rejected": -18.30438995361328, "step": 718 }, { "epoch": 0.09790305010893247, "grad_norm": 42.61019959090062, "learning_rate": 7.825850340136054e-07, "logits/chosen": 0.8494248986244202, "logits/rejected": 2.393946647644043, "logps/chosen": -1.810532808303833, "logps/rejected": -1.5025948286056519, "loss": 4.6478, "rewards/accuracies": 0.5, "rewards/chosen": -18.105327606201172, "rewards/margins": -3.079379081726074, "rewards/rejected": -15.025949478149414, "step": 719 }, { "epoch": 0.09803921568627451, "grad_norm": 37.72936695455801, "learning_rate": 7.836734693877551e-07, "logits/chosen": -0.49009692668914795, "logits/rejected": -0.6882311105728149, "logps/chosen": -1.0606231689453125, "logps/rejected": -1.2865989208221436, "loss": 3.8677, "rewards/accuracies": 0.75, "rewards/chosen": -10.606231689453125, "rewards/margins": 2.2597579956054688, "rewards/rejected": -12.865989685058594, "step": 720 }, { "epoch": 0.09817538126361655, "grad_norm": 59.441092391606794, "learning_rate": 7.847619047619047e-07, "logits/chosen": 0.13285762071609497, "logits/rejected": 1.87190842628479, "logps/chosen": -1.344689130783081, "logps/rejected": -1.8044698238372803, "loss": 4.8789, "rewards/accuracies": 1.0, "rewards/chosen": -13.446891784667969, "rewards/margins": 4.597805976867676, "rewards/rejected": -18.044696807861328, "step": 721 }, { "epoch": 0.09831154684095861, "grad_norm": 49.6766291152574, "learning_rate": 7.858503401360544e-07, "logits/chosen": -0.47632408142089844, "logits/rejected": 0.5335745811462402, "logps/chosen": -1.3056421279907227, "logps/rejected": -1.4092220067977905, "loss": 4.4355, "rewards/accuracies": 0.5, "rewards/chosen": -13.05642032623291, "rewards/margins": 1.0358002185821533, "rewards/rejected": -14.092220306396484, "step": 722 }, { "epoch": 0.09844771241830065, "grad_norm": 40.256374132137275, "learning_rate": 7.86938775510204e-07, "logits/chosen": -0.824160635471344, "logits/rejected": 0.9001928567886353, "logps/chosen": -1.2354365587234497, "logps/rejected": -1.707230567932129, "loss": 4.085, "rewards/accuracies": 0.75, "rewards/chosen": -12.354365348815918, "rewards/margins": 4.717940330505371, "rewards/rejected": -17.07230567932129, "step": 723 }, { "epoch": 0.09858387799564271, "grad_norm": 44.06204272060546, "learning_rate": 7.880272108843537e-07, "logits/chosen": 0.0488055944442749, "logits/rejected": -0.6544159650802612, "logps/chosen": -2.258899211883545, "logps/rejected": -1.8060481548309326, "loss": 5.3003, "rewards/accuracies": 0.25, "rewards/chosen": -22.588993072509766, "rewards/margins": -4.528511047363281, "rewards/rejected": -18.060482025146484, "step": 724 }, { "epoch": 0.09872004357298475, "grad_norm": 43.87298547506148, "learning_rate": 7.891156462585033e-07, "logits/chosen": -1.183397889137268, "logits/rejected": -0.511728823184967, "logps/chosen": -1.0698847770690918, "logps/rejected": -1.4847863912582397, "loss": 4.2498, "rewards/accuracies": 1.0, "rewards/chosen": -10.698846817016602, "rewards/margins": 4.149016380310059, "rewards/rejected": -14.847864151000977, "step": 725 }, { "epoch": 0.0988562091503268, "grad_norm": 39.489952043180004, "learning_rate": 7.90204081632653e-07, "logits/chosen": -1.4574799537658691, "logits/rejected": -1.5426701307296753, "logps/chosen": -1.139153242111206, "logps/rejected": -1.0022411346435547, "loss": 4.7504, "rewards/accuracies": 0.25, "rewards/chosen": -11.391532897949219, "rewards/margins": -1.369121789932251, "rewards/rejected": -10.022411346435547, "step": 726 }, { "epoch": 0.09899237472766885, "grad_norm": 43.38088870537801, "learning_rate": 7.912925170068027e-07, "logits/chosen": -1.0274544954299927, "logits/rejected": 1.3594375848770142, "logps/chosen": -1.339028239250183, "logps/rejected": -1.745755672454834, "loss": 4.6918, "rewards/accuracies": 1.0, "rewards/chosen": -13.39028263092041, "rewards/margins": 4.067275047302246, "rewards/rejected": -17.457557678222656, "step": 727 }, { "epoch": 0.09912854030501089, "grad_norm": 45.55314076930362, "learning_rate": 7.923809523809524e-07, "logits/chosen": -2.158655881881714, "logits/rejected": 0.19467759132385254, "logps/chosen": -1.4235544204711914, "logps/rejected": -1.8347347974777222, "loss": 4.4757, "rewards/accuracies": 0.5, "rewards/chosen": -14.235544204711914, "rewards/margins": 4.111804008483887, "rewards/rejected": -18.347347259521484, "step": 728 }, { "epoch": 0.09926470588235294, "grad_norm": 58.25214772749213, "learning_rate": 7.93469387755102e-07, "logits/chosen": -0.26922571659088135, "logits/rejected": -0.8202770352363586, "logps/chosen": -1.2572438716888428, "logps/rejected": -1.1610374450683594, "loss": 4.6367, "rewards/accuracies": 0.25, "rewards/chosen": -12.572439193725586, "rewards/margins": -0.9620649814605713, "rewards/rejected": -11.610374450683594, "step": 729 }, { "epoch": 0.09940087145969499, "grad_norm": 40.37349820209022, "learning_rate": 7.945578231292517e-07, "logits/chosen": -0.6377133131027222, "logits/rejected": -0.34134382009506226, "logps/chosen": -1.4691821336746216, "logps/rejected": -1.4256865978240967, "loss": 4.099, "rewards/accuracies": 0.5, "rewards/chosen": -14.691821098327637, "rewards/margins": -0.4349546432495117, "rewards/rejected": -14.256866455078125, "step": 730 }, { "epoch": 0.09953703703703703, "grad_norm": 37.92461121781647, "learning_rate": 7.956462585034014e-07, "logits/chosen": -1.1494933366775513, "logits/rejected": 0.4769717752933502, "logps/chosen": -1.3526298999786377, "logps/rejected": -1.967796802520752, "loss": 4.5606, "rewards/accuracies": 0.75, "rewards/chosen": -13.526298522949219, "rewards/margins": 6.151669502258301, "rewards/rejected": -19.677967071533203, "step": 731 }, { "epoch": 0.09967320261437909, "grad_norm": 42.477671721218805, "learning_rate": 7.967346938775509e-07, "logits/chosen": -0.7631001472473145, "logits/rejected": -1.989106297492981, "logps/chosen": -1.4343900680541992, "logps/rejected": -1.3141846656799316, "loss": 4.206, "rewards/accuracies": 0.5, "rewards/chosen": -14.343899726867676, "rewards/margins": -1.2020537853240967, "rewards/rejected": -13.141845703125, "step": 732 }, { "epoch": 0.09980936819172113, "grad_norm": 53.67530694416586, "learning_rate": 7.978231292517007e-07, "logits/chosen": 1.0620880126953125, "logits/rejected": 1.9397039413452148, "logps/chosen": -1.5628434419631958, "logps/rejected": -1.9588817358016968, "loss": 4.3076, "rewards/accuracies": 0.75, "rewards/chosen": -15.628435134887695, "rewards/margins": 3.960383176803589, "rewards/rejected": -19.588817596435547, "step": 733 }, { "epoch": 0.09994553376906318, "grad_norm": 46.377241488582584, "learning_rate": 7.989115646258503e-07, "logits/chosen": 0.7463825345039368, "logits/rejected": 1.1524479389190674, "logps/chosen": -1.6604950428009033, "logps/rejected": -1.409916877746582, "loss": 4.2439, "rewards/accuracies": 0.0, "rewards/chosen": -16.604949951171875, "rewards/margins": -2.505781412124634, "rewards/rejected": -14.09916877746582, "step": 734 }, { "epoch": 0.10008169934640523, "grad_norm": 46.76161676021151, "learning_rate": 8e-07, "logits/chosen": -0.5108676552772522, "logits/rejected": 2.1995582580566406, "logps/chosen": -1.315900444984436, "logps/rejected": -2.013021469116211, "loss": 4.2627, "rewards/accuracies": 1.0, "rewards/chosen": -13.159004211425781, "rewards/margins": 6.9712114334106445, "rewards/rejected": -20.13021469116211, "step": 735 }, { "epoch": 0.10021786492374728, "grad_norm": 46.00742507247384, "learning_rate": 7.999999548083467e-07, "logits/chosen": -1.2346765995025635, "logits/rejected": 1.3776856660842896, "logps/chosen": -1.4780516624450684, "logps/rejected": -1.776558518409729, "loss": 4.087, "rewards/accuracies": 1.0, "rewards/chosen": -14.780516624450684, "rewards/margins": 2.9850690364837646, "rewards/rejected": -17.76558494567871, "step": 736 }, { "epoch": 0.10035403050108932, "grad_norm": 44.83102340729846, "learning_rate": 7.999998192333973e-07, "logits/chosen": -0.897120475769043, "logits/rejected": -0.11550307273864746, "logps/chosen": -1.0464494228363037, "logps/rejected": -1.2399767637252808, "loss": 4.2674, "rewards/accuracies": 0.75, "rewards/chosen": -10.464494705200195, "rewards/margins": 1.9352738857269287, "rewards/rejected": -12.39976692199707, "step": 737 }, { "epoch": 0.10049019607843138, "grad_norm": 40.90720255300272, "learning_rate": 7.999995932751822e-07, "logits/chosen": 1.2781093120574951, "logits/rejected": 0.14575031399726868, "logps/chosen": -2.104914426803589, "logps/rejected": -2.0117335319519043, "loss": 5.0062, "rewards/accuracies": 0.25, "rewards/chosen": -21.049144744873047, "rewards/margins": -0.9318099021911621, "rewards/rejected": -20.117332458496094, "step": 738 }, { "epoch": 0.10062636165577342, "grad_norm": 45.984388498758534, "learning_rate": 7.999992769337527e-07, "logits/chosen": 1.0772892236709595, "logits/rejected": 2.489861488342285, "logps/chosen": -1.359615445137024, "logps/rejected": -2.0309252738952637, "loss": 4.5872, "rewards/accuracies": 1.0, "rewards/chosen": -13.59615421295166, "rewards/margins": 6.713099479675293, "rewards/rejected": -20.309253692626953, "step": 739 }, { "epoch": 0.10076252723311546, "grad_norm": 45.95873104154777, "learning_rate": 7.999988702091802e-07, "logits/chosen": -0.042368821799755096, "logits/rejected": 0.18819379806518555, "logps/chosen": -1.2321606874465942, "logps/rejected": -1.896632432937622, "loss": 4.5198, "rewards/accuracies": 0.75, "rewards/chosen": -12.321606636047363, "rewards/margins": 6.644718170166016, "rewards/rejected": -18.966323852539062, "step": 740 }, { "epoch": 0.10089869281045752, "grad_norm": 45.023536913008556, "learning_rate": 7.999983731015564e-07, "logits/chosen": -0.47716349363327026, "logits/rejected": 0.13738465309143066, "logps/chosen": -1.3479433059692383, "logps/rejected": -1.5097033977508545, "loss": 4.5081, "rewards/accuracies": 0.5, "rewards/chosen": -13.479432106018066, "rewards/margins": 1.6176012754440308, "rewards/rejected": -15.097034454345703, "step": 741 }, { "epoch": 0.10103485838779956, "grad_norm": 40.49516626038055, "learning_rate": 7.999977856109937e-07, "logits/chosen": 0.322273850440979, "logits/rejected": 0.4344635009765625, "logps/chosen": -1.593541145324707, "logps/rejected": -1.6536016464233398, "loss": 4.0212, "rewards/accuracies": 0.75, "rewards/chosen": -15.93541145324707, "rewards/margins": 0.6006042957305908, "rewards/rejected": -16.5360164642334, "step": 742 }, { "epoch": 0.10117102396514162, "grad_norm": 44.527218753838554, "learning_rate": 7.999971077376251e-07, "logits/chosen": -0.2034805417060852, "logits/rejected": 0.4198349118232727, "logps/chosen": -1.2776243686676025, "logps/rejected": -1.5847891569137573, "loss": 4.002, "rewards/accuracies": 0.75, "rewards/chosen": -12.776244163513184, "rewards/margins": 3.0716471672058105, "rewards/rejected": -15.847890853881836, "step": 743 }, { "epoch": 0.10130718954248366, "grad_norm": 38.68334160457821, "learning_rate": 7.999963394816036e-07, "logits/chosen": -0.906157374382019, "logits/rejected": 0.4318722188472748, "logps/chosen": -1.2594432830810547, "logps/rejected": -1.5900355577468872, "loss": 4.2958, "rewards/accuracies": 0.75, "rewards/chosen": -12.59443187713623, "rewards/margins": 3.3059232234954834, "rewards/rejected": -15.900355339050293, "step": 744 }, { "epoch": 0.1014433551198257, "grad_norm": 51.353343806018685, "learning_rate": 7.999954808431027e-07, "logits/chosen": 0.23294037580490112, "logits/rejected": 1.7872135639190674, "logps/chosen": -1.533708095550537, "logps/rejected": -1.9631006717681885, "loss": 4.6974, "rewards/accuracies": 0.75, "rewards/chosen": -15.337081909179688, "rewards/margins": 4.293926239013672, "rewards/rejected": -19.63100814819336, "step": 745 }, { "epoch": 0.10157952069716776, "grad_norm": 44.48947245200595, "learning_rate": 7.999945318223166e-07, "logits/chosen": 0.280547559261322, "logits/rejected": 0.9209766387939453, "logps/chosen": -1.6304137706756592, "logps/rejected": -1.8409937620162964, "loss": 4.3922, "rewards/accuracies": 0.75, "rewards/chosen": -16.304136276245117, "rewards/margins": 2.1058011054992676, "rewards/rejected": -18.40993881225586, "step": 746 }, { "epoch": 0.1017156862745098, "grad_norm": 47.2469834727621, "learning_rate": 7.999934924194596e-07, "logits/chosen": 0.8606135845184326, "logits/rejected": 0.13383632898330688, "logps/chosen": -1.469611406326294, "logps/rejected": -1.4864981174468994, "loss": 4.1793, "rewards/accuracies": 0.5, "rewards/chosen": -14.696113586425781, "rewards/margins": 0.16886639595031738, "rewards/rejected": -14.864980697631836, "step": 747 }, { "epoch": 0.10185185185185185, "grad_norm": 51.173090754640626, "learning_rate": 7.999923626347666e-07, "logits/chosen": 0.5592957735061646, "logits/rejected": 0.9642077684402466, "logps/chosen": -1.8894916772842407, "logps/rejected": -1.557434320449829, "loss": 4.952, "rewards/accuracies": 0.25, "rewards/chosen": -18.894916534423828, "rewards/margins": -3.320573091506958, "rewards/rejected": -15.574344635009766, "step": 748 }, { "epoch": 0.1019880174291939, "grad_norm": 51.40699700530561, "learning_rate": 7.999911424684928e-07, "logits/chosen": 0.005308866500854492, "logits/rejected": -0.1343148946762085, "logps/chosen": -1.7332133054733276, "logps/rejected": -1.8322079181671143, "loss": 4.6041, "rewards/accuracies": 0.25, "rewards/chosen": -17.33213233947754, "rewards/margins": 0.9899454116821289, "rewards/rejected": -18.32207679748535, "step": 749 }, { "epoch": 0.10212418300653595, "grad_norm": 51.959298063086024, "learning_rate": 7.99989831920914e-07, "logits/chosen": 1.6166810989379883, "logits/rejected": -0.24686861038208008, "logps/chosen": -1.3326783180236816, "logps/rejected": -1.5660514831542969, "loss": 4.5576, "rewards/accuracies": 0.75, "rewards/chosen": -13.326783180236816, "rewards/margins": 2.3337321281433105, "rewards/rejected": -15.660514831542969, "step": 750 }, { "epoch": 0.102260348583878, "grad_norm": 42.361849392431196, "learning_rate": 7.999884309923265e-07, "logits/chosen": 0.16631072759628296, "logits/rejected": 1.6443976163864136, "logps/chosen": -1.6128628253936768, "logps/rejected": -1.7088284492492676, "loss": 3.7831, "rewards/accuracies": 0.75, "rewards/chosen": -16.12862777709961, "rewards/margins": 0.9596550464630127, "rewards/rejected": -17.08828353881836, "step": 751 }, { "epoch": 0.10239651416122005, "grad_norm": 52.02982656435625, "learning_rate": 7.999869396830466e-07, "logits/chosen": 0.5750711560249329, "logits/rejected": 0.8159037828445435, "logps/chosen": -2.030202627182007, "logps/rejected": -2.0596227645874023, "loss": 4.6521, "rewards/accuracies": 0.5, "rewards/chosen": -20.30202865600586, "rewards/margins": 0.294201135635376, "rewards/rejected": -20.596229553222656, "step": 752 }, { "epoch": 0.10253267973856209, "grad_norm": 38.9372409643313, "learning_rate": 7.999853579934114e-07, "logits/chosen": 0.7347349524497986, "logits/rejected": 1.8001577854156494, "logps/chosen": -1.752373456954956, "logps/rejected": -2.0807955265045166, "loss": 4.3954, "rewards/accuracies": 1.0, "rewards/chosen": -17.52373504638672, "rewards/margins": 3.2842206954956055, "rewards/rejected": -20.807954788208008, "step": 753 }, { "epoch": 0.10266884531590414, "grad_norm": 45.351511068571305, "learning_rate": 7.999836859237781e-07, "logits/chosen": -0.40798765420913696, "logits/rejected": 0.32381778955459595, "logps/chosen": -1.5169373750686646, "logps/rejected": -1.6966947317123413, "loss": 4.2296, "rewards/accuracies": 0.5, "rewards/chosen": -15.169373512268066, "rewards/margins": 1.797574758529663, "rewards/rejected": -16.966949462890625, "step": 754 }, { "epoch": 0.10280501089324619, "grad_norm": 42.02374519505962, "learning_rate": 7.999819234745248e-07, "logits/chosen": -0.1300109475851059, "logits/rejected": 0.8792860507965088, "logps/chosen": -1.367758870124817, "logps/rejected": -1.5508662462234497, "loss": 4.2657, "rewards/accuracies": 0.75, "rewards/chosen": -13.67758846282959, "rewards/margins": 1.8310739994049072, "rewards/rejected": -15.508663177490234, "step": 755 }, { "epoch": 0.10294117647058823, "grad_norm": 45.45263087837593, "learning_rate": 7.999800706460496e-07, "logits/chosen": 0.2849912941455841, "logits/rejected": 1.5089614391326904, "logps/chosen": -1.3870457410812378, "logps/rejected": -1.4939966201782227, "loss": 4.5786, "rewards/accuracies": 0.5, "rewards/chosen": -13.87045669555664, "rewards/margins": 1.0695087909698486, "rewards/rejected": -14.939966201782227, "step": 756 }, { "epoch": 0.10307734204793029, "grad_norm": 37.58335066523679, "learning_rate": 7.99978127438771e-07, "logits/chosen": 0.5464581251144409, "logits/rejected": 1.3172847032546997, "logps/chosen": -1.5842032432556152, "logps/rejected": -1.8092522621154785, "loss": 4.2965, "rewards/accuracies": 0.75, "rewards/chosen": -15.842032432556152, "rewards/margins": 2.2504894733428955, "rewards/rejected": -18.09252166748047, "step": 757 }, { "epoch": 0.10321350762527233, "grad_norm": 46.20894705300342, "learning_rate": 7.999760938531286e-07, "logits/chosen": -1.9875504970550537, "logits/rejected": -1.254569172859192, "logps/chosen": -1.3618805408477783, "logps/rejected": -1.3991626501083374, "loss": 4.7427, "rewards/accuracies": 0.75, "rewards/chosen": -13.618805885314941, "rewards/margins": 0.3728206157684326, "rewards/rejected": -13.991626739501953, "step": 758 }, { "epoch": 0.10334967320261437, "grad_norm": 44.371694476003775, "learning_rate": 7.999739698895813e-07, "logits/chosen": 1.33803129196167, "logits/rejected": 1.050602912902832, "logps/chosen": -1.406922459602356, "logps/rejected": -1.416944980621338, "loss": 4.2984, "rewards/accuracies": 0.5, "rewards/chosen": -14.069225311279297, "rewards/margins": 0.10022521018981934, "rewards/rejected": -14.169449806213379, "step": 759 }, { "epoch": 0.10348583877995643, "grad_norm": 39.75838869756418, "learning_rate": 7.999717555486093e-07, "logits/chosen": 1.103134036064148, "logits/rejected": 0.5269545316696167, "logps/chosen": -1.5032625198364258, "logps/rejected": -1.833913803100586, "loss": 4.5646, "rewards/accuracies": 0.75, "rewards/chosen": -15.032626152038574, "rewards/margins": 3.3065123558044434, "rewards/rejected": -18.33913803100586, "step": 760 }, { "epoch": 0.10362200435729847, "grad_norm": 46.043815750364395, "learning_rate": 7.99969450830713e-07, "logits/chosen": 1.8028452396392822, "logits/rejected": 2.311917304992676, "logps/chosen": -2.0503485202789307, "logps/rejected": -1.8697985410690308, "loss": 4.5417, "rewards/accuracies": 0.25, "rewards/chosen": -20.50348472595215, "rewards/margins": -1.8054988384246826, "rewards/rejected": -18.697986602783203, "step": 761 }, { "epoch": 0.10375816993464053, "grad_norm": 49.922291645644115, "learning_rate": 7.999670557364131e-07, "logits/chosen": -1.1546406745910645, "logits/rejected": 2.3604073524475098, "logps/chosen": -1.467210054397583, "logps/rejected": -1.9923110008239746, "loss": 4.3286, "rewards/accuracies": 0.75, "rewards/chosen": -14.672100067138672, "rewards/margins": 5.251008987426758, "rewards/rejected": -19.923110961914062, "step": 762 }, { "epoch": 0.10389433551198257, "grad_norm": 38.64410605510715, "learning_rate": 7.999645702662507e-07, "logits/chosen": -0.6993436813354492, "logits/rejected": 1.7371671199798584, "logps/chosen": -1.0313720703125, "logps/rejected": -1.3027536869049072, "loss": 4.4024, "rewards/accuracies": 0.75, "rewards/chosen": -10.313720703125, "rewards/margins": 2.713815450668335, "rewards/rejected": -13.027536392211914, "step": 763 }, { "epoch": 0.10403050108932461, "grad_norm": 44.94948151741874, "learning_rate": 7.999619944207876e-07, "logits/chosen": -0.8879281878471375, "logits/rejected": -0.38571667671203613, "logps/chosen": -1.8033292293548584, "logps/rejected": -1.5244849920272827, "loss": 5.0339, "rewards/accuracies": 0.25, "rewards/chosen": -18.033292770385742, "rewards/margins": -2.7884411811828613, "rewards/rejected": -15.244851112365723, "step": 764 }, { "epoch": 0.10416666666666667, "grad_norm": 47.90447331375952, "learning_rate": 7.999593282006057e-07, "logits/chosen": 1.0220603942871094, "logits/rejected": 2.5652663707733154, "logps/chosen": -1.6571190357208252, "logps/rejected": -2.234567880630493, "loss": 4.4294, "rewards/accuracies": 1.0, "rewards/chosen": -16.571189880371094, "rewards/margins": 5.77448844909668, "rewards/rejected": -22.345680236816406, "step": 765 }, { "epoch": 0.10430283224400871, "grad_norm": 42.93746027939843, "learning_rate": 7.999565716063075e-07, "logits/chosen": 1.7471057176589966, "logits/rejected": 0.5052383542060852, "logps/chosen": -1.8894391059875488, "logps/rejected": -1.9978009462356567, "loss": 4.2417, "rewards/accuracies": 0.5, "rewards/chosen": -18.894390106201172, "rewards/margins": 1.0836181640625, "rewards/rejected": -19.978008270263672, "step": 766 }, { "epoch": 0.10443899782135076, "grad_norm": 44.75308521872794, "learning_rate": 7.999537246385159e-07, "logits/chosen": 1.391936182975769, "logits/rejected": 1.979264736175537, "logps/chosen": -1.3208458423614502, "logps/rejected": -1.7198090553283691, "loss": 4.8704, "rewards/accuracies": 1.0, "rewards/chosen": -13.208457946777344, "rewards/margins": 3.989633083343506, "rewards/rejected": -17.198091506958008, "step": 767 }, { "epoch": 0.10457516339869281, "grad_norm": 42.3844555561235, "learning_rate": 7.999507872978741e-07, "logits/chosen": 0.08571749925613403, "logits/rejected": 1.41682767868042, "logps/chosen": -1.3275012969970703, "logps/rejected": -1.4475387334823608, "loss": 4.6326, "rewards/accuracies": 0.75, "rewards/chosen": -13.275012969970703, "rewards/margins": 1.2003741264343262, "rewards/rejected": -14.475387573242188, "step": 768 }, { "epoch": 0.10471132897603486, "grad_norm": 42.452747529948, "learning_rate": 7.99947759585046e-07, "logits/chosen": 0.07330568879842758, "logits/rejected": 0.7793669700622559, "logps/chosen": -1.2890889644622803, "logps/rejected": -1.373905897140503, "loss": 4.8448, "rewards/accuracies": 0.5, "rewards/chosen": -12.890890121459961, "rewards/margins": 0.8481690883636475, "rewards/rejected": -13.739059448242188, "step": 769 }, { "epoch": 0.10484749455337691, "grad_norm": 42.84668596349429, "learning_rate": 7.999446415007156e-07, "logits/chosen": 1.4900578260421753, "logits/rejected": -0.04155588150024414, "logps/chosen": -1.42629873752594, "logps/rejected": -1.1759291887283325, "loss": 4.4047, "rewards/accuracies": 0.5, "rewards/chosen": -14.26298713684082, "rewards/margins": -2.5036957263946533, "rewards/rejected": -11.759291648864746, "step": 770 }, { "epoch": 0.10498366013071896, "grad_norm": 47.603886428155384, "learning_rate": 7.999414330455873e-07, "logits/chosen": 2.1358940601348877, "logits/rejected": 2.920032501220703, "logps/chosen": -1.5609328746795654, "logps/rejected": -1.7076072692871094, "loss": 4.7882, "rewards/accuracies": 0.75, "rewards/chosen": -15.609329223632812, "rewards/margins": 1.4667434692382812, "rewards/rejected": -17.076072692871094, "step": 771 }, { "epoch": 0.105119825708061, "grad_norm": 41.585290740790875, "learning_rate": 7.999381342203864e-07, "logits/chosen": 2.3426434993743896, "logits/rejected": 3.2454495429992676, "logps/chosen": -1.9075422286987305, "logps/rejected": -2.384535312652588, "loss": 4.1084, "rewards/accuracies": 1.0, "rewards/chosen": -19.075424194335938, "rewards/margins": 4.769930362701416, "rewards/rejected": -23.845354080200195, "step": 772 }, { "epoch": 0.10525599128540306, "grad_norm": 54.621441368346126, "learning_rate": 7.999347450258582e-07, "logits/chosen": 0.0966683030128479, "logits/rejected": 1.137557029724121, "logps/chosen": -1.5221836566925049, "logps/rejected": -1.8010807037353516, "loss": 4.4663, "rewards/accuracies": 0.75, "rewards/chosen": -15.221835136413574, "rewards/margins": 2.788971424102783, "rewards/rejected": -18.010807037353516, "step": 773 }, { "epoch": 0.1053921568627451, "grad_norm": 42.046096561995164, "learning_rate": 7.999312654627684e-07, "logits/chosen": 1.1910027265548706, "logits/rejected": 1.0521016120910645, "logps/chosen": -1.6117336750030518, "logps/rejected": -1.5488284826278687, "loss": 4.1598, "rewards/accuracies": 0.75, "rewards/chosen": -16.11733627319336, "rewards/margins": -0.629051685333252, "rewards/rejected": -15.488285064697266, "step": 774 }, { "epoch": 0.10552832244008714, "grad_norm": 46.125004807518955, "learning_rate": 7.999276955319033e-07, "logits/chosen": 1.328837275505066, "logits/rejected": -0.3747066259384155, "logps/chosen": -2.042919635772705, "logps/rejected": -1.6462650299072266, "loss": 5.1737, "rewards/accuracies": 0.0, "rewards/chosen": -20.429195404052734, "rewards/margins": -3.9665462970733643, "rewards/rejected": -16.462648391723633, "step": 775 }, { "epoch": 0.1056644880174292, "grad_norm": 43.65934784339586, "learning_rate": 7.999240352340695e-07, "logits/chosen": -1.2119096517562866, "logits/rejected": 0.33803147077560425, "logps/chosen": -1.4546700716018677, "logps/rejected": -1.9309070110321045, "loss": 4.6109, "rewards/accuracies": 1.0, "rewards/chosen": -14.546700477600098, "rewards/margins": 4.7623701095581055, "rewards/rejected": -19.309070587158203, "step": 776 }, { "epoch": 0.10580065359477124, "grad_norm": 43.33889183914095, "learning_rate": 7.999202845700942e-07, "logits/chosen": -0.30413782596588135, "logits/rejected": -0.15817609429359436, "logps/chosen": -1.2539868354797363, "logps/rejected": -1.5481176376342773, "loss": 3.8188, "rewards/accuracies": 0.75, "rewards/chosen": -12.53986930847168, "rewards/margins": 2.9413070678710938, "rewards/rejected": -15.481176376342773, "step": 777 }, { "epoch": 0.10593681917211328, "grad_norm": 49.62453040028167, "learning_rate": 7.999164435408249e-07, "logits/chosen": 1.7273705005645752, "logits/rejected": 0.14834052324295044, "logps/chosen": -1.9452265501022339, "logps/rejected": -1.6694021224975586, "loss": 5.061, "rewards/accuracies": 0.25, "rewards/chosen": -19.452266693115234, "rewards/margins": -2.7582433223724365, "rewards/rejected": -16.69402313232422, "step": 778 }, { "epoch": 0.10607298474945534, "grad_norm": 44.798494189277086, "learning_rate": 7.999125121471293e-07, "logits/chosen": 3.3573546409606934, "logits/rejected": 3.5657966136932373, "logps/chosen": -1.9543980360031128, "logps/rejected": -2.419301986694336, "loss": 4.9165, "rewards/accuracies": 1.0, "rewards/chosen": -19.543981552124023, "rewards/margins": 4.649039268493652, "rewards/rejected": -24.19301986694336, "step": 779 }, { "epoch": 0.10620915032679738, "grad_norm": 51.24940232612136, "learning_rate": 7.99908490389896e-07, "logits/chosen": 2.394583225250244, "logits/rejected": 1.549010157585144, "logps/chosen": -1.5958573818206787, "logps/rejected": -1.6589035987854004, "loss": 4.6981, "rewards/accuracies": 0.5, "rewards/chosen": -15.958574295043945, "rewards/margins": 0.6304608583450317, "rewards/rejected": -16.589035034179688, "step": 780 }, { "epoch": 0.10634531590413944, "grad_norm": 46.396893674282545, "learning_rate": 7.999043782700334e-07, "logits/chosen": 1.9348478317260742, "logits/rejected": 2.2909460067749023, "logps/chosen": -1.9175128936767578, "logps/rejected": -2.0046744346618652, "loss": 4.2051, "rewards/accuracies": 0.5, "rewards/chosen": -19.175128936767578, "rewards/margins": 0.871612548828125, "rewards/rejected": -20.046741485595703, "step": 781 }, { "epoch": 0.10648148148148148, "grad_norm": 50.77244359722997, "learning_rate": 7.999001757884712e-07, "logits/chosen": 1.060349464416504, "logits/rejected": 1.4278526306152344, "logps/chosen": -1.6032949686050415, "logps/rejected": -1.7755917310714722, "loss": 4.315, "rewards/accuracies": 0.75, "rewards/chosen": -16.032949447631836, "rewards/margins": 1.7229681015014648, "rewards/rejected": -17.755916595458984, "step": 782 }, { "epoch": 0.10661764705882353, "grad_norm": 52.01786774384421, "learning_rate": 7.998958829461585e-07, "logits/chosen": 1.2047566175460815, "logits/rejected": 1.6644797325134277, "logps/chosen": -1.7201473712921143, "logps/rejected": -1.6446311473846436, "loss": 4.5462, "rewards/accuracies": 0.5, "rewards/chosen": -17.201473236083984, "rewards/margins": -0.7551627159118652, "rewards/rejected": -16.446311950683594, "step": 783 }, { "epoch": 0.10675381263616558, "grad_norm": 42.120326637715934, "learning_rate": 7.998914997440655e-07, "logits/chosen": 0.7624495029449463, "logits/rejected": 1.4675722122192383, "logps/chosen": -1.5322215557098389, "logps/rejected": -1.7168982028961182, "loss": 3.981, "rewards/accuracies": 0.75, "rewards/chosen": -15.322216033935547, "rewards/margins": 1.8467655181884766, "rewards/rejected": -17.168981552124023, "step": 784 }, { "epoch": 0.10688997821350762, "grad_norm": 46.47588457544468, "learning_rate": 7.998870261831825e-07, "logits/chosen": 1.7723464965820312, "logits/rejected": 0.5840928554534912, "logps/chosen": -1.2676290273666382, "logps/rejected": -1.2153995037078857, "loss": 4.3321, "rewards/accuracies": 0.25, "rewards/chosen": -12.676290512084961, "rewards/margins": -0.5222959518432617, "rewards/rejected": -12.1539945602417, "step": 785 }, { "epoch": 0.10702614379084967, "grad_norm": 44.602173593610786, "learning_rate": 7.998824622645205e-07, "logits/chosen": 0.02989286184310913, "logits/rejected": -0.07318854331970215, "logps/chosen": -1.132688283920288, "logps/rejected": -1.4367401599884033, "loss": 4.3376, "rewards/accuracies": 0.75, "rewards/chosen": -11.326883316040039, "rewards/margins": 3.040518283843994, "rewards/rejected": -14.367401123046875, "step": 786 }, { "epoch": 0.10716230936819172, "grad_norm": 48.237451877159515, "learning_rate": 7.998778079891108e-07, "logits/chosen": 1.5195659399032593, "logits/rejected": 2.7613930702209473, "logps/chosen": -1.5405583381652832, "logps/rejected": -1.5658700466156006, "loss": 4.9749, "rewards/accuracies": 0.75, "rewards/chosen": -15.405582427978516, "rewards/margins": 0.25311827659606934, "rewards/rejected": -15.65870189666748, "step": 787 }, { "epoch": 0.10729847494553377, "grad_norm": 46.34053153547859, "learning_rate": 7.998730633580049e-07, "logits/chosen": 1.7733864784240723, "logits/rejected": 1.9103527069091797, "logps/chosen": -1.448975682258606, "logps/rejected": -1.4454541206359863, "loss": 4.5179, "rewards/accuracies": 0.5, "rewards/chosen": -14.489757537841797, "rewards/margins": -0.03521597385406494, "rewards/rejected": -14.45454216003418, "step": 788 }, { "epoch": 0.10743464052287582, "grad_norm": 47.4707477579344, "learning_rate": 7.998682283722749e-07, "logits/chosen": -0.34334930777549744, "logits/rejected": -0.2384556531906128, "logps/chosen": -1.589897632598877, "logps/rejected": -1.6089590787887573, "loss": 4.5898, "rewards/accuracies": 0.5, "rewards/chosen": -15.898975372314453, "rewards/margins": 0.1906147003173828, "rewards/rejected": -16.08959197998047, "step": 789 }, { "epoch": 0.10757080610021787, "grad_norm": 43.728134658446514, "learning_rate": 7.998633030330134e-07, "logits/chosen": 0.7546036243438721, "logits/rejected": 1.1618421077728271, "logps/chosen": -1.5561189651489258, "logps/rejected": -1.8480472564697266, "loss": 3.5626, "rewards/accuracies": 1.0, "rewards/chosen": -15.561190605163574, "rewards/margins": 2.9192826747894287, "rewards/rejected": -18.480472564697266, "step": 790 }, { "epoch": 0.10770697167755991, "grad_norm": 52.06956691020298, "learning_rate": 7.998582873413332e-07, "logits/chosen": 2.343764543533325, "logits/rejected": 2.720688819885254, "logps/chosen": -2.3316140174865723, "logps/rejected": -2.1328868865966797, "loss": 4.5931, "rewards/accuracies": 0.75, "rewards/chosen": -23.31614112854004, "rewards/margins": -1.9872727394104004, "rewards/rejected": -21.328866958618164, "step": 791 }, { "epoch": 0.10784313725490197, "grad_norm": 55.673332739264985, "learning_rate": 7.998531812983677e-07, "logits/chosen": 2.534262180328369, "logits/rejected": 2.119086980819702, "logps/chosen": -1.784118890762329, "logps/rejected": -1.610933780670166, "loss": 4.9427, "rewards/accuracies": 0.25, "rewards/chosen": -17.841188430786133, "rewards/margins": -1.7318520545959473, "rewards/rejected": -16.109336853027344, "step": 792 }, { "epoch": 0.10797930283224401, "grad_norm": 48.37927409531654, "learning_rate": 7.998479849052709e-07, "logits/chosen": 1.4289524555206299, "logits/rejected": 2.2518985271453857, "logps/chosen": -1.4624829292297363, "logps/rejected": -1.9227702617645264, "loss": 4.6585, "rewards/accuracies": 0.75, "rewards/chosen": -14.62483024597168, "rewards/margins": 4.6028733253479, "rewards/rejected": -19.227703094482422, "step": 793 }, { "epoch": 0.10811546840958605, "grad_norm": 39.60917882011979, "learning_rate": 7.998426981632164e-07, "logits/chosen": 1.9969346523284912, "logits/rejected": 3.416038751602173, "logps/chosen": -1.7362511157989502, "logps/rejected": -1.9682178497314453, "loss": 4.3581, "rewards/accuracies": 0.75, "rewards/chosen": -17.362510681152344, "rewards/margins": 2.319667100906372, "rewards/rejected": -19.682178497314453, "step": 794 }, { "epoch": 0.10825163398692811, "grad_norm": 42.0380064537613, "learning_rate": 7.998373210733992e-07, "logits/chosen": 1.56708824634552, "logits/rejected": 3.3601059913635254, "logps/chosen": -1.669676661491394, "logps/rejected": -2.1744823455810547, "loss": 4.1041, "rewards/accuracies": 1.0, "rewards/chosen": -16.696765899658203, "rewards/margins": 5.048057556152344, "rewards/rejected": -21.744823455810547, "step": 795 }, { "epoch": 0.10838779956427015, "grad_norm": 47.791787644649645, "learning_rate": 7.998318536370344e-07, "logits/chosen": 1.3217236995697021, "logits/rejected": 1.537630558013916, "logps/chosen": -2.0219311714172363, "logps/rejected": -1.9770113229751587, "loss": 4.8442, "rewards/accuracies": 0.25, "rewards/chosen": -20.219310760498047, "rewards/margins": -0.44919776916503906, "rewards/rejected": -19.770111083984375, "step": 796 }, { "epoch": 0.1085239651416122, "grad_norm": 44.749519460276936, "learning_rate": 7.998262958553571e-07, "logits/chosen": -0.9626508951187134, "logits/rejected": 0.8505517244338989, "logps/chosen": -1.3389331102371216, "logps/rejected": -1.4971683025360107, "loss": 4.68, "rewards/accuracies": 1.0, "rewards/chosen": -13.389331817626953, "rewards/margins": 1.5823519229888916, "rewards/rejected": -14.971683502197266, "step": 797 }, { "epoch": 0.10866013071895425, "grad_norm": 71.07956538400349, "learning_rate": 7.998206477296233e-07, "logits/chosen": -0.6165209412574768, "logits/rejected": -0.512959897518158, "logps/chosen": -1.3097622394561768, "logps/rejected": -1.1828570365905762, "loss": 5.1712, "rewards/accuracies": 0.5, "rewards/chosen": -13.097623825073242, "rewards/margins": -1.2690527439117432, "rewards/rejected": -11.828569412231445, "step": 798 }, { "epoch": 0.1087962962962963, "grad_norm": 49.962577648855664, "learning_rate": 7.998149092611092e-07, "logits/chosen": 2.796869993209839, "logits/rejected": 2.792573928833008, "logps/chosen": -2.012871503829956, "logps/rejected": -1.8833363056182861, "loss": 4.2909, "rewards/accuracies": 0.5, "rewards/chosen": -20.12871551513672, "rewards/margins": -1.2953510284423828, "rewards/rejected": -18.833362579345703, "step": 799 }, { "epoch": 0.10893246187363835, "grad_norm": 42.03416020266132, "learning_rate": 7.998090804511114e-07, "logits/chosen": 1.2368518114089966, "logits/rejected": 1.657493233680725, "logps/chosen": -1.4247905015945435, "logps/rejected": -1.5928130149841309, "loss": 5.063, "rewards/accuracies": 0.75, "rewards/chosen": -14.247905731201172, "rewards/margins": 1.6802233457565308, "rewards/rejected": -15.928129196166992, "step": 800 }, { "epoch": 0.1090686274509804, "grad_norm": 45.11474063518245, "learning_rate": 7.99803161300947e-07, "logits/chosen": 3.178882598876953, "logits/rejected": 2.3935508728027344, "logps/chosen": -1.7421722412109375, "logps/rejected": -1.6688203811645508, "loss": 4.5186, "rewards/accuracies": 0.5, "rewards/chosen": -17.421722412109375, "rewards/margins": -0.7335193157196045, "rewards/rejected": -16.688203811645508, "step": 801 }, { "epoch": 0.10920479302832244, "grad_norm": 47.572846063372005, "learning_rate": 7.997971518119536e-07, "logits/chosen": -0.13488692045211792, "logits/rejected": 1.5874485969543457, "logps/chosen": -1.198693037033081, "logps/rejected": -1.5744705200195312, "loss": 5.0111, "rewards/accuracies": 1.0, "rewards/chosen": -11.986930847167969, "rewards/margins": 3.7577755451202393, "rewards/rejected": -15.744706153869629, "step": 802 }, { "epoch": 0.10934095860566449, "grad_norm": 46.50829352148976, "learning_rate": 7.997910519854888e-07, "logits/chosen": -1.6249284744262695, "logits/rejected": -0.8311495780944824, "logps/chosen": -1.041746973991394, "logps/rejected": -1.1036317348480225, "loss": 4.4458, "rewards/accuracies": 0.5, "rewards/chosen": -10.417469024658203, "rewards/margins": 0.6188468933105469, "rewards/rejected": -11.03631591796875, "step": 803 }, { "epoch": 0.10947712418300654, "grad_norm": 43.54769042923091, "learning_rate": 7.997848618229312e-07, "logits/chosen": -1.8649637699127197, "logits/rejected": -0.04868261516094208, "logps/chosen": -1.0966370105743408, "logps/rejected": -1.2237751483917236, "loss": 4.4914, "rewards/accuracies": 0.75, "rewards/chosen": -10.96636962890625, "rewards/margins": 1.2713816165924072, "rewards/rejected": -12.237751960754395, "step": 804 }, { "epoch": 0.10961328976034858, "grad_norm": 38.32800554753021, "learning_rate": 7.997785813256795e-07, "logits/chosen": 0.9221013188362122, "logits/rejected": 2.054861068725586, "logps/chosen": -1.3827762603759766, "logps/rejected": -1.6495500802993774, "loss": 4.2179, "rewards/accuracies": 1.0, "rewards/chosen": -13.827762603759766, "rewards/margins": 2.667738199234009, "rewards/rejected": -16.495500564575195, "step": 805 }, { "epoch": 0.10974945533769064, "grad_norm": 42.06339709962767, "learning_rate": 7.997722104951527e-07, "logits/chosen": 1.0834029912948608, "logits/rejected": 1.3104156255722046, "logps/chosen": -1.7429174184799194, "logps/rejected": -1.667360782623291, "loss": 4.5296, "rewards/accuracies": 0.0, "rewards/chosen": -17.429174423217773, "rewards/margins": -0.755565881729126, "rewards/rejected": -16.673608779907227, "step": 806 }, { "epoch": 0.10988562091503268, "grad_norm": 35.34577236272051, "learning_rate": 7.997657493327904e-07, "logits/chosen": -0.10088402032852173, "logits/rejected": 1.5857336521148682, "logps/chosen": -1.3145780563354492, "logps/rejected": -1.5547659397125244, "loss": 4.2937, "rewards/accuracies": 0.75, "rewards/chosen": -13.145780563354492, "rewards/margins": 2.4018783569335938, "rewards/rejected": -15.547659873962402, "step": 807 }, { "epoch": 0.11002178649237472, "grad_norm": 41.15252531048562, "learning_rate": 7.997591978400525e-07, "logits/chosen": 1.1905560493469238, "logits/rejected": -0.5407991409301758, "logps/chosen": -1.365112066268921, "logps/rejected": -1.343386173248291, "loss": 4.6042, "rewards/accuracies": 0.5, "rewards/chosen": -13.651121139526367, "rewards/margins": -0.21725869178771973, "rewards/rejected": -13.433862686157227, "step": 808 }, { "epoch": 0.11015795206971678, "grad_norm": 49.586219293875615, "learning_rate": 7.997525560184194e-07, "logits/chosen": -0.5955168008804321, "logits/rejected": -0.3867112398147583, "logps/chosen": -1.6349142789840698, "logps/rejected": -1.4682430028915405, "loss": 4.2502, "rewards/accuracies": 0.25, "rewards/chosen": -16.34914207458496, "rewards/margins": -1.6667118072509766, "rewards/rejected": -14.682430267333984, "step": 809 }, { "epoch": 0.11029411764705882, "grad_norm": 43.19229647146315, "learning_rate": 7.997458238693919e-07, "logits/chosen": 1.6922736167907715, "logits/rejected": 2.4774770736694336, "logps/chosen": -1.5199089050292969, "logps/rejected": -1.572237253189087, "loss": 4.5586, "rewards/accuracies": 0.75, "rewards/chosen": -15.199089050292969, "rewards/margins": 0.5232837200164795, "rewards/rejected": -15.722373008728027, "step": 810 }, { "epoch": 0.11043028322440088, "grad_norm": 40.5196345591091, "learning_rate": 7.997390013944912e-07, "logits/chosen": -0.2795543074607849, "logits/rejected": 2.336671829223633, "logps/chosen": -1.3175506591796875, "logps/rejected": -1.6040173768997192, "loss": 4.9312, "rewards/accuracies": 0.75, "rewards/chosen": -13.175506591796875, "rewards/margins": 2.864666700363159, "rewards/rejected": -16.040172576904297, "step": 811 }, { "epoch": 0.11056644880174292, "grad_norm": 42.66296498026488, "learning_rate": 7.997320885952587e-07, "logits/chosen": 2.6417489051818848, "logits/rejected": 2.220654249191284, "logps/chosen": -1.9035792350769043, "logps/rejected": -1.911035180091858, "loss": 4.3155, "rewards/accuracies": 0.5, "rewards/chosen": -19.03579330444336, "rewards/margins": 0.07455945014953613, "rewards/rejected": -19.1103515625, "step": 812 }, { "epoch": 0.11070261437908496, "grad_norm": 53.457458493564744, "learning_rate": 7.997250854732567e-07, "logits/chosen": 0.9791847467422485, "logits/rejected": 2.389136791229248, "logps/chosen": -1.683013916015625, "logps/rejected": -2.0108375549316406, "loss": 4.2544, "rewards/accuracies": 1.0, "rewards/chosen": -16.83013916015625, "rewards/margins": 3.278236150741577, "rewards/rejected": -20.108375549316406, "step": 813 }, { "epoch": 0.11083877995642702, "grad_norm": 37.36550821896362, "learning_rate": 7.997179920300675e-07, "logits/chosen": 0.5718583464622498, "logits/rejected": -0.5942140817642212, "logps/chosen": -1.3514230251312256, "logps/rejected": -1.2181285619735718, "loss": 3.8591, "rewards/accuracies": 0.5, "rewards/chosen": -13.514229774475098, "rewards/margins": -1.332944393157959, "rewards/rejected": -12.181285858154297, "step": 814 }, { "epoch": 0.11097494553376906, "grad_norm": 86.95836167678615, "learning_rate": 7.997108082672939e-07, "logits/chosen": -1.129970908164978, "logits/rejected": -0.19228291511535645, "logps/chosen": -1.185614824295044, "logps/rejected": -1.3346645832061768, "loss": 5.2188, "rewards/accuracies": 0.75, "rewards/chosen": -11.856147766113281, "rewards/margins": 1.4904981851577759, "rewards/rejected": -13.34664535522461, "step": 815 }, { "epoch": 0.1111111111111111, "grad_norm": 44.28843438066439, "learning_rate": 7.99703534186559e-07, "logits/chosen": 1.2686960697174072, "logits/rejected": 3.173292398452759, "logps/chosen": -1.4345290660858154, "logps/rejected": -1.7055399417877197, "loss": 4.1895, "rewards/accuracies": 0.75, "rewards/chosen": -14.345291137695312, "rewards/margins": 2.7101078033447266, "rewards/rejected": -17.055400848388672, "step": 816 }, { "epoch": 0.11124727668845316, "grad_norm": 42.69582083380421, "learning_rate": 7.996961697895066e-07, "logits/chosen": 1.1418542861938477, "logits/rejected": 3.2451326847076416, "logps/chosen": -1.3706974983215332, "logps/rejected": -1.6633399724960327, "loss": 4.5434, "rewards/accuracies": 0.75, "rewards/chosen": -13.706974983215332, "rewards/margins": 2.926424741744995, "rewards/rejected": -16.633399963378906, "step": 817 }, { "epoch": 0.1113834422657952, "grad_norm": 43.83274390275088, "learning_rate": 7.996887150778008e-07, "logits/chosen": 1.2151288986206055, "logits/rejected": 2.7889676094055176, "logps/chosen": -1.630746841430664, "logps/rejected": -1.852696418762207, "loss": 4.146, "rewards/accuracies": 0.5, "rewards/chosen": -16.30746841430664, "rewards/margins": 2.219496488571167, "rewards/rejected": -18.52696418762207, "step": 818 }, { "epoch": 0.11151960784313726, "grad_norm": 39.95008659989285, "learning_rate": 7.99681170053126e-07, "logits/chosen": 2.2801849842071533, "logits/rejected": 3.9853124618530273, "logps/chosen": -1.5699419975280762, "logps/rejected": -1.7378953695297241, "loss": 4.4334, "rewards/accuracies": 0.75, "rewards/chosen": -15.699420928955078, "rewards/margins": 1.6795332431793213, "rewards/rejected": -17.37895393371582, "step": 819 }, { "epoch": 0.1116557734204793, "grad_norm": 42.850387788794556, "learning_rate": 7.996735347171869e-07, "logits/chosen": -1.4442167282104492, "logits/rejected": -0.21529269218444824, "logps/chosen": -1.5274709463119507, "logps/rejected": -1.895316481590271, "loss": 4.4, "rewards/accuracies": 0.75, "rewards/chosen": -15.274709701538086, "rewards/margins": 3.6784555912017822, "rewards/rejected": -18.95316505432129, "step": 820 }, { "epoch": 0.11179193899782135, "grad_norm": 38.261268811596004, "learning_rate": 7.996658090717091e-07, "logits/chosen": 2.9251387119293213, "logits/rejected": 2.5341591835021973, "logps/chosen": -1.5982091426849365, "logps/rejected": -1.6341652870178223, "loss": 4.1409, "rewards/accuracies": 0.5, "rewards/chosen": -15.982090950012207, "rewards/margins": 0.3595621585845947, "rewards/rejected": -16.34165382385254, "step": 821 }, { "epoch": 0.1119281045751634, "grad_norm": 44.588141087168076, "learning_rate": 7.996579931184378e-07, "logits/chosen": 2.2755961418151855, "logits/rejected": 2.2831172943115234, "logps/chosen": -1.613018274307251, "logps/rejected": -1.8200173377990723, "loss": 4.7425, "rewards/accuracies": 0.75, "rewards/chosen": -16.130184173583984, "rewards/margins": 2.069991111755371, "rewards/rejected": -18.20017433166504, "step": 822 }, { "epoch": 0.11206427015250545, "grad_norm": 39.60525844493314, "learning_rate": 7.996500868591395e-07, "logits/chosen": -0.23103633522987366, "logits/rejected": 1.9989025592803955, "logps/chosen": -1.2241528034210205, "logps/rejected": -1.5441884994506836, "loss": 4.2931, "rewards/accuracies": 1.0, "rewards/chosen": -12.24152946472168, "rewards/margins": 3.2003560066223145, "rewards/rejected": -15.441884994506836, "step": 823 }, { "epoch": 0.11220043572984749, "grad_norm": 44.264652928431026, "learning_rate": 7.996420902956006e-07, "logits/chosen": 0.9468441605567932, "logits/rejected": 2.2608304023742676, "logps/chosen": -1.3547320365905762, "logps/rejected": -1.7074940204620361, "loss": 4.1671, "rewards/accuracies": 1.0, "rewards/chosen": -13.547320365905762, "rewards/margins": 3.527620792388916, "rewards/rejected": -17.074939727783203, "step": 824 }, { "epoch": 0.11233660130718955, "grad_norm": 48.128852729169424, "learning_rate": 7.996340034296277e-07, "logits/chosen": 1.7506422996520996, "logits/rejected": 2.495896577835083, "logps/chosen": -1.4920201301574707, "logps/rejected": -1.5008001327514648, "loss": 4.386, "rewards/accuracies": 0.75, "rewards/chosen": -14.92020034790039, "rewards/margins": 0.08780074119567871, "rewards/rejected": -15.008001327514648, "step": 825 }, { "epoch": 0.11247276688453159, "grad_norm": 44.265687670636545, "learning_rate": 7.996258262630485e-07, "logits/chosen": 2.421787738800049, "logits/rejected": 3.4354467391967773, "logps/chosen": -1.657911777496338, "logps/rejected": -1.8589850664138794, "loss": 3.9308, "rewards/accuracies": 0.5, "rewards/chosen": -16.579118728637695, "rewards/margins": 2.0107316970825195, "rewards/rejected": -18.58985137939453, "step": 826 }, { "epoch": 0.11260893246187363, "grad_norm": 39.685908143113394, "learning_rate": 7.996175587977104e-07, "logits/chosen": 1.752053141593933, "logits/rejected": 0.7705234289169312, "logps/chosen": -1.4807262420654297, "logps/rejected": -1.29343581199646, "loss": 4.4555, "rewards/accuracies": 0.25, "rewards/chosen": -14.80726146697998, "rewards/margins": -1.8729026317596436, "rewards/rejected": -12.934358596801758, "step": 827 }, { "epoch": 0.11274509803921569, "grad_norm": 47.84132394142951, "learning_rate": 7.996092010354817e-07, "logits/chosen": 2.135287046432495, "logits/rejected": 2.1528279781341553, "logps/chosen": -1.538193702697754, "logps/rejected": -1.5659382343292236, "loss": 4.6281, "rewards/accuracies": 0.75, "rewards/chosen": -15.381937980651855, "rewards/margins": 0.27744460105895996, "rewards/rejected": -15.659381866455078, "step": 828 }, { "epoch": 0.11288126361655773, "grad_norm": 41.72236335878688, "learning_rate": 7.996007529782508e-07, "logits/chosen": 3.0798511505126953, "logits/rejected": 3.3644068241119385, "logps/chosen": -2.07153058052063, "logps/rejected": -2.362374782562256, "loss": 3.8606, "rewards/accuracies": 0.75, "rewards/chosen": -20.715307235717773, "rewards/margins": 2.908442497253418, "rewards/rejected": -23.623748779296875, "step": 829 }, { "epoch": 0.11301742919389979, "grad_norm": 44.572744468730576, "learning_rate": 7.995922146279267e-07, "logits/chosen": 1.6180964708328247, "logits/rejected": 2.2122201919555664, "logps/chosen": -1.9738147258758545, "logps/rejected": -2.022217035293579, "loss": 4.3597, "rewards/accuracies": 0.5, "rewards/chosen": -19.738147735595703, "rewards/margins": 0.4840216636657715, "rewards/rejected": -20.22216796875, "step": 830 }, { "epoch": 0.11315359477124183, "grad_norm": 47.3185262969345, "learning_rate": 7.995835859864385e-07, "logits/chosen": 2.955887794494629, "logits/rejected": 0.3346899151802063, "logps/chosen": -1.395180106163025, "logps/rejected": -1.255450963973999, "loss": 4.9122, "rewards/accuracies": 0.25, "rewards/chosen": -13.951801300048828, "rewards/margins": -1.3972911834716797, "rewards/rejected": -12.554510116577148, "step": 831 }, { "epoch": 0.11328976034858387, "grad_norm": 59.227715455255606, "learning_rate": 7.995748670557361e-07, "logits/chosen": 4.161682605743408, "logits/rejected": 3.509000062942505, "logps/chosen": -2.5619614124298096, "logps/rejected": -2.225311756134033, "loss": 5.0492, "rewards/accuracies": 0.5, "rewards/chosen": -25.619613647460938, "rewards/margins": -3.3664956092834473, "rewards/rejected": -22.25311851501465, "step": 832 }, { "epoch": 0.11342592592592593, "grad_norm": 50.53096079337673, "learning_rate": 7.995660578377897e-07, "logits/chosen": 2.8084771633148193, "logits/rejected": 2.370011568069458, "logps/chosen": -1.3440096378326416, "logps/rejected": -1.364070177078247, "loss": 4.4065, "rewards/accuracies": 0.5, "rewards/chosen": -13.440096855163574, "rewards/margins": 0.2006053924560547, "rewards/rejected": -13.640701293945312, "step": 833 }, { "epoch": 0.11356209150326797, "grad_norm": 44.0042944948044, "learning_rate": 7.995571583345896e-07, "logits/chosen": 2.738004207611084, "logits/rejected": 4.2232818603515625, "logps/chosen": -1.8931374549865723, "logps/rejected": -1.5626740455627441, "loss": 4.6314, "rewards/accuracies": 0.0, "rewards/chosen": -18.931373596191406, "rewards/margins": -3.3046324253082275, "rewards/rejected": -15.626741409301758, "step": 834 }, { "epoch": 0.11369825708061002, "grad_norm": 41.03197631932184, "learning_rate": 7.995481685481467e-07, "logits/chosen": 1.3312965631484985, "logits/rejected": 2.5171711444854736, "logps/chosen": -1.4905967712402344, "logps/rejected": -1.7261977195739746, "loss": 4.1587, "rewards/accuracies": 1.0, "rewards/chosen": -14.905967712402344, "rewards/margins": 2.356008291244507, "rewards/rejected": -17.26197624206543, "step": 835 }, { "epoch": 0.11383442265795207, "grad_norm": 46.90304311271346, "learning_rate": 7.995390884804925e-07, "logits/chosen": 2.896207571029663, "logits/rejected": 3.531282424926758, "logps/chosen": -1.8912467956542969, "logps/rejected": -1.9229722023010254, "loss": 4.4022, "rewards/accuracies": 0.75, "rewards/chosen": -18.91246795654297, "rewards/margins": 0.31725406646728516, "rewards/rejected": -19.229721069335938, "step": 836 }, { "epoch": 0.11397058823529412, "grad_norm": 46.07314751988449, "learning_rate": 7.995299181336787e-07, "logits/chosen": 1.9982068538665771, "logits/rejected": 3.4526779651641846, "logps/chosen": -1.5250093936920166, "logps/rejected": -1.8938385248184204, "loss": 4.4738, "rewards/accuracies": 1.0, "rewards/chosen": -15.250094413757324, "rewards/margins": 3.68829083442688, "rewards/rejected": -18.938385009765625, "step": 837 }, { "epoch": 0.11410675381263617, "grad_norm": 43.47874445265805, "learning_rate": 7.995206575097774e-07, "logits/chosen": 2.553567886352539, "logits/rejected": 3.0127320289611816, "logps/chosen": -1.7549365758895874, "logps/rejected": -1.881333589553833, "loss": 4.5364, "rewards/accuracies": 0.5, "rewards/chosen": -17.549365997314453, "rewards/margins": 1.263970136642456, "rewards/rejected": -18.813335418701172, "step": 838 }, { "epoch": 0.11424291938997821, "grad_norm": 64.58009094122423, "learning_rate": 7.995113066108809e-07, "logits/chosen": 0.9528110027313232, "logits/rejected": 2.5350341796875, "logps/chosen": -1.5465189218521118, "logps/rejected": -1.6440314054489136, "loss": 4.4812, "rewards/accuracies": 0.75, "rewards/chosen": -15.465188980102539, "rewards/margins": 0.975125789642334, "rewards/rejected": -16.44031524658203, "step": 839 }, { "epoch": 0.11437908496732026, "grad_norm": 47.44221624778511, "learning_rate": 7.995018654391023e-07, "logits/chosen": 2.2601823806762695, "logits/rejected": 3.117448329925537, "logps/chosen": -1.259955644607544, "logps/rejected": -1.7267506122589111, "loss": 4.1959, "rewards/accuracies": 1.0, "rewards/chosen": -12.599555969238281, "rewards/margins": 4.667950630187988, "rewards/rejected": -17.267505645751953, "step": 840 }, { "epoch": 0.11451525054466231, "grad_norm": 41.98831457302339, "learning_rate": 7.99492333996575e-07, "logits/chosen": 1.1728336811065674, "logits/rejected": 3.7065720558166504, "logps/chosen": -1.482170581817627, "logps/rejected": -1.8663612604141235, "loss": 4.251, "rewards/accuracies": 1.0, "rewards/chosen": -14.821704864501953, "rewards/margins": 3.841907501220703, "rewards/rejected": -18.663612365722656, "step": 841 }, { "epoch": 0.11465141612200436, "grad_norm": 46.70692654102102, "learning_rate": 7.994827122854523e-07, "logits/chosen": 1.1253788471221924, "logits/rejected": 2.6127429008483887, "logps/chosen": -1.398816466331482, "logps/rejected": -1.6793115139007568, "loss": 4.4975, "rewards/accuracies": 0.75, "rewards/chosen": -13.988163948059082, "rewards/margins": 2.8049514293670654, "rewards/rejected": -16.793115615844727, "step": 842 }, { "epoch": 0.1147875816993464, "grad_norm": 47.70774277497766, "learning_rate": 7.994730003079089e-07, "logits/chosen": 2.3115875720977783, "logits/rejected": 0.31275153160095215, "logps/chosen": -1.9772289991378784, "logps/rejected": -1.419812560081482, "loss": 4.4677, "rewards/accuracies": 0.0, "rewards/chosen": -19.772289276123047, "rewards/margins": -5.574163913726807, "rewards/rejected": -14.198125839233398, "step": 843 }, { "epoch": 0.11492374727668846, "grad_norm": 49.57295789355476, "learning_rate": 7.994631980661389e-07, "logits/chosen": 1.6446104049682617, "logits/rejected": 2.8558950424194336, "logps/chosen": -1.72159743309021, "logps/rejected": -2.329660177230835, "loss": 4.6061, "rewards/accuracies": 0.75, "rewards/chosen": -17.215972900390625, "rewards/margins": 6.08062744140625, "rewards/rejected": -23.296602249145508, "step": 844 }, { "epoch": 0.1150599128540305, "grad_norm": 46.61842888218592, "learning_rate": 7.994533055623573e-07, "logits/chosen": 0.8966684341430664, "logits/rejected": 3.727719306945801, "logps/chosen": -1.4842970371246338, "logps/rejected": -1.8008421659469604, "loss": 4.1523, "rewards/accuracies": 0.75, "rewards/chosen": -14.842970848083496, "rewards/margins": 3.165449857711792, "rewards/rejected": -18.008420944213867, "step": 845 }, { "epoch": 0.11519607843137254, "grad_norm": 54.85005246381479, "learning_rate": 7.994433227987996e-07, "logits/chosen": -0.49391043186187744, "logits/rejected": -0.2893037796020508, "logps/chosen": -1.4544470310211182, "logps/rejected": -1.7277920246124268, "loss": 3.8415, "rewards/accuracies": 1.0, "rewards/chosen": -14.544469833374023, "rewards/margins": 2.733449935913086, "rewards/rejected": -17.27791976928711, "step": 846 }, { "epoch": 0.1153322440087146, "grad_norm": 41.24009811127357, "learning_rate": 7.994332497777209e-07, "logits/chosen": 0.6691161394119263, "logits/rejected": 4.294005393981934, "logps/chosen": -1.7547008991241455, "logps/rejected": -1.9339414834976196, "loss": 4.3354, "rewards/accuracies": 0.75, "rewards/chosen": -17.547008514404297, "rewards/margins": 1.7924070358276367, "rewards/rejected": -19.339414596557617, "step": 847 }, { "epoch": 0.11546840958605664, "grad_norm": 47.64465821751729, "learning_rate": 7.994230865013979e-07, "logits/chosen": 1.414803147315979, "logits/rejected": 1.7338522672653198, "logps/chosen": -1.7955043315887451, "logps/rejected": -1.9913398027420044, "loss": 3.5741, "rewards/accuracies": 0.75, "rewards/chosen": -17.95504379272461, "rewards/margins": 1.9583549499511719, "rewards/rejected": -19.91339874267578, "step": 848 }, { "epoch": 0.1156045751633987, "grad_norm": 54.4368666940105, "learning_rate": 7.994128329721269e-07, "logits/chosen": 0.42532259225845337, "logits/rejected": 1.560398817062378, "logps/chosen": -1.1815216541290283, "logps/rejected": -1.2621080875396729, "loss": 4.0454, "rewards/accuracies": 0.75, "rewards/chosen": -11.815216064453125, "rewards/margins": 0.8058650493621826, "rewards/rejected": -12.621081352233887, "step": 849 }, { "epoch": 0.11574074074074074, "grad_norm": 55.58384417072855, "learning_rate": 7.994024891922245e-07, "logits/chosen": 2.568765878677368, "logits/rejected": 1.9539084434509277, "logps/chosen": -1.7975943088531494, "logps/rejected": -1.793654441833496, "loss": 4.6728, "rewards/accuracies": 0.5, "rewards/chosen": -17.975942611694336, "rewards/margins": -0.039398908615112305, "rewards/rejected": -17.93654441833496, "step": 850 }, { "epoch": 0.11587690631808278, "grad_norm": 48.33937415352387, "learning_rate": 7.993920551640283e-07, "logits/chosen": 2.2358486652374268, "logits/rejected": 3.8478171825408936, "logps/chosen": -2.0244622230529785, "logps/rejected": -2.4460487365722656, "loss": 3.681, "rewards/accuracies": 0.75, "rewards/chosen": -20.2446231842041, "rewards/margins": 4.215863227844238, "rewards/rejected": -24.460487365722656, "step": 851 }, { "epoch": 0.11601307189542484, "grad_norm": 52.4684859792536, "learning_rate": 7.993815308898958e-07, "logits/chosen": 1.90043044090271, "logits/rejected": 2.0515055656433105, "logps/chosen": -2.0020904541015625, "logps/rejected": -2.0331709384918213, "loss": 4.8639, "rewards/accuracies": 0.5, "rewards/chosen": -20.020904541015625, "rewards/margins": 0.3108038902282715, "rewards/rejected": -20.331708908081055, "step": 852 }, { "epoch": 0.11614923747276688, "grad_norm": 47.364749392662674, "learning_rate": 7.993709163722051e-07, "logits/chosen": 2.676320791244507, "logits/rejected": 2.269941806793213, "logps/chosen": -1.8744984865188599, "logps/rejected": -1.4837815761566162, "loss": 4.7014, "rewards/accuracies": 0.25, "rewards/chosen": -18.744983673095703, "rewards/margins": -3.9071688652038574, "rewards/rejected": -14.83781623840332, "step": 853 }, { "epoch": 0.11628540305010893, "grad_norm": 51.238591292055965, "learning_rate": 7.993602116133546e-07, "logits/chosen": 3.5753331184387207, "logits/rejected": 1.9594500064849854, "logps/chosen": -1.8629615306854248, "logps/rejected": -1.8128989934921265, "loss": 5.0569, "rewards/accuracies": 0.5, "rewards/chosen": -18.629615783691406, "rewards/margins": -0.5006241798400879, "rewards/rejected": -18.128990173339844, "step": 854 }, { "epoch": 0.11642156862745098, "grad_norm": 49.77425740841202, "learning_rate": 7.993494166157631e-07, "logits/chosen": 2.4493346214294434, "logits/rejected": 1.9973634481430054, "logps/chosen": -1.9341917037963867, "logps/rejected": -1.799575924873352, "loss": 4.6408, "rewards/accuracies": 0.25, "rewards/chosen": -19.3419189453125, "rewards/margins": -1.3461594581604004, "rewards/rejected": -17.995758056640625, "step": 855 }, { "epoch": 0.11655773420479303, "grad_norm": 53.52736875719838, "learning_rate": 7.993385313818699e-07, "logits/chosen": 2.469193935394287, "logits/rejected": 3.577054738998413, "logps/chosen": -1.7958850860595703, "logps/rejected": -2.034409523010254, "loss": 4.2955, "rewards/accuracies": 0.75, "rewards/chosen": -17.958850860595703, "rewards/margins": 2.385244131088257, "rewards/rejected": -20.344093322753906, "step": 856 }, { "epoch": 0.11669389978213508, "grad_norm": 46.08058959205388, "learning_rate": 7.993275559141346e-07, "logits/chosen": 0.19053813815116882, "logits/rejected": 1.5932462215423584, "logps/chosen": -1.6488566398620605, "logps/rejected": -1.9421136379241943, "loss": 3.7655, "rewards/accuracies": 1.0, "rewards/chosen": -16.488567352294922, "rewards/margins": 2.932569980621338, "rewards/rejected": -19.4211368560791, "step": 857 }, { "epoch": 0.11683006535947713, "grad_norm": 54.95300365220007, "learning_rate": 7.993164902150371e-07, "logits/chosen": 3.1094419956207275, "logits/rejected": 3.263901710510254, "logps/chosen": -2.2033824920654297, "logps/rejected": -2.1520156860351562, "loss": 4.6461, "rewards/accuracies": 0.25, "rewards/chosen": -22.033824920654297, "rewards/margins": -0.5136651992797852, "rewards/rejected": -21.520158767700195, "step": 858 }, { "epoch": 0.11696623093681917, "grad_norm": 67.77367294866737, "learning_rate": 7.993053342870779e-07, "logits/chosen": 3.9570960998535156, "logits/rejected": 3.9210338592529297, "logps/chosen": -2.2754478454589844, "logps/rejected": -2.448413133621216, "loss": 4.9505, "rewards/accuracies": 0.75, "rewards/chosen": -22.75447654724121, "rewards/margins": 1.7296555042266846, "rewards/rejected": -24.484132766723633, "step": 859 }, { "epoch": 0.11710239651416122, "grad_norm": 50.15388855892387, "learning_rate": 7.992940881327778e-07, "logits/chosen": 4.34453010559082, "logits/rejected": 4.8185529708862305, "logps/chosen": -2.21602201461792, "logps/rejected": -2.227503776550293, "loss": 4.2153, "rewards/accuracies": 0.5, "rewards/chosen": -22.160221099853516, "rewards/margins": 0.11481428146362305, "rewards/rejected": -22.275035858154297, "step": 860 }, { "epoch": 0.11723856209150327, "grad_norm": 45.30773443856015, "learning_rate": 7.992827517546777e-07, "logits/chosen": 2.8476738929748535, "logits/rejected": 4.658045768737793, "logps/chosen": -2.219628095626831, "logps/rejected": -2.4782512187957764, "loss": 4.7001, "rewards/accuracies": 1.0, "rewards/chosen": -22.19628143310547, "rewards/margins": 2.5862321853637695, "rewards/rejected": -24.782512664794922, "step": 861 }, { "epoch": 0.11737472766884531, "grad_norm": 45.81868163563817, "learning_rate": 7.992713251553395e-07, "logits/chosen": 1.3500837087631226, "logits/rejected": 2.9237475395202637, "logps/chosen": -1.6426435708999634, "logps/rejected": -1.7314834594726562, "loss": 4.5684, "rewards/accuracies": 0.5, "rewards/chosen": -16.426435470581055, "rewards/margins": 0.8883979916572571, "rewards/rejected": -17.314834594726562, "step": 862 }, { "epoch": 0.11751089324618737, "grad_norm": 49.39613878450213, "learning_rate": 7.992598083373449e-07, "logits/chosen": -0.139217346906662, "logits/rejected": 2.7749881744384766, "logps/chosen": -1.4882128238677979, "logps/rejected": -2.088547468185425, "loss": 4.6251, "rewards/accuracies": 1.0, "rewards/chosen": -14.88212776184082, "rewards/margins": 6.0033464431762695, "rewards/rejected": -20.885475158691406, "step": 863 }, { "epoch": 0.11764705882352941, "grad_norm": 52.45370535794899, "learning_rate": 7.992482013032963e-07, "logits/chosen": 4.00998592376709, "logits/rejected": 4.236054420471191, "logps/chosen": -2.163886070251465, "logps/rejected": -2.6769449710845947, "loss": 5.272, "rewards/accuracies": 0.5, "rewards/chosen": -21.638858795166016, "rewards/margins": 5.130589485168457, "rewards/rejected": -26.76944923400879, "step": 864 }, { "epoch": 0.11778322440087145, "grad_norm": 50.65548901387453, "learning_rate": 7.992365040558164e-07, "logits/chosen": 2.4232351779937744, "logits/rejected": 3.499790668487549, "logps/chosen": -1.7854571342468262, "logps/rejected": -1.9066252708435059, "loss": 4.5199, "rewards/accuracies": 0.5, "rewards/chosen": -17.854572296142578, "rewards/margins": 1.2116804122924805, "rewards/rejected": -19.066251754760742, "step": 865 }, { "epoch": 0.11791938997821351, "grad_norm": 46.67765654054406, "learning_rate": 7.992247165975483e-07, "logits/chosen": 1.8912698030471802, "logits/rejected": 1.1794350147247314, "logps/chosen": -1.8825395107269287, "logps/rejected": -1.8339974880218506, "loss": 4.7948, "rewards/accuracies": 0.5, "rewards/chosen": -18.825395584106445, "rewards/margins": -0.48541975021362305, "rewards/rejected": -18.339975357055664, "step": 866 }, { "epoch": 0.11805555555555555, "grad_norm": 56.28742389107345, "learning_rate": 7.992128389311554e-07, "logits/chosen": 1.952441930770874, "logits/rejected": 1.540048360824585, "logps/chosen": -1.8311275243759155, "logps/rejected": -1.7946739196777344, "loss": 5.2516, "rewards/accuracies": 0.25, "rewards/chosen": -18.311275482177734, "rewards/margins": -0.36453723907470703, "rewards/rejected": -17.946739196777344, "step": 867 }, { "epoch": 0.11819172113289761, "grad_norm": 44.948134359417, "learning_rate": 7.992008710593216e-07, "logits/chosen": 2.1039044857025146, "logits/rejected": 1.2545994520187378, "logps/chosen": -1.6411654949188232, "logps/rejected": -1.7921127080917358, "loss": 4.5962, "rewards/accuracies": 0.5, "rewards/chosen": -16.41165542602539, "rewards/margins": 1.5094714164733887, "rewards/rejected": -17.921127319335938, "step": 868 }, { "epoch": 0.11832788671023965, "grad_norm": 52.49721816137179, "learning_rate": 7.991888129847513e-07, "logits/chosen": 4.327672004699707, "logits/rejected": 3.7563371658325195, "logps/chosen": -2.566938877105713, "logps/rejected": -2.1195712089538574, "loss": 4.5933, "rewards/accuracies": 0.25, "rewards/chosen": -25.669389724731445, "rewards/margins": -4.473678112030029, "rewards/rejected": -21.195711135864258, "step": 869 }, { "epoch": 0.1184640522875817, "grad_norm": 45.67079604215105, "learning_rate": 7.991766647101688e-07, "logits/chosen": 3.1962695121765137, "logits/rejected": 3.223931312561035, "logps/chosen": -1.8188562393188477, "logps/rejected": -1.7390127182006836, "loss": 4.132, "rewards/accuracies": 0.25, "rewards/chosen": -18.188560485839844, "rewards/margins": -0.7984356880187988, "rewards/rejected": -17.390125274658203, "step": 870 }, { "epoch": 0.11860021786492375, "grad_norm": 51.43311252007988, "learning_rate": 7.991644262383194e-07, "logits/chosen": 1.478072166442871, "logits/rejected": 3.210827112197876, "logps/chosen": -1.4893360137939453, "logps/rejected": -1.6730726957321167, "loss": 4.6882, "rewards/accuracies": 0.75, "rewards/chosen": -14.893360137939453, "rewards/margins": 1.8373669385910034, "rewards/rejected": -16.73072624206543, "step": 871 }, { "epoch": 0.1187363834422658, "grad_norm": 53.0222831201789, "learning_rate": 7.991520975719684e-07, "logits/chosen": 5.356977939605713, "logits/rejected": 3.766334295272827, "logps/chosen": -2.2170310020446777, "logps/rejected": -2.3270926475524902, "loss": 4.5255, "rewards/accuracies": 0.25, "rewards/chosen": -22.170310974121094, "rewards/margins": 1.1006131172180176, "rewards/rejected": -23.270923614501953, "step": 872 }, { "epoch": 0.11887254901960784, "grad_norm": 44.27896418638206, "learning_rate": 7.991396787139013e-07, "logits/chosen": 4.165711402893066, "logits/rejected": 5.513090133666992, "logps/chosen": -2.0428590774536133, "logps/rejected": -2.3191142082214355, "loss": 4.0964, "rewards/accuracies": 0.75, "rewards/chosen": -20.4285888671875, "rewards/margins": 2.762552499771118, "rewards/rejected": -23.191143035888672, "step": 873 }, { "epoch": 0.1190087145969499, "grad_norm": 50.79791203670637, "learning_rate": 7.991271696669247e-07, "logits/chosen": 3.4533634185791016, "logits/rejected": 2.5262489318847656, "logps/chosen": -2.085223436355591, "logps/rejected": -2.0119123458862305, "loss": 3.8653, "rewards/accuracies": 0.75, "rewards/chosen": -20.85223388671875, "rewards/margins": -0.7331125736236572, "rewards/rejected": -20.119121551513672, "step": 874 }, { "epoch": 0.11914488017429194, "grad_norm": 51.68379894757839, "learning_rate": 7.991145704338649e-07, "logits/chosen": 2.1606600284576416, "logits/rejected": 3.633030414581299, "logps/chosen": -1.6110084056854248, "logps/rejected": -1.5138835906982422, "loss": 4.8811, "rewards/accuracies": 0.5, "rewards/chosen": -16.110084533691406, "rewards/margins": -0.9712471961975098, "rewards/rejected": -15.138836860656738, "step": 875 }, { "epoch": 0.119281045751634, "grad_norm": 46.75667664439345, "learning_rate": 7.991018810175687e-07, "logits/chosen": 2.224560022354126, "logits/rejected": 4.6029253005981445, "logps/chosen": -1.7181895971298218, "logps/rejected": -1.9294828176498413, "loss": 3.8069, "rewards/accuracies": 1.0, "rewards/chosen": -17.181896209716797, "rewards/margins": 2.1129326820373535, "rewards/rejected": -19.294828414916992, "step": 876 }, { "epoch": 0.11941721132897604, "grad_norm": 51.38507299413752, "learning_rate": 7.990891014209034e-07, "logits/chosen": 2.006201982498169, "logits/rejected": 2.8657727241516113, "logps/chosen": -1.6579669713974, "logps/rejected": -1.7703289985656738, "loss": 3.9919, "rewards/accuracies": 0.75, "rewards/chosen": -16.579669952392578, "rewards/margins": 1.123619556427002, "rewards/rejected": -17.703289031982422, "step": 877 }, { "epoch": 0.11955337690631808, "grad_norm": 43.93299669649472, "learning_rate": 7.990762316467568e-07, "logits/chosen": 5.223365306854248, "logits/rejected": 4.129323959350586, "logps/chosen": -2.056187391281128, "logps/rejected": -2.463162899017334, "loss": 4.5957, "rewards/accuracies": 1.0, "rewards/chosen": -20.561874389648438, "rewards/margins": 4.069756507873535, "rewards/rejected": -24.631629943847656, "step": 878 }, { "epoch": 0.11968954248366014, "grad_norm": 43.30022442770633, "learning_rate": 7.99063271698037e-07, "logits/chosen": 2.8848280906677246, "logits/rejected": 4.5443501472473145, "logps/chosen": -1.8444262742996216, "logps/rejected": -2.1167306900024414, "loss": 3.4545, "rewards/accuracies": 1.0, "rewards/chosen": -18.444263458251953, "rewards/margins": 2.7230446338653564, "rewards/rejected": -21.167306900024414, "step": 879 }, { "epoch": 0.11982570806100218, "grad_norm": 51.14096622681969, "learning_rate": 7.990502215776722e-07, "logits/chosen": 2.321112871170044, "logits/rejected": 3.750701427459717, "logps/chosen": -2.2624361515045166, "logps/rejected": -2.180265426635742, "loss": 4.2435, "rewards/accuracies": 0.5, "rewards/chosen": -22.624361038208008, "rewards/margins": -0.8217084407806396, "rewards/rejected": -21.802654266357422, "step": 880 }, { "epoch": 0.11996187363834422, "grad_norm": 48.13673785603665, "learning_rate": 7.990370812886113e-07, "logits/chosen": 3.8476743698120117, "logits/rejected": 3.423062562942505, "logps/chosen": -2.239699363708496, "logps/rejected": -2.046610116958618, "loss": 4.7771, "rewards/accuracies": 0.5, "rewards/chosen": -22.39699363708496, "rewards/margins": -1.930891752243042, "rewards/rejected": -20.466102600097656, "step": 881 }, { "epoch": 0.12009803921568628, "grad_norm": 45.84602063857524, "learning_rate": 7.990238508338232e-07, "logits/chosen": 4.163022041320801, "logits/rejected": 2.378523349761963, "logps/chosen": -1.6412121057510376, "logps/rejected": -1.704577922821045, "loss": 4.7697, "rewards/accuracies": 0.25, "rewards/chosen": -16.412120819091797, "rewards/margins": 0.6336579322814941, "rewards/rejected": -17.045780181884766, "step": 882 }, { "epoch": 0.12023420479302832, "grad_norm": 53.435184353368484, "learning_rate": 7.990105302162978e-07, "logits/chosen": 2.2128028869628906, "logits/rejected": 4.717723369598389, "logps/chosen": -1.3731321096420288, "logps/rejected": -1.8178997039794922, "loss": 4.3784, "rewards/accuracies": 0.75, "rewards/chosen": -13.731321334838867, "rewards/margins": 4.447676181793213, "rewards/rejected": -18.178997039794922, "step": 883 }, { "epoch": 0.12037037037037036, "grad_norm": 42.96505701420603, "learning_rate": 7.989971194390447e-07, "logits/chosen": 2.2748289108276367, "logits/rejected": 3.827136993408203, "logps/chosen": -2.036902904510498, "logps/rejected": -2.082481622695923, "loss": 4.7382, "rewards/accuracies": 0.25, "rewards/chosen": -20.369028091430664, "rewards/margins": 0.45578885078430176, "rewards/rejected": -20.824817657470703, "step": 884 }, { "epoch": 0.12050653594771242, "grad_norm": 45.494367035058495, "learning_rate": 7.989836185050945e-07, "logits/chosen": 1.94386887550354, "logits/rejected": 2.29274320602417, "logps/chosen": -1.5400676727294922, "logps/rejected": -1.64557945728302, "loss": 4.2838, "rewards/accuracies": 0.75, "rewards/chosen": -15.400675773620605, "rewards/margins": 1.0551190376281738, "rewards/rejected": -16.455795288085938, "step": 885 }, { "epoch": 0.12064270152505446, "grad_norm": 55.142472546080825, "learning_rate": 7.989700274174976e-07, "logits/chosen": 3.5493879318237305, "logits/rejected": 3.47529935836792, "logps/chosen": -1.8077670335769653, "logps/rejected": -1.9385488033294678, "loss": 4.0408, "rewards/accuracies": 0.75, "rewards/chosen": -18.07767105102539, "rewards/margins": 1.3078162670135498, "rewards/rejected": -19.385486602783203, "step": 886 }, { "epoch": 0.12077886710239652, "grad_norm": 48.05095540562677, "learning_rate": 7.989563461793251e-07, "logits/chosen": 2.8617262840270996, "logits/rejected": 3.3611364364624023, "logps/chosen": -1.7528488636016846, "logps/rejected": -1.9175481796264648, "loss": 4.1522, "rewards/accuracies": 0.75, "rewards/chosen": -17.528488159179688, "rewards/margins": 1.6469941139221191, "rewards/rejected": -19.17548179626465, "step": 887 }, { "epoch": 0.12091503267973856, "grad_norm": 48.01245428873479, "learning_rate": 7.989425747936683e-07, "logits/chosen": 2.8180088996887207, "logits/rejected": 4.4854326248168945, "logps/chosen": -2.363223075866699, "logps/rejected": -2.2070374488830566, "loss": 4.0442, "rewards/accuracies": 0.5, "rewards/chosen": -23.632230758666992, "rewards/margins": -1.5618562698364258, "rewards/rejected": -22.07037353515625, "step": 888 }, { "epoch": 0.1210511982570806, "grad_norm": 45.462362447491415, "learning_rate": 7.989287132636392e-07, "logits/chosen": 2.4991650581359863, "logits/rejected": 4.11154842376709, "logps/chosen": -1.7081477642059326, "logps/rejected": -2.1405749320983887, "loss": 4.0091, "rewards/accuracies": 0.75, "rewards/chosen": -17.081478118896484, "rewards/margins": 4.3242716789245605, "rewards/rejected": -21.40574836730957, "step": 889 }, { "epoch": 0.12118736383442266, "grad_norm": 62.971862620617564, "learning_rate": 7.989147615923695e-07, "logits/chosen": 2.6527466773986816, "logits/rejected": 2.257628917694092, "logps/chosen": -1.8904260396957397, "logps/rejected": -1.9785504341125488, "loss": 4.5339, "rewards/accuracies": 0.5, "rewards/chosen": -18.904260635375977, "rewards/margins": 0.8812432289123535, "rewards/rejected": -19.785503387451172, "step": 890 }, { "epoch": 0.1213235294117647, "grad_norm": 42.08856413694379, "learning_rate": 7.98900719783012e-07, "logits/chosen": 3.3762714862823486, "logits/rejected": 3.2567319869995117, "logps/chosen": -1.915075421333313, "logps/rejected": -1.9884480237960815, "loss": 4.2604, "rewards/accuracies": 0.5, "rewards/chosen": -19.150754928588867, "rewards/margins": 0.7337253093719482, "rewards/rejected": -19.884479522705078, "step": 891 }, { "epoch": 0.12145969498910675, "grad_norm": 40.49045877241789, "learning_rate": 7.988865878387398e-07, "logits/chosen": 4.846908092498779, "logits/rejected": 3.8207530975341797, "logps/chosen": -1.9089422225952148, "logps/rejected": -1.8505258560180664, "loss": 3.8749, "rewards/accuracies": 0.5, "rewards/chosen": -19.08942222595215, "rewards/margins": -0.5841648578643799, "rewards/rejected": -18.50525665283203, "step": 892 }, { "epoch": 0.1215958605664488, "grad_norm": 48.5246369801411, "learning_rate": 7.988723657627457e-07, "logits/chosen": 2.7413411140441895, "logits/rejected": 3.0456862449645996, "logps/chosen": -1.7761399745941162, "logps/rejected": -2.1849045753479004, "loss": 4.5538, "rewards/accuracies": 0.75, "rewards/chosen": -17.761398315429688, "rewards/margins": 4.087647438049316, "rewards/rejected": -21.849044799804688, "step": 893 }, { "epoch": 0.12173202614379085, "grad_norm": 44.20358965151766, "learning_rate": 7.988580535582434e-07, "logits/chosen": 1.4866293668746948, "logits/rejected": 2.080658435821533, "logps/chosen": -1.5370550155639648, "logps/rejected": -1.7706012725830078, "loss": 4.4478, "rewards/accuracies": 1.0, "rewards/chosen": -15.370550155639648, "rewards/margins": 2.3354616165161133, "rewards/rejected": -17.706012725830078, "step": 894 }, { "epoch": 0.1218681917211329, "grad_norm": 48.4984903478355, "learning_rate": 7.988436512284667e-07, "logits/chosen": 4.371905326843262, "logits/rejected": 4.849971771240234, "logps/chosen": -2.0095033645629883, "logps/rejected": -2.010392189025879, "loss": 4.764, "rewards/accuracies": 0.5, "rewards/chosen": -20.095033645629883, "rewards/margins": 0.008887290954589844, "rewards/rejected": -20.103919982910156, "step": 895 }, { "epoch": 0.12200435729847495, "grad_norm": 43.76130854317068, "learning_rate": 7.988291587766704e-07, "logits/chosen": 0.7869336605072021, "logits/rejected": 2.588344097137451, "logps/chosen": -1.580883264541626, "logps/rejected": -1.7331383228302002, "loss": 4.8237, "rewards/accuracies": 0.5, "rewards/chosen": -15.808831214904785, "rewards/margins": 1.522552490234375, "rewards/rejected": -17.331382751464844, "step": 896 }, { "epoch": 0.12214052287581699, "grad_norm": 44.384715063683636, "learning_rate": 7.98814576206129e-07, "logits/chosen": 3.339961051940918, "logits/rejected": 4.212775230407715, "logps/chosen": -2.334468364715576, "logps/rejected": -2.3197569847106934, "loss": 4.6602, "rewards/accuracies": 0.5, "rewards/chosen": -23.344684600830078, "rewards/margins": -0.14711618423461914, "rewards/rejected": -23.197568893432617, "step": 897 }, { "epoch": 0.12227668845315905, "grad_norm": 51.73671002062397, "learning_rate": 7.987999035201373e-07, "logits/chosen": 3.159456729888916, "logits/rejected": 3.190122365951538, "logps/chosen": -1.5092206001281738, "logps/rejected": -1.710134744644165, "loss": 4.3063, "rewards/accuracies": 0.5, "rewards/chosen": -15.092205047607422, "rewards/margins": 2.009141445159912, "rewards/rejected": -17.101346969604492, "step": 898 }, { "epoch": 0.12241285403050109, "grad_norm": 40.743187994496395, "learning_rate": 7.987851407220109e-07, "logits/chosen": 2.180572032928467, "logits/rejected": 3.5586190223693848, "logps/chosen": -1.9326659440994263, "logps/rejected": -2.1298985481262207, "loss": 4.2739, "rewards/accuracies": 0.5, "rewards/chosen": -19.32666015625, "rewards/margins": 1.972327709197998, "rewards/rejected": -21.298988342285156, "step": 899 }, { "epoch": 0.12254901960784313, "grad_norm": 47.16739713679448, "learning_rate": 7.987702878150855e-07, "logits/chosen": 2.317399263381958, "logits/rejected": 3.876195192337036, "logps/chosen": -1.8095216751098633, "logps/rejected": -1.820688009262085, "loss": 4.4463, "rewards/accuracies": 0.5, "rewards/chosen": -18.095216751098633, "rewards/margins": 0.1116633415222168, "rewards/rejected": -18.206880569458008, "step": 900 }, { "epoch": 0.12268518518518519, "grad_norm": 40.59960760452764, "learning_rate": 7.987553448027174e-07, "logits/chosen": 3.4409563541412354, "logits/rejected": 2.4700558185577393, "logps/chosen": -1.8249249458312988, "logps/rejected": -1.6153151988983154, "loss": 4.4296, "rewards/accuracies": 0.5, "rewards/chosen": -18.249248504638672, "rewards/margins": -2.0960965156555176, "rewards/rejected": -16.153152465820312, "step": 901 }, { "epoch": 0.12282135076252723, "grad_norm": 44.82086177962484, "learning_rate": 7.987403116882831e-07, "logits/chosen": 5.743502140045166, "logits/rejected": 3.320540428161621, "logps/chosen": -2.15918231010437, "logps/rejected": -2.076713800430298, "loss": 5.0782, "rewards/accuracies": 0.75, "rewards/chosen": -21.59182357788086, "rewards/margins": -0.8246865272521973, "rewards/rejected": -20.76713752746582, "step": 902 }, { "epoch": 0.12295751633986927, "grad_norm": 40.395762940091544, "learning_rate": 7.987251884751792e-07, "logits/chosen": 1.206485629081726, "logits/rejected": 2.824019432067871, "logps/chosen": -1.591003179550171, "logps/rejected": -2.1346194744110107, "loss": 4.3669, "rewards/accuracies": 1.0, "rewards/chosen": -15.910032272338867, "rewards/margins": 5.436162948608398, "rewards/rejected": -21.346195220947266, "step": 903 }, { "epoch": 0.12309368191721133, "grad_norm": 45.460139565193195, "learning_rate": 7.98709975166823e-07, "logits/chosen": 3.094592571258545, "logits/rejected": 4.961325645446777, "logps/chosen": -1.9894088506698608, "logps/rejected": -2.1329660415649414, "loss": 4.9829, "rewards/accuracies": 0.75, "rewards/chosen": -19.894088745117188, "rewards/margins": 1.435572624206543, "rewards/rejected": -21.329660415649414, "step": 904 }, { "epoch": 0.12322984749455337, "grad_norm": 43.101541917476865, "learning_rate": 7.986946717666523e-07, "logits/chosen": 1.9508311748504639, "logits/rejected": 2.878726005554199, "logps/chosen": -1.7555562257766724, "logps/rejected": -1.8282430171966553, "loss": 4.2331, "rewards/accuracies": 0.75, "rewards/chosen": -17.55556297302246, "rewards/margins": 0.7268669605255127, "rewards/rejected": -18.282428741455078, "step": 905 }, { "epoch": 0.12336601307189543, "grad_norm": 50.64435205642499, "learning_rate": 7.986792782781248e-07, "logits/chosen": 3.4693427085876465, "logits/rejected": 3.4220943450927734, "logps/chosen": -1.8354898691177368, "logps/rejected": -1.7637406587600708, "loss": 4.9392, "rewards/accuracies": 0.5, "rewards/chosen": -18.35489845275879, "rewards/margins": -0.7174921035766602, "rewards/rejected": -17.637407302856445, "step": 906 }, { "epoch": 0.12350217864923747, "grad_norm": 39.771995550123705, "learning_rate": 7.986637947047188e-07, "logits/chosen": 0.5849190354347229, "logits/rejected": 2.1833252906799316, "logps/chosen": -1.4270265102386475, "logps/rejected": -1.547895073890686, "loss": 4.3305, "rewards/accuracies": 0.5, "rewards/chosen": -14.270265579223633, "rewards/margins": 1.2086851596832275, "rewards/rejected": -15.478950500488281, "step": 907 }, { "epoch": 0.12363834422657952, "grad_norm": 43.592645517112615, "learning_rate": 7.986482210499332e-07, "logits/chosen": 4.370053291320801, "logits/rejected": 4.596684455871582, "logps/chosen": -1.8080695867538452, "logps/rejected": -2.0375077724456787, "loss": 4.4211, "rewards/accuracies": 0.75, "rewards/chosen": -18.08069610595703, "rewards/margins": 2.2943825721740723, "rewards/rejected": -20.375078201293945, "step": 908 }, { "epoch": 0.12377450980392157, "grad_norm": 43.84896067572066, "learning_rate": 7.986325573172866e-07, "logits/chosen": 3.0842809677124023, "logits/rejected": 3.274156332015991, "logps/chosen": -1.8091762065887451, "logps/rejected": -2.082444429397583, "loss": 4.3271, "rewards/accuracies": 1.0, "rewards/chosen": -18.09176254272461, "rewards/margins": 2.732680320739746, "rewards/rejected": -20.824443817138672, "step": 909 }, { "epoch": 0.12391067538126362, "grad_norm": 40.11645101472523, "learning_rate": 7.986168035103185e-07, "logits/chosen": 4.297245025634766, "logits/rejected": 4.371959209442139, "logps/chosen": -2.070082426071167, "logps/rejected": -2.072909355163574, "loss": 4.7035, "rewards/accuracies": 0.25, "rewards/chosen": -20.700822830200195, "rewards/margins": 0.028270721435546875, "rewards/rejected": -20.729095458984375, "step": 910 }, { "epoch": 0.12404684095860566, "grad_norm": 39.67394136578868, "learning_rate": 7.986009596325889e-07, "logits/chosen": 4.212724685668945, "logits/rejected": 3.879843235015869, "logps/chosen": -2.0414607524871826, "logps/rejected": -1.9579627513885498, "loss": 4.1132, "rewards/accuracies": 0.25, "rewards/chosen": -20.414608001708984, "rewards/margins": -0.8349790573120117, "rewards/rejected": -19.579627990722656, "step": 911 }, { "epoch": 0.12418300653594772, "grad_norm": 59.82111093893077, "learning_rate": 7.985850256876774e-07, "logits/chosen": 2.5144758224487305, "logits/rejected": 4.730952739715576, "logps/chosen": -1.6218390464782715, "logps/rejected": -1.7873131036758423, "loss": 4.8935, "rewards/accuracies": 0.75, "rewards/chosen": -16.21839141845703, "rewards/margins": 1.654740810394287, "rewards/rejected": -17.873130798339844, "step": 912 }, { "epoch": 0.12431917211328976, "grad_norm": 41.40315968563615, "learning_rate": 7.985690016791846e-07, "logits/chosen": 2.8049492835998535, "logits/rejected": 4.845367431640625, "logps/chosen": -1.5186257362365723, "logps/rejected": -1.8233811855316162, "loss": 3.9798, "rewards/accuracies": 0.75, "rewards/chosen": -15.186257362365723, "rewards/margins": 3.0475544929504395, "rewards/rejected": -18.23381233215332, "step": 913 }, { "epoch": 0.12445533769063181, "grad_norm": 41.19206872192094, "learning_rate": 7.985528876107314e-07, "logits/chosen": 4.0677266120910645, "logits/rejected": 5.641735076904297, "logps/chosen": -1.913643717765808, "logps/rejected": -2.222106456756592, "loss": 4.522, "rewards/accuracies": 0.75, "rewards/chosen": -19.136436462402344, "rewards/margins": 3.0846285820007324, "rewards/rejected": -22.221065521240234, "step": 914 }, { "epoch": 0.12459150326797386, "grad_norm": 46.27470077262356, "learning_rate": 7.985366834859586e-07, "logits/chosen": 3.1562342643737793, "logits/rejected": 4.442729949951172, "logps/chosen": -2.051252841949463, "logps/rejected": -2.3018441200256348, "loss": 4.558, "rewards/accuracies": 0.75, "rewards/chosen": -20.512529373168945, "rewards/margins": 2.5059123039245605, "rewards/rejected": -23.01844024658203, "step": 915 }, { "epoch": 0.1247276688453159, "grad_norm": 59.58670618214668, "learning_rate": 7.985203893085281e-07, "logits/chosen": 3.825368881225586, "logits/rejected": 4.587506294250488, "logps/chosen": -2.081829786300659, "logps/rejected": -2.191189765930176, "loss": 5.2282, "rewards/accuracies": 0.75, "rewards/chosen": -20.81829833984375, "rewards/margins": 1.0935986042022705, "rewards/rejected": -21.911895751953125, "step": 916 }, { "epoch": 0.12486383442265796, "grad_norm": 43.31430337550887, "learning_rate": 7.985040050821211e-07, "logits/chosen": 2.2694787979125977, "logits/rejected": 4.983431816101074, "logps/chosen": -1.842294454574585, "logps/rejected": -2.190483808517456, "loss": 4.1616, "rewards/accuracies": 0.75, "rewards/chosen": -18.422945022583008, "rewards/margins": 3.4818921089172363, "rewards/rejected": -21.904836654663086, "step": 917 }, { "epoch": 0.125, "grad_norm": 41.62399718441401, "learning_rate": 7.984875308104403e-07, "logits/chosen": 3.727170705795288, "logits/rejected": 4.325830459594727, "logps/chosen": -1.8539658784866333, "logps/rejected": -2.1135666370391846, "loss": 4.5098, "rewards/accuracies": 0.75, "rewards/chosen": -18.53965950012207, "rewards/margins": 2.5960068702697754, "rewards/rejected": -21.135665893554688, "step": 918 }, { "epoch": 0.12513616557734206, "grad_norm": 40.91313677416503, "learning_rate": 7.984709664972079e-07, "logits/chosen": 4.328149795532227, "logits/rejected": 5.034106254577637, "logps/chosen": -1.8038724660873413, "logps/rejected": -2.1280770301818848, "loss": 4.4271, "rewards/accuracies": 1.0, "rewards/chosen": -18.038726806640625, "rewards/margins": 3.242043972015381, "rewards/rejected": -21.28076934814453, "step": 919 }, { "epoch": 0.12527233115468409, "grad_norm": 39.7630957263521, "learning_rate": 7.984543121461669e-07, "logits/chosen": 1.633331298828125, "logits/rejected": 2.8702194690704346, "logps/chosen": -1.8553555011749268, "logps/rejected": -2.0308690071105957, "loss": 4.3341, "rewards/accuracies": 0.25, "rewards/chosen": -18.55355453491211, "rewards/margins": 1.7551379203796387, "rewards/rejected": -20.308692932128906, "step": 920 }, { "epoch": 0.12540849673202614, "grad_norm": 43.501872062677734, "learning_rate": 7.984375677610804e-07, "logits/chosen": 2.4156532287597656, "logits/rejected": 2.3160037994384766, "logps/chosen": -1.4887292385101318, "logps/rejected": -1.6244280338287354, "loss": 4.5213, "rewards/accuracies": 0.75, "rewards/chosen": -14.88729190826416, "rewards/margins": 1.356987714767456, "rewards/rejected": -16.244279861450195, "step": 921 }, { "epoch": 0.1255446623093682, "grad_norm": 46.11879372838668, "learning_rate": 7.984207333457318e-07, "logits/chosen": 4.600252628326416, "logits/rejected": 3.799196720123291, "logps/chosen": -2.3106746673583984, "logps/rejected": -2.2340214252471924, "loss": 4.5697, "rewards/accuracies": 0.5, "rewards/chosen": -23.106748580932617, "rewards/margins": -0.7665324211120605, "rewards/rejected": -22.340213775634766, "step": 922 }, { "epoch": 0.12568082788671023, "grad_norm": 46.306341529327185, "learning_rate": 7.984038089039254e-07, "logits/chosen": 3.3711540699005127, "logits/rejected": 3.850059986114502, "logps/chosen": -2.374791145324707, "logps/rejected": -2.4938666820526123, "loss": 4.9497, "rewards/accuracies": 0.5, "rewards/chosen": -23.74791145324707, "rewards/margins": 1.1907548904418945, "rewards/rejected": -24.93866729736328, "step": 923 }, { "epoch": 0.12581699346405228, "grad_norm": 44.24729207793318, "learning_rate": 7.98386794439485e-07, "logits/chosen": 3.6166205406188965, "logits/rejected": 5.774544715881348, "logps/chosen": -1.6187400817871094, "logps/rejected": -2.0852341651916504, "loss": 4.5065, "rewards/accuracies": 0.75, "rewards/chosen": -16.187400817871094, "rewards/margins": 4.664943218231201, "rewards/rejected": -20.852344512939453, "step": 924 }, { "epoch": 0.12595315904139434, "grad_norm": 61.83353528418289, "learning_rate": 7.983696899562552e-07, "logits/chosen": 3.6685948371887207, "logits/rejected": 5.299312591552734, "logps/chosen": -2.044760227203369, "logps/rejected": -2.317211866378784, "loss": 4.1921, "rewards/accuracies": 0.75, "rewards/chosen": -20.447601318359375, "rewards/margins": 2.7245187759399414, "rewards/rejected": -23.172119140625, "step": 925 }, { "epoch": 0.12608932461873637, "grad_norm": 40.423299588018615, "learning_rate": 7.98352495458101e-07, "logits/chosen": 5.379737377166748, "logits/rejected": 5.089036464691162, "logps/chosen": -1.9949156045913696, "logps/rejected": -1.771653175354004, "loss": 4.7274, "rewards/accuracies": 0.0, "rewards/chosen": -19.949155807495117, "rewards/margins": -2.232623815536499, "rewards/rejected": -17.71653175354004, "step": 926 }, { "epoch": 0.12622549019607843, "grad_norm": 43.324004807766144, "learning_rate": 7.983352109489077e-07, "logits/chosen": 5.466368675231934, "logits/rejected": 5.654951095581055, "logps/chosen": -2.083843469619751, "logps/rejected": -2.349411964416504, "loss": 4.2948, "rewards/accuracies": 0.75, "rewards/chosen": -20.83843421936035, "rewards/margins": 2.655683994293213, "rewards/rejected": -23.494117736816406, "step": 927 }, { "epoch": 0.12636165577342048, "grad_norm": 44.561381100304466, "learning_rate": 7.983178364325808e-07, "logits/chosen": 5.512731552124023, "logits/rejected": 4.444767951965332, "logps/chosen": -2.056936740875244, "logps/rejected": -1.9662474393844604, "loss": 4.5719, "rewards/accuracies": 0.5, "rewards/chosen": -20.569366455078125, "rewards/margins": -0.9068922996520996, "rewards/rejected": -19.6624755859375, "step": 928 }, { "epoch": 0.12649782135076254, "grad_norm": 44.079269882628275, "learning_rate": 7.983003719130464e-07, "logits/chosen": 2.812741994857788, "logits/rejected": 3.123182773590088, "logps/chosen": -1.5605435371398926, "logps/rejected": -1.3674390316009521, "loss": 4.5808, "rewards/accuracies": 0.25, "rewards/chosen": -15.60543441772461, "rewards/margins": -1.931044578552246, "rewards/rejected": -13.674389839172363, "step": 929 }, { "epoch": 0.12663398692810457, "grad_norm": 45.567117510456114, "learning_rate": 7.982828173942503e-07, "logits/chosen": 5.040852069854736, "logits/rejected": 6.083684921264648, "logps/chosen": -2.250911235809326, "logps/rejected": -2.1568360328674316, "loss": 4.6982, "rewards/accuracies": 0.5, "rewards/chosen": -22.509113311767578, "rewards/margins": -0.9407525062561035, "rewards/rejected": -21.568359375, "step": 930 }, { "epoch": 0.12677015250544663, "grad_norm": 55.29693268431634, "learning_rate": 7.982651728801596e-07, "logits/chosen": 4.920167922973633, "logits/rejected": 5.806859016418457, "logps/chosen": -2.155095338821411, "logps/rejected": -2.1734275817871094, "loss": 5.3683, "rewards/accuracies": 0.75, "rewards/chosen": -21.550952911376953, "rewards/margins": 0.18332242965698242, "rewards/rejected": -21.73427391052246, "step": 931 }, { "epoch": 0.12690631808278868, "grad_norm": 51.151944700143495, "learning_rate": 7.982474383747608e-07, "logits/chosen": 4.080419063568115, "logits/rejected": 4.0843119621276855, "logps/chosen": -1.3504643440246582, "logps/rejected": -1.4400324821472168, "loss": 4.6042, "rewards/accuracies": 0.75, "rewards/chosen": -13.504644393920898, "rewards/margins": 0.8956809043884277, "rewards/rejected": -14.400325775146484, "step": 932 }, { "epoch": 0.1270424836601307, "grad_norm": 48.38644999493428, "learning_rate": 7.982296138820615e-07, "logits/chosen": 2.381006956100464, "logits/rejected": 3.5152487754821777, "logps/chosen": -1.9553923606872559, "logps/rejected": -2.5064501762390137, "loss": 4.1871, "rewards/accuracies": 0.75, "rewards/chosen": -19.553924560546875, "rewards/margins": 5.510580062866211, "rewards/rejected": -25.064502716064453, "step": 933 }, { "epoch": 0.12717864923747277, "grad_norm": 49.50639520994033, "learning_rate": 7.982116994060891e-07, "logits/chosen": 3.1999707221984863, "logits/rejected": 3.5983245372772217, "logps/chosen": -1.6864445209503174, "logps/rejected": -1.81197988986969, "loss": 4.9187, "rewards/accuracies": 0.75, "rewards/chosen": -16.864444732666016, "rewards/margins": 1.255354642868042, "rewards/rejected": -18.11979866027832, "step": 934 }, { "epoch": 0.12731481481481483, "grad_norm": 43.97113194708808, "learning_rate": 7.981936949508915e-07, "logits/chosen": 3.466364622116089, "logits/rejected": 4.342974662780762, "logps/chosen": -1.775575876235962, "logps/rejected": -1.9006186723709106, "loss": 4.3883, "rewards/accuracies": 0.75, "rewards/chosen": -17.75575828552246, "rewards/margins": 1.2504279613494873, "rewards/rejected": -19.006187438964844, "step": 935 }, { "epoch": 0.12745098039215685, "grad_norm": 45.40915686192073, "learning_rate": 7.98175600520537e-07, "logits/chosen": 5.069168567657471, "logits/rejected": 4.011317253112793, "logps/chosen": -2.0801730155944824, "logps/rejected": -1.785546064376831, "loss": 3.9337, "rewards/accuracies": 0.25, "rewards/chosen": -20.801733016967773, "rewards/margins": -2.946272373199463, "rewards/rejected": -17.85546112060547, "step": 936 }, { "epoch": 0.1275871459694989, "grad_norm": 48.88985452599935, "learning_rate": 7.981574161191144e-07, "logits/chosen": 4.479645729064941, "logits/rejected": 4.119283676147461, "logps/chosen": -2.008878707885742, "logps/rejected": -1.8627779483795166, "loss": 4.6163, "rewards/accuracies": 0.25, "rewards/chosen": -20.08878517150879, "rewards/margins": -1.461005449295044, "rewards/rejected": -18.62778091430664, "step": 937 }, { "epoch": 0.12772331154684097, "grad_norm": 41.72454425731878, "learning_rate": 7.981391417507323e-07, "logits/chosen": 4.24578857421875, "logits/rejected": 5.56094217300415, "logps/chosen": -1.6242228746414185, "logps/rejected": -1.9850709438323975, "loss": 3.7823, "rewards/accuracies": 0.75, "rewards/chosen": -16.242229461669922, "rewards/margins": 3.6084814071655273, "rewards/rejected": -19.850711822509766, "step": 938 }, { "epoch": 0.127859477124183, "grad_norm": 41.542219631526024, "learning_rate": 7.981207774195201e-07, "logits/chosen": 2.388550281524658, "logits/rejected": 2.7481040954589844, "logps/chosen": -1.4094196557998657, "logps/rejected": -1.4209145307540894, "loss": 4.2736, "rewards/accuracies": 0.5, "rewards/chosen": -14.094196319580078, "rewards/margins": 0.11494946479797363, "rewards/rejected": -14.209145545959473, "step": 939 }, { "epoch": 0.12799564270152505, "grad_norm": 42.33347511290077, "learning_rate": 7.981023231296273e-07, "logits/chosen": 4.54443359375, "logits/rejected": 4.933149814605713, "logps/chosen": -1.74971604347229, "logps/rejected": -2.0567898750305176, "loss": 4.2645, "rewards/accuracies": 0.75, "rewards/chosen": -17.497159957885742, "rewards/margins": 3.0707364082336426, "rewards/rejected": -20.56789779663086, "step": 940 }, { "epoch": 0.1281318082788671, "grad_norm": 42.64312780411096, "learning_rate": 7.980837788852239e-07, "logits/chosen": 5.820315361022949, "logits/rejected": 6.398536682128906, "logps/chosen": -2.0151822566986084, "logps/rejected": -2.3736934661865234, "loss": 4.4991, "rewards/accuracies": 0.5, "rewards/chosen": -20.15182113647461, "rewards/margins": 3.585111618041992, "rewards/rejected": -23.736934661865234, "step": 941 }, { "epoch": 0.12826797385620914, "grad_norm": 44.548206904479265, "learning_rate": 7.980651446905e-07, "logits/chosen": 3.0311245918273926, "logits/rejected": 4.375387191772461, "logps/chosen": -1.335904598236084, "logps/rejected": -1.7065558433532715, "loss": 4.3803, "rewards/accuracies": 1.0, "rewards/chosen": -13.359046936035156, "rewards/margins": 3.706512451171875, "rewards/rejected": -17.06555938720703, "step": 942 }, { "epoch": 0.1284041394335512, "grad_norm": 46.03212187017097, "learning_rate": 7.980464205496662e-07, "logits/chosen": 5.804676055908203, "logits/rejected": 5.911588668823242, "logps/chosen": -2.4242725372314453, "logps/rejected": -2.5432591438293457, "loss": 4.2232, "rewards/accuracies": 0.75, "rewards/chosen": -24.242725372314453, "rewards/margins": 1.1898655891418457, "rewards/rejected": -25.43259048461914, "step": 943 }, { "epoch": 0.12854030501089325, "grad_norm": 47.039969645893656, "learning_rate": 7.980276064669535e-07, "logits/chosen": 5.90254545211792, "logits/rejected": 6.204745292663574, "logps/chosen": -2.2665328979492188, "logps/rejected": -2.3097662925720215, "loss": 4.0287, "rewards/accuracies": 0.25, "rewards/chosen": -22.665328979492188, "rewards/margins": 0.43233394622802734, "rewards/rejected": -23.09766387939453, "step": 944 }, { "epoch": 0.12867647058823528, "grad_norm": 47.16362541189174, "learning_rate": 7.98008702446613e-07, "logits/chosen": 3.9191551208496094, "logits/rejected": 4.215607643127441, "logps/chosen": -1.9037729501724243, "logps/rejected": -1.7615230083465576, "loss": 3.7495, "rewards/accuracies": 0.25, "rewards/chosen": -19.037729263305664, "rewards/margins": -1.4224982261657715, "rewards/rejected": -17.615230560302734, "step": 945 }, { "epoch": 0.12881263616557734, "grad_norm": 45.466338393784646, "learning_rate": 7.979897084929162e-07, "logits/chosen": 3.6212191581726074, "logits/rejected": 4.43524169921875, "logps/chosen": -1.8950848579406738, "logps/rejected": -2.1814417839050293, "loss": 4.0856, "rewards/accuracies": 0.75, "rewards/chosen": -18.950849533081055, "rewards/margins": 2.8635671138763428, "rewards/rejected": -21.814414978027344, "step": 946 }, { "epoch": 0.1289488017429194, "grad_norm": 45.474767826035944, "learning_rate": 7.979706246101548e-07, "logits/chosen": 5.143566608428955, "logits/rejected": 3.793743133544922, "logps/chosen": -2.099146842956543, "logps/rejected": -2.0233609676361084, "loss": 4.2086, "rewards/accuracies": 0.25, "rewards/chosen": -20.991470336914062, "rewards/margins": -0.757859468460083, "rewards/rejected": -20.233610153198242, "step": 947 }, { "epoch": 0.12908496732026145, "grad_norm": 47.15164148430484, "learning_rate": 7.979514508026412e-07, "logits/chosen": 4.842136383056641, "logits/rejected": 3.687164068222046, "logps/chosen": -1.9615323543548584, "logps/rejected": -1.7279993295669556, "loss": 4.36, "rewards/accuracies": 0.25, "rewards/chosen": -19.61532211303711, "rewards/margins": -2.335330009460449, "rewards/rejected": -17.27999496459961, "step": 948 }, { "epoch": 0.12922113289760348, "grad_norm": 40.7896737394388, "learning_rate": 7.979321870747078e-07, "logits/chosen": 5.142605781555176, "logits/rejected": 4.8444390296936035, "logps/chosen": -1.7940646409988403, "logps/rejected": -1.937427043914795, "loss": 4.3378, "rewards/accuracies": 0.75, "rewards/chosen": -17.94064712524414, "rewards/margins": 1.4336223602294922, "rewards/rejected": -19.374269485473633, "step": 949 }, { "epoch": 0.12935729847494554, "grad_norm": 49.14240480685074, "learning_rate": 7.979128334307073e-07, "logits/chosen": 4.571870803833008, "logits/rejected": 5.114515781402588, "logps/chosen": -2.0677504539489746, "logps/rejected": -2.2272913455963135, "loss": 4.2954, "rewards/accuracies": 0.75, "rewards/chosen": -20.67750358581543, "rewards/margins": 1.5954079627990723, "rewards/rejected": -22.272911071777344, "step": 950 }, { "epoch": 0.1294934640522876, "grad_norm": 46.27407035808989, "learning_rate": 7.978933898750132e-07, "logits/chosen": 4.751191139221191, "logits/rejected": 4.3973846435546875, "logps/chosen": -1.9540538787841797, "logps/rejected": -2.0736680030822754, "loss": 4.0757, "rewards/accuracies": 0.5, "rewards/chosen": -19.540538787841797, "rewards/margins": 1.1961417198181152, "rewards/rejected": -20.73668098449707, "step": 951 }, { "epoch": 0.12962962962962962, "grad_norm": 50.67797364152998, "learning_rate": 7.978738564120183e-07, "logits/chosen": 5.365047454833984, "logits/rejected": 5.75999641418457, "logps/chosen": -2.2765026092529297, "logps/rejected": -2.4693546295166016, "loss": 3.9793, "rewards/accuracies": 0.75, "rewards/chosen": -22.765024185180664, "rewards/margins": 1.9285249710083008, "rewards/rejected": -24.69355010986328, "step": 952 }, { "epoch": 0.12976579520697168, "grad_norm": 46.82204641961534, "learning_rate": 7.978542330461368e-07, "logits/chosen": 2.54077410697937, "logits/rejected": 3.8147473335266113, "logps/chosen": -1.5841829776763916, "logps/rejected": -1.629895567893982, "loss": 4.5747, "rewards/accuracies": 0.5, "rewards/chosen": -15.841829299926758, "rewards/margins": 0.4571256637573242, "rewards/rejected": -16.2989559173584, "step": 953 }, { "epoch": 0.12990196078431374, "grad_norm": 43.51786447562013, "learning_rate": 7.978345197818027e-07, "logits/chosen": 3.595370292663574, "logits/rejected": 3.6262831687927246, "logps/chosen": -1.5203224420547485, "logps/rejected": -1.9724345207214355, "loss": 4.1412, "rewards/accuracies": 0.5, "rewards/chosen": -15.203225135803223, "rewards/margins": 4.521120071411133, "rewards/rejected": -19.72434425354004, "step": 954 }, { "epoch": 0.13003812636165576, "grad_norm": 50.07509463239467, "learning_rate": 7.978147166234702e-07, "logits/chosen": 3.1539669036865234, "logits/rejected": 5.040071487426758, "logps/chosen": -1.7023606300354004, "logps/rejected": -2.127298355102539, "loss": 4.6013, "rewards/accuracies": 0.75, "rewards/chosen": -17.023605346679688, "rewards/margins": 4.24937629699707, "rewards/rejected": -21.27298355102539, "step": 955 }, { "epoch": 0.13017429193899782, "grad_norm": 48.697989978331805, "learning_rate": 7.977948235756142e-07, "logits/chosen": 4.769252777099609, "logits/rejected": 4.8115997314453125, "logps/chosen": -2.163222312927246, "logps/rejected": -2.003490686416626, "loss": 4.3809, "rewards/accuracies": 0.5, "rewards/chosen": -21.632221221923828, "rewards/margins": -1.5973162651062012, "rewards/rejected": -20.0349063873291, "step": 956 }, { "epoch": 0.13031045751633988, "grad_norm": 40.401717915591334, "learning_rate": 7.977748406427297e-07, "logits/chosen": 3.9239730834960938, "logits/rejected": 4.2219038009643555, "logps/chosen": -2.070563793182373, "logps/rejected": -2.1642794609069824, "loss": 4.1236, "rewards/accuracies": 0.5, "rewards/chosen": -20.705636978149414, "rewards/margins": 0.9371569156646729, "rewards/rejected": -21.64279556274414, "step": 957 }, { "epoch": 0.1304466230936819, "grad_norm": 49.083535581767336, "learning_rate": 7.977547678293318e-07, "logits/chosen": 4.473104000091553, "logits/rejected": 3.4383156299591064, "logps/chosen": -1.9833264350891113, "logps/rejected": -2.0197336673736572, "loss": 4.6419, "rewards/accuracies": 0.25, "rewards/chosen": -19.83326530456543, "rewards/margins": 0.36407041549682617, "rewards/rejected": -20.19733428955078, "step": 958 }, { "epoch": 0.13058278867102396, "grad_norm": 51.227079984744044, "learning_rate": 7.977346051399563e-07, "logits/chosen": 5.350119590759277, "logits/rejected": 5.465251445770264, "logps/chosen": -1.9534590244293213, "logps/rejected": -2.1739351749420166, "loss": 4.1825, "rewards/accuracies": 1.0, "rewards/chosen": -19.534591674804688, "rewards/margins": 2.204761505126953, "rewards/rejected": -21.739351272583008, "step": 959 }, { "epoch": 0.13071895424836602, "grad_norm": 47.36466187060154, "learning_rate": 7.97714352579159e-07, "logits/chosen": 3.092315196990967, "logits/rejected": 2.7317256927490234, "logps/chosen": -2.1760711669921875, "logps/rejected": -1.6117217540740967, "loss": 4.5603, "rewards/accuracies": 0.0, "rewards/chosen": -21.760711669921875, "rewards/margins": -5.643494606018066, "rewards/rejected": -16.117218017578125, "step": 960 }, { "epoch": 0.13085511982570805, "grad_norm": 50.31096595796402, "learning_rate": 7.976940101515161e-07, "logits/chosen": 4.896127700805664, "logits/rejected": 6.660222053527832, "logps/chosen": -2.306648015975952, "logps/rejected": -2.4038524627685547, "loss": 4.3529, "rewards/accuracies": 0.75, "rewards/chosen": -23.066478729248047, "rewards/margins": 0.9720456600189209, "rewards/rejected": -24.03852653503418, "step": 961 }, { "epoch": 0.1309912854030501, "grad_norm": 39.03432470282297, "learning_rate": 7.976735778616243e-07, "logits/chosen": 4.2287750244140625, "logits/rejected": 4.561674118041992, "logps/chosen": -2.042038917541504, "logps/rejected": -2.19973087310791, "loss": 3.6604, "rewards/accuracies": 0.75, "rewards/chosen": -20.42038917541504, "rewards/margins": 1.5769217014312744, "rewards/rejected": -21.997310638427734, "step": 962 }, { "epoch": 0.13112745098039216, "grad_norm": 46.242176918362205, "learning_rate": 7.976530557141005e-07, "logits/chosen": 2.236238479614258, "logits/rejected": 2.8560032844543457, "logps/chosen": -2.059316396713257, "logps/rejected": -2.5558269023895264, "loss": 4.3322, "rewards/accuracies": 0.5, "rewards/chosen": -20.593162536621094, "rewards/margins": 4.965105056762695, "rewards/rejected": -25.55826759338379, "step": 963 }, { "epoch": 0.1312636165577342, "grad_norm": 50.88213264810947, "learning_rate": 7.976324437135816e-07, "logits/chosen": 3.77366304397583, "logits/rejected": 4.560412406921387, "logps/chosen": -1.5472891330718994, "logps/rejected": -1.8739361763000488, "loss": 5.2659, "rewards/accuracies": 0.75, "rewards/chosen": -15.472890853881836, "rewards/margins": 3.2664694786071777, "rewards/rejected": -18.739360809326172, "step": 964 }, { "epoch": 0.13139978213507625, "grad_norm": 47.876548895451904, "learning_rate": 7.976117418647252e-07, "logits/chosen": 3.4880549907684326, "logits/rejected": 4.130314350128174, "logps/chosen": -1.4403774738311768, "logps/rejected": -1.7024507522583008, "loss": 3.8921, "rewards/accuracies": 0.75, "rewards/chosen": -14.403775215148926, "rewards/margins": 2.6207315921783447, "rewards/rejected": -17.024507522583008, "step": 965 }, { "epoch": 0.1315359477124183, "grad_norm": 50.99959510175533, "learning_rate": 7.975909501722091e-07, "logits/chosen": 2.7523601055145264, "logits/rejected": 3.877437114715576, "logps/chosen": -1.8566656112670898, "logps/rejected": -1.6848039627075195, "loss": 4.8541, "rewards/accuracies": 0.5, "rewards/chosen": -18.56665802001953, "rewards/margins": -1.7186167240142822, "rewards/rejected": -16.848039627075195, "step": 966 }, { "epoch": 0.13167211328976036, "grad_norm": 50.956380927826146, "learning_rate": 7.975700686407312e-07, "logits/chosen": 3.3729968070983887, "logits/rejected": 4.822307586669922, "logps/chosen": -1.8159701824188232, "logps/rejected": -2.2293167114257812, "loss": 4.543, "rewards/accuracies": 1.0, "rewards/chosen": -18.15970230102539, "rewards/margins": 4.133465766906738, "rewards/rejected": -22.293167114257812, "step": 967 }, { "epoch": 0.1318082788671024, "grad_norm": 54.749953128116665, "learning_rate": 7.9754909727501e-07, "logits/chosen": 3.351588487625122, "logits/rejected": 5.281325340270996, "logps/chosen": -2.151978015899658, "logps/rejected": -2.2518444061279297, "loss": 4.2842, "rewards/accuracies": 0.75, "rewards/chosen": -21.519779205322266, "rewards/margins": 0.998664379119873, "rewards/rejected": -22.518444061279297, "step": 968 }, { "epoch": 0.13194444444444445, "grad_norm": 43.080488756247696, "learning_rate": 7.975280360797841e-07, "logits/chosen": 4.246837139129639, "logits/rejected": 4.487598896026611, "logps/chosen": -2.3251090049743652, "logps/rejected": -2.337202310562134, "loss": 4.4683, "rewards/accuracies": 0.75, "rewards/chosen": -23.25109100341797, "rewards/margins": 0.12093162536621094, "rewards/rejected": -23.37202262878418, "step": 969 }, { "epoch": 0.1320806100217865, "grad_norm": 47.76658154043028, "learning_rate": 7.975068850598125e-07, "logits/chosen": 4.291533470153809, "logits/rejected": 3.0182411670684814, "logps/chosen": -2.1051673889160156, "logps/rejected": -1.8428406715393066, "loss": 4.2182, "rewards/accuracies": 0.25, "rewards/chosen": -21.051673889160156, "rewards/margins": -2.62326717376709, "rewards/rejected": -18.428407669067383, "step": 970 }, { "epoch": 0.13221677559912853, "grad_norm": 57.444594265324795, "learning_rate": 7.974856442198743e-07, "logits/chosen": 4.133232116699219, "logits/rejected": 4.52957820892334, "logps/chosen": -1.9346314668655396, "logps/rejected": -1.835188627243042, "loss": 4.3984, "rewards/accuracies": 0.25, "rewards/chosen": -19.3463134765625, "rewards/margins": -0.9944281578063965, "rewards/rejected": -18.351886749267578, "step": 971 }, { "epoch": 0.1323529411764706, "grad_norm": 60.7068250230704, "learning_rate": 7.974643135647692e-07, "logits/chosen": 5.3910088539123535, "logits/rejected": 2.681448459625244, "logps/chosen": -2.036872386932373, "logps/rejected": -1.6932172775268555, "loss": 4.5738, "rewards/accuracies": 0.25, "rewards/chosen": -20.368724822998047, "rewards/margins": -3.436553716659546, "rewards/rejected": -16.932170867919922, "step": 972 }, { "epoch": 0.13248910675381265, "grad_norm": 48.84333161134421, "learning_rate": 7.97442893099317e-07, "logits/chosen": 3.5312976837158203, "logits/rejected": 3.8547539710998535, "logps/chosen": -1.820877194404602, "logps/rejected": -1.8509085178375244, "loss": 4.1499, "rewards/accuracies": 0.5, "rewards/chosen": -18.208770751953125, "rewards/margins": 0.3003120422363281, "rewards/rejected": -18.509082794189453, "step": 973 }, { "epoch": 0.13262527233115468, "grad_norm": 56.94220670747304, "learning_rate": 7.974213828283577e-07, "logits/chosen": 4.5804829597473145, "logits/rejected": 4.682823181152344, "logps/chosen": -2.1328482627868652, "logps/rejected": -2.189065933227539, "loss": 4.6415, "rewards/accuracies": 0.5, "rewards/chosen": -21.328481674194336, "rewards/margins": 0.5621762275695801, "rewards/rejected": -21.890657424926758, "step": 974 }, { "epoch": 0.13276143790849673, "grad_norm": 48.43445876643324, "learning_rate": 7.973997827567519e-07, "logits/chosen": 4.169370174407959, "logits/rejected": 5.91585636138916, "logps/chosen": -1.8856592178344727, "logps/rejected": -2.4391183853149414, "loss": 4.2587, "rewards/accuracies": 1.0, "rewards/chosen": -18.85659408569336, "rewards/margins": 5.534591197967529, "rewards/rejected": -24.391185760498047, "step": 975 }, { "epoch": 0.1328976034858388, "grad_norm": 45.08184627361692, "learning_rate": 7.973780928893802e-07, "logits/chosen": 3.8775973320007324, "logits/rejected": 5.359223365783691, "logps/chosen": -1.919114351272583, "logps/rejected": -2.533646583557129, "loss": 3.8817, "rewards/accuracies": 1.0, "rewards/chosen": -19.191143035888672, "rewards/margins": 6.145323276519775, "rewards/rejected": -25.336467742919922, "step": 976 }, { "epoch": 0.13303376906318082, "grad_norm": 56.09289219192801, "learning_rate": 7.973563132311437e-07, "logits/chosen": 2.984923839569092, "logits/rejected": 3.725620746612549, "logps/chosen": -2.4187049865722656, "logps/rejected": -2.253016948699951, "loss": 4.307, "rewards/accuracies": 0.5, "rewards/chosen": -24.187049865722656, "rewards/margins": -1.6568799018859863, "rewards/rejected": -22.530170440673828, "step": 977 }, { "epoch": 0.13316993464052287, "grad_norm": 52.605489467796964, "learning_rate": 7.973344437869636e-07, "logits/chosen": 4.163606643676758, "logits/rejected": 4.523069381713867, "logps/chosen": -1.6992753744125366, "logps/rejected": -2.365288496017456, "loss": 4.691, "rewards/accuracies": 0.75, "rewards/chosen": -16.992753982543945, "rewards/margins": 6.66013240814209, "rewards/rejected": -23.65288543701172, "step": 978 }, { "epoch": 0.13330610021786493, "grad_norm": 47.88216664115121, "learning_rate": 7.973124845617815e-07, "logits/chosen": 4.534308433532715, "logits/rejected": 5.856690406799316, "logps/chosen": -2.352555990219116, "logps/rejected": -2.4972190856933594, "loss": 4.7095, "rewards/accuracies": 0.75, "rewards/chosen": -23.525558471679688, "rewards/margins": 1.4466314315795898, "rewards/rejected": -24.972190856933594, "step": 979 }, { "epoch": 0.13344226579520696, "grad_norm": 213.90874589908165, "learning_rate": 7.972904355605594e-07, "logits/chosen": 5.879633903503418, "logits/rejected": 3.6448092460632324, "logps/chosen": -2.0529088973999023, "logps/rejected": -1.9756548404693604, "loss": 3.879, "rewards/accuracies": 0.25, "rewards/chosen": -20.529088973999023, "rewards/margins": -0.7725400924682617, "rewards/rejected": -19.756547927856445, "step": 980 }, { "epoch": 0.13357843137254902, "grad_norm": 41.382381397155505, "learning_rate": 7.972682967882793e-07, "logits/chosen": 4.053157806396484, "logits/rejected": 3.6222245693206787, "logps/chosen": -2.054159641265869, "logps/rejected": -1.9190062284469604, "loss": 4.2723, "rewards/accuracies": 0.5, "rewards/chosen": -20.541595458984375, "rewards/margins": -1.3515324592590332, "rewards/rejected": -19.1900634765625, "step": 981 }, { "epoch": 0.13371459694989107, "grad_norm": 42.675774649923795, "learning_rate": 7.972460682499436e-07, "logits/chosen": 2.7811765670776367, "logits/rejected": 6.186593055725098, "logps/chosen": -2.1404130458831787, "logps/rejected": -2.3828067779541016, "loss": 4.3807, "rewards/accuracies": 0.5, "rewards/chosen": -21.404132843017578, "rewards/margins": 2.4239344596862793, "rewards/rejected": -23.828065872192383, "step": 982 }, { "epoch": 0.1338507625272331, "grad_norm": 50.197445627127294, "learning_rate": 7.972237499505752e-07, "logits/chosen": 4.719696521759033, "logits/rejected": 3.5258288383483887, "logps/chosen": -2.1296708583831787, "logps/rejected": -2.0927040576934814, "loss": 4.2228, "rewards/accuracies": 0.5, "rewards/chosen": -21.296710968017578, "rewards/margins": -0.36966943740844727, "rewards/rejected": -20.927040100097656, "step": 983 }, { "epoch": 0.13398692810457516, "grad_norm": 48.87911605940029, "learning_rate": 7.972013418952171e-07, "logits/chosen": 3.941441059112549, "logits/rejected": 4.572856426239014, "logps/chosen": -1.6212220191955566, "logps/rejected": -2.0114712715148926, "loss": 4.2577, "rewards/accuracies": 0.75, "rewards/chosen": -16.21221923828125, "rewards/margins": 3.902493715286255, "rewards/rejected": -20.114713668823242, "step": 984 }, { "epoch": 0.13412309368191722, "grad_norm": 49.102846358808854, "learning_rate": 7.971788440889324e-07, "logits/chosen": 5.568404197692871, "logits/rejected": 5.142353057861328, "logps/chosen": -2.0520334243774414, "logps/rejected": -2.043212413787842, "loss": 5.5378, "rewards/accuracies": 0.75, "rewards/chosen": -20.52033233642578, "rewards/margins": -0.0882103443145752, "rewards/rejected": -20.4321231842041, "step": 985 }, { "epoch": 0.13425925925925927, "grad_norm": 47.130618195538986, "learning_rate": 7.971562565368048e-07, "logits/chosen": 4.325529098510742, "logits/rejected": 4.223384857177734, "logps/chosen": -2.0123467445373535, "logps/rejected": -2.2217302322387695, "loss": 4.1463, "rewards/accuracies": 0.5, "rewards/chosen": -20.123470306396484, "rewards/margins": 2.0938305854797363, "rewards/rejected": -22.217300415039062, "step": 986 }, { "epoch": 0.1343954248366013, "grad_norm": 56.85513994092024, "learning_rate": 7.971335792439381e-07, "logits/chosen": 3.0165677070617676, "logits/rejected": 3.8728694915771484, "logps/chosen": -1.7131550312042236, "logps/rejected": -1.9218038320541382, "loss": 4.317, "rewards/accuracies": 0.5, "rewards/chosen": -17.131549835205078, "rewards/margins": 2.086489200592041, "rewards/rejected": -19.218036651611328, "step": 987 }, { "epoch": 0.13453159041394336, "grad_norm": 43.19919892905289, "learning_rate": 7.971108122154564e-07, "logits/chosen": 2.199279308319092, "logits/rejected": 3.4707608222961426, "logps/chosen": -1.775047779083252, "logps/rejected": -2.2052979469299316, "loss": 4.0318, "rewards/accuracies": 0.75, "rewards/chosen": -17.750476837158203, "rewards/margins": 4.30250358581543, "rewards/rejected": -22.052980422973633, "step": 988 }, { "epoch": 0.13466775599128541, "grad_norm": 46.923278013458386, "learning_rate": 7.970879554565041e-07, "logits/chosen": 4.047567367553711, "logits/rejected": 1.979766845703125, "logps/chosen": -2.0801734924316406, "logps/rejected": -2.026052236557007, "loss": 3.8622, "rewards/accuracies": 0.5, "rewards/chosen": -20.801734924316406, "rewards/margins": -0.5412132740020752, "rewards/rejected": -20.260522842407227, "step": 989 }, { "epoch": 0.13480392156862744, "grad_norm": 45.60032744736615, "learning_rate": 7.970650089722459e-07, "logits/chosen": 5.783294677734375, "logits/rejected": 5.452220916748047, "logps/chosen": -2.1028800010681152, "logps/rejected": -2.3600974082946777, "loss": 3.8532, "rewards/accuracies": 0.5, "rewards/chosen": -21.02880096435547, "rewards/margins": 2.5721747875213623, "rewards/rejected": -23.600975036621094, "step": 990 }, { "epoch": 0.1349400871459695, "grad_norm": 49.89439065257801, "learning_rate": 7.970419727678669e-07, "logits/chosen": 5.072588920593262, "logits/rejected": 6.236075401306152, "logps/chosen": -2.3951921463012695, "logps/rejected": -2.711578369140625, "loss": 4.6813, "rewards/accuracies": 1.0, "rewards/chosen": -23.951919555664062, "rewards/margins": 3.1638622283935547, "rewards/rejected": -27.11578369140625, "step": 991 }, { "epoch": 0.13507625272331156, "grad_norm": 45.106554757655005, "learning_rate": 7.970188468485719e-07, "logits/chosen": 5.099736213684082, "logits/rejected": 5.608115196228027, "logps/chosen": -2.3975818157196045, "logps/rejected": -2.559650421142578, "loss": 4.282, "rewards/accuracies": 0.75, "rewards/chosen": -23.97581672668457, "rewards/margins": 1.6206855773925781, "rewards/rejected": -25.59650230407715, "step": 992 }, { "epoch": 0.1352124183006536, "grad_norm": 45.58925512914784, "learning_rate": 7.969956312195868e-07, "logits/chosen": 5.022174835205078, "logits/rejected": 5.886713981628418, "logps/chosen": -2.6145336627960205, "logps/rejected": -2.6634230613708496, "loss": 4.4021, "rewards/accuracies": 0.5, "rewards/chosen": -26.145336151123047, "rewards/margins": 0.4888935089111328, "rewards/rejected": -26.63422966003418, "step": 993 }, { "epoch": 0.13534858387799564, "grad_norm": 46.04679495583251, "learning_rate": 7.969723258861573e-07, "logits/chosen": 2.5523691177368164, "logits/rejected": 3.9369258880615234, "logps/chosen": -2.185417652130127, "logps/rejected": -2.3597724437713623, "loss": 3.8463, "rewards/accuracies": 0.25, "rewards/chosen": -21.854175567626953, "rewards/margins": 1.743549108505249, "rewards/rejected": -23.59772491455078, "step": 994 }, { "epoch": 0.1354847494553377, "grad_norm": 47.8656962906433, "learning_rate": 7.969489308535494e-07, "logits/chosen": 3.4875621795654297, "logits/rejected": 3.8863320350646973, "logps/chosen": -1.6470462083816528, "logps/rejected": -1.8548824787139893, "loss": 4.1377, "rewards/accuracies": 0.75, "rewards/chosen": -16.470462799072266, "rewards/margins": 2.0783629417419434, "rewards/rejected": -18.548824310302734, "step": 995 }, { "epoch": 0.13562091503267973, "grad_norm": 61.6778311320352, "learning_rate": 7.969254461270493e-07, "logits/chosen": 4.330343246459961, "logits/rejected": 3.9407095909118652, "logps/chosen": -1.8384894132614136, "logps/rejected": -2.284733772277832, "loss": 5.0839, "rewards/accuracies": 1.0, "rewards/chosen": -18.38489532470703, "rewards/margins": 4.4624433517456055, "rewards/rejected": -22.84733772277832, "step": 996 }, { "epoch": 0.13575708061002179, "grad_norm": 49.574676440694454, "learning_rate": 7.969018717119635e-07, "logits/chosen": 6.398221492767334, "logits/rejected": 6.9847259521484375, "logps/chosen": -2.1014816761016846, "logps/rejected": -2.2102534770965576, "loss": 4.0394, "rewards/accuracies": 0.75, "rewards/chosen": -21.014816284179688, "rewards/margins": 1.0877175331115723, "rewards/rejected": -22.102535247802734, "step": 997 }, { "epoch": 0.13589324618736384, "grad_norm": 44.96095200950198, "learning_rate": 7.968782076136191e-07, "logits/chosen": 2.930283308029175, "logits/rejected": 5.049941062927246, "logps/chosen": -2.171353816986084, "logps/rejected": -2.4129629135131836, "loss": 4.1089, "rewards/accuracies": 0.5, "rewards/chosen": -21.713539123535156, "rewards/margins": 2.416090488433838, "rewards/rejected": -24.129629135131836, "step": 998 }, { "epoch": 0.13602941176470587, "grad_norm": 46.29333695563555, "learning_rate": 7.968544538373631e-07, "logits/chosen": 4.166629314422607, "logits/rejected": 6.42094087600708, "logps/chosen": -1.9230842590332031, "logps/rejected": -2.419991970062256, "loss": 3.8556, "rewards/accuracies": 0.75, "rewards/chosen": -19.23084259033203, "rewards/margins": 4.969076156616211, "rewards/rejected": -24.199918746948242, "step": 999 }, { "epoch": 0.13616557734204793, "grad_norm": 57.42655314203598, "learning_rate": 7.968306103885627e-07, "logits/chosen": 4.926385402679443, "logits/rejected": 6.983389377593994, "logps/chosen": -2.073366403579712, "logps/rejected": -2.279418468475342, "loss": 3.9182, "rewards/accuracies": 0.75, "rewards/chosen": -20.733665466308594, "rewards/margins": 2.060520648956299, "rewards/rejected": -22.794185638427734, "step": 1000 }, { "epoch": 0.13630174291938998, "grad_norm": 50.10276318976019, "learning_rate": 7.968066772726057e-07, "logits/chosen": 5.755542755126953, "logits/rejected": 6.5435333251953125, "logps/chosen": -2.1833250522613525, "logps/rejected": -2.3083083629608154, "loss": 4.1271, "rewards/accuracies": 0.5, "rewards/chosen": -21.833250045776367, "rewards/margins": 1.2498326301574707, "rewards/rejected": -23.083084106445312, "step": 1001 }, { "epoch": 0.136437908496732, "grad_norm": 46.78896321831273, "learning_rate": 7.967826544949e-07, "logits/chosen": 4.221368312835693, "logits/rejected": 5.376340866088867, "logps/chosen": -2.0299232006073, "logps/rejected": -2.463860034942627, "loss": 3.4598, "rewards/accuracies": 1.0, "rewards/chosen": -20.299232482910156, "rewards/margins": 4.339366436004639, "rewards/rejected": -24.63859748840332, "step": 1002 }, { "epoch": 0.13657407407407407, "grad_norm": 53.29676810790914, "learning_rate": 7.967585420608735e-07, "logits/chosen": 5.54328727722168, "logits/rejected": 6.363629341125488, "logps/chosen": -2.3394711017608643, "logps/rejected": -2.529414176940918, "loss": 4.3776, "rewards/accuracies": 0.75, "rewards/chosen": -23.394710540771484, "rewards/margins": 1.899430274963379, "rewards/rejected": -25.294139862060547, "step": 1003 }, { "epoch": 0.13671023965141613, "grad_norm": 49.281302584666655, "learning_rate": 7.96734339975975e-07, "logits/chosen": 6.152274131774902, "logits/rejected": 5.562819004058838, "logps/chosen": -2.431075096130371, "logps/rejected": -2.544400215148926, "loss": 4.0262, "rewards/accuracies": 0.75, "rewards/chosen": -24.310752868652344, "rewards/margins": 1.133251667022705, "rewards/rejected": -25.44400405883789, "step": 1004 }, { "epoch": 0.13684640522875818, "grad_norm": 46.67309596342532, "learning_rate": 7.967100482456726e-07, "logits/chosen": 6.460958480834961, "logits/rejected": 7.138529300689697, "logps/chosen": -2.206467628479004, "logps/rejected": -2.4913711547851562, "loss": 4.1087, "rewards/accuracies": 1.0, "rewards/chosen": -22.06467628479004, "rewards/margins": 2.8490347862243652, "rewards/rejected": -24.913711547851562, "step": 1005 }, { "epoch": 0.1369825708061002, "grad_norm": 45.71079213082513, "learning_rate": 7.966856668754559e-07, "logits/chosen": 6.183473587036133, "logits/rejected": 7.865950584411621, "logps/chosen": -2.276963233947754, "logps/rejected": -2.698779582977295, "loss": 4.1645, "rewards/accuracies": 1.0, "rewards/chosen": -22.769634246826172, "rewards/margins": 4.218163967132568, "rewards/rejected": -26.987796783447266, "step": 1006 }, { "epoch": 0.13711873638344227, "grad_norm": 53.352208155819916, "learning_rate": 7.966611958708337e-07, "logits/chosen": 5.227699279785156, "logits/rejected": 6.491503715515137, "logps/chosen": -2.195051670074463, "logps/rejected": -2.3064403533935547, "loss": 4.5424, "rewards/accuracies": 0.5, "rewards/chosen": -21.950515747070312, "rewards/margins": 1.1138887405395508, "rewards/rejected": -23.064403533935547, "step": 1007 }, { "epoch": 0.13725490196078433, "grad_norm": 43.79303774941913, "learning_rate": 7.966366352373354e-07, "logits/chosen": 6.17009973526001, "logits/rejected": 5.970820426940918, "logps/chosen": -2.2414960861206055, "logps/rejected": -2.3855643272399902, "loss": 4.4723, "rewards/accuracies": 0.5, "rewards/chosen": -22.414958953857422, "rewards/margins": 1.4406819343566895, "rewards/rejected": -23.855640411376953, "step": 1008 }, { "epoch": 0.13739106753812635, "grad_norm": 46.06948484389758, "learning_rate": 7.966119849805107e-07, "logits/chosen": 4.792661666870117, "logits/rejected": 5.642295837402344, "logps/chosen": -2.5291213989257812, "logps/rejected": -2.5257763862609863, "loss": 4.5125, "rewards/accuracies": 0.25, "rewards/chosen": -25.291213989257812, "rewards/margins": -0.03345012664794922, "rewards/rejected": -25.25776481628418, "step": 1009 }, { "epoch": 0.1375272331154684, "grad_norm": 47.47757072924291, "learning_rate": 7.965872451059295e-07, "logits/chosen": 4.543857574462891, "logits/rejected": 6.1334967613220215, "logps/chosen": -2.167302131652832, "logps/rejected": -2.574005365371704, "loss": 4.2466, "rewards/accuracies": 1.0, "rewards/chosen": -21.673019409179688, "rewards/margins": 4.067032814025879, "rewards/rejected": -25.740053176879883, "step": 1010 }, { "epoch": 0.13766339869281047, "grad_norm": 50.56296088076588, "learning_rate": 7.965624156191822e-07, "logits/chosen": 5.582206726074219, "logits/rejected": 6.183130741119385, "logps/chosen": -1.9676964282989502, "logps/rejected": -2.230074882507324, "loss": 4.5111, "rewards/accuracies": 1.0, "rewards/chosen": -19.676963806152344, "rewards/margins": 2.6237823963165283, "rewards/rejected": -22.300748825073242, "step": 1011 }, { "epoch": 0.1377995642701525, "grad_norm": 50.99114821270767, "learning_rate": 7.96537496525879e-07, "logits/chosen": 5.342528343200684, "logits/rejected": 3.897219657897949, "logps/chosen": -1.6309785842895508, "logps/rejected": -1.7863633632659912, "loss": 4.5261, "rewards/accuracies": 0.5, "rewards/chosen": -16.309785842895508, "rewards/margins": 1.5538480281829834, "rewards/rejected": -17.863632202148438, "step": 1012 }, { "epoch": 0.13793572984749455, "grad_norm": 48.06573562515427, "learning_rate": 7.965124878316506e-07, "logits/chosen": 6.778437614440918, "logits/rejected": 6.482697486877441, "logps/chosen": -2.3960983753204346, "logps/rejected": -2.43254017829895, "loss": 3.8697, "rewards/accuracies": 0.5, "rewards/chosen": -23.960983276367188, "rewards/margins": 0.36441850662231445, "rewards/rejected": -24.325403213500977, "step": 1013 }, { "epoch": 0.1380718954248366, "grad_norm": 60.82513587153696, "learning_rate": 7.96487389542148e-07, "logits/chosen": 6.637144088745117, "logits/rejected": 6.1011247634887695, "logps/chosen": -2.1287920475006104, "logps/rejected": -2.3270628452301025, "loss": 4.2687, "rewards/accuracies": 0.75, "rewards/chosen": -21.287919998168945, "rewards/margins": 1.98270845413208, "rewards/rejected": -23.270627975463867, "step": 1014 }, { "epoch": 0.13820806100217864, "grad_norm": 43.800269398271496, "learning_rate": 7.964622016630424e-07, "logits/chosen": 5.1817779541015625, "logits/rejected": 7.03879451751709, "logps/chosen": -1.9167916774749756, "logps/rejected": -2.4773428440093994, "loss": 3.7978, "rewards/accuracies": 1.0, "rewards/chosen": -19.167919158935547, "rewards/margins": 5.6055097579956055, "rewards/rejected": -24.773427963256836, "step": 1015 }, { "epoch": 0.1383442265795207, "grad_norm": 45.77397809471599, "learning_rate": 7.964369242000252e-07, "logits/chosen": 6.1734700202941895, "logits/rejected": 6.780320167541504, "logps/chosen": -2.265819549560547, "logps/rejected": -2.3198344707489014, "loss": 4.5411, "rewards/accuracies": 0.5, "rewards/chosen": -22.65819549560547, "rewards/margins": 0.5401511192321777, "rewards/rejected": -23.198345184326172, "step": 1016 }, { "epoch": 0.13848039215686275, "grad_norm": 46.58457261057992, "learning_rate": 7.964115571588078e-07, "logits/chosen": 2.1859750747680664, "logits/rejected": 5.170090675354004, "logps/chosen": -1.4195713996887207, "logps/rejected": -1.754122257232666, "loss": 4.2616, "rewards/accuracies": 0.75, "rewards/chosen": -14.195713996887207, "rewards/margins": 3.3455092906951904, "rewards/rejected": -17.541223526000977, "step": 1017 }, { "epoch": 0.13861655773420478, "grad_norm": 52.968906504164046, "learning_rate": 7.963861005451224e-07, "logits/chosen": 2.756232738494873, "logits/rejected": 4.040573596954346, "logps/chosen": -1.527591586112976, "logps/rejected": -1.7065008878707886, "loss": 4.4288, "rewards/accuracies": 1.0, "rewards/chosen": -15.27591609954834, "rewards/margins": 1.789092779159546, "rewards/rejected": -17.06501007080078, "step": 1018 }, { "epoch": 0.13875272331154684, "grad_norm": 43.29654005476456, "learning_rate": 7.96360554364721e-07, "logits/chosen": 5.291211128234863, "logits/rejected": 5.492145538330078, "logps/chosen": -2.1507720947265625, "logps/rejected": -2.3138866424560547, "loss": 4.4598, "rewards/accuracies": 0.75, "rewards/chosen": -21.507720947265625, "rewards/margins": 1.631143569946289, "rewards/rejected": -23.138864517211914, "step": 1019 }, { "epoch": 0.1388888888888889, "grad_norm": 43.163696062814275, "learning_rate": 7.963349186233759e-07, "logits/chosen": 5.445135116577148, "logits/rejected": 5.999203681945801, "logps/chosen": -1.610558032989502, "logps/rejected": -1.541162133216858, "loss": 4.4669, "rewards/accuracies": 0.5, "rewards/chosen": -16.105581283569336, "rewards/margins": -0.6939592361450195, "rewards/rejected": -15.41162109375, "step": 1020 }, { "epoch": 0.13902505446623092, "grad_norm": 41.63457806352326, "learning_rate": 7.9630919332688e-07, "logits/chosen": 5.658180236816406, "logits/rejected": 4.146244049072266, "logps/chosen": -1.9695402383804321, "logps/rejected": -2.053072929382324, "loss": 4.2603, "rewards/accuracies": 0.5, "rewards/chosen": -19.695402145385742, "rewards/margins": 0.8353266716003418, "rewards/rejected": -20.530729293823242, "step": 1021 }, { "epoch": 0.13916122004357298, "grad_norm": 47.7237513497023, "learning_rate": 7.962833784810457e-07, "logits/chosen": 4.687623023986816, "logits/rejected": 4.470863342285156, "logps/chosen": -1.7590824365615845, "logps/rejected": -1.708630084991455, "loss": 3.9614, "rewards/accuracies": 0.5, "rewards/chosen": -17.590824127197266, "rewards/margins": -0.5045232772827148, "rewards/rejected": -17.086299896240234, "step": 1022 }, { "epoch": 0.13929738562091504, "grad_norm": 45.41193362160696, "learning_rate": 7.962574740917066e-07, "logits/chosen": 5.4449687004089355, "logits/rejected": 7.119078159332275, "logps/chosen": -2.281217575073242, "logps/rejected": -2.2781388759613037, "loss": 4.0513, "rewards/accuracies": 0.25, "rewards/chosen": -22.812175750732422, "rewards/margins": -0.030787944793701172, "rewards/rejected": -22.781389236450195, "step": 1023 }, { "epoch": 0.1394335511982571, "grad_norm": 49.39149155745676, "learning_rate": 7.962314801647157e-07, "logits/chosen": 2.9478116035461426, "logits/rejected": 1.6148114204406738, "logps/chosen": -1.9713932275772095, "logps/rejected": -1.8362253904342651, "loss": 4.8189, "rewards/accuracies": 0.5, "rewards/chosen": -19.713932037353516, "rewards/margins": -1.3516778945922852, "rewards/rejected": -18.362253189086914, "step": 1024 }, { "epoch": 0.13956971677559912, "grad_norm": 55.34554632467121, "learning_rate": 7.962053967059464e-07, "logits/chosen": 4.417041778564453, "logits/rejected": 6.113276481628418, "logps/chosen": -1.9175881147384644, "logps/rejected": -2.4994046688079834, "loss": 5.2489, "rewards/accuracies": 0.75, "rewards/chosen": -19.175880432128906, "rewards/margins": 5.818165302276611, "rewards/rejected": -24.99404525756836, "step": 1025 }, { "epoch": 0.13970588235294118, "grad_norm": 46.4626396253127, "learning_rate": 7.961792237212927e-07, "logits/chosen": 5.216333389282227, "logits/rejected": 6.551816940307617, "logps/chosen": -2.26210355758667, "logps/rejected": -2.6473398208618164, "loss": 4.1438, "rewards/accuracies": 1.0, "rewards/chosen": -22.621036529541016, "rewards/margins": 3.8523635864257812, "rewards/rejected": -26.473400115966797, "step": 1026 }, { "epoch": 0.13984204793028324, "grad_norm": 46.7082167760848, "learning_rate": 7.961529612166685e-07, "logits/chosen": 5.4353346824646, "logits/rejected": 5.138882637023926, "logps/chosen": -1.9646060466766357, "logps/rejected": -1.9841561317443848, "loss": 4.2783, "rewards/accuracies": 0.25, "rewards/chosen": -19.646060943603516, "rewards/margins": 0.19550180435180664, "rewards/rejected": -19.841562271118164, "step": 1027 }, { "epoch": 0.13997821350762527, "grad_norm": 48.03628874425826, "learning_rate": 7.961266091980082e-07, "logits/chosen": 5.660122871398926, "logits/rejected": 6.692634582519531, "logps/chosen": -2.158141613006592, "logps/rejected": -2.1116585731506348, "loss": 4.0108, "rewards/accuracies": 0.5, "rewards/chosen": -21.581417083740234, "rewards/margins": -0.4648292064666748, "rewards/rejected": -21.116588592529297, "step": 1028 }, { "epoch": 0.14011437908496732, "grad_norm": 47.91736998471385, "learning_rate": 7.96100167671266e-07, "logits/chosen": 1.9720420837402344, "logits/rejected": 3.2649435997009277, "logps/chosen": -1.712989091873169, "logps/rejected": -2.118776798248291, "loss": 4.9325, "rewards/accuracies": 0.75, "rewards/chosen": -17.12989044189453, "rewards/margins": 4.057876110076904, "rewards/rejected": -21.187767028808594, "step": 1029 }, { "epoch": 0.14025054466230938, "grad_norm": 53.416966297615424, "learning_rate": 7.960736366424167e-07, "logits/chosen": 4.360380172729492, "logits/rejected": 5.6599836349487305, "logps/chosen": -1.7232075929641724, "logps/rejected": -1.766570806503296, "loss": 4.4062, "rewards/accuracies": 0.5, "rewards/chosen": -17.232074737548828, "rewards/margins": 0.43363165855407715, "rewards/rejected": -17.665706634521484, "step": 1030 }, { "epoch": 0.1403867102396514, "grad_norm": 45.99151588969897, "learning_rate": 7.960470161174555e-07, "logits/chosen": 5.360307693481445, "logits/rejected": 6.683546543121338, "logps/chosen": -2.0309877395629883, "logps/rejected": -2.4405713081359863, "loss": 4.0633, "rewards/accuracies": 0.75, "rewards/chosen": -20.30987548828125, "rewards/margins": 4.0958356857299805, "rewards/rejected": -24.405712127685547, "step": 1031 }, { "epoch": 0.14052287581699346, "grad_norm": 48.811371456147484, "learning_rate": 7.96020306102397e-07, "logits/chosen": 4.017085075378418, "logits/rejected": 4.3793206214904785, "logps/chosen": -1.8930926322937012, "logps/rejected": -1.748248815536499, "loss": 4.2363, "rewards/accuracies": 0.5, "rewards/chosen": -18.930927276611328, "rewards/margins": -1.448439359664917, "rewards/rejected": -17.482486724853516, "step": 1032 }, { "epoch": 0.14065904139433552, "grad_norm": 55.164416923691704, "learning_rate": 7.959935066032769e-07, "logits/chosen": 5.667030334472656, "logits/rejected": 6.86879825592041, "logps/chosen": -2.0046916007995605, "logps/rejected": -2.2825822830200195, "loss": 4.6109, "rewards/accuracies": 0.5, "rewards/chosen": -20.04691505432129, "rewards/margins": 2.778907299041748, "rewards/rejected": -22.825820922851562, "step": 1033 }, { "epoch": 0.14079520697167755, "grad_norm": 53.013148700535915, "learning_rate": 7.959666176261507e-07, "logits/chosen": 7.663473129272461, "logits/rejected": 5.569563865661621, "logps/chosen": -2.16133451461792, "logps/rejected": -2.110358476638794, "loss": 4.4247, "rewards/accuracies": 0.25, "rewards/chosen": -21.613346099853516, "rewards/margins": -0.5097613334655762, "rewards/rejected": -21.10358428955078, "step": 1034 }, { "epoch": 0.1409313725490196, "grad_norm": 55.56316849255676, "learning_rate": 7.95939639177094e-07, "logits/chosen": 4.902765274047852, "logits/rejected": 5.226705551147461, "logps/chosen": -2.3151988983154297, "logps/rejected": -2.3405163288116455, "loss": 4.7719, "rewards/accuracies": 0.75, "rewards/chosen": -23.151988983154297, "rewards/margins": 0.2531752586364746, "rewards/rejected": -23.405162811279297, "step": 1035 }, { "epoch": 0.14106753812636166, "grad_norm": 112.27927500679068, "learning_rate": 7.95912571262203e-07, "logits/chosen": 5.342618942260742, "logits/rejected": 4.995650768280029, "logps/chosen": -2.194971799850464, "logps/rejected": -2.2112390995025635, "loss": 3.8678, "rewards/accuracies": 0.5, "rewards/chosen": -21.949718475341797, "rewards/margins": 0.16267156600952148, "rewards/rejected": -22.112388610839844, "step": 1036 }, { "epoch": 0.1412037037037037, "grad_norm": 46.612563127848894, "learning_rate": 7.958854138875941e-07, "logits/chosen": 6.459105491638184, "logits/rejected": 7.88980770111084, "logps/chosen": -1.9008677005767822, "logps/rejected": -2.267080783843994, "loss": 3.9206, "rewards/accuracies": 0.75, "rewards/chosen": -19.00867462158203, "rewards/margins": 3.662132740020752, "rewards/rejected": -22.670808792114258, "step": 1037 }, { "epoch": 0.14133986928104575, "grad_norm": 49.82071199310295, "learning_rate": 7.958581670594032e-07, "logits/chosen": 8.312299728393555, "logits/rejected": 6.331727981567383, "logps/chosen": -2.4803104400634766, "logps/rejected": -2.206653118133545, "loss": 4.9175, "rewards/accuracies": 0.25, "rewards/chosen": -24.803104400634766, "rewards/margins": -2.736574649810791, "rewards/rejected": -22.066532135009766, "step": 1038 }, { "epoch": 0.1414760348583878, "grad_norm": 53.8069959455329, "learning_rate": 7.958308307837873e-07, "logits/chosen": 4.937051773071289, "logits/rejected": 4.528910160064697, "logps/chosen": -2.327319383621216, "logps/rejected": -2.2350826263427734, "loss": 4.154, "rewards/accuracies": 0.25, "rewards/chosen": -23.273193359375, "rewards/margins": -0.9223687648773193, "rewards/rejected": -22.350826263427734, "step": 1039 }, { "epoch": 0.14161220043572983, "grad_norm": 50.95442164833872, "learning_rate": 7.958034050669234e-07, "logits/chosen": 7.363531112670898, "logits/rejected": 6.030301094055176, "logps/chosen": -2.1089093685150146, "logps/rejected": -1.845641851425171, "loss": 4.7639, "rewards/accuracies": 0.0, "rewards/chosen": -21.089094161987305, "rewards/margins": -2.6326744556427, "rewards/rejected": -18.456418991088867, "step": 1040 }, { "epoch": 0.1417483660130719, "grad_norm": 52.87572588158923, "learning_rate": 7.957758899150083e-07, "logits/chosen": 4.971278190612793, "logits/rejected": 5.681962013244629, "logps/chosen": -1.9657094478607178, "logps/rejected": -1.9676095247268677, "loss": 4.9401, "rewards/accuracies": 0.5, "rewards/chosen": -19.657094955444336, "rewards/margins": 0.019000768661499023, "rewards/rejected": -19.676095962524414, "step": 1041 }, { "epoch": 0.14188453159041395, "grad_norm": 45.13563157673828, "learning_rate": 7.957482853342593e-07, "logits/chosen": 6.339360237121582, "logits/rejected": 5.772627353668213, "logps/chosen": -2.3745903968811035, "logps/rejected": -1.9021775722503662, "loss": 4.5425, "rewards/accuracies": 0.5, "rewards/chosen": -23.74590301513672, "rewards/margins": -4.724127769470215, "rewards/rejected": -19.021774291992188, "step": 1042 }, { "epoch": 0.142020697167756, "grad_norm": 48.895414631112985, "learning_rate": 7.95720591330914e-07, "logits/chosen": 5.814421653747559, "logits/rejected": 5.6527910232543945, "logps/chosen": -2.48193359375, "logps/rejected": -2.5209484100341797, "loss": 4.292, "rewards/accuracies": 0.5, "rewards/chosen": -24.819334030151367, "rewards/margins": 0.3901495933532715, "rewards/rejected": -25.209484100341797, "step": 1043 }, { "epoch": 0.14215686274509803, "grad_norm": 50.44315156646931, "learning_rate": 7.9569280791123e-07, "logits/chosen": 3.232903480529785, "logits/rejected": 4.902839183807373, "logps/chosen": -1.8420944213867188, "logps/rejected": -2.1924145221710205, "loss": 4.4135, "rewards/accuracies": 1.0, "rewards/chosen": -18.420944213867188, "rewards/margins": 3.503201961517334, "rewards/rejected": -21.92414665222168, "step": 1044 }, { "epoch": 0.1422930283224401, "grad_norm": 50.08086545975584, "learning_rate": 7.956649350814853e-07, "logits/chosen": 4.4003095626831055, "logits/rejected": 3.843292713165283, "logps/chosen": -2.4307634830474854, "logps/rejected": -2.248175859451294, "loss": 4.3023, "rewards/accuracies": 0.5, "rewards/chosen": -24.307636260986328, "rewards/margins": -1.8258755207061768, "rewards/rejected": -22.48175811767578, "step": 1045 }, { "epoch": 0.14242919389978215, "grad_norm": 54.42429294230667, "learning_rate": 7.956369728479778e-07, "logits/chosen": 5.983796119689941, "logits/rejected": 6.95272159576416, "logps/chosen": -2.0249714851379395, "logps/rejected": -2.1813859939575195, "loss": 4.4494, "rewards/accuracies": 1.0, "rewards/chosen": -20.249713897705078, "rewards/margins": 1.5641446113586426, "rewards/rejected": -21.813858032226562, "step": 1046 }, { "epoch": 0.14256535947712418, "grad_norm": 47.05940398165884, "learning_rate": 7.956089212170261e-07, "logits/chosen": 7.150551795959473, "logits/rejected": 7.135979652404785, "logps/chosen": -2.066507339477539, "logps/rejected": -2.2773332595825195, "loss": 4.168, "rewards/accuracies": 0.5, "rewards/chosen": -20.66507339477539, "rewards/margins": 2.108257293701172, "rewards/rejected": -22.773330688476562, "step": 1047 }, { "epoch": 0.14270152505446623, "grad_norm": 46.76371971720704, "learning_rate": 7.955807801949682e-07, "logits/chosen": 4.851590156555176, "logits/rejected": 5.81767463684082, "logps/chosen": -1.9255645275115967, "logps/rejected": -2.295163869857788, "loss": 3.7745, "rewards/accuracies": 1.0, "rewards/chosen": -19.255645751953125, "rewards/margins": 3.6959927082061768, "rewards/rejected": -22.951637268066406, "step": 1048 }, { "epoch": 0.1428376906318083, "grad_norm": 62.81963865833847, "learning_rate": 7.955525497881633e-07, "logits/chosen": 7.785830020904541, "logits/rejected": 7.584619522094727, "logps/chosen": -2.357166051864624, "logps/rejected": -2.7048888206481934, "loss": 3.9996, "rewards/accuracies": 0.5, "rewards/chosen": -23.5716609954834, "rewards/margins": 3.4772276878356934, "rewards/rejected": -27.04888916015625, "step": 1049 }, { "epoch": 0.14297385620915032, "grad_norm": 57.87172141769583, "learning_rate": 7.955242300029901e-07, "logits/chosen": 6.423809051513672, "logits/rejected": 6.134947776794434, "logps/chosen": -1.7155506610870361, "logps/rejected": -1.6229510307312012, "loss": 4.4466, "rewards/accuracies": 0.25, "rewards/chosen": -17.155506134033203, "rewards/margins": -0.9259970188140869, "rewards/rejected": -16.229511260986328, "step": 1050 }, { "epoch": 0.14311002178649238, "grad_norm": 53.52587933815226, "learning_rate": 7.954958208458478e-07, "logits/chosen": 5.869946479797363, "logits/rejected": 7.458627223968506, "logps/chosen": -2.0281543731689453, "logps/rejected": -2.051058292388916, "loss": 4.6205, "rewards/accuracies": 0.75, "rewards/chosen": -20.281543731689453, "rewards/margins": 0.2290358543395996, "rewards/rejected": -20.510581970214844, "step": 1051 }, { "epoch": 0.14324618736383443, "grad_norm": 48.41694371324118, "learning_rate": 7.954673223231553e-07, "logits/chosen": 7.997488975524902, "logits/rejected": 6.775203704833984, "logps/chosen": -2.3698830604553223, "logps/rejected": -2.470629930496216, "loss": 4.3586, "rewards/accuracies": 0.5, "rewards/chosen": -23.69883155822754, "rewards/margins": 1.0074687004089355, "rewards/rejected": -24.706298828125, "step": 1052 }, { "epoch": 0.14338235294117646, "grad_norm": 54.2673952754752, "learning_rate": 7.954387344413525e-07, "logits/chosen": 6.109495162963867, "logits/rejected": 7.391427993774414, "logps/chosen": -1.8037078380584717, "logps/rejected": -2.335245132446289, "loss": 4.443, "rewards/accuracies": 0.75, "rewards/chosen": -18.037078857421875, "rewards/margins": 5.315374374389648, "rewards/rejected": -23.352453231811523, "step": 1053 }, { "epoch": 0.14351851851851852, "grad_norm": 54.73790098865087, "learning_rate": 7.95410057206899e-07, "logits/chosen": 6.068110942840576, "logits/rejected": 6.894179344177246, "logps/chosen": -2.0141701698303223, "logps/rejected": -2.241074562072754, "loss": 3.9224, "rewards/accuracies": 0.75, "rewards/chosen": -20.141700744628906, "rewards/margins": 2.269045829772949, "rewards/rejected": -22.410747528076172, "step": 1054 }, { "epoch": 0.14365468409586057, "grad_norm": 51.523280818690424, "learning_rate": 7.953812906262745e-07, "logits/chosen": 8.136614799499512, "logits/rejected": 6.949071884155273, "logps/chosen": -2.009533166885376, "logps/rejected": -2.0068891048431396, "loss": 4.5898, "rewards/accuracies": 0.5, "rewards/chosen": -20.0953311920166, "rewards/margins": -0.026439666748046875, "rewards/rejected": -20.068891525268555, "step": 1055 }, { "epoch": 0.1437908496732026, "grad_norm": 46.60694964097246, "learning_rate": 7.953524347059792e-07, "logits/chosen": 5.627121925354004, "logits/rejected": 5.8312458992004395, "logps/chosen": -2.1889595985412598, "logps/rejected": -2.198256254196167, "loss": 4.3221, "rewards/accuracies": 0.25, "rewards/chosen": -21.88959503173828, "rewards/margins": 0.09296679496765137, "rewards/rejected": -21.982563018798828, "step": 1056 }, { "epoch": 0.14392701525054466, "grad_norm": 57.244252134995314, "learning_rate": 7.953234894525333e-07, "logits/chosen": 5.919942855834961, "logits/rejected": 7.810980796813965, "logps/chosen": -1.8680756092071533, "logps/rejected": -2.3732359409332275, "loss": 3.8517, "rewards/accuracies": 0.75, "rewards/chosen": -18.680755615234375, "rewards/margins": 5.051602363586426, "rewards/rejected": -23.732358932495117, "step": 1057 }, { "epoch": 0.14406318082788672, "grad_norm": 47.88189192125238, "learning_rate": 7.952944548724771e-07, "logits/chosen": 6.280356407165527, "logits/rejected": 5.733226776123047, "logps/chosen": -1.6158535480499268, "logps/rejected": -1.6675394773483276, "loss": 4.3499, "rewards/accuracies": 0.5, "rewards/chosen": -16.158536911010742, "rewards/margins": 0.5168585777282715, "rewards/rejected": -16.675395965576172, "step": 1058 }, { "epoch": 0.14419934640522875, "grad_norm": 46.40847013570618, "learning_rate": 7.952653309723713e-07, "logits/chosen": 7.231493949890137, "logits/rejected": 9.152608871459961, "logps/chosen": -2.271376609802246, "logps/rejected": -2.5555365085601807, "loss": 4.4584, "rewards/accuracies": 0.75, "rewards/chosen": -22.71376609802246, "rewards/margins": 2.8415989875793457, "rewards/rejected": -25.55536460876465, "step": 1059 }, { "epoch": 0.1443355119825708, "grad_norm": 45.04418380012262, "learning_rate": 7.952361177587966e-07, "logits/chosen": 8.854448318481445, "logits/rejected": 8.031278610229492, "logps/chosen": -2.5517873764038086, "logps/rejected": -2.603154182434082, "loss": 4.1387, "rewards/accuracies": 0.5, "rewards/chosen": -25.51787567138672, "rewards/margins": 0.5136675834655762, "rewards/rejected": -26.03154182434082, "step": 1060 }, { "epoch": 0.14447167755991286, "grad_norm": 48.01856991270235, "learning_rate": 7.952068152383541e-07, "logits/chosen": 5.296724319458008, "logits/rejected": 8.466856002807617, "logps/chosen": -2.237086772918701, "logps/rejected": -2.349339008331299, "loss": 4.0515, "rewards/accuracies": 0.25, "rewards/chosen": -22.370868682861328, "rewards/margins": 1.1225228309631348, "rewards/rejected": -23.493389129638672, "step": 1061 }, { "epoch": 0.14460784313725492, "grad_norm": 50.2037623374664, "learning_rate": 7.951774234176648e-07, "logits/chosen": 8.926631927490234, "logits/rejected": 9.181978225708008, "logps/chosen": -2.703066825866699, "logps/rejected": -2.8754844665527344, "loss": 4.2356, "rewards/accuracies": 0.75, "rewards/chosen": -27.030670166015625, "rewards/margins": 1.7241759300231934, "rewards/rejected": -28.754844665527344, "step": 1062 }, { "epoch": 0.14474400871459694, "grad_norm": 44.10919740522181, "learning_rate": 7.951479423033703e-07, "logits/chosen": 7.9544525146484375, "logits/rejected": 7.274389266967773, "logps/chosen": -2.7099621295928955, "logps/rejected": -3.3282530307769775, "loss": 4.4225, "rewards/accuracies": 0.5, "rewards/chosen": -27.099620819091797, "rewards/margins": 6.18290901184082, "rewards/rejected": -33.28253173828125, "step": 1063 }, { "epoch": 0.144880174291939, "grad_norm": 65.97325798282887, "learning_rate": 7.951183719021318e-07, "logits/chosen": 7.5790276527404785, "logits/rejected": 7.778615951538086, "logps/chosen": -2.239041805267334, "logps/rejected": -2.28788685798645, "loss": 4.8338, "rewards/accuracies": 0.25, "rewards/chosen": -22.390419006347656, "rewards/margins": 0.4884495735168457, "rewards/rejected": -22.878870010375977, "step": 1064 }, { "epoch": 0.14501633986928106, "grad_norm": 49.44487860414385, "learning_rate": 7.950887122206311e-07, "logits/chosen": 7.517751693725586, "logits/rejected": 6.650003910064697, "logps/chosen": -2.2033753395080566, "logps/rejected": -1.9669626951217651, "loss": 4.4342, "rewards/accuracies": 0.25, "rewards/chosen": -22.03375244140625, "rewards/margins": -2.3641250133514404, "rewards/rejected": -19.669628143310547, "step": 1065 }, { "epoch": 0.1451525054466231, "grad_norm": 52.988796636741974, "learning_rate": 7.950589632655699e-07, "logits/chosen": 6.1148834228515625, "logits/rejected": 7.54592227935791, "logps/chosen": -2.0307931900024414, "logps/rejected": -2.441746711730957, "loss": 4.2769, "rewards/accuracies": 1.0, "rewards/chosen": -20.30792999267578, "rewards/margins": 4.109536170959473, "rewards/rejected": -24.41746711730957, "step": 1066 }, { "epoch": 0.14528867102396514, "grad_norm": 49.34808127045025, "learning_rate": 7.950291250436706e-07, "logits/chosen": 6.636176109313965, "logits/rejected": 8.931753158569336, "logps/chosen": -2.0897319316864014, "logps/rejected": -3.1704888343811035, "loss": 4.1627, "rewards/accuracies": 1.0, "rewards/chosen": -20.897319793701172, "rewards/margins": 10.80756664276123, "rewards/rejected": -31.704885482788086, "step": 1067 }, { "epoch": 0.1454248366013072, "grad_norm": 49.22649118586078, "learning_rate": 7.94999197561675e-07, "logits/chosen": 8.083562850952148, "logits/rejected": 7.8664751052856445, "logps/chosen": -2.298187255859375, "logps/rejected": -2.1232364177703857, "loss": 4.7241, "rewards/accuracies": 0.25, "rewards/chosen": -22.98187255859375, "rewards/margins": -1.749507188796997, "rewards/rejected": -21.232364654541016, "step": 1068 }, { "epoch": 0.14556100217864923, "grad_norm": 49.57396964875262, "learning_rate": 7.949691808263457e-07, "logits/chosen": 5.4989447593688965, "logits/rejected": 6.136642932891846, "logps/chosen": -1.9881917238235474, "logps/rejected": -1.9571750164031982, "loss": 3.8864, "rewards/accuracies": 0.5, "rewards/chosen": -19.88191795349121, "rewards/margins": -0.3101668357849121, "rewards/rejected": -19.57175064086914, "step": 1069 }, { "epoch": 0.14569716775599129, "grad_norm": 43.9429953509117, "learning_rate": 7.94939074844465e-07, "logits/chosen": 7.3009257316589355, "logits/rejected": 8.207815170288086, "logps/chosen": -2.5520060062408447, "logps/rejected": -2.751284599304199, "loss": 4.1217, "rewards/accuracies": 0.75, "rewards/chosen": -25.520061492919922, "rewards/margins": 1.9927849769592285, "rewards/rejected": -27.512845993041992, "step": 1070 }, { "epoch": 0.14583333333333334, "grad_norm": 49.66341172533865, "learning_rate": 7.94908879622836e-07, "logits/chosen": 7.165170669555664, "logits/rejected": 7.030969142913818, "logps/chosen": -2.5283141136169434, "logps/rejected": -2.5711522102355957, "loss": 4.2115, "rewards/accuracies": 0.5, "rewards/chosen": -25.28314208984375, "rewards/margins": 0.4283790588378906, "rewards/rejected": -25.71152114868164, "step": 1071 }, { "epoch": 0.14596949891067537, "grad_norm": 44.01069996157368, "learning_rate": 7.94878595168281e-07, "logits/chosen": 6.776277542114258, "logits/rejected": 7.165320873260498, "logps/chosen": -2.179622173309326, "logps/rejected": -2.472015380859375, "loss": 3.8833, "rewards/accuracies": 0.75, "rewards/chosen": -21.796220779418945, "rewards/margins": 2.9239354133605957, "rewards/rejected": -24.720157623291016, "step": 1072 }, { "epoch": 0.14610566448801743, "grad_norm": 47.29593574104192, "learning_rate": 7.948482214876434e-07, "logits/chosen": 6.462275505065918, "logits/rejected": 8.455435752868652, "logps/chosen": -2.169266700744629, "logps/rejected": -2.7503933906555176, "loss": 4.2996, "rewards/accuracies": 1.0, "rewards/chosen": -21.692665100097656, "rewards/margins": 5.8112688064575195, "rewards/rejected": -27.503934860229492, "step": 1073 }, { "epoch": 0.14624183006535948, "grad_norm": 43.385282204503184, "learning_rate": 7.948177585877865e-07, "logits/chosen": 5.360551357269287, "logits/rejected": 6.941818714141846, "logps/chosen": -2.0499110221862793, "logps/rejected": -2.096334934234619, "loss": 3.9148, "rewards/accuracies": 0.5, "rewards/chosen": -20.499109268188477, "rewards/margins": 0.46423816680908203, "rewards/rejected": -20.963348388671875, "step": 1074 }, { "epoch": 0.1463779956427015, "grad_norm": 63.85741215348618, "learning_rate": 7.947872064755932e-07, "logits/chosen": 6.103431701660156, "logits/rejected": 7.028664588928223, "logps/chosen": -2.318885326385498, "logps/rejected": -2.5804271697998047, "loss": 4.6535, "rewards/accuracies": 0.75, "rewards/chosen": -23.188854217529297, "rewards/margins": 2.6154165267944336, "rewards/rejected": -25.804269790649414, "step": 1075 }, { "epoch": 0.14651416122004357, "grad_norm": 46.99543886219949, "learning_rate": 7.947565651579673e-07, "logits/chosen": 2.7179949283599854, "logits/rejected": 6.859549522399902, "logps/chosen": -1.9620617628097534, "logps/rejected": -2.3632497787475586, "loss": 4.5087, "rewards/accuracies": 0.75, "rewards/chosen": -19.620616912841797, "rewards/margins": 4.0118794441223145, "rewards/rejected": -23.632497787475586, "step": 1076 }, { "epoch": 0.14665032679738563, "grad_norm": 44.72689943456375, "learning_rate": 7.947258346418325e-07, "logits/chosen": 6.847975730895996, "logits/rejected": 8.044841766357422, "logps/chosen": -2.453902244567871, "logps/rejected": -2.642956256866455, "loss": 4.1691, "rewards/accuracies": 0.75, "rewards/chosen": -24.539024353027344, "rewards/margins": 1.8905396461486816, "rewards/rejected": -26.429563522338867, "step": 1077 }, { "epoch": 0.14678649237472766, "grad_norm": 46.16589186920076, "learning_rate": 7.946950149341326e-07, "logits/chosen": 7.992119789123535, "logits/rejected": 6.882905960083008, "logps/chosen": -2.754000663757324, "logps/rejected": -2.6131105422973633, "loss": 4.3165, "rewards/accuracies": 0.25, "rewards/chosen": -27.540008544921875, "rewards/margins": -1.4089035987854004, "rewards/rejected": -26.131103515625, "step": 1078 }, { "epoch": 0.1469226579520697, "grad_norm": 52.89344515144318, "learning_rate": 7.946641060418313e-07, "logits/chosen": 7.981333255767822, "logits/rejected": 9.242244720458984, "logps/chosen": -2.2177023887634277, "logps/rejected": -2.3474793434143066, "loss": 4.4859, "rewards/accuracies": 0.5, "rewards/chosen": -22.177024841308594, "rewards/margins": 1.2977678775787354, "rewards/rejected": -23.47479248046875, "step": 1079 }, { "epoch": 0.14705882352941177, "grad_norm": 53.659622231242196, "learning_rate": 7.94633107971913e-07, "logits/chosen": 7.2666826248168945, "logits/rejected": 6.468663215637207, "logps/chosen": -2.156695604324341, "logps/rejected": -2.2692136764526367, "loss": 4.3751, "rewards/accuracies": 0.5, "rewards/chosen": -21.56695556640625, "rewards/margins": 1.1251811981201172, "rewards/rejected": -22.692136764526367, "step": 1080 }, { "epoch": 0.14719498910675383, "grad_norm": 52.022904290368764, "learning_rate": 7.946020207313819e-07, "logits/chosen": 6.663586616516113, "logits/rejected": 8.251152992248535, "logps/chosen": -2.227090358734131, "logps/rejected": -2.5086073875427246, "loss": 4.2394, "rewards/accuracies": 1.0, "rewards/chosen": -22.270904541015625, "rewards/margins": 2.8151702880859375, "rewards/rejected": -25.086074829101562, "step": 1081 }, { "epoch": 0.14733115468409586, "grad_norm": 50.57773220808332, "learning_rate": 7.945708443272624e-07, "logits/chosen": 7.459784507751465, "logits/rejected": 8.710752487182617, "logps/chosen": -2.5463266372680664, "logps/rejected": -2.5229549407958984, "loss": 4.2698, "rewards/accuracies": 0.75, "rewards/chosen": -25.463266372680664, "rewards/margins": -0.2337179183959961, "rewards/rejected": -25.229549407958984, "step": 1082 }, { "epoch": 0.1474673202614379, "grad_norm": 52.354911731285405, "learning_rate": 7.94539578766599e-07, "logits/chosen": 7.949219703674316, "logits/rejected": 7.473289489746094, "logps/chosen": -2.375026226043701, "logps/rejected": -2.180941581726074, "loss": 4.462, "rewards/accuracies": 0.5, "rewards/chosen": -23.750263214111328, "rewards/margins": -1.9408464431762695, "rewards/rejected": -21.809417724609375, "step": 1083 }, { "epoch": 0.14760348583877997, "grad_norm": 54.9454158290904, "learning_rate": 7.945082240564566e-07, "logits/chosen": 5.836798191070557, "logits/rejected": 6.279282569885254, "logps/chosen": -1.8878028392791748, "logps/rejected": -2.2218551635742188, "loss": 3.6206, "rewards/accuracies": 1.0, "rewards/chosen": -18.878028869628906, "rewards/margins": 3.3405239582061768, "rewards/rejected": -22.218551635742188, "step": 1084 }, { "epoch": 0.147739651416122, "grad_norm": 53.11280363162564, "learning_rate": 7.944767802039199e-07, "logits/chosen": 7.140092849731445, "logits/rejected": 9.398331642150879, "logps/chosen": -2.4658021926879883, "logps/rejected": -2.9945878982543945, "loss": 4.2228, "rewards/accuracies": 0.75, "rewards/chosen": -24.658021926879883, "rewards/margins": 5.287858963012695, "rewards/rejected": -29.945880889892578, "step": 1085 }, { "epoch": 0.14787581699346405, "grad_norm": 62.989455055247255, "learning_rate": 7.94445247216094e-07, "logits/chosen": 5.750793933868408, "logits/rejected": 6.506831169128418, "logps/chosen": -1.8794132471084595, "logps/rejected": -2.1250860691070557, "loss": 4.0752, "rewards/accuracies": 1.0, "rewards/chosen": -18.794132232666016, "rewards/margins": 2.4567270278930664, "rewards/rejected": -21.2508602142334, "step": 1086 }, { "epoch": 0.1480119825708061, "grad_norm": 47.020988516773876, "learning_rate": 7.944136251001038e-07, "logits/chosen": 6.356416702270508, "logits/rejected": 7.606756687164307, "logps/chosen": -2.17345929145813, "logps/rejected": -2.496551990509033, "loss": 3.9951, "rewards/accuracies": 0.75, "rewards/chosen": -21.73459243774414, "rewards/margins": 3.2309279441833496, "rewards/rejected": -24.96552085876465, "step": 1087 }, { "epoch": 0.14814814814814814, "grad_norm": 52.2176491055644, "learning_rate": 7.943819138630948e-07, "logits/chosen": 7.370484828948975, "logits/rejected": 7.982266426086426, "logps/chosen": -2.439173936843872, "logps/rejected": -2.609348773956299, "loss": 4.5097, "rewards/accuracies": 0.5, "rewards/chosen": -24.391738891601562, "rewards/margins": 1.7017502784729004, "rewards/rejected": -26.093488693237305, "step": 1088 }, { "epoch": 0.1482843137254902, "grad_norm": 51.322399086481276, "learning_rate": 7.943501135122324e-07, "logits/chosen": 6.305922985076904, "logits/rejected": 7.985579490661621, "logps/chosen": -2.3960118293762207, "logps/rejected": -2.708390712738037, "loss": 3.9419, "rewards/accuracies": 1.0, "rewards/chosen": -23.96011734008789, "rewards/margins": 3.1237893104553223, "rewards/rejected": -27.083908081054688, "step": 1089 }, { "epoch": 0.14842047930283225, "grad_norm": 42.81584451525843, "learning_rate": 7.943182240547021e-07, "logits/chosen": 8.12923812866211, "logits/rejected": 7.334025859832764, "logps/chosen": -2.4688851833343506, "logps/rejected": -2.5602047443389893, "loss": 3.9281, "rewards/accuracies": 0.5, "rewards/chosen": -24.688852310180664, "rewards/margins": 0.9131970405578613, "rewards/rejected": -25.602048873901367, "step": 1090 }, { "epoch": 0.14855664488017428, "grad_norm": 53.1864489113875, "learning_rate": 7.942862454977097e-07, "logits/chosen": 7.5830254554748535, "logits/rejected": 5.893628120422363, "logps/chosen": -2.4135351181030273, "logps/rejected": -2.240065336227417, "loss": 4.2581, "rewards/accuracies": 0.25, "rewards/chosen": -24.135351181030273, "rewards/margins": -1.7346978187561035, "rewards/rejected": -22.400651931762695, "step": 1091 }, { "epoch": 0.14869281045751634, "grad_norm": 48.10441769018277, "learning_rate": 7.942541778484809e-07, "logits/chosen": 8.192827224731445, "logits/rejected": 8.32555103302002, "logps/chosen": -2.4188528060913086, "logps/rejected": -2.623684883117676, "loss": 3.9498, "rewards/accuracies": 0.5, "rewards/chosen": -24.18852996826172, "rewards/margins": 2.0483217239379883, "rewards/rejected": -26.23685073852539, "step": 1092 }, { "epoch": 0.1488289760348584, "grad_norm": 46.30544669999431, "learning_rate": 7.942220211142616e-07, "logits/chosen": 7.191508769989014, "logits/rejected": 7.424618244171143, "logps/chosen": -2.0988006591796875, "logps/rejected": -2.2501749992370605, "loss": 3.9488, "rewards/accuracies": 0.75, "rewards/chosen": -20.988006591796875, "rewards/margins": 1.5137426853179932, "rewards/rejected": -22.501750946044922, "step": 1093 }, { "epoch": 0.14896514161220042, "grad_norm": 46.35059022799355, "learning_rate": 7.94189775302318e-07, "logits/chosen": 7.1982340812683105, "logits/rejected": 7.17783260345459, "logps/chosen": -2.1847338676452637, "logps/rejected": -2.4242565631866455, "loss": 4.6753, "rewards/accuracies": 0.75, "rewards/chosen": -21.84733772277832, "rewards/margins": 2.3952271938323975, "rewards/rejected": -24.242565155029297, "step": 1094 }, { "epoch": 0.14910130718954248, "grad_norm": 64.00973027342964, "learning_rate": 7.941574404199362e-07, "logits/chosen": 8.047845840454102, "logits/rejected": 8.519121170043945, "logps/chosen": -2.7377214431762695, "logps/rejected": -2.75407075881958, "loss": 4.6196, "rewards/accuracies": 0.5, "rewards/chosen": -27.377212524414062, "rewards/margins": 0.16349315643310547, "rewards/rejected": -27.540706634521484, "step": 1095 }, { "epoch": 0.14923747276688454, "grad_norm": 44.39031053745535, "learning_rate": 7.941250164744227e-07, "logits/chosen": 6.542208194732666, "logits/rejected": 7.782904148101807, "logps/chosen": -2.3145689964294434, "logps/rejected": -2.5056004524230957, "loss": 4.3399, "rewards/accuracies": 0.75, "rewards/chosen": -23.14569091796875, "rewards/margins": 1.9103164672851562, "rewards/rejected": -25.056007385253906, "step": 1096 }, { "epoch": 0.14937363834422657, "grad_norm": 50.91196655464987, "learning_rate": 7.940925034731039e-07, "logits/chosen": 6.246726036071777, "logits/rejected": 7.371548652648926, "logps/chosen": -2.3633217811584473, "logps/rejected": -2.643650531768799, "loss": 4.4569, "rewards/accuracies": 0.5, "rewards/chosen": -23.633214950561523, "rewards/margins": 2.8032898902893066, "rewards/rejected": -26.436504364013672, "step": 1097 }, { "epoch": 0.14950980392156862, "grad_norm": 66.7589244773527, "learning_rate": 7.940599014233262e-07, "logits/chosen": 7.501989841461182, "logits/rejected": 7.347443580627441, "logps/chosen": -2.573467254638672, "logps/rejected": -2.526434898376465, "loss": 4.901, "rewards/accuracies": 0.5, "rewards/chosen": -25.73467254638672, "rewards/margins": -0.4703240394592285, "rewards/rejected": -25.264347076416016, "step": 1098 }, { "epoch": 0.14964596949891068, "grad_norm": 49.11678691541122, "learning_rate": 7.940272103324565e-07, "logits/chosen": 5.274229049682617, "logits/rejected": 6.564269065856934, "logps/chosen": -1.8250707387924194, "logps/rejected": -2.3409523963928223, "loss": 4.2356, "rewards/accuracies": 0.75, "rewards/chosen": -18.250707626342773, "rewards/margins": 5.158818244934082, "rewards/rejected": -23.409526824951172, "step": 1099 }, { "epoch": 0.14978213507625274, "grad_norm": 45.17340211492503, "learning_rate": 7.939944302078815e-07, "logits/chosen": 5.029417037963867, "logits/rejected": 7.333371162414551, "logps/chosen": -2.1967899799346924, "logps/rejected": -2.566066265106201, "loss": 4.1908, "rewards/accuracies": 1.0, "rewards/chosen": -21.967899322509766, "rewards/margins": 3.692763328552246, "rewards/rejected": -25.660663604736328, "step": 1100 }, { "epoch": 0.14991830065359477, "grad_norm": 51.09328456097132, "learning_rate": 7.939615610570083e-07, "logits/chosen": 7.795248985290527, "logits/rejected": 8.13749885559082, "logps/chosen": -2.3356683254241943, "logps/rejected": -2.4591879844665527, "loss": 4.395, "rewards/accuracies": 0.5, "rewards/chosen": -23.356685638427734, "rewards/margins": 1.2351970672607422, "rewards/rejected": -24.591880798339844, "step": 1101 }, { "epoch": 0.15005446623093682, "grad_norm": 50.86595888324888, "learning_rate": 7.939286028872639e-07, "logits/chosen": 8.02786636352539, "logits/rejected": 9.317289352416992, "logps/chosen": -2.394430160522461, "logps/rejected": -2.7985663414001465, "loss": 3.6633, "rewards/accuracies": 1.0, "rewards/chosen": -23.94430160522461, "rewards/margins": 4.041362762451172, "rewards/rejected": -27.98566436767578, "step": 1102 }, { "epoch": 0.15019063180827888, "grad_norm": 48.36951874962446, "learning_rate": 7.938955557060952e-07, "logits/chosen": 6.669107437133789, "logits/rejected": 5.768198490142822, "logps/chosen": -2.100468635559082, "logps/rejected": -2.302006244659424, "loss": 4.1433, "rewards/accuracies": 1.0, "rewards/chosen": -21.00468635559082, "rewards/margins": 2.0153756141662598, "rewards/rejected": -23.020061492919922, "step": 1103 }, { "epoch": 0.1503267973856209, "grad_norm": 49.90270158710396, "learning_rate": 7.938624195209699e-07, "logits/chosen": 6.170908451080322, "logits/rejected": 7.345170974731445, "logps/chosen": -2.045766830444336, "logps/rejected": -2.5369343757629395, "loss": 3.965, "rewards/accuracies": 1.0, "rewards/chosen": -20.45766830444336, "rewards/margins": 4.911676406860352, "rewards/rejected": -25.36934471130371, "step": 1104 }, { "epoch": 0.15046296296296297, "grad_norm": 112.79819865223507, "learning_rate": 7.938291943393751e-07, "logits/chosen": 6.920950889587402, "logits/rejected": 7.6525044441223145, "logps/chosen": -2.226163864135742, "logps/rejected": -2.603524923324585, "loss": 4.2349, "rewards/accuracies": 0.75, "rewards/chosen": -22.261638641357422, "rewards/margins": 3.7736124992370605, "rewards/rejected": -26.035249710083008, "step": 1105 }, { "epoch": 0.15059912854030502, "grad_norm": 47.689283478423064, "learning_rate": 7.937958801688185e-07, "logits/chosen": 5.754385948181152, "logits/rejected": 6.442047595977783, "logps/chosen": -1.9331166744232178, "logps/rejected": -2.396372079849243, "loss": 4.4762, "rewards/accuracies": 1.0, "rewards/chosen": -19.331167221069336, "rewards/margins": 4.6325531005859375, "rewards/rejected": -23.963720321655273, "step": 1106 }, { "epoch": 0.15073529411764705, "grad_norm": 44.934132527470545, "learning_rate": 7.937624770168277e-07, "logits/chosen": 6.897212028503418, "logits/rejected": 8.690450668334961, "logps/chosen": -2.1974709033966064, "logps/rejected": -2.800395965576172, "loss": 3.9997, "rewards/accuracies": 1.0, "rewards/chosen": -21.974708557128906, "rewards/margins": 6.029249668121338, "rewards/rejected": -28.00395965576172, "step": 1107 }, { "epoch": 0.1508714596949891, "grad_norm": 49.322838333305896, "learning_rate": 7.937289848909503e-07, "logits/chosen": 8.424612045288086, "logits/rejected": 9.701778411865234, "logps/chosen": -2.5336403846740723, "logps/rejected": -2.88767671585083, "loss": 3.9546, "rewards/accuracies": 1.0, "rewards/chosen": -25.33640480041504, "rewards/margins": 3.540363311767578, "rewards/rejected": -28.876768112182617, "step": 1108 }, { "epoch": 0.15100762527233116, "grad_norm": 47.07457819344444, "learning_rate": 7.93695403798754e-07, "logits/chosen": 7.704772472381592, "logits/rejected": 7.941378593444824, "logps/chosen": -2.5284812450408936, "logps/rejected": -2.596478223800659, "loss": 4.4841, "rewards/accuracies": 0.75, "rewards/chosen": -25.284812927246094, "rewards/margins": 0.6799702644348145, "rewards/rejected": -25.96478271484375, "step": 1109 }, { "epoch": 0.1511437908496732, "grad_norm": 49.2277240822838, "learning_rate": 7.936617337478271e-07, "logits/chosen": 7.591097831726074, "logits/rejected": 7.918282508850098, "logps/chosen": -2.483613967895508, "logps/rejected": -2.4923453330993652, "loss": 4.5781, "rewards/accuracies": 0.75, "rewards/chosen": -24.83614158630371, "rewards/margins": 0.0873112678527832, "rewards/rejected": -24.923452377319336, "step": 1110 }, { "epoch": 0.15127995642701525, "grad_norm": 45.35641379312098, "learning_rate": 7.936279747457773e-07, "logits/chosen": 6.42563533782959, "logits/rejected": 8.74050235748291, "logps/chosen": -2.607151508331299, "logps/rejected": -2.837583065032959, "loss": 4.1067, "rewards/accuracies": 1.0, "rewards/chosen": -26.071516036987305, "rewards/margins": 2.3043155670166016, "rewards/rejected": -28.375831604003906, "step": 1111 }, { "epoch": 0.1514161220043573, "grad_norm": 50.185986943253845, "learning_rate": 7.935941268002329e-07, "logits/chosen": 7.601758003234863, "logits/rejected": 6.955576419830322, "logps/chosen": -2.4935624599456787, "logps/rejected": -2.444077968597412, "loss": 4.5314, "rewards/accuracies": 0.25, "rewards/chosen": -24.935623168945312, "rewards/margins": -0.4948453903198242, "rewards/rejected": -24.440776824951172, "step": 1112 }, { "epoch": 0.15155228758169934, "grad_norm": 46.24471501159778, "learning_rate": 7.935601899188421e-07, "logits/chosen": 7.707123279571533, "logits/rejected": 7.249208450317383, "logps/chosen": -2.537623882293701, "logps/rejected": -2.6062886714935303, "loss": 4.1572, "rewards/accuracies": 0.75, "rewards/chosen": -25.376239776611328, "rewards/margins": 0.6866464614868164, "rewards/rejected": -26.062885284423828, "step": 1113 }, { "epoch": 0.1516884531590414, "grad_norm": 51.45445301329037, "learning_rate": 7.935261641092731e-07, "logits/chosen": 7.124772071838379, "logits/rejected": 8.023130416870117, "logps/chosen": -2.7843029499053955, "logps/rejected": -2.6473846435546875, "loss": 4.5359, "rewards/accuracies": 0.5, "rewards/chosen": -27.843029022216797, "rewards/margins": -1.3691821098327637, "rewards/rejected": -26.473846435546875, "step": 1114 }, { "epoch": 0.15182461873638345, "grad_norm": 51.242410038880706, "learning_rate": 7.934920493792145e-07, "logits/chosen": 6.778059005737305, "logits/rejected": 8.734140396118164, "logps/chosen": -2.330320119857788, "logps/rejected": -2.818174362182617, "loss": 4.6206, "rewards/accuracies": 0.75, "rewards/chosen": -23.30320167541504, "rewards/margins": 4.878541469573975, "rewards/rejected": -28.181743621826172, "step": 1115 }, { "epoch": 0.15196078431372548, "grad_norm": 46.392036626980186, "learning_rate": 7.934578457363746e-07, "logits/chosen": 6.030731201171875, "logits/rejected": 7.659158706665039, "logps/chosen": -2.4740757942199707, "logps/rejected": -2.85720157623291, "loss": 4.4774, "rewards/accuracies": 1.0, "rewards/chosen": -24.740758895874023, "rewards/margins": 3.8312559127807617, "rewards/rejected": -28.5720157623291, "step": 1116 }, { "epoch": 0.15209694989106753, "grad_norm": 49.603371506961395, "learning_rate": 7.934235531884821e-07, "logits/chosen": 8.230222702026367, "logits/rejected": 7.546130180358887, "logps/chosen": -2.6970059871673584, "logps/rejected": -2.327589511871338, "loss": 3.8638, "rewards/accuracies": 0.5, "rewards/chosen": -26.970060348510742, "rewards/margins": -3.6941633224487305, "rewards/rejected": -23.275896072387695, "step": 1117 }, { "epoch": 0.1522331154684096, "grad_norm": 151.85189418127254, "learning_rate": 7.933891717432858e-07, "logits/chosen": 6.6564788818359375, "logits/rejected": 7.013362884521484, "logps/chosen": -2.709073066711426, "logps/rejected": -2.368361473083496, "loss": 4.4376, "rewards/accuracies": 0.25, "rewards/chosen": -27.090730667114258, "rewards/margins": -3.407116413116455, "rewards/rejected": -23.68361473083496, "step": 1118 }, { "epoch": 0.15236928104575165, "grad_norm": 49.54338452532694, "learning_rate": 7.933547014085542e-07, "logits/chosen": 6.162973403930664, "logits/rejected": 6.243196487426758, "logps/chosen": -2.1460914611816406, "logps/rejected": -2.2738068103790283, "loss": 4.2464, "rewards/accuracies": 0.5, "rewards/chosen": -21.460914611816406, "rewards/margins": 1.2771544456481934, "rewards/rejected": -22.73807144165039, "step": 1119 }, { "epoch": 0.15250544662309368, "grad_norm": 49.915947944812025, "learning_rate": 7.933201421920765e-07, "logits/chosen": 5.742565155029297, "logits/rejected": 7.938054084777832, "logps/chosen": -2.14505934715271, "logps/rejected": -2.6224310398101807, "loss": 3.7136, "rewards/accuracies": 1.0, "rewards/chosen": -21.450593948364258, "rewards/margins": 4.773715496063232, "rewards/rejected": -26.224308013916016, "step": 1120 }, { "epoch": 0.15264161220043573, "grad_norm": 57.32973494657131, "learning_rate": 7.932854941016613e-07, "logits/chosen": 7.69840145111084, "logits/rejected": 9.112159729003906, "logps/chosen": -2.642568349838257, "logps/rejected": -2.803447723388672, "loss": 4.7546, "rewards/accuracies": 0.5, "rewards/chosen": -26.425682067871094, "rewards/margins": 1.6087932586669922, "rewards/rejected": -28.03447723388672, "step": 1121 }, { "epoch": 0.1527777777777778, "grad_norm": 46.362386037243, "learning_rate": 7.932507571451378e-07, "logits/chosen": 7.834908485412598, "logits/rejected": 8.630746841430664, "logps/chosen": -2.73953914642334, "logps/rejected": -2.7583911418914795, "loss": 4.7127, "rewards/accuracies": 0.5, "rewards/chosen": -27.3953914642334, "rewards/margins": 0.18851947784423828, "rewards/rejected": -27.583911895751953, "step": 1122 }, { "epoch": 0.15291394335511982, "grad_norm": 45.31969585844831, "learning_rate": 7.932159313303551e-07, "logits/chosen": 7.902040958404541, "logits/rejected": 8.441766738891602, "logps/chosen": -2.3378429412841797, "logps/rejected": -2.529228687286377, "loss": 4.1786, "rewards/accuracies": 0.75, "rewards/chosen": -23.378429412841797, "rewards/margins": 1.913856029510498, "rewards/rejected": -25.292285919189453, "step": 1123 }, { "epoch": 0.15305010893246188, "grad_norm": 46.99139205170852, "learning_rate": 7.931810166651824e-07, "logits/chosen": 8.447710990905762, "logits/rejected": 6.321667671203613, "logps/chosen": -2.977449893951416, "logps/rejected": -2.708930015563965, "loss": 4.0207, "rewards/accuracies": 0.25, "rewards/chosen": -29.774497985839844, "rewards/margins": -2.68519926071167, "rewards/rejected": -27.08930015563965, "step": 1124 }, { "epoch": 0.15318627450980393, "grad_norm": 46.969959929297254, "learning_rate": 7.931460131575089e-07, "logits/chosen": 6.86187219619751, "logits/rejected": 8.685192108154297, "logps/chosen": -2.5303664207458496, "logps/rejected": -2.632661819458008, "loss": 3.9065, "rewards/accuracies": 0.75, "rewards/chosen": -25.303665161132812, "rewards/margins": 1.0229530334472656, "rewards/rejected": -26.326618194580078, "step": 1125 }, { "epoch": 0.15332244008714596, "grad_norm": 56.45543900469734, "learning_rate": 7.931109208152439e-07, "logits/chosen": 8.241204261779785, "logits/rejected": 8.769966125488281, "logps/chosen": -2.6888444423675537, "logps/rejected": -2.796113967895508, "loss": 4.3655, "rewards/accuracies": 0.75, "rewards/chosen": -26.888442993164062, "rewards/margins": 1.0726947784423828, "rewards/rejected": -27.961139678955078, "step": 1126 }, { "epoch": 0.15345860566448802, "grad_norm": 49.403036291259575, "learning_rate": 7.930757396463169e-07, "logits/chosen": 7.73856258392334, "logits/rejected": 7.6672444343566895, "logps/chosen": -2.765505075454712, "logps/rejected": -2.7455334663391113, "loss": 4.4813, "rewards/accuracies": 0.5, "rewards/chosen": -27.655052185058594, "rewards/margins": -0.19971799850463867, "rewards/rejected": -27.455333709716797, "step": 1127 }, { "epoch": 0.15359477124183007, "grad_norm": 50.74015346394014, "learning_rate": 7.930404696586773e-07, "logits/chosen": 6.836971282958984, "logits/rejected": 6.899444580078125, "logps/chosen": -2.5004751682281494, "logps/rejected": -2.7284443378448486, "loss": 4.3618, "rewards/accuracies": 0.75, "rewards/chosen": -25.00475311279297, "rewards/margins": 2.279691219329834, "rewards/rejected": -27.284442901611328, "step": 1128 }, { "epoch": 0.1537309368191721, "grad_norm": 53.63044915186758, "learning_rate": 7.930051108602947e-07, "logits/chosen": 7.844524383544922, "logits/rejected": 7.467277526855469, "logps/chosen": -2.5502562522888184, "logps/rejected": -2.5992136001586914, "loss": 3.9788, "rewards/accuracies": 0.5, "rewards/chosen": -25.5025634765625, "rewards/margins": 0.4895758628845215, "rewards/rejected": -25.992137908935547, "step": 1129 }, { "epoch": 0.15386710239651416, "grad_norm": 47.3364762701843, "learning_rate": 7.929696632591588e-07, "logits/chosen": 8.901281356811523, "logits/rejected": 8.651849746704102, "logps/chosen": -2.6522576808929443, "logps/rejected": -2.745893955230713, "loss": 4.6854, "rewards/accuracies": 0.75, "rewards/chosen": -26.52257537841797, "rewards/margins": 0.9363641738891602, "rewards/rejected": -27.458942413330078, "step": 1130 }, { "epoch": 0.15400326797385622, "grad_norm": 47.37456909991962, "learning_rate": 7.929341268632789e-07, "logits/chosen": 8.784614562988281, "logits/rejected": 8.703961372375488, "logps/chosen": -2.4677441120147705, "logps/rejected": -2.686199903488159, "loss": 4.1312, "rewards/accuracies": 0.75, "rewards/chosen": -24.67744255065918, "rewards/margins": 2.1845569610595703, "rewards/rejected": -26.86199951171875, "step": 1131 }, { "epoch": 0.15413943355119825, "grad_norm": 54.066181969384445, "learning_rate": 7.928985016806851e-07, "logits/chosen": 8.977182388305664, "logits/rejected": 9.019474029541016, "logps/chosen": -3.155921459197998, "logps/rejected": -3.062601089477539, "loss": 4.2599, "rewards/accuracies": 0.25, "rewards/chosen": -31.559213638305664, "rewards/margins": -0.9332046508789062, "rewards/rejected": -30.62601089477539, "step": 1132 }, { "epoch": 0.1542755991285403, "grad_norm": 86.78083326579426, "learning_rate": 7.928627877194273e-07, "logits/chosen": 5.983113765716553, "logits/rejected": 7.942171096801758, "logps/chosen": -2.2089171409606934, "logps/rejected": -2.3919990062713623, "loss": 3.9949, "rewards/accuracies": 0.75, "rewards/chosen": -22.089170455932617, "rewards/margins": 1.8308186531066895, "rewards/rejected": -23.91999053955078, "step": 1133 }, { "epoch": 0.15441176470588236, "grad_norm": 47.2587679646544, "learning_rate": 7.92826984987575e-07, "logits/chosen": 7.849818229675293, "logits/rejected": 7.750422477722168, "logps/chosen": -2.7677550315856934, "logps/rejected": -2.8587520122528076, "loss": 4.4254, "rewards/accuracies": 0.75, "rewards/chosen": -27.677549362182617, "rewards/margins": 0.9099702835083008, "rewards/rejected": -28.587520599365234, "step": 1134 }, { "epoch": 0.1545479302832244, "grad_norm": 56.406906040793395, "learning_rate": 7.927910934932183e-07, "logits/chosen": 8.777715682983398, "logits/rejected": 8.962930679321289, "logps/chosen": -2.3650498390197754, "logps/rejected": -3.2889227867126465, "loss": 3.8443, "rewards/accuracies": 1.0, "rewards/chosen": -23.650497436523438, "rewards/margins": 9.238730430603027, "rewards/rejected": -32.88922882080078, "step": 1135 }, { "epoch": 0.15468409586056645, "grad_norm": 51.054472800171006, "learning_rate": 7.927551132444673e-07, "logits/chosen": 8.215749740600586, "logits/rejected": 8.148689270019531, "logps/chosen": -2.899407148361206, "logps/rejected": -2.931394338607788, "loss": 4.0624, "rewards/accuracies": 0.5, "rewards/chosen": -28.99407196044922, "rewards/margins": 0.3198733329772949, "rewards/rejected": -29.313945770263672, "step": 1136 }, { "epoch": 0.1548202614379085, "grad_norm": 68.34305562845441, "learning_rate": 7.927190442494518e-07, "logits/chosen": 7.497902870178223, "logits/rejected": 8.772741317749023, "logps/chosen": -2.5817089080810547, "logps/rejected": -3.24771785736084, "loss": 4.1149, "rewards/accuracies": 1.0, "rewards/chosen": -25.81709098815918, "rewards/margins": 6.660086631774902, "rewards/rejected": -32.477176666259766, "step": 1137 }, { "epoch": 0.15495642701525056, "grad_norm": 59.04209899681186, "learning_rate": 7.926828865163221e-07, "logits/chosen": 9.030056953430176, "logits/rejected": 9.271724700927734, "logps/chosen": -3.422837018966675, "logps/rejected": -3.244077682495117, "loss": 4.1488, "rewards/accuracies": 0.5, "rewards/chosen": -34.228370666503906, "rewards/margins": -1.787592887878418, "rewards/rejected": -32.44077682495117, "step": 1138 }, { "epoch": 0.1550925925925926, "grad_norm": 47.57728498954993, "learning_rate": 7.926466400532481e-07, "logits/chosen": 7.150096893310547, "logits/rejected": 7.626458168029785, "logps/chosen": -2.5810799598693848, "logps/rejected": -2.974435567855835, "loss": 4.1206, "rewards/accuracies": 0.75, "rewards/chosen": -25.81079864501953, "rewards/margins": 3.933558940887451, "rewards/rejected": -29.744356155395508, "step": 1139 }, { "epoch": 0.15522875816993464, "grad_norm": 55.758746427706164, "learning_rate": 7.926103048684203e-07, "logits/chosen": 9.020283699035645, "logits/rejected": 8.913803100585938, "logps/chosen": -2.7098047733306885, "logps/rejected": -2.978079080581665, "loss": 4.3018, "rewards/accuracies": 0.75, "rewards/chosen": -27.098047256469727, "rewards/margins": 2.6827425956726074, "rewards/rejected": -29.78078842163086, "step": 1140 }, { "epoch": 0.1553649237472767, "grad_norm": 58.934998450541656, "learning_rate": 7.925738809700487e-07, "logits/chosen": 9.630571365356445, "logits/rejected": 9.850543975830078, "logps/chosen": -2.8658831119537354, "logps/rejected": -3.0272743701934814, "loss": 4.5418, "rewards/accuracies": 0.75, "rewards/chosen": -28.658832550048828, "rewards/margins": 1.6139121055603027, "rewards/rejected": -30.272743225097656, "step": 1141 }, { "epoch": 0.15550108932461873, "grad_norm": 50.964282096668484, "learning_rate": 7.925373683663636e-07, "logits/chosen": 8.024306297302246, "logits/rejected": 8.45549488067627, "logps/chosen": -2.9211411476135254, "logps/rejected": -2.9055113792419434, "loss": 4.3757, "rewards/accuracies": 0.25, "rewards/chosen": -29.211410522460938, "rewards/margins": -0.15629863739013672, "rewards/rejected": -29.055112838745117, "step": 1142 }, { "epoch": 0.1556372549019608, "grad_norm": 61.643445504659546, "learning_rate": 7.925007670656154e-07, "logits/chosen": 8.99154281616211, "logits/rejected": 8.213747024536133, "logps/chosen": -2.614159107208252, "logps/rejected": -2.236635684967041, "loss": 4.6637, "rewards/accuracies": 0.25, "rewards/chosen": -26.141590118408203, "rewards/margins": -3.775233745574951, "rewards/rejected": -22.366355895996094, "step": 1143 }, { "epoch": 0.15577342047930284, "grad_norm": 49.75153118050264, "learning_rate": 7.924640770760744e-07, "logits/chosen": 9.58445930480957, "logits/rejected": 9.427666664123535, "logps/chosen": -3.076221466064453, "logps/rejected": -2.901731014251709, "loss": 4.3745, "rewards/accuracies": 0.5, "rewards/chosen": -30.76221466064453, "rewards/margins": -1.7449064254760742, "rewards/rejected": -29.01730728149414, "step": 1144 }, { "epoch": 0.15590958605664487, "grad_norm": 69.16819945620071, "learning_rate": 7.924272984060311e-07, "logits/chosen": 7.850988864898682, "logits/rejected": 7.994434833526611, "logps/chosen": -2.350342273712158, "logps/rejected": -2.6547465324401855, "loss": 4.479, "rewards/accuracies": 0.75, "rewards/chosen": -23.503421783447266, "rewards/margins": 3.044044017791748, "rewards/rejected": -26.547466278076172, "step": 1145 }, { "epoch": 0.15604575163398693, "grad_norm": 47.031101483768936, "learning_rate": 7.923904310637959e-07, "logits/chosen": 8.806947708129883, "logits/rejected": 8.718597412109375, "logps/chosen": -2.759979724884033, "logps/rejected": -2.2833003997802734, "loss": 4.3061, "rewards/accuracies": 0.0, "rewards/chosen": -27.599796295166016, "rewards/margins": -4.766791820526123, "rewards/rejected": -22.833003997802734, "step": 1146 }, { "epoch": 0.15618191721132899, "grad_norm": 48.324880654922914, "learning_rate": 7.923534750576993e-07, "logits/chosen": 8.138442993164062, "logits/rejected": 9.023494720458984, "logps/chosen": -2.663228988647461, "logps/rejected": -2.7861099243164062, "loss": 4.3852, "rewards/accuracies": 0.75, "rewards/chosen": -26.63228988647461, "rewards/margins": 1.2288074493408203, "rewards/rejected": -27.861099243164062, "step": 1147 }, { "epoch": 0.15631808278867101, "grad_norm": 57.28120689697348, "learning_rate": 7.923164303960917e-07, "logits/chosen": 6.756883144378662, "logits/rejected": 6.45073127746582, "logps/chosen": -2.3643574714660645, "logps/rejected": -2.410245895385742, "loss": 4.5004, "rewards/accuracies": 0.5, "rewards/chosen": -23.64357566833496, "rewards/margins": 0.45888185501098633, "rewards/rejected": -24.102458953857422, "step": 1148 }, { "epoch": 0.15645424836601307, "grad_norm": 48.81630190264293, "learning_rate": 7.922792970873438e-07, "logits/chosen": 8.601554870605469, "logits/rejected": 9.803306579589844, "logps/chosen": -2.717217445373535, "logps/rejected": -2.9079816341400146, "loss": 3.8294, "rewards/accuracies": 0.75, "rewards/chosen": -27.172176361083984, "rewards/margins": 1.9076428413391113, "rewards/rejected": -29.079818725585938, "step": 1149 }, { "epoch": 0.15659041394335513, "grad_norm": 44.32909859831822, "learning_rate": 7.922420751398461e-07, "logits/chosen": 7.246589660644531, "logits/rejected": 7.8504638671875, "logps/chosen": -2.499854564666748, "logps/rejected": -2.6981639862060547, "loss": 4.5723, "rewards/accuracies": 0.75, "rewards/chosen": -24.998546600341797, "rewards/margins": 1.983095645904541, "rewards/rejected": -26.98164176940918, "step": 1150 }, { "epoch": 0.15672657952069716, "grad_norm": 51.98989417406794, "learning_rate": 7.92204764562009e-07, "logits/chosen": 7.35845947265625, "logits/rejected": 8.105576515197754, "logps/chosen": -2.5289154052734375, "logps/rejected": -2.4815585613250732, "loss": 4.3632, "rewards/accuracies": 0.5, "rewards/chosen": -25.289154052734375, "rewards/margins": -0.473569393157959, "rewards/rejected": -24.81558609008789, "step": 1151 }, { "epoch": 0.1568627450980392, "grad_norm": 45.000878727782236, "learning_rate": 7.921673653622636e-07, "logits/chosen": 6.709641456604004, "logits/rejected": 6.770277976989746, "logps/chosen": -2.5377397537231445, "logps/rejected": -2.6440114974975586, "loss": 4.4532, "rewards/accuracies": 0.5, "rewards/chosen": -25.377395629882812, "rewards/margins": 1.0627202987670898, "rewards/rejected": -26.44011688232422, "step": 1152 }, { "epoch": 0.15699891067538127, "grad_norm": 42.89860838454658, "learning_rate": 7.921298775490603e-07, "logits/chosen": 7.3571953773498535, "logits/rejected": 6.208561420440674, "logps/chosen": -2.181574821472168, "logps/rejected": -2.2339630126953125, "loss": 4.2665, "rewards/accuracies": 0.5, "rewards/chosen": -21.815746307373047, "rewards/margins": 0.5238857269287109, "rewards/rejected": -22.339632034301758, "step": 1153 }, { "epoch": 0.1571350762527233, "grad_norm": 41.910420091319956, "learning_rate": 7.920923011308696e-07, "logits/chosen": 8.677886962890625, "logits/rejected": 8.830910682678223, "logps/chosen": -2.702261209487915, "logps/rejected": -2.7495505809783936, "loss": 4.4031, "rewards/accuracies": 0.5, "rewards/chosen": -27.022611618041992, "rewards/margins": 0.47289514541625977, "rewards/rejected": -27.495506286621094, "step": 1154 }, { "epoch": 0.15727124183006536, "grad_norm": 53.6697562783065, "learning_rate": 7.920546361161825e-07, "logits/chosen": 6.923252105712891, "logits/rejected": 8.0449800491333, "logps/chosen": -2.3969736099243164, "logps/rejected": -2.6189417839050293, "loss": 4.1884, "rewards/accuracies": 0.75, "rewards/chosen": -23.96973419189453, "rewards/margins": 2.2196831703186035, "rewards/rejected": -26.18941879272461, "step": 1155 }, { "epoch": 0.1574074074074074, "grad_norm": 46.52516995193828, "learning_rate": 7.920168825135097e-07, "logits/chosen": 7.208144187927246, "logits/rejected": 8.907424926757812, "logps/chosen": -2.173461437225342, "logps/rejected": -2.7350268363952637, "loss": 4.0512, "rewards/accuracies": 1.0, "rewards/chosen": -21.734615325927734, "rewards/margins": 5.615652561187744, "rewards/rejected": -27.350269317626953, "step": 1156 }, { "epoch": 0.15754357298474944, "grad_norm": 42.96021207984754, "learning_rate": 7.919790403313818e-07, "logits/chosen": 7.093186378479004, "logits/rejected": 8.397819519042969, "logps/chosen": -2.3753325939178467, "logps/rejected": -2.803098678588867, "loss": 4.6365, "rewards/accuracies": 0.75, "rewards/chosen": -23.753326416015625, "rewards/margins": 4.2776618003845215, "rewards/rejected": -28.030988693237305, "step": 1157 }, { "epoch": 0.1576797385620915, "grad_norm": 45.579057963422144, "learning_rate": 7.919411095783496e-07, "logits/chosen": 6.433524131774902, "logits/rejected": 4.88902473449707, "logps/chosen": -2.356855869293213, "logps/rejected": -2.1626136302948, "loss": 4.2855, "rewards/accuracies": 0.25, "rewards/chosen": -23.568559646606445, "rewards/margins": -1.942422866821289, "rewards/rejected": -21.626136779785156, "step": 1158 }, { "epoch": 0.15781590413943355, "grad_norm": 41.602316780962795, "learning_rate": 7.919030902629838e-07, "logits/chosen": 7.8141703605651855, "logits/rejected": 7.589683532714844, "logps/chosen": -2.381171703338623, "logps/rejected": -2.464498281478882, "loss": 3.8406, "rewards/accuracies": 0.5, "rewards/chosen": -23.811716079711914, "rewards/margins": 0.8332662582397461, "rewards/rejected": -24.644981384277344, "step": 1159 }, { "epoch": 0.1579520697167756, "grad_norm": 45.29050301304745, "learning_rate": 7.918649823938753e-07, "logits/chosen": 6.128301620483398, "logits/rejected": 7.212432861328125, "logps/chosen": -2.262805938720703, "logps/rejected": -2.518038749694824, "loss": 4.3103, "rewards/accuracies": 0.75, "rewards/chosen": -22.62805938720703, "rewards/margins": 2.552328586578369, "rewards/rejected": -25.180387496948242, "step": 1160 }, { "epoch": 0.15808823529411764, "grad_norm": 46.704486187484896, "learning_rate": 7.91826785979635e-07, "logits/chosen": 6.594202995300293, "logits/rejected": 8.302248001098633, "logps/chosen": -2.257922887802124, "logps/rejected": -2.498197078704834, "loss": 4.2209, "rewards/accuracies": 0.5, "rewards/chosen": -22.57923126220703, "rewards/margins": 2.4027395248413086, "rewards/rejected": -24.981969833374023, "step": 1161 }, { "epoch": 0.1582244008714597, "grad_norm": 52.59163419949607, "learning_rate": 7.917885010288933e-07, "logits/chosen": 7.484555244445801, "logits/rejected": 8.504622459411621, "logps/chosen": -2.356009006500244, "logps/rejected": -2.698099136352539, "loss": 4.23, "rewards/accuracies": 1.0, "rewards/chosen": -23.560091018676758, "rewards/margins": 3.420900344848633, "rewards/rejected": -26.98099136352539, "step": 1162 }, { "epoch": 0.15836056644880175, "grad_norm": 53.975488010465355, "learning_rate": 7.917501275503013e-07, "logits/chosen": 8.263418197631836, "logits/rejected": 7.491490364074707, "logps/chosen": -2.434109926223755, "logps/rejected": -2.321136236190796, "loss": 4.5125, "rewards/accuracies": 0.25, "rewards/chosen": -24.34109878540039, "rewards/margins": -1.1297359466552734, "rewards/rejected": -23.211362838745117, "step": 1163 }, { "epoch": 0.15849673202614378, "grad_norm": 120.60595522712494, "learning_rate": 7.917116655525298e-07, "logits/chosen": 7.946958541870117, "logits/rejected": 7.257849216461182, "logps/chosen": -2.445608615875244, "logps/rejected": -2.308995246887207, "loss": 4.2458, "rewards/accuracies": 0.25, "rewards/chosen": -24.456085205078125, "rewards/margins": -1.3661317825317383, "rewards/rejected": -23.08995246887207, "step": 1164 }, { "epoch": 0.15863289760348584, "grad_norm": 48.479937339378594, "learning_rate": 7.916731150442695e-07, "logits/chosen": 6.287302017211914, "logits/rejected": 7.566799640655518, "logps/chosen": -1.948386788368225, "logps/rejected": -2.5497307777404785, "loss": 3.5612, "rewards/accuracies": 1.0, "rewards/chosen": -19.483867645263672, "rewards/margins": 6.013441562652588, "rewards/rejected": -25.497310638427734, "step": 1165 }, { "epoch": 0.1587690631808279, "grad_norm": 73.91803110764468, "learning_rate": 7.916344760342312e-07, "logits/chosen": 6.695956230163574, "logits/rejected": 6.149774551391602, "logps/chosen": -1.9673153162002563, "logps/rejected": -1.9376832246780396, "loss": 4.0544, "rewards/accuracies": 0.5, "rewards/chosen": -19.673152923583984, "rewards/margins": -0.29632115364074707, "rewards/rejected": -19.3768310546875, "step": 1166 }, { "epoch": 0.15890522875816993, "grad_norm": 45.25870613727244, "learning_rate": 7.915957485311459e-07, "logits/chosen": 8.94240951538086, "logits/rejected": 6.721093654632568, "logps/chosen": -2.410639762878418, "logps/rejected": -2.2027604579925537, "loss": 4.2034, "rewards/accuracies": 0.0, "rewards/chosen": -24.10639762878418, "rewards/margins": -2.0787925720214844, "rewards/rejected": -22.027605056762695, "step": 1167 }, { "epoch": 0.15904139433551198, "grad_norm": 37.803461474109625, "learning_rate": 7.915569325437641e-07, "logits/chosen": 6.479024410247803, "logits/rejected": 8.297042846679688, "logps/chosen": -2.4594805240631104, "logps/rejected": -3.0445265769958496, "loss": 3.8663, "rewards/accuracies": 0.75, "rewards/chosen": -24.594806671142578, "rewards/margins": 5.850459098815918, "rewards/rejected": -30.44526481628418, "step": 1168 }, { "epoch": 0.15917755991285404, "grad_norm": 44.09733028884694, "learning_rate": 7.915180280808568e-07, "logits/chosen": 7.621018409729004, "logits/rejected": 7.980587959289551, "logps/chosen": -2.0270071029663086, "logps/rejected": -2.49847674369812, "loss": 4.0494, "rewards/accuracies": 0.75, "rewards/chosen": -20.27007293701172, "rewards/margins": 4.714694023132324, "rewards/rejected": -24.98476791381836, "step": 1169 }, { "epoch": 0.15931372549019607, "grad_norm": 86.59342694179934, "learning_rate": 7.914790351512149e-07, "logits/chosen": 6.637066841125488, "logits/rejected": 6.507599353790283, "logps/chosen": -2.3773319721221924, "logps/rejected": -2.253164768218994, "loss": 4.3538, "rewards/accuracies": 0.5, "rewards/chosen": -23.773319244384766, "rewards/margins": -1.2416696548461914, "rewards/rejected": -22.531648635864258, "step": 1170 }, { "epoch": 0.15944989106753812, "grad_norm": 40.7453829604734, "learning_rate": 7.914399537636488e-07, "logits/chosen": 7.014645576477051, "logits/rejected": 8.437039375305176, "logps/chosen": -2.335275411605835, "logps/rejected": -2.430802583694458, "loss": 3.8622, "rewards/accuracies": 0.25, "rewards/chosen": -23.352754592895508, "rewards/margins": 0.9552712440490723, "rewards/rejected": -24.308025360107422, "step": 1171 }, { "epoch": 0.15958605664488018, "grad_norm": 45.022700546935035, "learning_rate": 7.914007839269896e-07, "logits/chosen": 6.360730171203613, "logits/rejected": 7.078437805175781, "logps/chosen": -1.984074592590332, "logps/rejected": -2.27953839302063, "loss": 4.2962, "rewards/accuracies": 1.0, "rewards/chosen": -19.84074592590332, "rewards/margins": 2.954638957977295, "rewards/rejected": -22.79538345336914, "step": 1172 }, { "epoch": 0.1597222222222222, "grad_norm": 41.909326635525765, "learning_rate": 7.913615256500878e-07, "logits/chosen": 7.0528788566589355, "logits/rejected": 6.934502601623535, "logps/chosen": -2.1906280517578125, "logps/rejected": -2.29599666595459, "loss": 4.0461, "rewards/accuracies": 0.75, "rewards/chosen": -21.90628433227539, "rewards/margins": 1.0536837577819824, "rewards/rejected": -22.9599666595459, "step": 1173 }, { "epoch": 0.15985838779956427, "grad_norm": 46.329822336090686, "learning_rate": 7.913221789418143e-07, "logits/chosen": 6.043361186981201, "logits/rejected": 5.8570098876953125, "logps/chosen": -2.4215264320373535, "logps/rejected": -2.362422466278076, "loss": 4.4577, "rewards/accuracies": 0.5, "rewards/chosen": -24.21526336669922, "rewards/margins": -0.5910391807556152, "rewards/rejected": -23.624225616455078, "step": 1174 }, { "epoch": 0.15999455337690632, "grad_norm": 112.43945349378095, "learning_rate": 7.912827438110598e-07, "logits/chosen": 6.285729885101318, "logits/rejected": 7.648681640625, "logps/chosen": -1.9528017044067383, "logps/rejected": -2.3003039360046387, "loss": 3.9756, "rewards/accuracies": 0.5, "rewards/chosen": -19.528017044067383, "rewards/margins": 3.475022792816162, "rewards/rejected": -23.003040313720703, "step": 1175 }, { "epoch": 0.16013071895424835, "grad_norm": 73.40927627416218, "learning_rate": 7.91243220266735e-07, "logits/chosen": 7.827475547790527, "logits/rejected": 7.428185939788818, "logps/chosen": -2.3474299907684326, "logps/rejected": -2.472999095916748, "loss": 4.6108, "rewards/accuracies": 0.5, "rewards/chosen": -23.474300384521484, "rewards/margins": 1.2556920051574707, "rewards/rejected": -24.729991912841797, "step": 1176 }, { "epoch": 0.1602668845315904, "grad_norm": 44.881310500042396, "learning_rate": 7.912036083177704e-07, "logits/chosen": 5.332381725311279, "logits/rejected": 7.056471824645996, "logps/chosen": -2.7831692695617676, "logps/rejected": -2.534198760986328, "loss": 4.5152, "rewards/accuracies": 0.5, "rewards/chosen": -27.83169174194336, "rewards/margins": -2.489704132080078, "rewards/rejected": -25.34198760986328, "step": 1177 }, { "epoch": 0.16040305010893247, "grad_norm": 47.12114089810392, "learning_rate": 7.911639079731169e-07, "logits/chosen": 5.191059112548828, "logits/rejected": 7.330502510070801, "logps/chosen": -2.1851978302001953, "logps/rejected": -2.576646566390991, "loss": 4.095, "rewards/accuracies": 0.75, "rewards/chosen": -21.85197639465332, "rewards/margins": 3.914490222930908, "rewards/rejected": -25.76646614074707, "step": 1178 }, { "epoch": 0.16053921568627452, "grad_norm": 47.82847122731908, "learning_rate": 7.911241192417449e-07, "logits/chosen": 7.1996307373046875, "logits/rejected": 6.333563804626465, "logps/chosen": -2.295835494995117, "logps/rejected": -2.1645596027374268, "loss": 4.0986, "rewards/accuracies": 0.25, "rewards/chosen": -22.958356857299805, "rewards/margins": -1.312760829925537, "rewards/rejected": -21.64559555053711, "step": 1179 }, { "epoch": 0.16067538126361655, "grad_norm": 47.74071286374484, "learning_rate": 7.910842421326451e-07, "logits/chosen": 6.49576997756958, "logits/rejected": 7.53598690032959, "logps/chosen": -2.258139133453369, "logps/rejected": -2.4827632904052734, "loss": 4.5998, "rewards/accuracies": 0.5, "rewards/chosen": -22.581390380859375, "rewards/margins": 2.2462430000305176, "rewards/rejected": -24.827632904052734, "step": 1180 }, { "epoch": 0.1608115468409586, "grad_norm": 44.05895840055343, "learning_rate": 7.910442766548282e-07, "logits/chosen": 8.00619888305664, "logits/rejected": 7.319178104400635, "logps/chosen": -2.363656997680664, "logps/rejected": -2.2660961151123047, "loss": 4.0233, "rewards/accuracies": 0.75, "rewards/chosen": -23.63656997680664, "rewards/margins": -0.9756088256835938, "rewards/rejected": -22.660961151123047, "step": 1181 }, { "epoch": 0.16094771241830066, "grad_norm": 51.516086250994434, "learning_rate": 7.910042228173244e-07, "logits/chosen": 6.717377185821533, "logits/rejected": 6.513887405395508, "logps/chosen": -2.6532795429229736, "logps/rejected": -2.381054639816284, "loss": 4.24, "rewards/accuracies": 0.0, "rewards/chosen": -26.532794952392578, "rewards/margins": -2.7222485542297363, "rewards/rejected": -23.810546875, "step": 1182 }, { "epoch": 0.1610838779956427, "grad_norm": 47.96265291149307, "learning_rate": 7.909640806291845e-07, "logits/chosen": 6.607083320617676, "logits/rejected": 6.585633277893066, "logps/chosen": -2.414853572845459, "logps/rejected": -2.5216073989868164, "loss": 4.3339, "rewards/accuracies": 0.5, "rewards/chosen": -24.148536682128906, "rewards/margins": 1.0675373077392578, "rewards/rejected": -25.216073989868164, "step": 1183 }, { "epoch": 0.16122004357298475, "grad_norm": 49.007532621296455, "learning_rate": 7.909238500994789e-07, "logits/chosen": 8.669445991516113, "logits/rejected": 8.155569076538086, "logps/chosen": -2.768871307373047, "logps/rejected": -2.5854012966156006, "loss": 4.0204, "rewards/accuracies": 0.5, "rewards/chosen": -27.6887149810791, "rewards/margins": -1.8347015380859375, "rewards/rejected": -25.854013442993164, "step": 1184 }, { "epoch": 0.1613562091503268, "grad_norm": 41.83991076644753, "learning_rate": 7.908835312372978e-07, "logits/chosen": 5.679642677307129, "logits/rejected": 8.094038009643555, "logps/chosen": -2.1750242710113525, "logps/rejected": -2.642012119293213, "loss": 4.0093, "rewards/accuracies": 1.0, "rewards/chosen": -21.750242233276367, "rewards/margins": 4.66987943649292, "rewards/rejected": -26.420120239257812, "step": 1185 }, { "epoch": 0.16149237472766884, "grad_norm": 45.63234660026179, "learning_rate": 7.908431240517518e-07, "logits/chosen": 8.941490173339844, "logits/rejected": 8.6273832321167, "logps/chosen": -2.554060935974121, "logps/rejected": -2.451082706451416, "loss": 3.7838, "rewards/accuracies": 0.25, "rewards/chosen": -25.540611267089844, "rewards/margins": -1.0297846794128418, "rewards/rejected": -24.510826110839844, "step": 1186 }, { "epoch": 0.1616285403050109, "grad_norm": 48.15756651429638, "learning_rate": 7.908026285519712e-07, "logits/chosen": 6.583603858947754, "logits/rejected": 8.57951545715332, "logps/chosen": -2.2965054512023926, "logps/rejected": -2.4612135887145996, "loss": 4.3091, "rewards/accuracies": 0.5, "rewards/chosen": -22.96505355834961, "rewards/margins": 1.6470818519592285, "rewards/rejected": -24.61213493347168, "step": 1187 }, { "epoch": 0.16176470588235295, "grad_norm": 47.49097854801176, "learning_rate": 7.907620447471062e-07, "logits/chosen": 8.438037872314453, "logits/rejected": 8.336353302001953, "logps/chosen": -2.589360237121582, "logps/rejected": -2.7006521224975586, "loss": 4.0086, "rewards/accuracies": 0.75, "rewards/chosen": -25.893600463867188, "rewards/margins": 1.1129217147827148, "rewards/rejected": -27.00652313232422, "step": 1188 }, { "epoch": 0.16190087145969498, "grad_norm": 43.15643147442864, "learning_rate": 7.907213726463271e-07, "logits/chosen": 8.2874755859375, "logits/rejected": 7.103033065795898, "logps/chosen": -2.4536635875701904, "logps/rejected": -2.499533176422119, "loss": 4.201, "rewards/accuracies": 0.5, "rewards/chosen": -24.536636352539062, "rewards/margins": 0.4586958885192871, "rewards/rejected": -24.995330810546875, "step": 1189 }, { "epoch": 0.16203703703703703, "grad_norm": 49.07868778831616, "learning_rate": 7.906806122588242e-07, "logits/chosen": 7.477006912231445, "logits/rejected": 9.121700286865234, "logps/chosen": -3.001270294189453, "logps/rejected": -3.225698471069336, "loss": 4.516, "rewards/accuracies": 0.5, "rewards/chosen": -30.012704849243164, "rewards/margins": 2.2442784309387207, "rewards/rejected": -32.25698471069336, "step": 1190 }, { "epoch": 0.1621732026143791, "grad_norm": 44.903447553217724, "learning_rate": 7.906397635938076e-07, "logits/chosen": 9.041082382202148, "logits/rejected": 8.58409309387207, "logps/chosen": -2.6652073860168457, "logps/rejected": -2.6395771503448486, "loss": 4.1845, "rewards/accuracies": 0.25, "rewards/chosen": -26.65207290649414, "rewards/margins": -0.2563014030456543, "rewards/rejected": -26.395771026611328, "step": 1191 }, { "epoch": 0.16230936819172112, "grad_norm": 50.49946387061741, "learning_rate": 7.905988266605073e-07, "logits/chosen": 8.729927062988281, "logits/rejected": 9.01923942565918, "logps/chosen": -2.797328472137451, "logps/rejected": -3.0078065395355225, "loss": 4.4266, "rewards/accuracies": 0.5, "rewards/chosen": -27.973285675048828, "rewards/margins": 2.1047801971435547, "rewards/rejected": -30.078065872192383, "step": 1192 }, { "epoch": 0.16244553376906318, "grad_norm": 47.95281237059657, "learning_rate": 7.905578014681733e-07, "logits/chosen": 8.266969680786133, "logits/rejected": 9.694924354553223, "logps/chosen": -2.5821752548217773, "logps/rejected": -2.792140245437622, "loss": 4.0594, "rewards/accuracies": 0.75, "rewards/chosen": -25.821752548217773, "rewards/margins": 2.099648952484131, "rewards/rejected": -27.921401977539062, "step": 1193 }, { "epoch": 0.16258169934640523, "grad_norm": 67.59343231449338, "learning_rate": 7.90516688026076e-07, "logits/chosen": 7.296021938323975, "logits/rejected": 7.417969703674316, "logps/chosen": -2.7244749069213867, "logps/rejected": -2.6164660453796387, "loss": 3.7746, "rewards/accuracies": 0.25, "rewards/chosen": -27.2447509765625, "rewards/margins": -1.080091953277588, "rewards/rejected": -26.16465950012207, "step": 1194 }, { "epoch": 0.16271786492374726, "grad_norm": 49.76640421656704, "learning_rate": 7.904754863435046e-07, "logits/chosen": 7.338853359222412, "logits/rejected": 6.818391799926758, "logps/chosen": -2.5227701663970947, "logps/rejected": -2.4308886528015137, "loss": 3.8773, "rewards/accuracies": 0.5, "rewards/chosen": -25.227703094482422, "rewards/margins": -0.9188144207000732, "rewards/rejected": -24.308887481689453, "step": 1195 }, { "epoch": 0.16285403050108932, "grad_norm": 50.52994901848754, "learning_rate": 7.904341964297696e-07, "logits/chosen": 9.858491897583008, "logits/rejected": 10.366353988647461, "logps/chosen": -3.1529717445373535, "logps/rejected": -3.459007740020752, "loss": 3.7069, "rewards/accuracies": 0.5, "rewards/chosen": -31.52971649169922, "rewards/margins": 3.0603599548339844, "rewards/rejected": -34.59008026123047, "step": 1196 }, { "epoch": 0.16299019607843138, "grad_norm": 51.25176595537686, "learning_rate": 7.903928182942005e-07, "logits/chosen": 9.343969345092773, "logits/rejected": 10.811040878295898, "logps/chosen": -2.8082683086395264, "logps/rejected": -3.213130235671997, "loss": 4.203, "rewards/accuracies": 1.0, "rewards/chosen": -28.082683563232422, "rewards/margins": 4.048619747161865, "rewards/rejected": -32.13130187988281, "step": 1197 }, { "epoch": 0.16312636165577343, "grad_norm": 50.04293124930519, "learning_rate": 7.90351351946147e-07, "logits/chosen": 9.274725914001465, "logits/rejected": 9.910178184509277, "logps/chosen": -2.905477523803711, "logps/rejected": -3.230508327484131, "loss": 4.0452, "rewards/accuracies": 1.0, "rewards/chosen": -29.05477523803711, "rewards/margins": 3.250309467315674, "rewards/rejected": -32.305084228515625, "step": 1198 }, { "epoch": 0.16326252723311546, "grad_norm": 59.32269237189041, "learning_rate": 7.903097973949789e-07, "logits/chosen": 9.157251358032227, "logits/rejected": 9.477829933166504, "logps/chosen": -3.1268322467803955, "logps/rejected": -3.0040948390960693, "loss": 4.6708, "rewards/accuracies": 0.25, "rewards/chosen": -31.268321990966797, "rewards/margins": -1.227372646331787, "rewards/rejected": -30.040950775146484, "step": 1199 }, { "epoch": 0.16339869281045752, "grad_norm": 59.74957943468478, "learning_rate": 7.902681546500858e-07, "logits/chosen": 10.02657699584961, "logits/rejected": 9.473548889160156, "logps/chosen": -3.270781993865967, "logps/rejected": -3.3306610584259033, "loss": 4.2222, "rewards/accuracies": 0.25, "rewards/chosen": -32.70781707763672, "rewards/margins": 0.598790168762207, "rewards/rejected": -33.306610107421875, "step": 1200 }, { "epoch": 0.16353485838779958, "grad_norm": 52.15960484890637, "learning_rate": 7.902264237208771e-07, "logits/chosen": 6.745089530944824, "logits/rejected": 7.813010215759277, "logps/chosen": -2.1293115615844727, "logps/rejected": -2.5862135887145996, "loss": 4.4992, "rewards/accuracies": 0.75, "rewards/chosen": -21.293115615844727, "rewards/margins": 4.569021224975586, "rewards/rejected": -25.862136840820312, "step": 1201 }, { "epoch": 0.1636710239651416, "grad_norm": 45.135291273159055, "learning_rate": 7.901846046167824e-07, "logits/chosen": 9.162870407104492, "logits/rejected": 8.631814002990723, "logps/chosen": -2.9310011863708496, "logps/rejected": -2.6789236068725586, "loss": 4.2694, "rewards/accuracies": 0.25, "rewards/chosen": -29.310012817382812, "rewards/margins": -2.5207743644714355, "rewards/rejected": -26.78923797607422, "step": 1202 }, { "epoch": 0.16380718954248366, "grad_norm": 46.006261436924035, "learning_rate": 7.901426973472509e-07, "logits/chosen": 9.08969497680664, "logits/rejected": 9.744539260864258, "logps/chosen": -3.1854746341705322, "logps/rejected": -3.2322418689727783, "loss": 4.1807, "rewards/accuracies": 0.5, "rewards/chosen": -31.854747772216797, "rewards/margins": 0.46767234802246094, "rewards/rejected": -32.322418212890625, "step": 1203 }, { "epoch": 0.16394335511982572, "grad_norm": 72.39720041634392, "learning_rate": 7.901007019217519e-07, "logits/chosen": 8.303030014038086, "logits/rejected": 9.55474853515625, "logps/chosen": -2.9824090003967285, "logps/rejected": -2.8661532402038574, "loss": 3.9521, "rewards/accuracies": 0.25, "rewards/chosen": -29.82408905029297, "rewards/margins": -1.1625571250915527, "rewards/rejected": -28.661531448364258, "step": 1204 }, { "epoch": 0.16407952069716775, "grad_norm": 52.334219968929375, "learning_rate": 7.900586183497748e-07, "logits/chosen": 8.41949462890625, "logits/rejected": 8.077463150024414, "logps/chosen": -3.028280258178711, "logps/rejected": -2.9924492835998535, "loss": 4.7317, "rewards/accuracies": 0.75, "rewards/chosen": -30.28280258178711, "rewards/margins": -0.358309268951416, "rewards/rejected": -29.92449378967285, "step": 1205 }, { "epoch": 0.1642156862745098, "grad_norm": 51.68009646797139, "learning_rate": 7.900164466408288e-07, "logits/chosen": 8.171339988708496, "logits/rejected": 9.551666259765625, "logps/chosen": -2.7435102462768555, "logps/rejected": -2.860262393951416, "loss": 4.3852, "rewards/accuracies": 0.5, "rewards/chosen": -27.435104370117188, "rewards/margins": 1.1675186157226562, "rewards/rejected": -28.602622985839844, "step": 1206 }, { "epoch": 0.16435185185185186, "grad_norm": 46.767504930253054, "learning_rate": 7.899741868044426e-07, "logits/chosen": 8.578557014465332, "logits/rejected": 10.075246810913086, "logps/chosen": -2.935159921646118, "logps/rejected": -3.049600124359131, "loss": 3.9812, "rewards/accuracies": 0.5, "rewards/chosen": -29.351600646972656, "rewards/margins": 1.144399642944336, "rewards/rejected": -30.49599838256836, "step": 1207 }, { "epoch": 0.1644880174291939, "grad_norm": 68.08325597927762, "learning_rate": 7.899318388501653e-07, "logits/chosen": 8.984088897705078, "logits/rejected": 8.250326156616211, "logps/chosen": -2.4821114540100098, "logps/rejected": -2.831770658493042, "loss": 4.1688, "rewards/accuracies": 0.75, "rewards/chosen": -24.821115493774414, "rewards/margins": 3.496591567993164, "rewards/rejected": -28.317707061767578, "step": 1208 }, { "epoch": 0.16462418300653595, "grad_norm": 46.9827469930005, "learning_rate": 7.898894027875659e-07, "logits/chosen": 7.4706621170043945, "logits/rejected": 7.465971946716309, "logps/chosen": -2.5216708183288574, "logps/rejected": -2.4594779014587402, "loss": 4.4317, "rewards/accuracies": 0.5, "rewards/chosen": -25.21670913696289, "rewards/margins": -0.6219301223754883, "rewards/rejected": -24.594776153564453, "step": 1209 }, { "epoch": 0.164760348583878, "grad_norm": 52.908902344603845, "learning_rate": 7.89846878626233e-07, "logits/chosen": 8.659454345703125, "logits/rejected": 9.533832550048828, "logps/chosen": -2.796104907989502, "logps/rejected": -2.9316117763519287, "loss": 4.5952, "rewards/accuracies": 0.75, "rewards/chosen": -27.961048126220703, "rewards/margins": 1.3550701141357422, "rewards/rejected": -29.316118240356445, "step": 1210 }, { "epoch": 0.16489651416122003, "grad_norm": 48.09817706043864, "learning_rate": 7.898042663757754e-07, "logits/chosen": 8.338525772094727, "logits/rejected": 8.739020347595215, "logps/chosen": -2.693481922149658, "logps/rejected": -3.0165553092956543, "loss": 3.9702, "rewards/accuracies": 0.75, "rewards/chosen": -26.9348201751709, "rewards/margins": 3.230733871459961, "rewards/rejected": -30.16555404663086, "step": 1211 }, { "epoch": 0.1650326797385621, "grad_norm": 52.118172075585704, "learning_rate": 7.897615660458216e-07, "logits/chosen": 8.778643608093262, "logits/rejected": 9.218008041381836, "logps/chosen": -2.767887592315674, "logps/rejected": -2.9571924209594727, "loss": 4.2072, "rewards/accuracies": 0.75, "rewards/chosen": -27.67887306213379, "rewards/margins": 1.893049716949463, "rewards/rejected": -29.571924209594727, "step": 1212 }, { "epoch": 0.16516884531590414, "grad_norm": 47.18662843699371, "learning_rate": 7.897187776460202e-07, "logits/chosen": 7.036987781524658, "logits/rejected": 8.346046447753906, "logps/chosen": -2.7886505126953125, "logps/rejected": -2.9355268478393555, "loss": 4.5341, "rewards/accuracies": 0.75, "rewards/chosen": -27.886505126953125, "rewards/margins": 1.4687633514404297, "rewards/rejected": -29.355268478393555, "step": 1213 }, { "epoch": 0.16530501089324617, "grad_norm": 46.35566796372013, "learning_rate": 7.896759011860396e-07, "logits/chosen": 6.91854190826416, "logits/rejected": 6.903120517730713, "logps/chosen": -2.467074394226074, "logps/rejected": -2.51928973197937, "loss": 4.3486, "rewards/accuracies": 0.5, "rewards/chosen": -24.67074203491211, "rewards/margins": 0.5221548080444336, "rewards/rejected": -25.19289779663086, "step": 1214 }, { "epoch": 0.16544117647058823, "grad_norm": 54.04858328492999, "learning_rate": 7.896329366755679e-07, "logits/chosen": 10.285880088806152, "logits/rejected": 8.779707908630371, "logps/chosen": -3.0571486949920654, "logps/rejected": -2.730156660079956, "loss": 4.4935, "rewards/accuracies": 0.25, "rewards/chosen": -30.571487426757812, "rewards/margins": -3.2699198722839355, "rewards/rejected": -27.30156707763672, "step": 1215 }, { "epoch": 0.1655773420479303, "grad_norm": 49.88482621022518, "learning_rate": 7.895898841243136e-07, "logits/chosen": 9.636751174926758, "logits/rejected": 8.826343536376953, "logps/chosen": -3.091158628463745, "logps/rejected": -2.683889865875244, "loss": 4.8564, "rewards/accuracies": 0.25, "rewards/chosen": -30.91158676147461, "rewards/margins": -4.072689056396484, "rewards/rejected": -26.838897705078125, "step": 1216 }, { "epoch": 0.16571350762527234, "grad_norm": 57.7389181557458, "learning_rate": 7.895467435420045e-07, "logits/chosen": 8.21391487121582, "logits/rejected": 8.936031341552734, "logps/chosen": -2.9287195205688477, "logps/rejected": -2.758406639099121, "loss": 5.166, "rewards/accuracies": 0.5, "rewards/chosen": -29.28719711303711, "rewards/margins": -1.703132152557373, "rewards/rejected": -27.584064483642578, "step": 1217 }, { "epoch": 0.16584967320261437, "grad_norm": 51.05621070131691, "learning_rate": 7.895035149383886e-07, "logits/chosen": 7.776633262634277, "logits/rejected": 8.606831550598145, "logps/chosen": -2.5223183631896973, "logps/rejected": -3.2068605422973633, "loss": 3.6597, "rewards/accuracies": 0.75, "rewards/chosen": -25.223182678222656, "rewards/margins": 6.8454203605651855, "rewards/rejected": -32.068603515625, "step": 1218 }, { "epoch": 0.16598583877995643, "grad_norm": 42.41399148289667, "learning_rate": 7.89460198323234e-07, "logits/chosen": 9.009593963623047, "logits/rejected": 8.807266235351562, "logps/chosen": -2.5317530632019043, "logps/rejected": -2.5955069065093994, "loss": 3.7383, "rewards/accuracies": 0.5, "rewards/chosen": -25.317529678344727, "rewards/margins": 0.637540340423584, "rewards/rejected": -25.95507049560547, "step": 1219 }, { "epoch": 0.1661220043572985, "grad_norm": 42.73635448954191, "learning_rate": 7.894167937063281e-07, "logits/chosen": 7.047861099243164, "logits/rejected": 9.425230026245117, "logps/chosen": -2.4128899574279785, "logps/rejected": -2.8917031288146973, "loss": 4.6091, "rewards/accuracies": 0.75, "rewards/chosen": -24.128902435302734, "rewards/margins": 4.7881293296813965, "rewards/rejected": -28.917030334472656, "step": 1220 }, { "epoch": 0.16625816993464052, "grad_norm": 47.75601635837362, "learning_rate": 7.893733010974788e-07, "logits/chosen": 8.043566703796387, "logits/rejected": 7.648934364318848, "logps/chosen": -2.450763702392578, "logps/rejected": -2.311603307723999, "loss": 3.9592, "rewards/accuracies": 0.5, "rewards/chosen": -24.50763702392578, "rewards/margins": -1.3916053771972656, "rewards/rejected": -23.116031646728516, "step": 1221 }, { "epoch": 0.16639433551198257, "grad_norm": 44.026606136342174, "learning_rate": 7.893297205065135e-07, "logits/chosen": 8.013726234436035, "logits/rejected": 8.154206275939941, "logps/chosen": -2.5639395713806152, "logps/rejected": -2.6715142726898193, "loss": 4.7265, "rewards/accuracies": 0.75, "rewards/chosen": -25.63939666748047, "rewards/margins": 1.0757455825805664, "rewards/rejected": -26.71514320373535, "step": 1222 }, { "epoch": 0.16653050108932463, "grad_norm": 43.788507543876264, "learning_rate": 7.892860519432796e-07, "logits/chosen": 7.779584884643555, "logits/rejected": 6.248539447784424, "logps/chosen": -2.6361396312713623, "logps/rejected": -2.2719714641571045, "loss": 4.7316, "rewards/accuracies": 0.5, "rewards/chosen": -26.36139678955078, "rewards/margins": -3.64168119430542, "rewards/rejected": -22.719715118408203, "step": 1223 }, { "epoch": 0.16666666666666666, "grad_norm": 53.768593501835404, "learning_rate": 7.892422954176444e-07, "logits/chosen": 8.936152458190918, "logits/rejected": 7.725457668304443, "logps/chosen": -2.898293972015381, "logps/rejected": -2.615774631500244, "loss": 4.5047, "rewards/accuracies": 0.25, "rewards/chosen": -28.982938766479492, "rewards/margins": -2.8251938819885254, "rewards/rejected": -26.157745361328125, "step": 1224 }, { "epoch": 0.16680283224400871, "grad_norm": 44.36608637473741, "learning_rate": 7.891984509394952e-07, "logits/chosen": 8.487771034240723, "logits/rejected": 10.188762664794922, "logps/chosen": -2.7043280601501465, "logps/rejected": -2.9450626373291016, "loss": 4.3369, "rewards/accuracies": 0.75, "rewards/chosen": -27.04328155517578, "rewards/margins": 2.4073429107666016, "rewards/rejected": -29.450626373291016, "step": 1225 }, { "epoch": 0.16693899782135077, "grad_norm": 48.32628632996881, "learning_rate": 7.891545185187386e-07, "logits/chosen": 7.650729179382324, "logits/rejected": 7.475862979888916, "logps/chosen": -2.329695224761963, "logps/rejected": -2.3355753421783447, "loss": 4.5582, "rewards/accuracies": 0.5, "rewards/chosen": -23.296951293945312, "rewards/margins": 0.05880117416381836, "rewards/rejected": -23.35575294494629, "step": 1226 }, { "epoch": 0.1670751633986928, "grad_norm": 44.07832537228781, "learning_rate": 7.891104981653019e-07, "logits/chosen": 6.920581340789795, "logits/rejected": 8.416048049926758, "logps/chosen": -2.120213031768799, "logps/rejected": -2.3374996185302734, "loss": 4.3348, "rewards/accuracies": 0.75, "rewards/chosen": -21.202133178710938, "rewards/margins": 2.1728639602661133, "rewards/rejected": -23.374996185302734, "step": 1227 }, { "epoch": 0.16721132897603486, "grad_norm": 49.0774077827626, "learning_rate": 7.890663898891318e-07, "logits/chosen": 7.015523910522461, "logits/rejected": 8.179105758666992, "logps/chosen": -2.0620737075805664, "logps/rejected": -2.2772297859191895, "loss": 4.0157, "rewards/accuracies": 0.75, "rewards/chosen": -20.620737075805664, "rewards/margins": 2.1515610218048096, "rewards/rejected": -22.77229881286621, "step": 1228 }, { "epoch": 0.1673474945533769, "grad_norm": 41.64311580433287, "learning_rate": 7.890221937001946e-07, "logits/chosen": 5.694972991943359, "logits/rejected": 7.951999664306641, "logps/chosen": -2.241795539855957, "logps/rejected": -2.6675844192504883, "loss": 4.5521, "rewards/accuracies": 0.75, "rewards/chosen": -22.417953491210938, "rewards/margins": 4.2578887939453125, "rewards/rejected": -26.67584228515625, "step": 1229 }, { "epoch": 0.16748366013071894, "grad_norm": 49.620329792745174, "learning_rate": 7.889779096084772e-07, "logits/chosen": 7.153855323791504, "logits/rejected": 8.499265670776367, "logps/chosen": -2.3969154357910156, "logps/rejected": -2.8528056144714355, "loss": 3.9698, "rewards/accuracies": 0.75, "rewards/chosen": -23.969154357910156, "rewards/margins": 4.558901309967041, "rewards/rejected": -28.52805519104004, "step": 1230 }, { "epoch": 0.167619825708061, "grad_norm": 44.1820851859682, "learning_rate": 7.88933537623986e-07, "logits/chosen": 8.961554527282715, "logits/rejected": 8.377254486083984, "logps/chosen": -2.749741315841675, "logps/rejected": -3.0290141105651855, "loss": 3.8906, "rewards/accuracies": 0.75, "rewards/chosen": -27.497413635253906, "rewards/margins": 2.7927279472351074, "rewards/rejected": -30.290142059326172, "step": 1231 }, { "epoch": 0.16775599128540306, "grad_norm": 48.87841493015513, "learning_rate": 7.888890777567467e-07, "logits/chosen": 6.144126892089844, "logits/rejected": 6.580039978027344, "logps/chosen": -2.236231803894043, "logps/rejected": -2.279569387435913, "loss": 4.4336, "rewards/accuracies": 0.5, "rewards/chosen": -22.362316131591797, "rewards/margins": 0.4333772659301758, "rewards/rejected": -22.79569435119629, "step": 1232 }, { "epoch": 0.16789215686274508, "grad_norm": 41.24625752594887, "learning_rate": 7.888445300168058e-07, "logits/chosen": 7.0895795822143555, "logits/rejected": 5.980898857116699, "logps/chosen": -2.0515217781066895, "logps/rejected": -2.15792179107666, "loss": 4.0467, "rewards/accuracies": 1.0, "rewards/chosen": -20.515220642089844, "rewards/margins": 1.063999891281128, "rewards/rejected": -21.5792179107666, "step": 1233 }, { "epoch": 0.16802832244008714, "grad_norm": 45.460971287694534, "learning_rate": 7.887998944142291e-07, "logits/chosen": 8.357107162475586, "logits/rejected": 9.19843578338623, "logps/chosen": -2.408684253692627, "logps/rejected": -2.801273822784424, "loss": 4.5419, "rewards/accuracies": 1.0, "rewards/chosen": -24.086843490600586, "rewards/margins": 3.9258971214294434, "rewards/rejected": -28.012741088867188, "step": 1234 }, { "epoch": 0.1681644880174292, "grad_norm": 50.358764603349144, "learning_rate": 7.887551709591024e-07, "logits/chosen": 8.702383995056152, "logits/rejected": 7.625736236572266, "logps/chosen": -3.0411338806152344, "logps/rejected": -2.4814095497131348, "loss": 4.4466, "rewards/accuracies": 0.25, "rewards/chosen": -30.411340713500977, "rewards/margins": -5.597244739532471, "rewards/rejected": -24.814098358154297, "step": 1235 }, { "epoch": 0.16830065359477125, "grad_norm": 46.18588228762737, "learning_rate": 7.887103596615315e-07, "logits/chosen": 7.76688289642334, "logits/rejected": 8.8538236618042, "logps/chosen": -2.1207587718963623, "logps/rejected": -2.404035806655884, "loss": 4.5288, "rewards/accuracies": 1.0, "rewards/chosen": -21.20758628845215, "rewards/margins": 2.8327717781066895, "rewards/rejected": -24.040359497070312, "step": 1236 }, { "epoch": 0.16843681917211328, "grad_norm": 45.938285758476354, "learning_rate": 7.886654605316415e-07, "logits/chosen": 7.792182922363281, "logits/rejected": 8.150588989257812, "logps/chosen": -2.811627149581909, "logps/rejected": -2.5907046794891357, "loss": 4.3697, "rewards/accuracies": 0.25, "rewards/chosen": -28.116270065307617, "rewards/margins": -2.2092232704162598, "rewards/rejected": -25.907047271728516, "step": 1237 }, { "epoch": 0.16857298474945534, "grad_norm": 53.69942973120506, "learning_rate": 7.886204735795781e-07, "logits/chosen": 7.269331932067871, "logits/rejected": 8.399460792541504, "logps/chosen": -2.0979690551757812, "logps/rejected": -2.345423698425293, "loss": 4.3467, "rewards/accuracies": 0.75, "rewards/chosen": -20.979690551757812, "rewards/margins": 2.4745473861694336, "rewards/rejected": -23.454238891601562, "step": 1238 }, { "epoch": 0.1687091503267974, "grad_norm": 45.97907821012185, "learning_rate": 7.885753988155062e-07, "logits/chosen": 7.879973411560059, "logits/rejected": 7.962778091430664, "logps/chosen": -2.4362170696258545, "logps/rejected": -2.3817994594573975, "loss": 4.0893, "rewards/accuracies": 0.5, "rewards/chosen": -24.362171173095703, "rewards/margins": -0.5441751480102539, "rewards/rejected": -23.817995071411133, "step": 1239 }, { "epoch": 0.16884531590413943, "grad_norm": 44.57826643096699, "learning_rate": 7.88530236249611e-07, "logits/chosen": 7.4560227394104, "logits/rejected": 7.998175621032715, "logps/chosen": -2.039416790008545, "logps/rejected": -2.317472219467163, "loss": 3.9888, "rewards/accuracies": 0.75, "rewards/chosen": -20.394166946411133, "rewards/margins": 2.780555486679077, "rewards/rejected": -23.17472267150879, "step": 1240 }, { "epoch": 0.16898148148148148, "grad_norm": 43.37307168807983, "learning_rate": 7.884849858920973e-07, "logits/chosen": 8.428056716918945, "logits/rejected": 9.207789421081543, "logps/chosen": -2.7131471633911133, "logps/rejected": -2.889676570892334, "loss": 4.4153, "rewards/accuracies": 0.5, "rewards/chosen": -27.131471633911133, "rewards/margins": 1.765294075012207, "rewards/rejected": -28.896766662597656, "step": 1241 }, { "epoch": 0.16911764705882354, "grad_norm": 50.801493316275256, "learning_rate": 7.884396477531898e-07, "logits/chosen": 8.581502914428711, "logits/rejected": 8.88255500793457, "logps/chosen": -2.381821632385254, "logps/rejected": -2.3830666542053223, "loss": 4.5063, "rewards/accuracies": 0.5, "rewards/chosen": -23.818214416503906, "rewards/margins": 0.012451648712158203, "rewards/rejected": -23.83066749572754, "step": 1242 }, { "epoch": 0.16925381263616557, "grad_norm": 53.15003206183536, "learning_rate": 7.88394221843133e-07, "logits/chosen": 6.5563249588012695, "logits/rejected": 8.292549133300781, "logps/chosen": -2.8236265182495117, "logps/rejected": -2.8322436809539795, "loss": 4.5841, "rewards/accuracies": 0.5, "rewards/chosen": -28.236265182495117, "rewards/margins": 0.08617115020751953, "rewards/rejected": -28.322437286376953, "step": 1243 }, { "epoch": 0.16938997821350762, "grad_norm": 153.41596665809257, "learning_rate": 7.883487081721913e-07, "logits/chosen": 8.879186630249023, "logits/rejected": 9.821548461914062, "logps/chosen": -2.4280247688293457, "logps/rejected": -2.559257984161377, "loss": 3.7734, "rewards/accuracies": 0.75, "rewards/chosen": -24.28024673461914, "rewards/margins": 1.3123326301574707, "rewards/rejected": -25.592578887939453, "step": 1244 }, { "epoch": 0.16952614379084968, "grad_norm": 47.69058208584124, "learning_rate": 7.883031067506488e-07, "logits/chosen": 8.811670303344727, "logits/rejected": 8.875321388244629, "logps/chosen": -3.086681365966797, "logps/rejected": -2.863330125808716, "loss": 4.337, "rewards/accuracies": 0.5, "rewards/chosen": -30.866811752319336, "rewards/margins": -2.2335095405578613, "rewards/rejected": -28.63330078125, "step": 1245 }, { "epoch": 0.1696623093681917, "grad_norm": 49.07536647496085, "learning_rate": 7.882574175888097e-07, "logits/chosen": 9.223796844482422, "logits/rejected": 8.314952850341797, "logps/chosen": -2.696385383605957, "logps/rejected": -2.6426055431365967, "loss": 4.557, "rewards/accuracies": 0.5, "rewards/chosen": -26.963851928710938, "rewards/margins": -0.5377945899963379, "rewards/rejected": -26.426055908203125, "step": 1246 }, { "epoch": 0.16979847494553377, "grad_norm": 46.50167934635103, "learning_rate": 7.882116406969976e-07, "logits/chosen": 8.85234546661377, "logits/rejected": 8.774535179138184, "logps/chosen": -2.6351304054260254, "logps/rejected": -2.747889280319214, "loss": 4.3997, "rewards/accuracies": 0.75, "rewards/chosen": -26.351303100585938, "rewards/margins": 1.1275901794433594, "rewards/rejected": -27.478893280029297, "step": 1247 }, { "epoch": 0.16993464052287582, "grad_norm": 50.598801418117986, "learning_rate": 7.881657760855563e-07, "logits/chosen": 7.395277976989746, "logits/rejected": 7.679974555969238, "logps/chosen": -2.6558921337127686, "logps/rejected": -2.831717014312744, "loss": 3.9732, "rewards/accuracies": 0.75, "rewards/chosen": -26.558921813964844, "rewards/margins": 1.7582497596740723, "rewards/rejected": -28.31717300415039, "step": 1248 }, { "epoch": 0.17007080610021785, "grad_norm": 44.31420553699995, "learning_rate": 7.881198237648494e-07, "logits/chosen": 9.681211471557617, "logits/rejected": 9.889016151428223, "logps/chosen": -2.9749889373779297, "logps/rejected": -2.8897032737731934, "loss": 4.4517, "rewards/accuracies": 0.25, "rewards/chosen": -29.749889373779297, "rewards/margins": -0.8528556823730469, "rewards/rejected": -28.89703369140625, "step": 1249 }, { "epoch": 0.1702069716775599, "grad_norm": 43.90390026727833, "learning_rate": 7.880737837452601e-07, "logits/chosen": 6.805121421813965, "logits/rejected": 7.625019550323486, "logps/chosen": -2.1204776763916016, "logps/rejected": -2.3624167442321777, "loss": 4.4329, "rewards/accuracies": 0.75, "rewards/chosen": -21.204776763916016, "rewards/margins": 2.4193906784057617, "rewards/rejected": -23.624168395996094, "step": 1250 }, { "epoch": 0.17034313725490197, "grad_norm": 43.86884204459576, "learning_rate": 7.880276560371914e-07, "logits/chosen": 8.064530372619629, "logits/rejected": 8.549880981445312, "logps/chosen": -2.4170472621917725, "logps/rejected": -2.5492300987243652, "loss": 4.1941, "rewards/accuracies": 0.75, "rewards/chosen": -24.170473098754883, "rewards/margins": 1.3218283653259277, "rewards/rejected": -25.492300033569336, "step": 1251 }, { "epoch": 0.170479302832244, "grad_norm": 44.23836474314249, "learning_rate": 7.879814406510664e-07, "logits/chosen": 9.785752296447754, "logits/rejected": 10.254776000976562, "logps/chosen": -2.8627383708953857, "logps/rejected": -3.0096795558929443, "loss": 4.3745, "rewards/accuracies": 0.75, "rewards/chosen": -28.627384185791016, "rewards/margins": 1.4694123268127441, "rewards/rejected": -30.09679412841797, "step": 1252 }, { "epoch": 0.17061546840958605, "grad_norm": 46.37567185099621, "learning_rate": 7.879351375973277e-07, "logits/chosen": 10.073427200317383, "logits/rejected": 10.412683486938477, "logps/chosen": -3.100381851196289, "logps/rejected": -3.030806064605713, "loss": 4.2376, "rewards/accuracies": 0.25, "rewards/chosen": -31.00381851196289, "rewards/margins": -0.6957559585571289, "rewards/rejected": -30.308059692382812, "step": 1253 }, { "epoch": 0.1707516339869281, "grad_norm": 54.448929215891106, "learning_rate": 7.87888746886438e-07, "logits/chosen": 9.418987274169922, "logits/rejected": 9.749025344848633, "logps/chosen": -2.7990148067474365, "logps/rejected": -3.2331364154815674, "loss": 3.9862, "rewards/accuracies": 0.75, "rewards/chosen": -27.99014663696289, "rewards/margins": 4.341217517852783, "rewards/rejected": -32.331363677978516, "step": 1254 }, { "epoch": 0.17088779956427017, "grad_norm": 43.81995296698345, "learning_rate": 7.878422685288799e-07, "logits/chosen": 7.687395095825195, "logits/rejected": 8.436609268188477, "logps/chosen": -2.6987977027893066, "logps/rejected": -2.5478081703186035, "loss": 4.6088, "rewards/accuracies": 0.5, "rewards/chosen": -26.98797607421875, "rewards/margins": -1.5098934173583984, "rewards/rejected": -25.47808265686035, "step": 1255 }, { "epoch": 0.1710239651416122, "grad_norm": 57.338079638293244, "learning_rate": 7.87795702535155e-07, "logits/chosen": 9.217778205871582, "logits/rejected": 9.713062286376953, "logps/chosen": -2.6679155826568604, "logps/rejected": -3.01013445854187, "loss": 4.1246, "rewards/accuracies": 1.0, "rewards/chosen": -26.679155349731445, "rewards/margins": 3.422189235687256, "rewards/rejected": -30.10134506225586, "step": 1256 }, { "epoch": 0.17116013071895425, "grad_norm": 55.39925917911861, "learning_rate": 7.877490489157855e-07, "logits/chosen": 9.393472671508789, "logits/rejected": 9.5245361328125, "logps/chosen": -2.9064230918884277, "logps/rejected": -2.967705488204956, "loss": 4.2233, "rewards/accuracies": 0.5, "rewards/chosen": -29.06422996520996, "rewards/margins": 0.6128249168395996, "rewards/rejected": -29.67705535888672, "step": 1257 }, { "epoch": 0.1712962962962963, "grad_norm": 54.605133188045855, "learning_rate": 7.877023076813134e-07, "logits/chosen": 8.3372802734375, "logits/rejected": 9.02685260772705, "logps/chosen": -2.834113359451294, "logps/rejected": -2.8951687812805176, "loss": 4.3057, "rewards/accuracies": 0.75, "rewards/chosen": -28.34113311767578, "rewards/margins": 0.6105523109436035, "rewards/rejected": -28.95168685913086, "step": 1258 }, { "epoch": 0.17143246187363834, "grad_norm": 46.060207559447825, "learning_rate": 7.876554788423e-07, "logits/chosen": 8.272743225097656, "logits/rejected": 7.979445457458496, "logps/chosen": -2.617460250854492, "logps/rejected": -2.5245399475097656, "loss": 3.897, "rewards/accuracies": 0.75, "rewards/chosen": -26.174602508544922, "rewards/margins": -0.9292035102844238, "rewards/rejected": -25.245397567749023, "step": 1259 }, { "epoch": 0.1715686274509804, "grad_norm": 62.066302174605326, "learning_rate": 7.876085624093268e-07, "logits/chosen": 8.099092483520508, "logits/rejected": 8.358476638793945, "logps/chosen": -2.5927271842956543, "logps/rejected": -2.7553932666778564, "loss": 4.116, "rewards/accuracies": 0.75, "rewards/chosen": -25.92727279663086, "rewards/margins": 1.626659870147705, "rewards/rejected": -27.553932189941406, "step": 1260 }, { "epoch": 0.17170479302832245, "grad_norm": 45.390657632075104, "learning_rate": 7.875615583929949e-07, "logits/chosen": 9.040811538696289, "logits/rejected": 9.581607818603516, "logps/chosen": -2.680461883544922, "logps/rejected": -2.4680802822113037, "loss": 3.9829, "rewards/accuracies": 0.25, "rewards/chosen": -26.80462074279785, "rewards/margins": -2.1238174438476562, "rewards/rejected": -24.680803298950195, "step": 1261 }, { "epoch": 0.17184095860566448, "grad_norm": 41.870558006577696, "learning_rate": 7.875144668039254e-07, "logits/chosen": 7.779206275939941, "logits/rejected": 7.793803691864014, "logps/chosen": -2.4872636795043945, "logps/rejected": -2.5717577934265137, "loss": 4.205, "rewards/accuracies": 0.5, "rewards/chosen": -24.872638702392578, "rewards/margins": 0.8449387550354004, "rewards/rejected": -25.717575073242188, "step": 1262 }, { "epoch": 0.17197712418300654, "grad_norm": 45.43700307649122, "learning_rate": 7.874672876527586e-07, "logits/chosen": 7.339555740356445, "logits/rejected": 10.506726264953613, "logps/chosen": -2.7479023933410645, "logps/rejected": -3.1536033153533936, "loss": 4.2973, "rewards/accuracies": 0.75, "rewards/chosen": -27.479022979736328, "rewards/margins": 4.057011127471924, "rewards/rejected": -31.536033630371094, "step": 1263 }, { "epoch": 0.1721132897603486, "grad_norm": 40.5787813248941, "learning_rate": 7.874200209501557e-07, "logits/chosen": 8.096565246582031, "logits/rejected": 9.300224304199219, "logps/chosen": -2.404423713684082, "logps/rejected": -2.6108412742614746, "loss": 3.9104, "rewards/accuracies": 0.5, "rewards/chosen": -24.04423713684082, "rewards/margins": 2.0641751289367676, "rewards/rejected": -26.108413696289062, "step": 1264 }, { "epoch": 0.17224945533769062, "grad_norm": 47.03969386192103, "learning_rate": 7.873726667067964e-07, "logits/chosen": 9.031967163085938, "logits/rejected": 9.853141784667969, "logps/chosen": -2.6336379051208496, "logps/rejected": -2.7906033992767334, "loss": 4.5596, "rewards/accuracies": 0.75, "rewards/chosen": -26.336376190185547, "rewards/margins": 1.5696558952331543, "rewards/rejected": -27.90603256225586, "step": 1265 }, { "epoch": 0.17238562091503268, "grad_norm": 45.13550065432039, "learning_rate": 7.87325224933381e-07, "logits/chosen": 7.262442588806152, "logits/rejected": 9.117935180664062, "logps/chosen": -2.3063859939575195, "logps/rejected": -2.4270410537719727, "loss": 4.051, "rewards/accuracies": 0.75, "rewards/chosen": -23.063858032226562, "rewards/margins": 1.2065496444702148, "rewards/rejected": -24.270408630371094, "step": 1266 }, { "epoch": 0.17252178649237473, "grad_norm": 44.05990227433554, "learning_rate": 7.872776956406294e-07, "logits/chosen": 6.902603626251221, "logits/rejected": 8.421066284179688, "logps/chosen": -2.5088655948638916, "logps/rejected": -2.8376498222351074, "loss": 4.3647, "rewards/accuracies": 1.0, "rewards/chosen": -25.08865737915039, "rewards/margins": 3.287842273712158, "rewards/rejected": -28.376497268676758, "step": 1267 }, { "epoch": 0.17265795206971676, "grad_norm": 44.222192201938974, "learning_rate": 7.872300788392811e-07, "logits/chosen": 8.868678092956543, "logits/rejected": 6.73671293258667, "logps/chosen": -2.810734987258911, "logps/rejected": -2.527515411376953, "loss": 4.2934, "rewards/accuracies": 0.0, "rewards/chosen": -28.107349395751953, "rewards/margins": -2.8321971893310547, "rewards/rejected": -25.27515411376953, "step": 1268 }, { "epoch": 0.17279411764705882, "grad_norm": 42.54489006423341, "learning_rate": 7.871823745400957e-07, "logits/chosen": 7.041839599609375, "logits/rejected": 8.131906509399414, "logps/chosen": -2.4303340911865234, "logps/rejected": -2.6957552433013916, "loss": 4.145, "rewards/accuracies": 0.75, "rewards/chosen": -24.303342819213867, "rewards/margins": 2.6542091369628906, "rewards/rejected": -26.957551956176758, "step": 1269 }, { "epoch": 0.17293028322440088, "grad_norm": 52.257487975443276, "learning_rate": 7.871345827538524e-07, "logits/chosen": 10.07999324798584, "logits/rejected": 9.793353080749512, "logps/chosen": -2.9309346675872803, "logps/rejected": -2.833347797393799, "loss": 4.3229, "rewards/accuracies": 0.25, "rewards/chosen": -29.309349060058594, "rewards/margins": -0.975867748260498, "rewards/rejected": -28.333478927612305, "step": 1270 }, { "epoch": 0.1730664488017429, "grad_norm": 45.18753771697076, "learning_rate": 7.870867034913498e-07, "logits/chosen": 9.54024887084961, "logits/rejected": 9.604001998901367, "logps/chosen": -2.8010237216949463, "logps/rejected": -3.0867996215820312, "loss": 4.2038, "rewards/accuracies": 0.75, "rewards/chosen": -28.010238647460938, "rewards/margins": 2.8577589988708496, "rewards/rejected": -30.867996215820312, "step": 1271 }, { "epoch": 0.17320261437908496, "grad_norm": 52.88839516501259, "learning_rate": 7.87038736763407e-07, "logits/chosen": 7.4955620765686035, "logits/rejected": 9.33792781829834, "logps/chosen": -2.363649845123291, "logps/rejected": -2.729494571685791, "loss": 4.2488, "rewards/accuracies": 0.75, "rewards/chosen": -23.636497497558594, "rewards/margins": 3.658446788787842, "rewards/rejected": -27.294944763183594, "step": 1272 }, { "epoch": 0.17333877995642702, "grad_norm": 45.39770761230631, "learning_rate": 7.869906825808623e-07, "logits/chosen": 7.4617204666137695, "logits/rejected": 7.897047519683838, "logps/chosen": -2.1216259002685547, "logps/rejected": -2.2860023975372314, "loss": 4.0704, "rewards/accuracies": 0.75, "rewards/chosen": -21.21626091003418, "rewards/margins": 1.643763542175293, "rewards/rejected": -22.860023498535156, "step": 1273 }, { "epoch": 0.17347494553376908, "grad_norm": 42.78392741972152, "learning_rate": 7.86942540954574e-07, "logits/chosen": 7.167069435119629, "logits/rejected": 7.469264030456543, "logps/chosen": -2.3732123374938965, "logps/rejected": -2.6532299518585205, "loss": 4.1496, "rewards/accuracies": 0.5, "rewards/chosen": -23.73212432861328, "rewards/margins": 2.8001761436462402, "rewards/rejected": -26.53230094909668, "step": 1274 }, { "epoch": 0.1736111111111111, "grad_norm": 50.63192019805002, "learning_rate": 7.868943118954202e-07, "logits/chosen": 8.573953628540039, "logits/rejected": 8.716363906860352, "logps/chosen": -2.984598398208618, "logps/rejected": -2.739996910095215, "loss": 4.5563, "rewards/accuracies": 0.25, "rewards/chosen": -29.845983505249023, "rewards/margins": -2.446016311645508, "rewards/rejected": -27.399967193603516, "step": 1275 }, { "epoch": 0.17374727668845316, "grad_norm": 43.267714793111686, "learning_rate": 7.868459954142982e-07, "logits/chosen": 8.899083137512207, "logits/rejected": 9.139083862304688, "logps/chosen": -2.926910400390625, "logps/rejected": -2.842679977416992, "loss": 4.5476, "rewards/accuracies": 0.25, "rewards/chosen": -29.26910400390625, "rewards/margins": -0.8423061370849609, "rewards/rejected": -28.426799774169922, "step": 1276 }, { "epoch": 0.17388344226579522, "grad_norm": 44.08110738477169, "learning_rate": 7.867975915221261e-07, "logits/chosen": 8.02815055847168, "logits/rejected": 8.239347457885742, "logps/chosen": -2.2561330795288086, "logps/rejected": -2.4364914894104004, "loss": 4.0396, "rewards/accuracies": 0.75, "rewards/chosen": -22.56133270263672, "rewards/margins": 1.8035826683044434, "rewards/rejected": -24.364913940429688, "step": 1277 }, { "epoch": 0.17401960784313725, "grad_norm": 38.91111310459993, "learning_rate": 7.867491002298408e-07, "logits/chosen": 9.380010604858398, "logits/rejected": 9.687565803527832, "logps/chosen": -3.120779514312744, "logps/rejected": -3.0078184604644775, "loss": 3.8274, "rewards/accuracies": 0.25, "rewards/chosen": -31.207796096801758, "rewards/margins": -1.129612922668457, "rewards/rejected": -30.078182220458984, "step": 1278 }, { "epoch": 0.1741557734204793, "grad_norm": 42.470646509213985, "learning_rate": 7.867005215483995e-07, "logits/chosen": 8.140865325927734, "logits/rejected": 6.529858589172363, "logps/chosen": -2.5664031505584717, "logps/rejected": -2.6206235885620117, "loss": 4.4001, "rewards/accuracies": 0.5, "rewards/chosen": -25.664030075073242, "rewards/margins": 0.5422048568725586, "rewards/rejected": -26.206233978271484, "step": 1279 }, { "epoch": 0.17429193899782136, "grad_norm": 51.004563214725096, "learning_rate": 7.866518554887787e-07, "logits/chosen": 9.436775207519531, "logits/rejected": 9.724409103393555, "logps/chosen": -2.977006673812866, "logps/rejected": -3.149815320968628, "loss": 3.8908, "rewards/accuracies": 0.75, "rewards/chosen": -29.770065307617188, "rewards/margins": 1.7280874252319336, "rewards/rejected": -31.498153686523438, "step": 1280 }, { "epoch": 0.1744281045751634, "grad_norm": 47.855841610895816, "learning_rate": 7.866031020619752e-07, "logits/chosen": 7.776568412780762, "logits/rejected": 9.014801979064941, "logps/chosen": -2.42738676071167, "logps/rejected": -3.0635900497436523, "loss": 4.1363, "rewards/accuracies": 1.0, "rewards/chosen": -24.273866653442383, "rewards/margins": 6.362033843994141, "rewards/rejected": -30.63589859008789, "step": 1281 }, { "epoch": 0.17456427015250545, "grad_norm": 43.1940178295898, "learning_rate": 7.86554261279005e-07, "logits/chosen": 7.536350250244141, "logits/rejected": 8.498577117919922, "logps/chosen": -2.5875492095947266, "logps/rejected": -2.7458057403564453, "loss": 4.2222, "rewards/accuracies": 0.75, "rewards/chosen": -25.875492095947266, "rewards/margins": 1.5825653076171875, "rewards/rejected": -27.458057403564453, "step": 1282 }, { "epoch": 0.1747004357298475, "grad_norm": 47.3303968722751, "learning_rate": 7.865053331509042e-07, "logits/chosen": 7.753451347351074, "logits/rejected": 8.951570510864258, "logps/chosen": -2.148693561553955, "logps/rejected": -2.6130423545837402, "loss": 4.0872, "rewards/accuracies": 0.75, "rewards/chosen": -21.486934661865234, "rewards/margins": 4.643489837646484, "rewards/rejected": -26.13042449951172, "step": 1283 }, { "epoch": 0.17483660130718953, "grad_norm": 43.10843006824928, "learning_rate": 7.864563176887286e-07, "logits/chosen": 8.511268615722656, "logits/rejected": 10.282013893127441, "logps/chosen": -2.7144768238067627, "logps/rejected": -3.022519588470459, "loss": 4.2106, "rewards/accuracies": 0.75, "rewards/chosen": -27.1447696685791, "rewards/margins": 3.080428123474121, "rewards/rejected": -30.225196838378906, "step": 1284 }, { "epoch": 0.1749727668845316, "grad_norm": 52.11064450552697, "learning_rate": 7.864072149035534e-07, "logits/chosen": 8.398311614990234, "logits/rejected": 9.460896492004395, "logps/chosen": -2.9201114177703857, "logps/rejected": -3.2004878520965576, "loss": 3.5389, "rewards/accuracies": 0.75, "rewards/chosen": -29.201114654541016, "rewards/margins": 2.803762912750244, "rewards/rejected": -32.004878997802734, "step": 1285 }, { "epoch": 0.17510893246187365, "grad_norm": 46.28275138180318, "learning_rate": 7.863580248064739e-07, "logits/chosen": 7.398241996765137, "logits/rejected": 7.874755859375, "logps/chosen": -2.6052942276000977, "logps/rejected": -2.578432559967041, "loss": 3.957, "rewards/accuracies": 0.5, "rewards/chosen": -26.05294418334961, "rewards/margins": -0.268618106842041, "rewards/rejected": -25.784324645996094, "step": 1286 }, { "epoch": 0.17524509803921567, "grad_norm": 62.77776920112825, "learning_rate": 7.863087474086051e-07, "logits/chosen": 8.974807739257812, "logits/rejected": 8.285686492919922, "logps/chosen": -2.7583703994750977, "logps/rejected": -2.577122449874878, "loss": 4.6429, "rewards/accuracies": 0.25, "rewards/chosen": -27.58370590209961, "rewards/margins": -1.8124804496765137, "rewards/rejected": -25.771224975585938, "step": 1287 }, { "epoch": 0.17538126361655773, "grad_norm": 59.96848977197629, "learning_rate": 7.862593827210815e-07, "logits/chosen": 8.073121070861816, "logits/rejected": 8.476629257202148, "logps/chosen": -2.2979440689086914, "logps/rejected": -2.3704824447631836, "loss": 4.2531, "rewards/accuracies": 0.5, "rewards/chosen": -22.979440689086914, "rewards/margins": 0.7253842353820801, "rewards/rejected": -23.704824447631836, "step": 1288 }, { "epoch": 0.1755174291938998, "grad_norm": 49.165421605557945, "learning_rate": 7.862099307550576e-07, "logits/chosen": 9.130033493041992, "logits/rejected": 8.285297393798828, "logps/chosen": -2.769033432006836, "logps/rejected": -2.5796380043029785, "loss": 4.6114, "rewards/accuracies": 0.25, "rewards/chosen": -27.690332412719727, "rewards/margins": -1.8939533233642578, "rewards/rejected": -25.79637908935547, "step": 1289 }, { "epoch": 0.17565359477124182, "grad_norm": 45.6798249012203, "learning_rate": 7.861603915217074e-07, "logits/chosen": 9.135185241699219, "logits/rejected": 10.020565032958984, "logps/chosen": -2.9188218116760254, "logps/rejected": -2.9648261070251465, "loss": 4.4247, "rewards/accuracies": 0.5, "rewards/chosen": -29.188217163085938, "rewards/margins": 0.46004533767700195, "rewards/rejected": -29.64826202392578, "step": 1290 }, { "epoch": 0.17578976034858387, "grad_norm": 44.09716494214744, "learning_rate": 7.861107650322246e-07, "logits/chosen": 9.419549942016602, "logits/rejected": 9.117006301879883, "logps/chosen": -2.746277093887329, "logps/rejected": -2.7407469749450684, "loss": 4.0879, "rewards/accuracies": 0.25, "rewards/chosen": -27.462772369384766, "rewards/margins": -0.05530261993408203, "rewards/rejected": -27.407468795776367, "step": 1291 }, { "epoch": 0.17592592592592593, "grad_norm": 45.7489076596425, "learning_rate": 7.860610512978229e-07, "logits/chosen": 8.09412956237793, "logits/rejected": 8.063135147094727, "logps/chosen": -2.4566221237182617, "logps/rejected": -2.702737808227539, "loss": 3.8923, "rewards/accuracies": 0.75, "rewards/chosen": -24.56622314453125, "rewards/margins": 2.46115779876709, "rewards/rejected": -27.027379989624023, "step": 1292 }, { "epoch": 0.176062091503268, "grad_norm": 41.81169717652493, "learning_rate": 7.860112503297354e-07, "logits/chosen": 9.53032398223877, "logits/rejected": 8.557223320007324, "logps/chosen": -2.7690269947052, "logps/rejected": -2.5029349327087402, "loss": 3.8499, "rewards/accuracies": 0.25, "rewards/chosen": -27.690269470214844, "rewards/margins": -2.660921096801758, "rewards/rejected": -25.029348373413086, "step": 1293 }, { "epoch": 0.17619825708061002, "grad_norm": 41.680187800960304, "learning_rate": 7.859613621392152e-07, "logits/chosen": 6.981788635253906, "logits/rejected": 8.917654037475586, "logps/chosen": -2.446352243423462, "logps/rejected": -2.926018238067627, "loss": 3.431, "rewards/accuracies": 1.0, "rewards/chosen": -24.46352195739746, "rewards/margins": 4.796660900115967, "rewards/rejected": -29.260181427001953, "step": 1294 }, { "epoch": 0.17633442265795207, "grad_norm": 49.61999104231989, "learning_rate": 7.859113867375347e-07, "logits/chosen": 9.17999267578125, "logits/rejected": 9.326842308044434, "logps/chosen": -2.711887836456299, "logps/rejected": -2.877936840057373, "loss": 4.5265, "rewards/accuracies": 0.5, "rewards/chosen": -27.118877410888672, "rewards/margins": 1.6604886054992676, "rewards/rejected": -28.77936553955078, "step": 1295 }, { "epoch": 0.17647058823529413, "grad_norm": 53.941215269367675, "learning_rate": 7.858613241359864e-07, "logits/chosen": 9.258852005004883, "logits/rejected": 10.28135871887207, "logps/chosen": -2.642005443572998, "logps/rejected": -2.8773343563079834, "loss": 3.7006, "rewards/accuracies": 0.75, "rewards/chosen": -26.420053482055664, "rewards/margins": 2.353290557861328, "rewards/rejected": -28.773344039916992, "step": 1296 }, { "epoch": 0.17660675381263616, "grad_norm": 109.58684251218743, "learning_rate": 7.858111743458823e-07, "logits/chosen": 8.554024696350098, "logits/rejected": 8.024055480957031, "logps/chosen": -2.5552148818969727, "logps/rejected": -2.6692795753479004, "loss": 4.6528, "rewards/accuracies": 0.5, "rewards/chosen": -25.55215072631836, "rewards/margins": 1.1406440734863281, "rewards/rejected": -26.692792892456055, "step": 1297 }, { "epoch": 0.17674291938997821, "grad_norm": 43.499354363175065, "learning_rate": 7.857609373785544e-07, "logits/chosen": 8.092500686645508, "logits/rejected": 8.62445068359375, "logps/chosen": -2.1386170387268066, "logps/rejected": -2.503025531768799, "loss": 3.8612, "rewards/accuracies": 0.75, "rewards/chosen": -21.38616943359375, "rewards/margins": 3.6440839767456055, "rewards/rejected": -25.030254364013672, "step": 1298 }, { "epoch": 0.17687908496732027, "grad_norm": 45.239434040990055, "learning_rate": 7.857106132453539e-07, "logits/chosen": 8.840289115905762, "logits/rejected": 9.56712818145752, "logps/chosen": -2.6726150512695312, "logps/rejected": -2.9231860637664795, "loss": 3.9784, "rewards/accuracies": 0.75, "rewards/chosen": -26.72614860534668, "rewards/margins": 2.5057129859924316, "rewards/rejected": -29.231861114501953, "step": 1299 }, { "epoch": 0.1770152505446623, "grad_norm": 61.069007347352795, "learning_rate": 7.856602019576521e-07, "logits/chosen": 7.837902069091797, "logits/rejected": 9.962900161743164, "logps/chosen": -2.6074588298797607, "logps/rejected": -2.8111348152160645, "loss": 4.1215, "rewards/accuracies": 1.0, "rewards/chosen": -26.074586868286133, "rewards/margins": 2.03676176071167, "rewards/rejected": -28.11134910583496, "step": 1300 }, { "epoch": 0.17715141612200436, "grad_norm": 62.052110501248826, "learning_rate": 7.856097035268396e-07, "logits/chosen": 10.124494552612305, "logits/rejected": 8.665399551391602, "logps/chosen": -3.0453388690948486, "logps/rejected": -2.928804874420166, "loss": 4.5699, "rewards/accuracies": 0.5, "rewards/chosen": -30.45339012145996, "rewards/margins": -1.1653385162353516, "rewards/rejected": -29.288049697875977, "step": 1301 }, { "epoch": 0.1772875816993464, "grad_norm": 52.74532976561218, "learning_rate": 7.855591179643271e-07, "logits/chosen": 9.235734939575195, "logits/rejected": 9.929137229919434, "logps/chosen": -2.0321879386901855, "logps/rejected": -2.695380449295044, "loss": 4.5087, "rewards/accuracies": 1.0, "rewards/chosen": -20.32187843322754, "rewards/margins": 6.631926536560059, "rewards/rejected": -26.95380401611328, "step": 1302 }, { "epoch": 0.17742374727668844, "grad_norm": 45.050085220380446, "learning_rate": 7.855084452815448e-07, "logits/chosen": 6.724666118621826, "logits/rejected": 7.504486083984375, "logps/chosen": -2.327885150909424, "logps/rejected": -2.364794969558716, "loss": 3.817, "rewards/accuracies": 0.25, "rewards/chosen": -23.278852462768555, "rewards/margins": 0.36909961700439453, "rewards/rejected": -23.647953033447266, "step": 1303 }, { "epoch": 0.1775599128540305, "grad_norm": 40.87873966697163, "learning_rate": 7.854576854899428e-07, "logits/chosen": 6.865413665771484, "logits/rejected": 7.068605422973633, "logps/chosen": -2.3312220573425293, "logps/rejected": -2.5891919136047363, "loss": 4.1407, "rewards/accuracies": 0.75, "rewards/chosen": -23.31222152709961, "rewards/margins": 2.5796985626220703, "rewards/rejected": -25.89192008972168, "step": 1304 }, { "epoch": 0.17769607843137256, "grad_norm": 47.506153559959266, "learning_rate": 7.854068386009905e-07, "logits/chosen": 8.405677795410156, "logits/rejected": 7.841905117034912, "logps/chosen": -2.792055368423462, "logps/rejected": -2.7799105644226074, "loss": 3.7432, "rewards/accuracies": 0.75, "rewards/chosen": -27.92055320739746, "rewards/margins": -0.12144660949707031, "rewards/rejected": -27.79910659790039, "step": 1305 }, { "epoch": 0.17783224400871459, "grad_norm": 45.72083639026132, "learning_rate": 7.853559046261771e-07, "logits/chosen": 8.674888610839844, "logits/rejected": 9.786585807800293, "logps/chosen": -2.6774773597717285, "logps/rejected": -2.879871368408203, "loss": 4.2575, "rewards/accuracies": 0.75, "rewards/chosen": -26.77477264404297, "rewards/margins": 2.023941993713379, "rewards/rejected": -28.79871368408203, "step": 1306 }, { "epoch": 0.17796840958605664, "grad_norm": 46.9559790980623, "learning_rate": 7.853048835770118e-07, "logits/chosen": 8.365705490112305, "logits/rejected": 9.559419631958008, "logps/chosen": -2.927828788757324, "logps/rejected": -3.098748207092285, "loss": 4.1183, "rewards/accuracies": 0.75, "rewards/chosen": -29.27828598022461, "rewards/margins": 1.709195613861084, "rewards/rejected": -30.98748016357422, "step": 1307 }, { "epoch": 0.1781045751633987, "grad_norm": 48.08949693119786, "learning_rate": 7.852537754650229e-07, "logits/chosen": 6.351641654968262, "logits/rejected": 9.48910140991211, "logps/chosen": -2.1510510444641113, "logps/rejected": -2.85921573638916, "loss": 4.2861, "rewards/accuracies": 1.0, "rewards/chosen": -21.510509490966797, "rewards/margins": 7.0816473960876465, "rewards/rejected": -28.592159271240234, "step": 1308 }, { "epoch": 0.17824074074074073, "grad_norm": 48.80477660900612, "learning_rate": 7.852025803017591e-07, "logits/chosen": 7.145428657531738, "logits/rejected": 8.007759094238281, "logps/chosen": -2.553997039794922, "logps/rejected": -2.6925857067108154, "loss": 4.1981, "rewards/accuracies": 0.75, "rewards/chosen": -25.53997230529785, "rewards/margins": 1.385885238647461, "rewards/rejected": -26.925857543945312, "step": 1309 }, { "epoch": 0.17837690631808278, "grad_norm": 51.40864024665004, "learning_rate": 7.851512980987882e-07, "logits/chosen": 8.42803955078125, "logits/rejected": 8.233053207397461, "logps/chosen": -2.8885626792907715, "logps/rejected": -3.254716634750366, "loss": 3.7062, "rewards/accuracies": 1.0, "rewards/chosen": -28.88562774658203, "rewards/margins": 3.6615395545959473, "rewards/rejected": -32.54716873168945, "step": 1310 }, { "epoch": 0.17851307189542484, "grad_norm": 44.981465900161545, "learning_rate": 7.850999288676977e-07, "logits/chosen": 6.657149314880371, "logits/rejected": 8.212369918823242, "logps/chosen": -2.5987396240234375, "logps/rejected": -2.8679263591766357, "loss": 4.4908, "rewards/accuracies": 0.75, "rewards/chosen": -25.987396240234375, "rewards/margins": 2.691866874694824, "rewards/rejected": -28.679264068603516, "step": 1311 }, { "epoch": 0.1786492374727669, "grad_norm": 53.39894255618083, "learning_rate": 7.850484726200949e-07, "logits/chosen": 7.736566543579102, "logits/rejected": 8.722738265991211, "logps/chosen": -2.803856611251831, "logps/rejected": -3.1215980052948, "loss": 4.4102, "rewards/accuracies": 1.0, "rewards/chosen": -28.03856658935547, "rewards/margins": 3.1774134635925293, "rewards/rejected": -31.215978622436523, "step": 1312 }, { "epoch": 0.17878540305010893, "grad_norm": 44.523019625201734, "learning_rate": 7.849969293676071e-07, "logits/chosen": 6.920699596405029, "logits/rejected": 9.222671508789062, "logps/chosen": -2.6663639545440674, "logps/rejected": -3.144148349761963, "loss": 3.8392, "rewards/accuracies": 1.0, "rewards/chosen": -26.663639068603516, "rewards/margins": 4.777843952178955, "rewards/rejected": -31.441482543945312, "step": 1313 }, { "epoch": 0.17892156862745098, "grad_norm": 45.85058367157754, "learning_rate": 7.849452991218805e-07, "logits/chosen": 8.4578857421875, "logits/rejected": 8.157858848571777, "logps/chosen": -2.910830497741699, "logps/rejected": -2.6770071983337402, "loss": 4.116, "rewards/accuracies": 0.25, "rewards/chosen": -29.108304977416992, "rewards/margins": -2.3382339477539062, "rewards/rejected": -26.770071029663086, "step": 1314 }, { "epoch": 0.17905773420479304, "grad_norm": 45.896185066098795, "learning_rate": 7.848935818945817e-07, "logits/chosen": 7.495598793029785, "logits/rejected": 7.143111705780029, "logps/chosen": -2.8101274967193604, "logps/rejected": -2.9063949584960938, "loss": 4.3931, "rewards/accuracies": 0.75, "rewards/chosen": -28.101276397705078, "rewards/margins": 0.9626755714416504, "rewards/rejected": -29.06395149230957, "step": 1315 }, { "epoch": 0.17919389978213507, "grad_norm": 50.890884004713385, "learning_rate": 7.848417776973964e-07, "logits/chosen": 7.454838752746582, "logits/rejected": 8.585148811340332, "logps/chosen": -2.76766300201416, "logps/rejected": -3.226586103439331, "loss": 4.6034, "rewards/accuracies": 0.75, "rewards/chosen": -27.676631927490234, "rewards/margins": 4.589230537414551, "rewards/rejected": -32.26586151123047, "step": 1316 }, { "epoch": 0.17933006535947713, "grad_norm": 43.62225892375065, "learning_rate": 7.847898865420304e-07, "logits/chosen": 9.010976791381836, "logits/rejected": 9.043279647827148, "logps/chosen": -2.9888768196105957, "logps/rejected": -2.9623231887817383, "loss": 3.9612, "rewards/accuracies": 0.5, "rewards/chosen": -29.888769149780273, "rewards/margins": -0.26553916931152344, "rewards/rejected": -29.62322998046875, "step": 1317 }, { "epoch": 0.17946623093681918, "grad_norm": 46.69361365769182, "learning_rate": 7.847379084402088e-07, "logits/chosen": 8.905539512634277, "logits/rejected": 8.911345481872559, "logps/chosen": -2.996196746826172, "logps/rejected": -3.117677927017212, "loss": 4.0225, "rewards/accuracies": 0.75, "rewards/chosen": -29.96196746826172, "rewards/margins": 1.2148103713989258, "rewards/rejected": -31.17677879333496, "step": 1318 }, { "epoch": 0.1796023965141612, "grad_norm": 47.27157092937472, "learning_rate": 7.846858434036765e-07, "logits/chosen": 7.4549407958984375, "logits/rejected": 8.654488563537598, "logps/chosen": -2.224435329437256, "logps/rejected": -2.6568148136138916, "loss": 4.1923, "rewards/accuracies": 1.0, "rewards/chosen": -22.244354248046875, "rewards/margins": 4.323793411254883, "rewards/rejected": -26.568147659301758, "step": 1319 }, { "epoch": 0.17973856209150327, "grad_norm": 45.718205258546924, "learning_rate": 7.846336914441981e-07, "logits/chosen": 7.4532575607299805, "logits/rejected": 8.890929222106934, "logps/chosen": -2.504608154296875, "logps/rejected": -3.1981658935546875, "loss": 4.1436, "rewards/accuracies": 1.0, "rewards/chosen": -25.04608154296875, "rewards/margins": 6.935575485229492, "rewards/rejected": -31.981657028198242, "step": 1320 }, { "epoch": 0.17987472766884532, "grad_norm": 50.19217762917184, "learning_rate": 7.845814525735575e-07, "logits/chosen": 8.761255264282227, "logits/rejected": 9.202247619628906, "logps/chosen": -3.048856496810913, "logps/rejected": -4.036525726318359, "loss": 4.4761, "rewards/accuracies": 0.75, "rewards/chosen": -30.488567352294922, "rewards/margins": 9.876689910888672, "rewards/rejected": -40.365257263183594, "step": 1321 }, { "epoch": 0.18001089324618735, "grad_norm": 41.55632099144956, "learning_rate": 7.845291268035588e-07, "logits/chosen": 8.213475227355957, "logits/rejected": 7.727570056915283, "logps/chosen": -2.5220561027526855, "logps/rejected": -2.7667527198791504, "loss": 3.827, "rewards/accuracies": 0.5, "rewards/chosen": -25.220561981201172, "rewards/margins": 2.446967124938965, "rewards/rejected": -27.66752815246582, "step": 1322 }, { "epoch": 0.1801470588235294, "grad_norm": 45.11677956169044, "learning_rate": 7.844767141460254e-07, "logits/chosen": 8.702874183654785, "logits/rejected": 8.160661697387695, "logps/chosen": -2.6788415908813477, "logps/rejected": -2.645124912261963, "loss": 4.4758, "rewards/accuracies": 0.5, "rewards/chosen": -26.788415908813477, "rewards/margins": -0.33716726303100586, "rewards/rejected": -26.451248168945312, "step": 1323 }, { "epoch": 0.18028322440087147, "grad_norm": 42.49808533005021, "learning_rate": 7.844242146128003e-07, "logits/chosen": 8.29671573638916, "logits/rejected": 7.412016868591309, "logps/chosen": -3.0038845539093018, "logps/rejected": -2.8807661533355713, "loss": 4.3596, "rewards/accuracies": 0.5, "rewards/chosen": -30.038846969604492, "rewards/margins": -1.2311854362487793, "rewards/rejected": -28.807662963867188, "step": 1324 }, { "epoch": 0.1804193899782135, "grad_norm": 41.39707599995495, "learning_rate": 7.843716282157463e-07, "logits/chosen": 9.151952743530273, "logits/rejected": 9.610337257385254, "logps/chosen": -2.582691192626953, "logps/rejected": -2.664917469024658, "loss": 4.0686, "rewards/accuracies": 0.25, "rewards/chosen": -25.82691192626953, "rewards/margins": 0.8222641944885254, "rewards/rejected": -26.649173736572266, "step": 1325 }, { "epoch": 0.18055555555555555, "grad_norm": 47.70532505900321, "learning_rate": 7.843189549667456e-07, "logits/chosen": 9.510072708129883, "logits/rejected": 9.099336624145508, "logps/chosen": -2.9257149696350098, "logps/rejected": -2.9918694496154785, "loss": 4.5142, "rewards/accuracies": 0.75, "rewards/chosen": -29.257152557373047, "rewards/margins": 0.6615414619445801, "rewards/rejected": -29.91869354248047, "step": 1326 }, { "epoch": 0.1806917211328976, "grad_norm": 68.13798865521646, "learning_rate": 7.842661948777001e-07, "logits/chosen": 8.143758773803711, "logits/rejected": 8.874886512756348, "logps/chosen": -2.5131006240844727, "logps/rejected": -2.720393657684326, "loss": 3.7988, "rewards/accuracies": 0.5, "rewards/chosen": -25.13100814819336, "rewards/margins": 2.072927951812744, "rewards/rejected": -27.203937530517578, "step": 1327 }, { "epoch": 0.18082788671023964, "grad_norm": 51.92070656611664, "learning_rate": 7.842133479605316e-07, "logits/chosen": 9.308808326721191, "logits/rejected": 8.755733489990234, "logps/chosen": -2.4738636016845703, "logps/rejected": -2.487785816192627, "loss": 4.2037, "rewards/accuracies": 0.5, "rewards/chosen": -24.738636016845703, "rewards/margins": 0.1392207145690918, "rewards/rejected": -24.877857208251953, "step": 1328 }, { "epoch": 0.1809640522875817, "grad_norm": 46.22116352876123, "learning_rate": 7.841604142271812e-07, "logits/chosen": 6.767590522766113, "logits/rejected": 6.491622447967529, "logps/chosen": -2.760068655014038, "logps/rejected": -2.714332103729248, "loss": 4.5898, "rewards/accuracies": 0.75, "rewards/chosen": -27.60068702697754, "rewards/margins": -0.4573636054992676, "rewards/rejected": -27.143321990966797, "step": 1329 }, { "epoch": 0.18110021786492375, "grad_norm": 45.92481952723659, "learning_rate": 7.841073936896098e-07, "logits/chosen": 9.192646026611328, "logits/rejected": 10.15475082397461, "logps/chosen": -2.89776873588562, "logps/rejected": -3.0004661083221436, "loss": 3.6606, "rewards/accuracies": 0.5, "rewards/chosen": -28.97768783569336, "rewards/margins": 1.0269718170166016, "rewards/rejected": -30.004661560058594, "step": 1330 }, { "epoch": 0.1812363834422658, "grad_norm": 52.239694563164775, "learning_rate": 7.840542863597976e-07, "logits/chosen": 7.8696489334106445, "logits/rejected": 8.539838790893555, "logps/chosen": -2.2055013179779053, "logps/rejected": -2.734375476837158, "loss": 4.1538, "rewards/accuracies": 1.0, "rewards/chosen": -22.055015563964844, "rewards/margins": 5.288741588592529, "rewards/rejected": -27.3437557220459, "step": 1331 }, { "epoch": 0.18137254901960784, "grad_norm": 44.91381665723014, "learning_rate": 7.840010922497448e-07, "logits/chosen": 8.964459419250488, "logits/rejected": 8.420997619628906, "logps/chosen": -2.618955135345459, "logps/rejected": -2.6096746921539307, "loss": 3.987, "rewards/accuracies": 0.5, "rewards/chosen": -26.189552307128906, "rewards/margins": -0.09280633926391602, "rewards/rejected": -26.09674644470215, "step": 1332 }, { "epoch": 0.1815087145969499, "grad_norm": 51.331650312365326, "learning_rate": 7.83947811371471e-07, "logits/chosen": 7.704388618469238, "logits/rejected": 8.722124099731445, "logps/chosen": -2.5196685791015625, "logps/rejected": -2.7346115112304688, "loss": 4.4506, "rewards/accuracies": 1.0, "rewards/chosen": -25.196685791015625, "rewards/margins": 2.1494288444519043, "rewards/rejected": -27.346113204956055, "step": 1333 }, { "epoch": 0.18164488017429195, "grad_norm": 50.0627114000502, "learning_rate": 7.838944437370154e-07, "logits/chosen": 8.75192642211914, "logits/rejected": 9.142759323120117, "logps/chosen": -3.0864992141723633, "logps/rejected": -3.117485761642456, "loss": 4.2668, "rewards/accuracies": 0.5, "rewards/chosen": -30.864992141723633, "rewards/margins": 0.30986595153808594, "rewards/rejected": -31.17485809326172, "step": 1334 }, { "epoch": 0.18178104575163398, "grad_norm": 48.95629635532633, "learning_rate": 7.838409893584371e-07, "logits/chosen": 10.226247787475586, "logits/rejected": 10.083877563476562, "logps/chosen": -2.8664755821228027, "logps/rejected": -2.775639295578003, "loss": 4.1613, "rewards/accuracies": 0.25, "rewards/chosen": -28.66475486755371, "rewards/margins": -0.9083642959594727, "rewards/rejected": -27.756391525268555, "step": 1335 }, { "epoch": 0.18191721132897604, "grad_norm": 48.84805700124482, "learning_rate": 7.837874482478142e-07, "logits/chosen": 9.029748916625977, "logits/rejected": 9.765567779541016, "logps/chosen": -2.902815341949463, "logps/rejected": -3.1342432498931885, "loss": 3.8572, "rewards/accuracies": 0.75, "rewards/chosen": -29.028152465820312, "rewards/margins": 2.314279079437256, "rewards/rejected": -31.342432022094727, "step": 1336 }, { "epoch": 0.1820533769063181, "grad_norm": 47.17070431310902, "learning_rate": 7.837338204172452e-07, "logits/chosen": 8.329147338867188, "logits/rejected": 10.970727920532227, "logps/chosen": -2.4520320892333984, "logps/rejected": -2.8704562187194824, "loss": 3.8107, "rewards/accuracies": 1.0, "rewards/chosen": -24.52031898498535, "rewards/margins": 4.184244155883789, "rewards/rejected": -28.70456314086914, "step": 1337 }, { "epoch": 0.18218954248366012, "grad_norm": 51.42237487575839, "learning_rate": 7.836801058788472e-07, "logits/chosen": 9.377151489257812, "logits/rejected": 10.2683687210083, "logps/chosen": -3.179899215698242, "logps/rejected": -3.5107839107513428, "loss": 4.6189, "rewards/accuracies": 1.0, "rewards/chosen": -31.798992156982422, "rewards/margins": 3.3088459968566895, "rewards/rejected": -35.10783767700195, "step": 1338 }, { "epoch": 0.18232570806100218, "grad_norm": 50.35123147853976, "learning_rate": 7.83626304644758e-07, "logits/chosen": 8.86866283416748, "logits/rejected": 10.021623611450195, "logps/chosen": -2.7955210208892822, "logps/rejected": -2.9454197883605957, "loss": 4.1334, "rewards/accuracies": 0.5, "rewards/chosen": -27.955209732055664, "rewards/margins": 1.4989895820617676, "rewards/rejected": -29.454198837280273, "step": 1339 }, { "epoch": 0.18246187363834424, "grad_norm": 45.47732001204179, "learning_rate": 7.835724167271341e-07, "logits/chosen": 9.377507209777832, "logits/rejected": 9.243790626525879, "logps/chosen": -2.2575838565826416, "logps/rejected": -2.375469207763672, "loss": 3.5493, "rewards/accuracies": 0.75, "rewards/chosen": -22.57583999633789, "rewards/margins": 1.1788535118103027, "rewards/rejected": -23.75469207763672, "step": 1340 }, { "epoch": 0.18259803921568626, "grad_norm": 44.294934381268426, "learning_rate": 7.835184421381519e-07, "logits/chosen": 10.024651527404785, "logits/rejected": 11.095529556274414, "logps/chosen": -3.1198720932006836, "logps/rejected": -3.4575343132019043, "loss": 3.8707, "rewards/accuracies": 0.75, "rewards/chosen": -31.198720932006836, "rewards/margins": 3.376624584197998, "rewards/rejected": -34.57534408569336, "step": 1341 }, { "epoch": 0.18273420479302832, "grad_norm": 45.676698392534036, "learning_rate": 7.834643808900078e-07, "logits/chosen": 8.213788032531738, "logits/rejected": 9.936334609985352, "logps/chosen": -2.7168798446655273, "logps/rejected": -3.1228396892547607, "loss": 3.9742, "rewards/accuracies": 1.0, "rewards/chosen": -27.168798446655273, "rewards/margins": 4.059598922729492, "rewards/rejected": -31.228397369384766, "step": 1342 }, { "epoch": 0.18287037037037038, "grad_norm": 42.860982601863974, "learning_rate": 7.834102329949168e-07, "logits/chosen": 10.656793594360352, "logits/rejected": 9.172439575195312, "logps/chosen": -3.160860776901245, "logps/rejected": -3.3796777725219727, "loss": 4.0404, "rewards/accuracies": 0.5, "rewards/chosen": -31.608606338500977, "rewards/margins": 2.188171863555908, "rewards/rejected": -33.796775817871094, "step": 1343 }, { "epoch": 0.1830065359477124, "grad_norm": 50.26948355076146, "learning_rate": 7.833559984651144e-07, "logits/chosen": 8.361039161682129, "logits/rejected": 9.89084243774414, "logps/chosen": -2.49456787109375, "logps/rejected": -2.8497698307037354, "loss": 3.9482, "rewards/accuracies": 0.5, "rewards/chosen": -24.945676803588867, "rewards/margins": 3.552021026611328, "rewards/rejected": -28.497699737548828, "step": 1344 }, { "epoch": 0.18314270152505446, "grad_norm": 45.36727548523761, "learning_rate": 7.833016773128554e-07, "logits/chosen": 6.385775566101074, "logits/rejected": 8.227290153503418, "logps/chosen": -2.159787654876709, "logps/rejected": -2.6255011558532715, "loss": 4.0237, "rewards/accuracies": 1.0, "rewards/chosen": -21.597875595092773, "rewards/margins": 4.657135486602783, "rewards/rejected": -26.2550106048584, "step": 1345 }, { "epoch": 0.18327886710239652, "grad_norm": 45.00118085375051, "learning_rate": 7.832472695504139e-07, "logits/chosen": 10.006620407104492, "logits/rejected": 9.893082618713379, "logps/chosen": -2.8696696758270264, "logps/rejected": -3.3201773166656494, "loss": 4.1831, "rewards/accuracies": 0.75, "rewards/chosen": -28.69669532775879, "rewards/margins": 4.505077838897705, "rewards/rejected": -33.20177459716797, "step": 1346 }, { "epoch": 0.18341503267973855, "grad_norm": 49.96788501119676, "learning_rate": 7.831927751900838e-07, "logits/chosen": 9.192754745483398, "logits/rejected": 9.839814186096191, "logps/chosen": -3.013190984725952, "logps/rejected": -2.555265426635742, "loss": 3.8903, "rewards/accuracies": 0.25, "rewards/chosen": -30.13191032409668, "rewards/margins": -4.579254150390625, "rewards/rejected": -25.552658081054688, "step": 1347 }, { "epoch": 0.1835511982570806, "grad_norm": 52.002608277679016, "learning_rate": 7.831381942441789e-07, "logits/chosen": 9.200372695922852, "logits/rejected": 9.916423797607422, "logps/chosen": -2.872497081756592, "logps/rejected": -2.8624329566955566, "loss": 4.7161, "rewards/accuracies": 0.5, "rewards/chosen": -28.724971771240234, "rewards/margins": -0.10064172744750977, "rewards/rejected": -28.62432861328125, "step": 1348 }, { "epoch": 0.18368736383442266, "grad_norm": 46.4934677837979, "learning_rate": 7.830835267250317e-07, "logits/chosen": 9.307903289794922, "logits/rejected": 9.674690246582031, "logps/chosen": -2.7459330558776855, "logps/rejected": -3.207642078399658, "loss": 4.2233, "rewards/accuracies": 0.75, "rewards/chosen": -27.459331512451172, "rewards/margins": 4.617091178894043, "rewards/rejected": -32.07642364501953, "step": 1349 }, { "epoch": 0.18382352941176472, "grad_norm": 54.2459813340419, "learning_rate": 7.830287726449953e-07, "logits/chosen": 8.25355339050293, "logits/rejected": 9.525409698486328, "logps/chosen": -2.552483081817627, "logps/rejected": -3.1829190254211426, "loss": 3.9356, "rewards/accuracies": 1.0, "rewards/chosen": -25.524831771850586, "rewards/margins": 6.304357528686523, "rewards/rejected": -31.829191207885742, "step": 1350 }, { "epoch": 0.18395969498910675, "grad_norm": 62.0082745283847, "learning_rate": 7.829739320164414e-07, "logits/chosen": 9.7778902053833, "logits/rejected": 8.709953308105469, "logps/chosen": -3.0565435886383057, "logps/rejected": -2.75095272064209, "loss": 4.2508, "rewards/accuracies": 0.5, "rewards/chosen": -30.5654354095459, "rewards/margins": -3.0559072494506836, "rewards/rejected": -27.5095272064209, "step": 1351 }, { "epoch": 0.1840958605664488, "grad_norm": 49.19128845752809, "learning_rate": 7.829190048517619e-07, "logits/chosen": 9.239721298217773, "logits/rejected": 8.323897361755371, "logps/chosen": -2.6105918884277344, "logps/rejected": -2.5454399585723877, "loss": 4.2528, "rewards/accuracies": 0.25, "rewards/chosen": -26.10591697692871, "rewards/margins": -0.6515178680419922, "rewards/rejected": -25.45439910888672, "step": 1352 }, { "epoch": 0.18423202614379086, "grad_norm": 44.41426061964819, "learning_rate": 7.82863991163368e-07, "logits/chosen": 8.649269104003906, "logits/rejected": 8.787925720214844, "logps/chosen": -2.796745777130127, "logps/rejected": -2.979707717895508, "loss": 4.1048, "rewards/accuracies": 0.5, "rewards/chosen": -27.967458724975586, "rewards/margins": 1.8296175003051758, "rewards/rejected": -29.797077178955078, "step": 1353 }, { "epoch": 0.1843681917211329, "grad_norm": 47.37874154993972, "learning_rate": 7.828088909636906e-07, "logits/chosen": 8.630144119262695, "logits/rejected": 9.549753189086914, "logps/chosen": -2.6196188926696777, "logps/rejected": -2.755126714706421, "loss": 4.5156, "rewards/accuracies": 0.75, "rewards/chosen": -26.196189880371094, "rewards/margins": 1.3550777435302734, "rewards/rejected": -27.551267623901367, "step": 1354 }, { "epoch": 0.18450435729847495, "grad_norm": 52.85718106101924, "learning_rate": 7.827537042651798e-07, "logits/chosen": 8.592658042907715, "logits/rejected": 9.703048706054688, "logps/chosen": -2.6374661922454834, "logps/rejected": -2.8476407527923584, "loss": 4.5466, "rewards/accuracies": 0.75, "rewards/chosen": -26.374662399291992, "rewards/margins": 2.1017465591430664, "rewards/rejected": -28.476409912109375, "step": 1355 }, { "epoch": 0.184640522875817, "grad_norm": 47.58929497846208, "learning_rate": 7.826984310803057e-07, "logits/chosen": 9.783190727233887, "logits/rejected": 8.971866607666016, "logps/chosen": -3.033231258392334, "logps/rejected": -2.749701499938965, "loss": 4.2085, "rewards/accuracies": 0.0, "rewards/chosen": -30.332313537597656, "rewards/margins": -2.835300922393799, "rewards/rejected": -27.497013092041016, "step": 1356 }, { "epoch": 0.18477668845315903, "grad_norm": 46.29048732154646, "learning_rate": 7.826430714215576e-07, "logits/chosen": 7.93804931640625, "logits/rejected": 8.744694709777832, "logps/chosen": -2.1610753536224365, "logps/rejected": -2.402611255645752, "loss": 4.32, "rewards/accuracies": 0.75, "rewards/chosen": -21.61075210571289, "rewards/margins": 2.4153614044189453, "rewards/rejected": -24.02611541748047, "step": 1357 }, { "epoch": 0.1849128540305011, "grad_norm": 51.97315986773322, "learning_rate": 7.825876253014448e-07, "logits/chosen": 8.355912208557129, "logits/rejected": 10.405187606811523, "logps/chosen": -2.534590482711792, "logps/rejected": -2.8314738273620605, "loss": 3.9988, "rewards/accuracies": 0.75, "rewards/chosen": -25.345905303955078, "rewards/margins": 2.968832015991211, "rewards/rejected": -28.31473731994629, "step": 1358 }, { "epoch": 0.18504901960784315, "grad_norm": 46.80853572890712, "learning_rate": 7.825320927324954e-07, "logits/chosen": 7.842777252197266, "logits/rejected": 8.50648021697998, "logps/chosen": -2.4450814723968506, "logps/rejected": -2.6280994415283203, "loss": 4.2143, "rewards/accuracies": 0.5, "rewards/chosen": -24.450815200805664, "rewards/margins": 1.8301787376403809, "rewards/rejected": -26.28099250793457, "step": 1359 }, { "epoch": 0.18518518518518517, "grad_norm": 46.89013792571976, "learning_rate": 7.824764737272575e-07, "logits/chosen": 9.842931747436523, "logits/rejected": 9.76855182647705, "logps/chosen": -2.939788341522217, "logps/rejected": -2.675670623779297, "loss": 4.6146, "rewards/accuracies": 0.25, "rewards/chosen": -29.397884368896484, "rewards/margins": -2.6411805152893066, "rewards/rejected": -26.756704330444336, "step": 1360 }, { "epoch": 0.18532135076252723, "grad_norm": 49.48257531958281, "learning_rate": 7.82420768298299e-07, "logits/chosen": 7.768167018890381, "logits/rejected": 8.005392074584961, "logps/chosen": -2.1833877563476562, "logps/rejected": -2.514078140258789, "loss": 4.216, "rewards/accuracies": 1.0, "rewards/chosen": -21.833877563476562, "rewards/margins": 3.30690336227417, "rewards/rejected": -25.14078140258789, "step": 1361 }, { "epoch": 0.1854575163398693, "grad_norm": 44.638372178394306, "learning_rate": 7.823649764582066e-07, "logits/chosen": 10.087686538696289, "logits/rejected": 10.119205474853516, "logps/chosen": -3.147470474243164, "logps/rejected": -2.968679428100586, "loss": 4.7431, "rewards/accuracies": 0.25, "rewards/chosen": -31.474702835083008, "rewards/margins": -1.7879085540771484, "rewards/rejected": -29.68679428100586, "step": 1362 }, { "epoch": 0.18559368191721132, "grad_norm": 54.22296976769298, "learning_rate": 7.823090982195872e-07, "logits/chosen": 10.664529800415039, "logits/rejected": 9.415311813354492, "logps/chosen": -2.8362154960632324, "logps/rejected": -2.7257423400878906, "loss": 4.9346, "rewards/accuracies": 0.25, "rewards/chosen": -28.362154006958008, "rewards/margins": -1.1047301292419434, "rewards/rejected": -27.257423400878906, "step": 1363 }, { "epoch": 0.18572984749455337, "grad_norm": 43.130519280713756, "learning_rate": 7.822531335950669e-07, "logits/chosen": 8.380388259887695, "logits/rejected": 8.011178970336914, "logps/chosen": -2.596414089202881, "logps/rejected": -2.7349119186401367, "loss": 3.9165, "rewards/accuracies": 0.5, "rewards/chosen": -25.96413803100586, "rewards/margins": 1.3849806785583496, "rewards/rejected": -27.349119186401367, "step": 1364 }, { "epoch": 0.18586601307189543, "grad_norm": 45.91280317416887, "learning_rate": 7.821970825972913e-07, "logits/chosen": 8.75217342376709, "logits/rejected": 8.750120162963867, "logps/chosen": -2.3616199493408203, "logps/rejected": -2.749350070953369, "loss": 3.5624, "rewards/accuracies": 0.75, "rewards/chosen": -23.616199493408203, "rewards/margins": 3.8773021697998047, "rewards/rejected": -27.493499755859375, "step": 1365 }, { "epoch": 0.18600217864923746, "grad_norm": 65.91655199649813, "learning_rate": 7.821409452389255e-07, "logits/chosen": 8.653797149658203, "logits/rejected": 8.013898849487305, "logps/chosen": -2.673430919647217, "logps/rejected": -2.600984573364258, "loss": 3.7868, "rewards/accuracies": 0.25, "rewards/chosen": -26.734310150146484, "rewards/margins": -0.7244634628295898, "rewards/rejected": -26.009845733642578, "step": 1366 }, { "epoch": 0.18613834422657952, "grad_norm": 53.110418079591554, "learning_rate": 7.820847215326544e-07, "logits/chosen": 8.16276741027832, "logits/rejected": 8.409547805786133, "logps/chosen": -2.500516891479492, "logps/rejected": -2.8310563564300537, "loss": 4.5774, "rewards/accuracies": 0.75, "rewards/chosen": -25.00516700744629, "rewards/margins": 3.30539608001709, "rewards/rejected": -28.310564041137695, "step": 1367 }, { "epoch": 0.18627450980392157, "grad_norm": 44.585447059525684, "learning_rate": 7.820284114911822e-07, "logits/chosen": 9.096748352050781, "logits/rejected": 8.409261703491211, "logps/chosen": -2.6320183277130127, "logps/rejected": -2.5243077278137207, "loss": 4.2005, "rewards/accuracies": 0.25, "rewards/chosen": -26.32018280029297, "rewards/margins": -1.0771074295043945, "rewards/rejected": -25.24307632446289, "step": 1368 }, { "epoch": 0.18641067538126363, "grad_norm": 45.97959802199157, "learning_rate": 7.819720151272324e-07, "logits/chosen": 8.617650985717773, "logits/rejected": 8.394166946411133, "logps/chosen": -2.4000353813171387, "logps/rejected": -2.452425718307495, "loss": 4.53, "rewards/accuracies": 0.5, "rewards/chosen": -24.000350952148438, "rewards/margins": 0.5239050388336182, "rewards/rejected": -24.52425765991211, "step": 1369 }, { "epoch": 0.18654684095860566, "grad_norm": 42.924401891522926, "learning_rate": 7.819155324535484e-07, "logits/chosen": 8.874107360839844, "logits/rejected": 8.612081527709961, "logps/chosen": -2.6209945678710938, "logps/rejected": -2.770878791809082, "loss": 4.2494, "rewards/accuracies": 0.75, "rewards/chosen": -26.20994758605957, "rewards/margins": 1.49884033203125, "rewards/rejected": -27.708786010742188, "step": 1370 }, { "epoch": 0.18668300653594772, "grad_norm": 37.33258720531031, "learning_rate": 7.81858963482893e-07, "logits/chosen": 8.826988220214844, "logits/rejected": 9.436518669128418, "logps/chosen": -2.4627223014831543, "logps/rejected": -2.620418071746826, "loss": 4.0681, "rewards/accuracies": 0.75, "rewards/chosen": -24.62722396850586, "rewards/margins": 1.5769562721252441, "rewards/rejected": -26.204177856445312, "step": 1371 }, { "epoch": 0.18681917211328977, "grad_norm": 47.29974469290214, "learning_rate": 7.818023082280482e-07, "logits/chosen": 9.085323333740234, "logits/rejected": 10.096864700317383, "logps/chosen": -2.627293109893799, "logps/rejected": -2.6100072860717773, "loss": 5.0054, "rewards/accuracies": 0.5, "rewards/chosen": -26.272930145263672, "rewards/margins": -0.17285871505737305, "rewards/rejected": -26.10007095336914, "step": 1372 }, { "epoch": 0.1869553376906318, "grad_norm": 42.3272388392499, "learning_rate": 7.81745566701816e-07, "logits/chosen": 9.125432968139648, "logits/rejected": 10.469291687011719, "logps/chosen": -2.580941677093506, "logps/rejected": -2.948439121246338, "loss": 3.9506, "rewards/accuracies": 1.0, "rewards/chosen": -25.80941390991211, "rewards/margins": 3.6749744415283203, "rewards/rejected": -29.484390258789062, "step": 1373 }, { "epoch": 0.18709150326797386, "grad_norm": 42.05704841937899, "learning_rate": 7.816887389170174e-07, "logits/chosen": 7.218978404998779, "logits/rejected": 7.688824653625488, "logps/chosen": -2.72843337059021, "logps/rejected": -2.666595220565796, "loss": 4.4596, "rewards/accuracies": 0.25, "rewards/chosen": -27.28433609008789, "rewards/margins": -0.6183829307556152, "rewards/rejected": -26.665952682495117, "step": 1374 }, { "epoch": 0.18722766884531591, "grad_norm": 42.296080473509534, "learning_rate": 7.816318248864931e-07, "logits/chosen": 7.478150844573975, "logits/rejected": 7.458555221557617, "logps/chosen": -2.4774723052978516, "logps/rejected": -2.40573787689209, "loss": 3.87, "rewards/accuracies": 0.5, "rewards/chosen": -24.774723052978516, "rewards/margins": -0.7173457145690918, "rewards/rejected": -24.057376861572266, "step": 1375 }, { "epoch": 0.18736383442265794, "grad_norm": 41.79087411156741, "learning_rate": 7.815748246231035e-07, "logits/chosen": 9.311481475830078, "logits/rejected": 10.440689086914062, "logps/chosen": -2.7531378269195557, "logps/rejected": -2.7557458877563477, "loss": 4.6337, "rewards/accuracies": 0.5, "rewards/chosen": -27.5313777923584, "rewards/margins": 0.026081562042236328, "rewards/rejected": -27.55746078491211, "step": 1376 }, { "epoch": 0.1875, "grad_norm": 45.175729508362366, "learning_rate": 7.81517738139728e-07, "logits/chosen": 9.79432201385498, "logits/rejected": 8.911310195922852, "logps/chosen": -2.96187424659729, "logps/rejected": -2.716613531112671, "loss": 4.6225, "rewards/accuracies": 0.0, "rewards/chosen": -29.61874008178711, "rewards/margins": -2.4526052474975586, "rewards/rejected": -27.1661376953125, "step": 1377 }, { "epoch": 0.18763616557734206, "grad_norm": 43.23312816550281, "learning_rate": 7.81460565449266e-07, "logits/chosen": 8.945716857910156, "logits/rejected": 9.391773223876953, "logps/chosen": -2.364760398864746, "logps/rejected": -2.991215467453003, "loss": 3.7102, "rewards/accuracies": 0.75, "rewards/chosen": -23.647605895996094, "rewards/margins": 6.264550685882568, "rewards/rejected": -29.91215705871582, "step": 1378 }, { "epoch": 0.18777233115468409, "grad_norm": 42.92545723476784, "learning_rate": 7.81403306564636e-07, "logits/chosen": 9.729814529418945, "logits/rejected": 7.58364200592041, "logps/chosen": -2.972052574157715, "logps/rejected": -2.7157387733459473, "loss": 3.9305, "rewards/accuracies": 0.25, "rewards/chosen": -29.72052764892578, "rewards/margins": -2.563138008117676, "rewards/rejected": -27.15738868713379, "step": 1379 }, { "epoch": 0.18790849673202614, "grad_norm": 57.94400147881544, "learning_rate": 7.813459614987762e-07, "logits/chosen": 8.723336219787598, "logits/rejected": 9.397075653076172, "logps/chosen": -2.2538578510284424, "logps/rejected": -2.6886062622070312, "loss": 4.5505, "rewards/accuracies": 1.0, "rewards/chosen": -22.538578033447266, "rewards/margins": 4.347484111785889, "rewards/rejected": -26.886062622070312, "step": 1380 }, { "epoch": 0.1880446623093682, "grad_norm": 43.1633695441149, "learning_rate": 7.812885302646442e-07, "logits/chosen": 9.050418853759766, "logits/rejected": 8.292202949523926, "logps/chosen": -2.589184284210205, "logps/rejected": -2.6063199043273926, "loss": 3.8532, "rewards/accuracies": 0.5, "rewards/chosen": -25.891845703125, "rewards/margins": 0.1713552474975586, "rewards/rejected": -26.063199996948242, "step": 1381 }, { "epoch": 0.18818082788671023, "grad_norm": 46.86295105593622, "learning_rate": 7.81231012875217e-07, "logits/chosen": 9.805633544921875, "logits/rejected": 10.370157241821289, "logps/chosen": -2.9815802574157715, "logps/rejected": -3.1010775566101074, "loss": 3.62, "rewards/accuracies": 0.75, "rewards/chosen": -29.81580352783203, "rewards/margins": 1.1949715614318848, "rewards/rejected": -31.010772705078125, "step": 1382 }, { "epoch": 0.18831699346405228, "grad_norm": 42.515543002536404, "learning_rate": 7.811734093434911e-07, "logits/chosen": 7.625960350036621, "logits/rejected": 8.749202728271484, "logps/chosen": -2.6699259281158447, "logps/rejected": -2.9941227436065674, "loss": 4.5631, "rewards/accuracies": 0.75, "rewards/chosen": -26.69925880432129, "rewards/margins": 3.2419681549072266, "rewards/rejected": -29.941226959228516, "step": 1383 }, { "epoch": 0.18845315904139434, "grad_norm": 43.29156858891409, "learning_rate": 7.811157196824825e-07, "logits/chosen": 9.433557510375977, "logits/rejected": 11.061319351196289, "logps/chosen": -3.5109803676605225, "logps/rejected": -3.35960054397583, "loss": 4.043, "rewards/accuracies": 0.75, "rewards/chosen": -35.10980224609375, "rewards/margins": -1.513798713684082, "rewards/rejected": -33.596004486083984, "step": 1384 }, { "epoch": 0.18858932461873637, "grad_norm": 89.33560682386566, "learning_rate": 7.810579439052268e-07, "logits/chosen": 10.347248077392578, "logits/rejected": 9.100332260131836, "logps/chosen": -2.93642520904541, "logps/rejected": -2.7252237796783447, "loss": 4.387, "rewards/accuracies": 0.25, "rewards/chosen": -29.36425018310547, "rewards/margins": -2.1120128631591797, "rewards/rejected": -27.252239227294922, "step": 1385 }, { "epoch": 0.18872549019607843, "grad_norm": 42.67648024038107, "learning_rate": 7.810000820247788e-07, "logits/chosen": 9.748127937316895, "logits/rejected": 9.943740844726562, "logps/chosen": -2.678229808807373, "logps/rejected": -2.9406111240386963, "loss": 4.1484, "rewards/accuracies": 0.75, "rewards/chosen": -26.782299041748047, "rewards/margins": 2.6238112449645996, "rewards/rejected": -29.406110763549805, "step": 1386 }, { "epoch": 0.18886165577342048, "grad_norm": 43.234146212663084, "learning_rate": 7.809421340542128e-07, "logits/chosen": 9.822073936462402, "logits/rejected": 9.072529792785645, "logps/chosen": -2.958082675933838, "logps/rejected": -2.8875317573547363, "loss": 4.4801, "rewards/accuracies": 0.5, "rewards/chosen": -29.580827713012695, "rewards/margins": -0.7055096626281738, "rewards/rejected": -28.87531852722168, "step": 1387 }, { "epoch": 0.18899782135076254, "grad_norm": 48.195336490131965, "learning_rate": 7.808841000066229e-07, "logits/chosen": 9.823136329650879, "logits/rejected": 9.344535827636719, "logps/chosen": -2.9445207118988037, "logps/rejected": -3.1408636569976807, "loss": 4.2332, "rewards/accuracies": 0.75, "rewards/chosen": -29.445205688476562, "rewards/margins": 1.9634313583374023, "rewards/rejected": -31.40863800048828, "step": 1388 }, { "epoch": 0.18913398692810457, "grad_norm": 52.50782196209953, "learning_rate": 7.808259798951221e-07, "logits/chosen": 9.999702453613281, "logits/rejected": 9.678653717041016, "logps/chosen": -2.7815330028533936, "logps/rejected": -2.766636371612549, "loss": 3.5435, "rewards/accuracies": 0.0, "rewards/chosen": -27.815330505371094, "rewards/margins": -0.14896583557128906, "rewards/rejected": -27.666364669799805, "step": 1389 }, { "epoch": 0.18927015250544663, "grad_norm": 43.82292144746052, "learning_rate": 7.80767773732843e-07, "logits/chosen": 8.993041038513184, "logits/rejected": 8.925305366516113, "logps/chosen": -2.175720691680908, "logps/rejected": -2.465709686279297, "loss": 3.9442, "rewards/accuracies": 1.0, "rewards/chosen": -21.7572078704834, "rewards/margins": 2.899888515472412, "rewards/rejected": -24.65709686279297, "step": 1390 }, { "epoch": 0.18940631808278868, "grad_norm": 43.32116167938164, "learning_rate": 7.807094815329383e-07, "logits/chosen": 9.102913856506348, "logits/rejected": 9.494205474853516, "logps/chosen": -2.810126781463623, "logps/rejected": -2.688117504119873, "loss": 4.2184, "rewards/accuracies": 0.25, "rewards/chosen": -28.101266860961914, "rewards/margins": -1.2200918197631836, "rewards/rejected": -26.881175994873047, "step": 1391 }, { "epoch": 0.1895424836601307, "grad_norm": 45.96656869234648, "learning_rate": 7.80651103308579e-07, "logits/chosen": 9.692230224609375, "logits/rejected": 11.62797737121582, "logps/chosen": -2.9773666858673096, "logps/rejected": -3.2520313262939453, "loss": 4.2056, "rewards/accuracies": 0.5, "rewards/chosen": -29.773666381835938, "rewards/margins": 2.746647357940674, "rewards/rejected": -32.52031707763672, "step": 1392 }, { "epoch": 0.18967864923747277, "grad_norm": 59.96149892280143, "learning_rate": 7.805926390729566e-07, "logits/chosen": 10.099379539489746, "logits/rejected": 9.95226764678955, "logps/chosen": -3.2038283348083496, "logps/rejected": -3.240269899368286, "loss": 3.8774, "rewards/accuracies": 0.5, "rewards/chosen": -32.03828048706055, "rewards/margins": 0.36441707611083984, "rewards/rejected": -32.4026985168457, "step": 1393 }, { "epoch": 0.18981481481481483, "grad_norm": 48.31673133800967, "learning_rate": 7.805340888392813e-07, "logits/chosen": 7.861808776855469, "logits/rejected": 8.591614723205566, "logps/chosen": -2.6242589950561523, "logps/rejected": -2.8570899963378906, "loss": 4.1859, "rewards/accuracies": 0.75, "rewards/chosen": -26.242591857910156, "rewards/margins": 2.328310966491699, "rewards/rejected": -28.57090187072754, "step": 1394 }, { "epoch": 0.18995098039215685, "grad_norm": 41.87402849633946, "learning_rate": 7.804754526207831e-07, "logits/chosen": 7.5378923416137695, "logits/rejected": 10.257431030273438, "logps/chosen": -2.723419427871704, "logps/rejected": -2.966965675354004, "loss": 4.0505, "rewards/accuracies": 0.75, "rewards/chosen": -27.234195709228516, "rewards/margins": 2.43546199798584, "rewards/rejected": -29.66965675354004, "step": 1395 }, { "epoch": 0.1900871459694989, "grad_norm": 86.10279265399744, "learning_rate": 7.804167304307114e-07, "logits/chosen": 8.725882530212402, "logits/rejected": 8.09979248046875, "logps/chosen": -2.8583226203918457, "logps/rejected": -3.0016796588897705, "loss": 4.0532, "rewards/accuracies": 0.5, "rewards/chosen": -28.58322525024414, "rewards/margins": 1.4335699081420898, "rewards/rejected": -30.016796112060547, "step": 1396 }, { "epoch": 0.19022331154684097, "grad_norm": 46.90671181585447, "learning_rate": 7.803579222823348e-07, "logits/chosen": 9.399831771850586, "logits/rejected": 9.476181030273438, "logps/chosen": -2.688443660736084, "logps/rejected": -2.7199175357818604, "loss": 4.5068, "rewards/accuracies": 0.5, "rewards/chosen": -26.884437561035156, "rewards/margins": 0.31473779678344727, "rewards/rejected": -27.199174880981445, "step": 1397 }, { "epoch": 0.190359477124183, "grad_norm": 67.73947907843439, "learning_rate": 7.802990281889418e-07, "logits/chosen": 9.35826587677002, "logits/rejected": 7.349555969238281, "logps/chosen": -2.6064136028289795, "logps/rejected": -2.2576799392700195, "loss": 4.3551, "rewards/accuracies": 0.25, "rewards/chosen": -26.064136505126953, "rewards/margins": -3.487337112426758, "rewards/rejected": -22.576797485351562, "step": 1398 }, { "epoch": 0.19049564270152505, "grad_norm": 50.077934008171, "learning_rate": 7.802400481638396e-07, "logits/chosen": 9.501384735107422, "logits/rejected": 9.78630256652832, "logps/chosen": -2.9302515983581543, "logps/rejected": -2.721067428588867, "loss": 4.1413, "rewards/accuracies": 0.25, "rewards/chosen": -29.30251693725586, "rewards/margins": -2.091841220855713, "rewards/rejected": -27.210674285888672, "step": 1399 }, { "epoch": 0.1906318082788671, "grad_norm": 51.13738478553901, "learning_rate": 7.801809822203555e-07, "logits/chosen": 10.478143692016602, "logits/rejected": 8.527539253234863, "logps/chosen": -2.8020858764648438, "logps/rejected": -2.44986891746521, "loss": 4.8511, "rewards/accuracies": 0.25, "rewards/chosen": -28.020862579345703, "rewards/margins": -3.5221710205078125, "rewards/rejected": -24.498689651489258, "step": 1400 }, { "epoch": 0.19076797385620914, "grad_norm": 43.39662964301068, "learning_rate": 7.801218303718358e-07, "logits/chosen": 8.353782653808594, "logits/rejected": 9.88780403137207, "logps/chosen": -2.7663016319274902, "logps/rejected": -2.994288921356201, "loss": 3.7838, "rewards/accuracies": 0.75, "rewards/chosen": -27.66301727294922, "rewards/margins": 2.2798714637756348, "rewards/rejected": -29.942888259887695, "step": 1401 }, { "epoch": 0.1909041394335512, "grad_norm": 57.26502457660627, "learning_rate": 7.800625926316464e-07, "logits/chosen": 9.060561180114746, "logits/rejected": 9.371355056762695, "logps/chosen": -3.224858045578003, "logps/rejected": -3.3752622604370117, "loss": 4.1657, "rewards/accuracies": 0.75, "rewards/chosen": -32.24858093261719, "rewards/margins": 1.504042148590088, "rewards/rejected": -33.75262451171875, "step": 1402 }, { "epoch": 0.19104030501089325, "grad_norm": 43.97258058290861, "learning_rate": 7.800032690131727e-07, "logits/chosen": 7.542567729949951, "logits/rejected": 8.216724395751953, "logps/chosen": -2.5365030765533447, "logps/rejected": -2.915224075317383, "loss": 4.172, "rewards/accuracies": 0.75, "rewards/chosen": -25.365032196044922, "rewards/margins": 3.7872090339660645, "rewards/rejected": -29.152240753173828, "step": 1403 }, { "epoch": 0.19117647058823528, "grad_norm": 45.368781063616076, "learning_rate": 7.799438595298191e-07, "logits/chosen": 9.722738265991211, "logits/rejected": 10.005331039428711, "logps/chosen": -3.1675658226013184, "logps/rejected": -2.6878910064697266, "loss": 4.3691, "rewards/accuracies": 0.25, "rewards/chosen": -31.675655364990234, "rewards/margins": -4.7967448234558105, "rewards/rejected": -26.878910064697266, "step": 1404 }, { "epoch": 0.19131263616557734, "grad_norm": 43.66169404979031, "learning_rate": 7.798843641950098e-07, "logits/chosen": 9.980880737304688, "logits/rejected": 8.40610122680664, "logps/chosen": -2.744546413421631, "logps/rejected": -2.6146116256713867, "loss": 4.1704, "rewards/accuracies": 0.25, "rewards/chosen": -27.445463180541992, "rewards/margins": -1.299346923828125, "rewards/rejected": -26.146116256713867, "step": 1405 }, { "epoch": 0.1914488017429194, "grad_norm": 51.640331040210484, "learning_rate": 7.798247830221883e-07, "logits/chosen": 8.331908226013184, "logits/rejected": 8.517950057983398, "logps/chosen": -2.8054895401000977, "logps/rejected": -2.877342462539673, "loss": 3.8711, "rewards/accuracies": 0.5, "rewards/chosen": -28.054893493652344, "rewards/margins": 0.718529224395752, "rewards/rejected": -28.773422241210938, "step": 1406 }, { "epoch": 0.19158496732026145, "grad_norm": 41.2078213542268, "learning_rate": 7.797651160248173e-07, "logits/chosen": 5.775514125823975, "logits/rejected": 8.85209846496582, "logps/chosen": -2.249507427215576, "logps/rejected": -2.738999605178833, "loss": 3.6027, "rewards/accuracies": 1.0, "rewards/chosen": -22.495075225830078, "rewards/margins": 4.894920349121094, "rewards/rejected": -27.389995574951172, "step": 1407 }, { "epoch": 0.19172113289760348, "grad_norm": 42.66123039610185, "learning_rate": 7.797053632163793e-07, "logits/chosen": 9.147993087768555, "logits/rejected": 9.000064849853516, "logps/chosen": -2.7639076709747314, "logps/rejected": -2.9875082969665527, "loss": 3.8751, "rewards/accuracies": 0.75, "rewards/chosen": -27.639076232910156, "rewards/margins": 2.236003875732422, "rewards/rejected": -29.875080108642578, "step": 1408 }, { "epoch": 0.19185729847494554, "grad_norm": 57.01921316372514, "learning_rate": 7.796455246103757e-07, "logits/chosen": 9.547250747680664, "logits/rejected": 8.21285629272461, "logps/chosen": -2.775984764099121, "logps/rejected": -2.7259721755981445, "loss": 4.5345, "rewards/accuracies": 0.25, "rewards/chosen": -27.75984764099121, "rewards/margins": -0.500126838684082, "rewards/rejected": -27.259721755981445, "step": 1409 }, { "epoch": 0.1919934640522876, "grad_norm": 48.92675108255035, "learning_rate": 7.795856002203278e-07, "logits/chosen": 7.852705001831055, "logits/rejected": 10.022195816040039, "logps/chosen": -2.7554802894592285, "logps/rejected": -3.042891502380371, "loss": 3.8342, "rewards/accuracies": 0.5, "rewards/chosen": -27.5548038482666, "rewards/margins": 2.8741111755371094, "rewards/rejected": -30.42891502380371, "step": 1410 }, { "epoch": 0.19212962962962962, "grad_norm": 45.25088483410498, "learning_rate": 7.795255900597757e-07, "logits/chosen": 8.349921226501465, "logits/rejected": 9.276391983032227, "logps/chosen": -2.9113292694091797, "logps/rejected": -3.197622299194336, "loss": 4.2461, "rewards/accuracies": 0.75, "rewards/chosen": -29.113292694091797, "rewards/margins": 2.8629302978515625, "rewards/rejected": -31.97622299194336, "step": 1411 }, { "epoch": 0.19226579520697168, "grad_norm": 44.54428510847966, "learning_rate": 7.794654941422793e-07, "logits/chosen": 9.373472213745117, "logits/rejected": 7.656415939331055, "logps/chosen": -3.138986110687256, "logps/rejected": -2.7520198822021484, "loss": 3.885, "rewards/accuracies": 0.25, "rewards/chosen": -31.389860153198242, "rewards/margins": -3.869661808013916, "rewards/rejected": -27.520198822021484, "step": 1412 }, { "epoch": 0.19240196078431374, "grad_norm": 59.23068469134054, "learning_rate": 7.79405312481418e-07, "logits/chosen": 8.266092300415039, "logits/rejected": 10.065275192260742, "logps/chosen": -2.7595114707946777, "logps/rejected": -2.912015914916992, "loss": 3.9647, "rewards/accuracies": 0.5, "rewards/chosen": -27.595115661621094, "rewards/margins": 1.5250449180603027, "rewards/rejected": -29.120159149169922, "step": 1413 }, { "epoch": 0.19253812636165576, "grad_norm": 45.51119209598674, "learning_rate": 7.793450450907899e-07, "logits/chosen": 6.812744140625, "logits/rejected": 8.779069900512695, "logps/chosen": -2.9503860473632812, "logps/rejected": -3.1422550678253174, "loss": 4.5914, "rewards/accuracies": 0.75, "rewards/chosen": -29.503860473632812, "rewards/margins": 1.918691635131836, "rewards/rejected": -31.42255210876465, "step": 1414 }, { "epoch": 0.19267429193899782, "grad_norm": 50.551817551047776, "learning_rate": 7.792846919840134e-07, "logits/chosen": 7.555563449859619, "logits/rejected": 9.164955139160156, "logps/chosen": -2.7411985397338867, "logps/rejected": -3.1020851135253906, "loss": 4.1653, "rewards/accuracies": 1.0, "rewards/chosen": -27.411983489990234, "rewards/margins": 3.608867645263672, "rewards/rejected": -31.020851135253906, "step": 1415 }, { "epoch": 0.19281045751633988, "grad_norm": 43.24500353421959, "learning_rate": 7.792242531747254e-07, "logits/chosen": 9.10142993927002, "logits/rejected": 9.820465087890625, "logps/chosen": -2.540660858154297, "logps/rejected": -3.0907201766967773, "loss": 4.3242, "rewards/accuracies": 1.0, "rewards/chosen": -25.40660858154297, "rewards/margins": 5.5005950927734375, "rewards/rejected": -30.907203674316406, "step": 1416 }, { "epoch": 0.1929466230936819, "grad_norm": 46.601581526359254, "learning_rate": 7.791637286765827e-07, "logits/chosen": 9.472031593322754, "logits/rejected": 8.414068222045898, "logps/chosen": -2.936103343963623, "logps/rejected": -3.0606439113616943, "loss": 4.4797, "rewards/accuracies": 0.5, "rewards/chosen": -29.361034393310547, "rewards/margins": 1.24540376663208, "rewards/rejected": -30.60643768310547, "step": 1417 }, { "epoch": 0.19308278867102396, "grad_norm": 47.37432762713803, "learning_rate": 7.791031185032613e-07, "logits/chosen": 6.164275169372559, "logits/rejected": 9.404150009155273, "logps/chosen": -2.2528951168060303, "logps/rejected": -2.973322868347168, "loss": 4.513, "rewards/accuracies": 1.0, "rewards/chosen": -22.528949737548828, "rewards/margins": 7.204277515411377, "rewards/rejected": -29.73322868347168, "step": 1418 }, { "epoch": 0.19321895424836602, "grad_norm": 49.44098756069984, "learning_rate": 7.790424226684566e-07, "logits/chosen": 8.997503280639648, "logits/rejected": 9.264274597167969, "logps/chosen": -2.8417553901672363, "logps/rejected": -3.1748342514038086, "loss": 4.8781, "rewards/accuracies": 0.75, "rewards/chosen": -28.417552947998047, "rewards/margins": 3.3307886123657227, "rewards/rejected": -31.748342514038086, "step": 1419 }, { "epoch": 0.19335511982570805, "grad_norm": 44.34697662722245, "learning_rate": 7.789816411858834e-07, "logits/chosen": 8.421128273010254, "logits/rejected": 9.701765060424805, "logps/chosen": -2.864990472793579, "logps/rejected": -2.8928678035736084, "loss": 4.6566, "rewards/accuracies": 0.5, "rewards/chosen": -28.64990234375, "rewards/margins": 0.27877378463745117, "rewards/rejected": -28.92867660522461, "step": 1420 }, { "epoch": 0.1934912854030501, "grad_norm": 50.95676902140057, "learning_rate": 7.789207740692756e-07, "logits/chosen": 10.414976119995117, "logits/rejected": 10.237785339355469, "logps/chosen": -2.9867801666259766, "logps/rejected": -3.162876605987549, "loss": 3.9149, "rewards/accuracies": 0.75, "rewards/chosen": -29.8678035736084, "rewards/margins": 1.7609639167785645, "rewards/rejected": -31.628767013549805, "step": 1421 }, { "epoch": 0.19362745098039216, "grad_norm": 48.72282203929905, "learning_rate": 7.788598213323868e-07, "logits/chosen": 9.797548294067383, "logits/rejected": 9.355550765991211, "logps/chosen": -2.855088710784912, "logps/rejected": -2.7904438972473145, "loss": 4.5341, "rewards/accuracies": 0.5, "rewards/chosen": -28.550886154174805, "rewards/margins": -0.6464476585388184, "rewards/rejected": -27.904438018798828, "step": 1422 }, { "epoch": 0.1937636165577342, "grad_norm": 48.75581261549794, "learning_rate": 7.787987829889894e-07, "logits/chosen": 8.961009979248047, "logits/rejected": 8.876577377319336, "logps/chosen": -2.9767961502075195, "logps/rejected": -3.150703191757202, "loss": 4.1813, "rewards/accuracies": 0.75, "rewards/chosen": -29.767963409423828, "rewards/margins": 1.739067554473877, "rewards/rejected": -31.50703239440918, "step": 1423 }, { "epoch": 0.19389978213507625, "grad_norm": 45.35547670841405, "learning_rate": 7.787376590528761e-07, "logits/chosen": 8.700684547424316, "logits/rejected": 8.2442626953125, "logps/chosen": -2.8773627281188965, "logps/rejected": -2.9830307960510254, "loss": 3.7012, "rewards/accuracies": 0.75, "rewards/chosen": -28.77362823486328, "rewards/margins": 1.0566797256469727, "rewards/rejected": -29.830307006835938, "step": 1424 }, { "epoch": 0.1940359477124183, "grad_norm": 46.97717698486755, "learning_rate": 7.786764495378578e-07, "logits/chosen": 9.959112167358398, "logits/rejected": 9.080175399780273, "logps/chosen": -3.3970131874084473, "logps/rejected": -3.191220760345459, "loss": 4.34, "rewards/accuracies": 0.25, "rewards/chosen": -33.970130920410156, "rewards/margins": -2.057925224304199, "rewards/rejected": -31.91220474243164, "step": 1425 }, { "epoch": 0.19417211328976036, "grad_norm": 51.71489023390766, "learning_rate": 7.786151544577658e-07, "logits/chosen": 7.997973442077637, "logits/rejected": 8.814556121826172, "logps/chosen": -2.39862060546875, "logps/rejected": -2.919034957885742, "loss": 4.0408, "rewards/accuracies": 1.0, "rewards/chosen": -23.986207962036133, "rewards/margins": 5.2041401863098145, "rewards/rejected": -29.190349578857422, "step": 1426 }, { "epoch": 0.1943082788671024, "grad_norm": 43.030763491139595, "learning_rate": 7.785537738264499e-07, "logits/chosen": 9.043115615844727, "logits/rejected": 9.913169860839844, "logps/chosen": -2.8094048500061035, "logps/rejected": -3.3500022888183594, "loss": 3.8035, "rewards/accuracies": 1.0, "rewards/chosen": -28.09404945373535, "rewards/margins": 5.405973434448242, "rewards/rejected": -33.500022888183594, "step": 1427 }, { "epoch": 0.19444444444444445, "grad_norm": 40.44446362460198, "learning_rate": 7.784923076577796e-07, "logits/chosen": 9.05484390258789, "logits/rejected": 10.06836223602295, "logps/chosen": -2.958004951477051, "logps/rejected": -3.2055649757385254, "loss": 3.7897, "rewards/accuracies": 0.75, "rewards/chosen": -29.580047607421875, "rewards/margins": 2.4756016731262207, "rewards/rejected": -32.05564880371094, "step": 1428 }, { "epoch": 0.1945806100217865, "grad_norm": 41.60631410488037, "learning_rate": 7.784307559656438e-07, "logits/chosen": 7.878442764282227, "logits/rejected": 9.904341697692871, "logps/chosen": -2.6093902587890625, "logps/rejected": -3.3775947093963623, "loss": 3.9373, "rewards/accuracies": 1.0, "rewards/chosen": -26.093902587890625, "rewards/margins": 7.68204402923584, "rewards/rejected": -33.77594757080078, "step": 1429 }, { "epoch": 0.19471677559912853, "grad_norm": 59.29624055672391, "learning_rate": 7.783691187639505e-07, "logits/chosen": 9.264883995056152, "logits/rejected": 9.822227478027344, "logps/chosen": -3.062620162963867, "logps/rejected": -2.982074737548828, "loss": 4.4761, "rewards/accuracies": 0.5, "rewards/chosen": -30.626201629638672, "rewards/margins": -0.8054556846618652, "rewards/rejected": -29.82074737548828, "step": 1430 }, { "epoch": 0.1948529411764706, "grad_norm": 48.16572364359254, "learning_rate": 7.783073960666273e-07, "logits/chosen": 8.536442756652832, "logits/rejected": 9.14830207824707, "logps/chosen": -2.68721342086792, "logps/rejected": -2.873544454574585, "loss": 4.6376, "rewards/accuracies": 0.75, "rewards/chosen": -26.872133255004883, "rewards/margins": 1.8633127212524414, "rewards/rejected": -28.73544692993164, "step": 1431 }, { "epoch": 0.19498910675381265, "grad_norm": 46.23381900934634, "learning_rate": 7.782455878876207e-07, "logits/chosen": 6.993161201477051, "logits/rejected": 8.413741111755371, "logps/chosen": -2.7440524101257324, "logps/rejected": -2.9516468048095703, "loss": 4.0178, "rewards/accuracies": 0.75, "rewards/chosen": -27.440521240234375, "rewards/margins": 2.07594633102417, "rewards/rejected": -29.516469955444336, "step": 1432 }, { "epoch": 0.19512527233115468, "grad_norm": 39.336063918064035, "learning_rate": 7.78183694240897e-07, "logits/chosen": 9.581241607666016, "logits/rejected": 8.131770133972168, "logps/chosen": -3.1709938049316406, "logps/rejected": -2.989457130432129, "loss": 4.0183, "rewards/accuracies": 0.5, "rewards/chosen": -31.709938049316406, "rewards/margins": -1.8153648376464844, "rewards/rejected": -29.894573211669922, "step": 1433 }, { "epoch": 0.19526143790849673, "grad_norm": 45.94413141152238, "learning_rate": 7.781217151404414e-07, "logits/chosen": 9.17176342010498, "logits/rejected": 8.933035850524902, "logps/chosen": -2.8165507316589355, "logps/rejected": -2.6400134563446045, "loss": 4.3081, "rewards/accuracies": 0.25, "rewards/chosen": -28.165508270263672, "rewards/margins": -1.7653746604919434, "rewards/rejected": -26.400135040283203, "step": 1434 }, { "epoch": 0.1953976034858388, "grad_norm": 43.69432110393918, "learning_rate": 7.780596506002587e-07, "logits/chosen": 9.745745658874512, "logits/rejected": 9.013357162475586, "logps/chosen": -3.0713653564453125, "logps/rejected": -2.9104180335998535, "loss": 4.019, "rewards/accuracies": 0.5, "rewards/chosen": -30.713651657104492, "rewards/margins": -1.6094717979431152, "rewards/rejected": -29.10418128967285, "step": 1435 }, { "epoch": 0.19553376906318082, "grad_norm": 40.522644038097496, "learning_rate": 7.779975006343729e-07, "logits/chosen": 9.826494216918945, "logits/rejected": 7.93301248550415, "logps/chosen": -3.261568069458008, "logps/rejected": -2.9983692169189453, "loss": 4.2288, "rewards/accuracies": 0.25, "rewards/chosen": -32.615684509277344, "rewards/margins": -2.631990909576416, "rewards/rejected": -29.983692169189453, "step": 1436 }, { "epoch": 0.19566993464052287, "grad_norm": 43.052378640835315, "learning_rate": 7.779352652568272e-07, "logits/chosen": 9.551679611206055, "logits/rejected": 9.08378791809082, "logps/chosen": -3.2459750175476074, "logps/rejected": -3.295665979385376, "loss": 3.8527, "rewards/accuracies": 0.5, "rewards/chosen": -32.459747314453125, "rewards/margins": 0.49690961837768555, "rewards/rejected": -32.95665740966797, "step": 1437 }, { "epoch": 0.19580610021786493, "grad_norm": 45.690385107351645, "learning_rate": 7.778729444816843e-07, "logits/chosen": 9.733203887939453, "logits/rejected": 9.3901948928833, "logps/chosen": -2.7972002029418945, "logps/rejected": -2.853917360305786, "loss": 4.5375, "rewards/accuracies": 0.75, "rewards/chosen": -27.972002029418945, "rewards/margins": 0.5671701431274414, "rewards/rejected": -28.539173126220703, "step": 1438 }, { "epoch": 0.19594226579520696, "grad_norm": 39.87620523223653, "learning_rate": 7.778105383230262e-07, "logits/chosen": 8.288427352905273, "logits/rejected": 8.5509033203125, "logps/chosen": -2.338588237762451, "logps/rejected": -2.941868305206299, "loss": 3.6721, "rewards/accuracies": 1.0, "rewards/chosen": -23.38588523864746, "rewards/margins": 6.032797813415527, "rewards/rejected": -29.418682098388672, "step": 1439 }, { "epoch": 0.19607843137254902, "grad_norm": 47.55649310087179, "learning_rate": 7.777480467949538e-07, "logits/chosen": 8.68994426727295, "logits/rejected": 8.449972152709961, "logps/chosen": -3.1329617500305176, "logps/rejected": -3.173642635345459, "loss": 4.3603, "rewards/accuracies": 0.5, "rewards/chosen": -31.329614639282227, "rewards/margins": 0.40680980682373047, "rewards/rejected": -31.736425399780273, "step": 1440 }, { "epoch": 0.19621459694989107, "grad_norm": 40.4546902191273, "learning_rate": 7.776854699115878e-07, "logits/chosen": 9.111116409301758, "logits/rejected": 9.26386833190918, "logps/chosen": -3.2930614948272705, "logps/rejected": -3.2770516872406006, "loss": 4.179, "rewards/accuracies": 0.25, "rewards/chosen": -32.93061447143555, "rewards/margins": -0.16009855270385742, "rewards/rejected": -32.77051544189453, "step": 1441 }, { "epoch": 0.1963507625272331, "grad_norm": 44.28429564543883, "learning_rate": 7.776228076870678e-07, "logits/chosen": 8.367546081542969, "logits/rejected": 8.709491729736328, "logps/chosen": -2.737830638885498, "logps/rejected": -2.9389190673828125, "loss": 4.4171, "rewards/accuracies": 0.75, "rewards/chosen": -27.378307342529297, "rewards/margins": 2.0108838081359863, "rewards/rejected": -29.389190673828125, "step": 1442 }, { "epoch": 0.19648692810457516, "grad_norm": 42.771777389611394, "learning_rate": 7.775600601355532e-07, "logits/chosen": 8.988128662109375, "logits/rejected": 8.389077186584473, "logps/chosen": -3.0894618034362793, "logps/rejected": -2.8165435791015625, "loss": 3.9168, "rewards/accuracies": 0.5, "rewards/chosen": -30.894615173339844, "rewards/margins": -2.7291817665100098, "rewards/rejected": -28.165435791015625, "step": 1443 }, { "epoch": 0.19662309368191722, "grad_norm": 77.25804454818537, "learning_rate": 7.774972272712217e-07, "logits/chosen": 8.027791976928711, "logits/rejected": 10.037418365478516, "logps/chosen": -2.497880220413208, "logps/rejected": -3.1453304290771484, "loss": 4.5425, "rewards/accuracies": 1.0, "rewards/chosen": -24.978803634643555, "rewards/margins": 6.474502086639404, "rewards/rejected": -31.453304290771484, "step": 1444 }, { "epoch": 0.19675925925925927, "grad_norm": 57.20208984100321, "learning_rate": 7.774343091082716e-07, "logits/chosen": 8.530563354492188, "logits/rejected": 7.857172966003418, "logps/chosen": -2.5923843383789062, "logps/rejected": -2.409501314163208, "loss": 4.6043, "rewards/accuracies": 0.25, "rewards/chosen": -25.923843383789062, "rewards/margins": -1.8288304805755615, "rewards/rejected": -24.095012664794922, "step": 1445 }, { "epoch": 0.1968954248366013, "grad_norm": 70.80688067676418, "learning_rate": 7.773713056609192e-07, "logits/chosen": 7.954108238220215, "logits/rejected": 7.232739448547363, "logps/chosen": -2.4978702068328857, "logps/rejected": -2.4132771492004395, "loss": 4.3674, "rewards/accuracies": 0.5, "rewards/chosen": -24.978702545166016, "rewards/margins": -0.8459315299987793, "rewards/rejected": -24.132770538330078, "step": 1446 }, { "epoch": 0.19703159041394336, "grad_norm": 43.24603201119185, "learning_rate": 7.773082169434011e-07, "logits/chosen": 9.084012985229492, "logits/rejected": 10.052854537963867, "logps/chosen": -3.351374626159668, "logps/rejected": -3.625396251678467, "loss": 3.8958, "rewards/accuracies": 0.75, "rewards/chosen": -33.51374435424805, "rewards/margins": 2.740217685699463, "rewards/rejected": -36.25395965576172, "step": 1447 }, { "epoch": 0.19716775599128541, "grad_norm": 45.952561409811935, "learning_rate": 7.772450429699723e-07, "logits/chosen": 6.113255977630615, "logits/rejected": 8.147967338562012, "logps/chosen": -2.389207124710083, "logps/rejected": -2.738765239715576, "loss": 3.9888, "rewards/accuracies": 0.75, "rewards/chosen": -23.892070770263672, "rewards/margins": 3.4955806732177734, "rewards/rejected": -27.387651443481445, "step": 1448 }, { "epoch": 0.19730392156862744, "grad_norm": 60.318455531326165, "learning_rate": 7.771817837549079e-07, "logits/chosen": 7.02171516418457, "logits/rejected": 7.643853664398193, "logps/chosen": -2.8811988830566406, "logps/rejected": -2.8630127906799316, "loss": 4.2589, "rewards/accuracies": 0.25, "rewards/chosen": -28.811988830566406, "rewards/margins": -0.18185997009277344, "rewards/rejected": -28.630130767822266, "step": 1449 }, { "epoch": 0.1974400871459695, "grad_norm": 46.39365157890924, "learning_rate": 7.771184393125016e-07, "logits/chosen": 8.487655639648438, "logits/rejected": 8.802398681640625, "logps/chosen": -2.5805859565734863, "logps/rejected": -2.7143349647521973, "loss": 4.0543, "rewards/accuracies": 0.75, "rewards/chosen": -25.80586051940918, "rewards/margins": 1.337489128112793, "rewards/rejected": -27.143348693847656, "step": 1450 }, { "epoch": 0.19757625272331156, "grad_norm": 43.85084677734427, "learning_rate": 7.770550096570665e-07, "logits/chosen": 8.25804328918457, "logits/rejected": 10.141490936279297, "logps/chosen": -2.6751809120178223, "logps/rejected": -3.142080783843994, "loss": 4.026, "rewards/accuracies": 1.0, "rewards/chosen": -26.751811981201172, "rewards/margins": 4.668998718261719, "rewards/rejected": -31.420808792114258, "step": 1451 }, { "epoch": 0.1977124183006536, "grad_norm": 42.989356771543, "learning_rate": 7.769914948029355e-07, "logits/chosen": 10.25372314453125, "logits/rejected": 10.622417449951172, "logps/chosen": -3.0612082481384277, "logps/rejected": -3.542187452316284, "loss": 4.1089, "rewards/accuracies": 0.75, "rewards/chosen": -30.612083435058594, "rewards/margins": 4.809792518615723, "rewards/rejected": -35.421875, "step": 1452 }, { "epoch": 0.19784858387799564, "grad_norm": 47.26754687735899, "learning_rate": 7.769278947644598e-07, "logits/chosen": 9.04432487487793, "logits/rejected": 9.647848129272461, "logps/chosen": -2.8638484477996826, "logps/rejected": -3.035745143890381, "loss": 4.1646, "rewards/accuracies": 0.75, "rewards/chosen": -28.638484954833984, "rewards/margins": 1.718968391418457, "rewards/rejected": -30.357452392578125, "step": 1453 }, { "epoch": 0.1979847494553377, "grad_norm": 48.68988744609558, "learning_rate": 7.768642095560105e-07, "logits/chosen": 9.311052322387695, "logits/rejected": 10.937128067016602, "logps/chosen": -2.9038901329040527, "logps/rejected": -3.516470432281494, "loss": 3.9562, "rewards/accuracies": 0.75, "rewards/chosen": -29.038902282714844, "rewards/margins": 6.125802516937256, "rewards/rejected": -35.164703369140625, "step": 1454 }, { "epoch": 0.19812091503267973, "grad_norm": 52.039864553064554, "learning_rate": 7.76800439191978e-07, "logits/chosen": 8.343742370605469, "logits/rejected": 6.983658790588379, "logps/chosen": -2.8168134689331055, "logps/rejected": -2.693687915802002, "loss": 3.7525, "rewards/accuracies": 0.5, "rewards/chosen": -28.168136596679688, "rewards/margins": -1.2312555313110352, "rewards/rejected": -26.936880111694336, "step": 1455 }, { "epoch": 0.19825708061002179, "grad_norm": 44.11655630734675, "learning_rate": 7.767365836867716e-07, "logits/chosen": 10.253247261047363, "logits/rejected": 9.816324234008789, "logps/chosen": -3.3441667556762695, "logps/rejected": -3.210970401763916, "loss": 4.074, "rewards/accuracies": 0.5, "rewards/chosen": -33.44166564941406, "rewards/margins": -1.3319659233093262, "rewards/rejected": -32.10969924926758, "step": 1456 }, { "epoch": 0.19839324618736384, "grad_norm": 50.99912315276199, "learning_rate": 7.7667264305482e-07, "logits/chosen": 9.019248962402344, "logits/rejected": 10.634315490722656, "logps/chosen": -2.91058087348938, "logps/rejected": -3.4656734466552734, "loss": 4.1762, "rewards/accuracies": 1.0, "rewards/chosen": -29.105810165405273, "rewards/margins": 5.5509257316589355, "rewards/rejected": -34.65673828125, "step": 1457 }, { "epoch": 0.19852941176470587, "grad_norm": 45.492263775455754, "learning_rate": 7.766086173105709e-07, "logits/chosen": 10.009572982788086, "logits/rejected": 9.080705642700195, "logps/chosen": -3.1363003253936768, "logps/rejected": -2.918297529220581, "loss": 3.9937, "rewards/accuracies": 0.5, "rewards/chosen": -31.363004684448242, "rewards/margins": -2.180027484893799, "rewards/rejected": -29.18297576904297, "step": 1458 }, { "epoch": 0.19866557734204793, "grad_norm": 47.81175243467854, "learning_rate": 7.765445064684918e-07, "logits/chosen": 9.030842781066895, "logits/rejected": 9.275713920593262, "logps/chosen": -2.9299583435058594, "logps/rejected": -3.2637462615966797, "loss": 4.3362, "rewards/accuracies": 0.75, "rewards/chosen": -29.299583435058594, "rewards/margins": 3.3378777503967285, "rewards/rejected": -32.63745880126953, "step": 1459 }, { "epoch": 0.19880174291938998, "grad_norm": 40.77703609649417, "learning_rate": 7.764803105430689e-07, "logits/chosen": 9.318174362182617, "logits/rejected": 10.547752380371094, "logps/chosen": -3.044933319091797, "logps/rejected": -3.354402780532837, "loss": 4.0165, "rewards/accuracies": 0.75, "rewards/chosen": -30.44933319091797, "rewards/margins": 3.0946946144104004, "rewards/rejected": -33.544029235839844, "step": 1460 }, { "epoch": 0.198937908496732, "grad_norm": 43.334656522159605, "learning_rate": 7.764160295488078e-07, "logits/chosen": 8.93509578704834, "logits/rejected": 9.578092575073242, "logps/chosen": -2.439145803451538, "logps/rejected": -2.3536200523376465, "loss": 3.8556, "rewards/accuracies": 0.5, "rewards/chosen": -24.39145851135254, "rewards/margins": -0.8552570343017578, "rewards/rejected": -23.53620147705078, "step": 1461 }, { "epoch": 0.19907407407407407, "grad_norm": 45.1463940113554, "learning_rate": 7.763516635002333e-07, "logits/chosen": 10.059344291687012, "logits/rejected": 10.981958389282227, "logps/chosen": -3.1329400539398193, "logps/rejected": -3.235844135284424, "loss": 4.4581, "rewards/accuracies": 0.75, "rewards/chosen": -31.32939910888672, "rewards/margins": 1.0290422439575195, "rewards/rejected": -32.35844039916992, "step": 1462 }, { "epoch": 0.19921023965141613, "grad_norm": 44.784076375327615, "learning_rate": 7.762872124118895e-07, "logits/chosen": 10.884218215942383, "logits/rejected": 11.401772499084473, "logps/chosen": -3.4508912563323975, "logps/rejected": -3.297713041305542, "loss": 3.7327, "rewards/accuracies": 0.5, "rewards/chosen": -34.5089111328125, "rewards/margins": -1.531783103942871, "rewards/rejected": -32.97713088989258, "step": 1463 }, { "epoch": 0.19934640522875818, "grad_norm": 43.21508006040941, "learning_rate": 7.762226762983397e-07, "logits/chosen": 9.239043235778809, "logits/rejected": 9.939115524291992, "logps/chosen": -2.8668112754821777, "logps/rejected": -3.226020336151123, "loss": 4.1144, "rewards/accuracies": 0.75, "rewards/chosen": -28.668113708496094, "rewards/margins": 3.5920910835266113, "rewards/rejected": -32.26020050048828, "step": 1464 }, { "epoch": 0.1994825708061002, "grad_norm": 39.725360147632486, "learning_rate": 7.761580551741662e-07, "logits/chosen": 10.668901443481445, "logits/rejected": 10.35605525970459, "logps/chosen": -3.0153207778930664, "logps/rejected": -3.1533613204956055, "loss": 4.2794, "rewards/accuracies": 0.75, "rewards/chosen": -30.15320587158203, "rewards/margins": 1.3804078102111816, "rewards/rejected": -31.533613204956055, "step": 1465 }, { "epoch": 0.19961873638344227, "grad_norm": 46.609045029107236, "learning_rate": 7.760933490539708e-07, "logits/chosen": 7.528223991394043, "logits/rejected": 7.680734634399414, "logps/chosen": -2.875335454940796, "logps/rejected": -2.792775869369507, "loss": 4.591, "rewards/accuracies": 0.25, "rewards/chosen": -28.753353118896484, "rewards/margins": -0.825594425201416, "rewards/rejected": -27.92776107788086, "step": 1466 }, { "epoch": 0.19975490196078433, "grad_norm": 40.14551819231499, "learning_rate": 7.760285579523744e-07, "logits/chosen": 10.192405700683594, "logits/rejected": 10.883947372436523, "logps/chosen": -2.923978328704834, "logps/rejected": -3.218256950378418, "loss": 3.97, "rewards/accuracies": 0.75, "rewards/chosen": -29.239782333374023, "rewards/margins": 2.942786693572998, "rewards/rejected": -32.18256759643555, "step": 1467 }, { "epoch": 0.19989106753812635, "grad_norm": 45.95643975375287, "learning_rate": 7.759636818840171e-07, "logits/chosen": 9.504526138305664, "logits/rejected": 7.696004390716553, "logps/chosen": -2.5552122592926025, "logps/rejected": -2.275801181793213, "loss": 4.6576, "rewards/accuracies": 0.25, "rewards/chosen": -25.552120208740234, "rewards/margins": -2.7941088676452637, "rewards/rejected": -22.758010864257812, "step": 1468 }, { "epoch": 0.2000272331154684, "grad_norm": 45.239857144551095, "learning_rate": 7.758987208635581e-07, "logits/chosen": 10.419322967529297, "logits/rejected": 10.270145416259766, "logps/chosen": -2.9831349849700928, "logps/rejected": -3.156280517578125, "loss": 4.1811, "rewards/accuracies": 0.75, "rewards/chosen": -29.831350326538086, "rewards/margins": 1.731454849243164, "rewards/rejected": -31.56280517578125, "step": 1469 }, { "epoch": 0.20016339869281047, "grad_norm": 41.82838506372563, "learning_rate": 7.758336749056757e-07, "logits/chosen": 8.79752254486084, "logits/rejected": 9.68968391418457, "logps/chosen": -2.624166488647461, "logps/rejected": -3.1322267055511475, "loss": 3.9202, "rewards/accuracies": 1.0, "rewards/chosen": -26.241666793823242, "rewards/margins": 5.080600738525391, "rewards/rejected": -31.322269439697266, "step": 1470 }, { "epoch": 0.2002995642701525, "grad_norm": 45.33861302907204, "learning_rate": 7.75768544025068e-07, "logits/chosen": 9.996610641479492, "logits/rejected": 10.333464622497559, "logps/chosen": -3.2554378509521484, "logps/rejected": -3.5777974128723145, "loss": 3.9567, "rewards/accuracies": 0.75, "rewards/chosen": -32.554378509521484, "rewards/margins": 3.2235970497131348, "rewards/rejected": -35.777976989746094, "step": 1471 }, { "epoch": 0.20043572984749455, "grad_norm": 46.53421934005612, "learning_rate": 7.757033282364517e-07, "logits/chosen": 9.273542404174805, "logits/rejected": 11.05586051940918, "logps/chosen": -3.009902238845825, "logps/rejected": -3.281428813934326, "loss": 4.0333, "rewards/accuracies": 0.75, "rewards/chosen": -30.099023818969727, "rewards/margins": 2.7152652740478516, "rewards/rejected": -32.81428909301758, "step": 1472 }, { "epoch": 0.2005718954248366, "grad_norm": 42.95871592635851, "learning_rate": 7.756380275545627e-07, "logits/chosen": 9.2052640914917, "logits/rejected": 10.910459518432617, "logps/chosen": -2.6476666927337646, "logps/rejected": -2.895676612854004, "loss": 4.21, "rewards/accuracies": 0.75, "rewards/chosen": -26.476667404174805, "rewards/margins": 2.480097770690918, "rewards/rejected": -28.956764221191406, "step": 1473 }, { "epoch": 0.20070806100217864, "grad_norm": 42.51174357384946, "learning_rate": 7.755726419941563e-07, "logits/chosen": 10.721439361572266, "logits/rejected": 11.10346794128418, "logps/chosen": -3.1327004432678223, "logps/rejected": -3.6346869468688965, "loss": 4.0112, "rewards/accuracies": 1.0, "rewards/chosen": -31.327003479003906, "rewards/margins": 5.019866466522217, "rewards/rejected": -36.34687042236328, "step": 1474 }, { "epoch": 0.2008442265795207, "grad_norm": 45.311632297484046, "learning_rate": 7.755071715700069e-07, "logits/chosen": 9.715666770935059, "logits/rejected": 9.806690216064453, "logps/chosen": -3.0859737396240234, "logps/rejected": -2.8672163486480713, "loss": 4.1732, "rewards/accuracies": 0.25, "rewards/chosen": -30.859737396240234, "rewards/margins": -2.1875743865966797, "rewards/rejected": -28.672163009643555, "step": 1475 }, { "epoch": 0.20098039215686275, "grad_norm": 41.09127228836207, "learning_rate": 7.754416162969081e-07, "logits/chosen": 9.042545318603516, "logits/rejected": 9.742539405822754, "logps/chosen": -2.8910083770751953, "logps/rejected": -2.807126045227051, "loss": 4.6364, "rewards/accuracies": 0.25, "rewards/chosen": -28.910085678100586, "rewards/margins": -0.8388237953186035, "rewards/rejected": -28.07126235961914, "step": 1476 }, { "epoch": 0.20111655773420478, "grad_norm": 42.19408866965922, "learning_rate": 7.753759761896727e-07, "logits/chosen": 9.99551010131836, "logits/rejected": 11.406013488769531, "logps/chosen": -3.2796497344970703, "logps/rejected": -3.0559041500091553, "loss": 3.7981, "rewards/accuracies": 0.25, "rewards/chosen": -32.79649353027344, "rewards/margins": -2.237455368041992, "rewards/rejected": -30.559040069580078, "step": 1477 }, { "epoch": 0.20125272331154684, "grad_norm": 39.06322188870667, "learning_rate": 7.753102512631326e-07, "logits/chosen": 10.866802215576172, "logits/rejected": 10.12379264831543, "logps/chosen": -3.2128219604492188, "logps/rejected": -3.215808629989624, "loss": 3.8709, "rewards/accuracies": 0.5, "rewards/chosen": -32.12821960449219, "rewards/margins": 0.029864788055419922, "rewards/rejected": -32.158084869384766, "step": 1478 }, { "epoch": 0.2013888888888889, "grad_norm": 43.64337166489494, "learning_rate": 7.75244441532139e-07, "logits/chosen": 9.569454193115234, "logits/rejected": 10.166397094726562, "logps/chosen": -2.651419162750244, "logps/rejected": -2.6985273361206055, "loss": 4.5647, "rewards/accuracies": 0.5, "rewards/chosen": -26.514190673828125, "rewards/margins": 0.4710826873779297, "rewards/rejected": -26.985275268554688, "step": 1479 }, { "epoch": 0.20152505446623092, "grad_norm": 39.83163139110385, "learning_rate": 7.751785470115619e-07, "logits/chosen": 10.676182746887207, "logits/rejected": 10.966014862060547, "logps/chosen": -3.5466856956481934, "logps/rejected": -3.21321177482605, "loss": 4.0602, "rewards/accuracies": 0.0, "rewards/chosen": -35.46685791015625, "rewards/margins": -3.334743022918701, "rewards/rejected": -32.132118225097656, "step": 1480 }, { "epoch": 0.20166122004357298, "grad_norm": 50.8948809842726, "learning_rate": 7.751125677162908e-07, "logits/chosen": 9.589519500732422, "logits/rejected": 10.808847427368164, "logps/chosen": -2.941997528076172, "logps/rejected": -3.2013497352600098, "loss": 4.348, "rewards/accuracies": 0.75, "rewards/chosen": -29.41997528076172, "rewards/margins": 2.5935211181640625, "rewards/rejected": -32.01349639892578, "step": 1481 }, { "epoch": 0.20179738562091504, "grad_norm": 57.42704199757395, "learning_rate": 7.750465036612343e-07, "logits/chosen": 10.320305824279785, "logits/rejected": 8.796375274658203, "logps/chosen": -3.2508604526519775, "logps/rejected": -3.0811171531677246, "loss": 4.7348, "rewards/accuracies": 0.5, "rewards/chosen": -32.50860595703125, "rewards/margins": -1.69743013381958, "rewards/rejected": -30.811172485351562, "step": 1482 }, { "epoch": 0.2019335511982571, "grad_norm": 50.914932759241154, "learning_rate": 7.749803548613203e-07, "logits/chosen": 10.147950172424316, "logits/rejected": 11.239627838134766, "logps/chosen": -3.09922456741333, "logps/rejected": -3.3327083587646484, "loss": 4.0454, "rewards/accuracies": 1.0, "rewards/chosen": -30.992244720458984, "rewards/margins": 2.334836959838867, "rewards/rejected": -33.327083587646484, "step": 1483 }, { "epoch": 0.20206971677559912, "grad_norm": 42.37069922940558, "learning_rate": 7.749141213314954e-07, "logits/chosen": 8.958866119384766, "logits/rejected": 8.771101951599121, "logps/chosen": -2.5410470962524414, "logps/rejected": -2.690443515777588, "loss": 4.396, "rewards/accuracies": 0.75, "rewards/chosen": -25.410470962524414, "rewards/margins": 1.493964672088623, "rewards/rejected": -26.904434204101562, "step": 1484 }, { "epoch": 0.20220588235294118, "grad_norm": 50.55506844435951, "learning_rate": 7.748478030867257e-07, "logits/chosen": 9.48668384552002, "logits/rejected": 9.271702766418457, "logps/chosen": -3.325000047683716, "logps/rejected": -3.016770362854004, "loss": 3.9134, "rewards/accuracies": 0.5, "rewards/chosen": -33.25, "rewards/margins": -3.082296371459961, "rewards/rejected": -30.16770362854004, "step": 1485 }, { "epoch": 0.20234204793028324, "grad_norm": 40.97428017166095, "learning_rate": 7.747814001419964e-07, "logits/chosen": 10.892715454101562, "logits/rejected": 9.6758451461792, "logps/chosen": -3.4842405319213867, "logps/rejected": -3.508584976196289, "loss": 4.0232, "rewards/accuracies": 0.5, "rewards/chosen": -34.8424072265625, "rewards/margins": 0.24344301223754883, "rewards/rejected": -35.08584976196289, "step": 1486 }, { "epoch": 0.20247821350762527, "grad_norm": 43.75361818968001, "learning_rate": 7.747149125123117e-07, "logits/chosen": 9.894676208496094, "logits/rejected": 9.776031494140625, "logps/chosen": -3.2765920162200928, "logps/rejected": -3.3884918689727783, "loss": 3.8582, "rewards/accuracies": 0.75, "rewards/chosen": -32.76591873168945, "rewards/margins": 1.118997573852539, "rewards/rejected": -33.884918212890625, "step": 1487 }, { "epoch": 0.20261437908496732, "grad_norm": 49.575596715558824, "learning_rate": 7.746483402126952e-07, "logits/chosen": 10.84475326538086, "logits/rejected": 11.007875442504883, "logps/chosen": -3.1815667152404785, "logps/rejected": -2.871211290359497, "loss": 4.7197, "rewards/accuracies": 0.25, "rewards/chosen": -31.81566619873047, "rewards/margins": -3.10355281829834, "rewards/rejected": -28.712114334106445, "step": 1488 }, { "epoch": 0.20275054466230938, "grad_norm": 43.67899039931348, "learning_rate": 7.745816832581893e-07, "logits/chosen": 8.589448928833008, "logits/rejected": 10.385920524597168, "logps/chosen": -3.1402339935302734, "logps/rejected": -3.4253718852996826, "loss": 4.269, "rewards/accuracies": 0.5, "rewards/chosen": -31.402339935302734, "rewards/margins": 2.851377487182617, "rewards/rejected": -34.25371551513672, "step": 1489 }, { "epoch": 0.2028867102396514, "grad_norm": 44.85680125990037, "learning_rate": 7.745149416638558e-07, "logits/chosen": 9.896493911743164, "logits/rejected": 11.421463966369629, "logps/chosen": -2.707122802734375, "logps/rejected": -2.8087105751037598, "loss": 4.3898, "rewards/accuracies": 0.75, "rewards/chosen": -27.07122802734375, "rewards/margins": 1.0158805847167969, "rewards/rejected": -28.087108612060547, "step": 1490 }, { "epoch": 0.20302287581699346, "grad_norm": 52.31040366492728, "learning_rate": 7.744481154447754e-07, "logits/chosen": 10.223047256469727, "logits/rejected": 10.650595664978027, "logps/chosen": -3.3929476737976074, "logps/rejected": -3.5265016555786133, "loss": 3.0323, "rewards/accuracies": 0.75, "rewards/chosen": -33.92947769165039, "rewards/margins": 1.3355374336242676, "rewards/rejected": -35.2650146484375, "step": 1491 }, { "epoch": 0.20315904139433552, "grad_norm": 40.93492679018815, "learning_rate": 7.743812046160481e-07, "logits/chosen": 10.169902801513672, "logits/rejected": 10.067743301391602, "logps/chosen": -3.3834095001220703, "logps/rejected": -3.1778202056884766, "loss": 4.2102, "rewards/accuracies": 0.0, "rewards/chosen": -33.83409118652344, "rewards/margins": -2.0558924674987793, "rewards/rejected": -31.778202056884766, "step": 1492 }, { "epoch": 0.20329520697167755, "grad_norm": 39.50788592785187, "learning_rate": 7.743142091927929e-07, "logits/chosen": 9.313554763793945, "logits/rejected": 11.694564819335938, "logps/chosen": -2.904229164123535, "logps/rejected": -3.504573106765747, "loss": 3.952, "rewards/accuracies": 0.75, "rewards/chosen": -29.04228973388672, "rewards/margins": 6.003439903259277, "rewards/rejected": -35.04573059082031, "step": 1493 }, { "epoch": 0.2034313725490196, "grad_norm": 44.74553805480841, "learning_rate": 7.742471291901481e-07, "logits/chosen": 10.188405990600586, "logits/rejected": 10.448129653930664, "logps/chosen": -2.9557619094848633, "logps/rejected": -2.8343801498413086, "loss": 4.2836, "rewards/accuracies": 0.0, "rewards/chosen": -29.557621002197266, "rewards/margins": -1.2138185501098633, "rewards/rejected": -28.34380340576172, "step": 1494 }, { "epoch": 0.20356753812636166, "grad_norm": 42.30167540964921, "learning_rate": 7.741799646232709e-07, "logits/chosen": 8.402538299560547, "logits/rejected": 10.05960464477539, "logps/chosen": -2.5066637992858887, "logps/rejected": -3.0062105655670166, "loss": 4.0774, "rewards/accuracies": 1.0, "rewards/chosen": -25.066638946533203, "rewards/margins": 4.995466232299805, "rewards/rejected": -30.062103271484375, "step": 1495 }, { "epoch": 0.2037037037037037, "grad_norm": 40.72251641003967, "learning_rate": 7.741127155073377e-07, "logits/chosen": 7.927128791809082, "logits/rejected": 8.741890907287598, "logps/chosen": -2.779224395751953, "logps/rejected": -2.900583028793335, "loss": 4.2144, "rewards/accuracies": 0.25, "rewards/chosen": -27.79224395751953, "rewards/margins": 1.2135863304138184, "rewards/rejected": -29.005830764770508, "step": 1496 }, { "epoch": 0.20383986928104575, "grad_norm": 49.05382957992348, "learning_rate": 7.740453818575439e-07, "logits/chosen": 10.010566711425781, "logits/rejected": 10.793418884277344, "logps/chosen": -2.6009232997894287, "logps/rejected": -2.921552896499634, "loss": 3.979, "rewards/accuracies": 0.75, "rewards/chosen": -26.009233474731445, "rewards/margins": 3.2062954902648926, "rewards/rejected": -29.21552848815918, "step": 1497 }, { "epoch": 0.2039760348583878, "grad_norm": 48.9620127329113, "learning_rate": 7.739779636891041e-07, "logits/chosen": 9.224894523620605, "logits/rejected": 10.303436279296875, "logps/chosen": -2.7092366218566895, "logps/rejected": -3.084982395172119, "loss": 4.361, "rewards/accuracies": 0.75, "rewards/chosen": -27.09236717224121, "rewards/margins": 3.757457733154297, "rewards/rejected": -30.849824905395508, "step": 1498 }, { "epoch": 0.20411220043572983, "grad_norm": 43.03962550124278, "learning_rate": 7.739104610172523e-07, "logits/chosen": 9.10914421081543, "logits/rejected": 8.730331420898438, "logps/chosen": -2.9254584312438965, "logps/rejected": -2.8034632205963135, "loss": 4.0455, "rewards/accuracies": 0.5, "rewards/chosen": -29.254581451416016, "rewards/margins": -1.2199501991271973, "rewards/rejected": -28.03463363647461, "step": 1499 }, { "epoch": 0.2042483660130719, "grad_norm": 44.13581434401064, "learning_rate": 7.738428738572409e-07, "logits/chosen": 9.597407341003418, "logits/rejected": 9.286162376403809, "logps/chosen": -2.8057594299316406, "logps/rejected": -3.08596134185791, "loss": 3.6643, "rewards/accuracies": 0.75, "rewards/chosen": -28.057594299316406, "rewards/margins": 2.8020176887512207, "rewards/rejected": -30.85961151123047, "step": 1500 }, { "epoch": 0.20438453159041395, "grad_norm": 41.82502187224987, "learning_rate": 7.73775202224342e-07, "logits/chosen": 8.586557388305664, "logits/rejected": 8.75683879852295, "logps/chosen": -2.743544578552246, "logps/rejected": -2.8887906074523926, "loss": 4.2211, "rewards/accuracies": 0.75, "rewards/chosen": -27.435447692871094, "rewards/margins": 1.4524588584899902, "rewards/rejected": -28.88790512084961, "step": 1501 }, { "epoch": 0.204520697167756, "grad_norm": 122.73271139131775, "learning_rate": 7.737074461338466e-07, "logits/chosen": 7.397534370422363, "logits/rejected": 10.051858901977539, "logps/chosen": -2.6109704971313477, "logps/rejected": -3.137937068939209, "loss": 4.0356, "rewards/accuracies": 0.75, "rewards/chosen": -26.10970687866211, "rewards/margins": 5.269665718078613, "rewards/rejected": -31.379371643066406, "step": 1502 }, { "epoch": 0.20465686274509803, "grad_norm": 39.8750247797268, "learning_rate": 7.736396056010645e-07, "logits/chosen": 9.419622421264648, "logits/rejected": 9.654056549072266, "logps/chosen": -3.207827568054199, "logps/rejected": -3.094827651977539, "loss": 4.1699, "rewards/accuracies": 0.5, "rewards/chosen": -32.078277587890625, "rewards/margins": -1.1299982070922852, "rewards/rejected": -30.94827651977539, "step": 1503 }, { "epoch": 0.2047930283224401, "grad_norm": 46.46385144490832, "learning_rate": 7.735716806413249e-07, "logits/chosen": 8.994508743286133, "logits/rejected": 9.368321418762207, "logps/chosen": -3.1029410362243652, "logps/rejected": -3.4398412704467773, "loss": 4.0943, "rewards/accuracies": 0.75, "rewards/chosen": -31.029409408569336, "rewards/margins": 3.369002342224121, "rewards/rejected": -34.39841079711914, "step": 1504 }, { "epoch": 0.20492919389978215, "grad_norm": 45.16728448424416, "learning_rate": 7.735036712699763e-07, "logits/chosen": 7.41726016998291, "logits/rejected": 8.287891387939453, "logps/chosen": -2.500386953353882, "logps/rejected": -2.6925277709960938, "loss": 4.7083, "rewards/accuracies": 0.75, "rewards/chosen": -25.00387191772461, "rewards/margins": 1.92140531539917, "rewards/rejected": -26.925275802612305, "step": 1505 }, { "epoch": 0.20506535947712418, "grad_norm": 46.22475819826301, "learning_rate": 7.734355775023856e-07, "logits/chosen": 7.4675984382629395, "logits/rejected": 8.651200294494629, "logps/chosen": -2.76682186126709, "logps/rejected": -3.044633388519287, "loss": 4.2357, "rewards/accuracies": 0.75, "rewards/chosen": -27.6682186126709, "rewards/margins": 2.778115749359131, "rewards/rejected": -30.446334838867188, "step": 1506 }, { "epoch": 0.20520152505446623, "grad_norm": 42.70570503184803, "learning_rate": 7.733673993539394e-07, "logits/chosen": 10.216934204101562, "logits/rejected": 9.96890640258789, "logps/chosen": -2.8829972743988037, "logps/rejected": -2.8136513233184814, "loss": 4.0289, "rewards/accuracies": 0.5, "rewards/chosen": -28.829973220825195, "rewards/margins": -0.6934604644775391, "rewards/rejected": -28.136512756347656, "step": 1507 }, { "epoch": 0.2053376906318083, "grad_norm": 38.96115520266723, "learning_rate": 7.73299136840043e-07, "logits/chosen": 8.760509490966797, "logits/rejected": 9.20695686340332, "logps/chosen": -2.8874409198760986, "logps/rejected": -3.152557849884033, "loss": 3.8471, "rewards/accuracies": 0.75, "rewards/chosen": -28.87441062927246, "rewards/margins": 2.6511688232421875, "rewards/rejected": -31.525577545166016, "step": 1508 }, { "epoch": 0.20547385620915032, "grad_norm": 40.47054649172478, "learning_rate": 7.732307899761209e-07, "logits/chosen": 8.899890899658203, "logits/rejected": 9.814976692199707, "logps/chosen": -2.7566111087799072, "logps/rejected": -2.992161512374878, "loss": 3.9077, "rewards/accuracies": 0.75, "rewards/chosen": -27.566112518310547, "rewards/margins": 2.3555030822753906, "rewards/rejected": -29.921615600585938, "step": 1509 }, { "epoch": 0.20561002178649238, "grad_norm": 42.731567634464724, "learning_rate": 7.731623587776167e-07, "logits/chosen": 9.140726089477539, "logits/rejected": 9.71060562133789, "logps/chosen": -3.029453754425049, "logps/rejected": -3.1287331581115723, "loss": 4.2355, "rewards/accuracies": 0.75, "rewards/chosen": -30.294538497924805, "rewards/margins": 0.9927935600280762, "rewards/rejected": -31.28733253479004, "step": 1510 }, { "epoch": 0.20574618736383443, "grad_norm": 41.36972242618985, "learning_rate": 7.730938432599929e-07, "logits/chosen": 9.15315055847168, "logits/rejected": 7.855367660522461, "logps/chosen": -2.5955810546875, "logps/rejected": -2.4753894805908203, "loss": 4.2815, "rewards/accuracies": 0.5, "rewards/chosen": -25.955810546875, "rewards/margins": -1.2019171714782715, "rewards/rejected": -24.753894805908203, "step": 1511 }, { "epoch": 0.20588235294117646, "grad_norm": 42.70114192689257, "learning_rate": 7.730252434387311e-07, "logits/chosen": 9.488306999206543, "logits/rejected": 9.259184837341309, "logps/chosen": -2.6069133281707764, "logps/rejected": -2.6126668453216553, "loss": 4.1845, "rewards/accuracies": 0.25, "rewards/chosen": -26.069133758544922, "rewards/margins": 0.057535648345947266, "rewards/rejected": -26.12666893005371, "step": 1512 }, { "epoch": 0.20601851851851852, "grad_norm": 44.02091058088088, "learning_rate": 7.729565593293323e-07, "logits/chosen": 9.567697525024414, "logits/rejected": 9.370980262756348, "logps/chosen": -3.1004130840301514, "logps/rejected": -3.303924560546875, "loss": 4.0734, "rewards/accuracies": 0.75, "rewards/chosen": -31.004131317138672, "rewards/margins": 2.0351152420043945, "rewards/rejected": -33.03924560546875, "step": 1513 }, { "epoch": 0.20615468409586057, "grad_norm": 45.05877284559592, "learning_rate": 7.728877909473159e-07, "logits/chosen": 8.95614242553711, "logits/rejected": 9.921453475952148, "logps/chosen": -2.625070095062256, "logps/rejected": -2.8340249061584473, "loss": 4.215, "rewards/accuracies": 0.75, "rewards/chosen": -26.250701904296875, "rewards/margins": 2.0895466804504395, "rewards/rejected": -28.340248107910156, "step": 1514 }, { "epoch": 0.2062908496732026, "grad_norm": 43.65773896952823, "learning_rate": 7.728189383082208e-07, "logits/chosen": 10.288301467895508, "logits/rejected": 9.952174186706543, "logps/chosen": -3.2025644779205322, "logps/rejected": -3.1361827850341797, "loss": 4.3344, "rewards/accuracies": 0.5, "rewards/chosen": -32.02564239501953, "rewards/margins": -0.6638140678405762, "rewards/rejected": -31.36182975769043, "step": 1515 }, { "epoch": 0.20642701525054466, "grad_norm": 48.02436071020197, "learning_rate": 7.727500014276049e-07, "logits/chosen": 9.840444564819336, "logits/rejected": 9.513788223266602, "logps/chosen": -2.35760498046875, "logps/rejected": -2.5773091316223145, "loss": 4.1825, "rewards/accuracies": 0.75, "rewards/chosen": -23.576047897338867, "rewards/margins": 2.1970443725585938, "rewards/rejected": -25.77309226989746, "step": 1516 }, { "epoch": 0.20656318082788672, "grad_norm": 39.606055333870046, "learning_rate": 7.72680980321045e-07, "logits/chosen": 8.691200256347656, "logits/rejected": 10.152860641479492, "logps/chosen": -3.1870124340057373, "logps/rejected": -3.344644546508789, "loss": 4.1197, "rewards/accuracies": 0.75, "rewards/chosen": -31.87012481689453, "rewards/margins": 1.5763211250305176, "rewards/rejected": -33.44644546508789, "step": 1517 }, { "epoch": 0.20669934640522875, "grad_norm": 44.19407768944493, "learning_rate": 7.726118750041369e-07, "logits/chosen": 9.211692810058594, "logits/rejected": 10.384984970092773, "logps/chosen": -2.683102607727051, "logps/rejected": -3.0428199768066406, "loss": 3.763, "rewards/accuracies": 1.0, "rewards/chosen": -26.831024169921875, "rewards/margins": 3.5971760749816895, "rewards/rejected": -30.428199768066406, "step": 1518 }, { "epoch": 0.2068355119825708, "grad_norm": 50.142469302333375, "learning_rate": 7.725426854924956e-07, "logits/chosen": 9.60268783569336, "logits/rejected": 8.72037124633789, "logps/chosen": -3.216475486755371, "logps/rejected": -3.0944762229919434, "loss": 3.8908, "rewards/accuracies": 0.0, "rewards/chosen": -32.164756774902344, "rewards/margins": -1.2199907302856445, "rewards/rejected": -30.94476318359375, "step": 1519 }, { "epoch": 0.20697167755991286, "grad_norm": 45.19816756854954, "learning_rate": 7.72473411801755e-07, "logits/chosen": 9.107534408569336, "logits/rejected": 10.19285774230957, "logps/chosen": -2.847598075866699, "logps/rejected": -3.138075828552246, "loss": 3.8678, "rewards/accuracies": 0.5, "rewards/chosen": -28.47597885131836, "rewards/margins": 2.904778480529785, "rewards/rejected": -31.380756378173828, "step": 1520 }, { "epoch": 0.20710784313725492, "grad_norm": 39.89669816439941, "learning_rate": 7.724040539475683e-07, "logits/chosen": 9.491531372070312, "logits/rejected": 9.675775527954102, "logps/chosen": -3.0406136512756348, "logps/rejected": -3.2230775356292725, "loss": 3.5273, "rewards/accuracies": 0.75, "rewards/chosen": -30.40613555908203, "rewards/margins": 1.8246378898620605, "rewards/rejected": -32.23077392578125, "step": 1521 }, { "epoch": 0.20724400871459694, "grad_norm": 45.62134151380513, "learning_rate": 7.723346119456072e-07, "logits/chosen": 8.607010841369629, "logits/rejected": 9.032543182373047, "logps/chosen": -2.906754493713379, "logps/rejected": -2.995626449584961, "loss": 4.3014, "rewards/accuracies": 0.75, "rewards/chosen": -29.06754493713379, "rewards/margins": 0.8887195587158203, "rewards/rejected": -29.95626449584961, "step": 1522 }, { "epoch": 0.207380174291939, "grad_norm": 44.46616016166908, "learning_rate": 7.722650858115628e-07, "logits/chosen": 10.386598587036133, "logits/rejected": 10.599716186523438, "logps/chosen": -3.499804735183716, "logps/rejected": -3.911957263946533, "loss": 3.6603, "rewards/accuracies": 0.75, "rewards/chosen": -34.998046875, "rewards/margins": 4.121527194976807, "rewards/rejected": -39.11957550048828, "step": 1523 }, { "epoch": 0.20751633986928106, "grad_norm": 55.43123380064811, "learning_rate": 7.72195475561145e-07, "logits/chosen": 9.799623489379883, "logits/rejected": 9.796525955200195, "logps/chosen": -3.3131446838378906, "logps/rejected": -3.3443098068237305, "loss": 4.5962, "rewards/accuracies": 0.5, "rewards/chosen": -33.131446838378906, "rewards/margins": 0.31165218353271484, "rewards/rejected": -33.44309997558594, "step": 1524 }, { "epoch": 0.2076525054466231, "grad_norm": 49.990599277152576, "learning_rate": 7.72125781210083e-07, "logits/chosen": 8.85702133178711, "logits/rejected": 9.503912925720215, "logps/chosen": -2.9897234439849854, "logps/rejected": -3.042562484741211, "loss": 4.1435, "rewards/accuracies": 0.75, "rewards/chosen": -29.897235870361328, "rewards/margins": 0.528388500213623, "rewards/rejected": -30.425622940063477, "step": 1525 }, { "epoch": 0.20778867102396514, "grad_norm": 46.623540005471575, "learning_rate": 7.720560027741246e-07, "logits/chosen": 8.609590530395508, "logits/rejected": 8.7723388671875, "logps/chosen": -3.0728912353515625, "logps/rejected": -3.0562045574188232, "loss": 3.8618, "rewards/accuracies": 0.25, "rewards/chosen": -30.728914260864258, "rewards/margins": -0.1668691635131836, "rewards/rejected": -30.56204605102539, "step": 1526 }, { "epoch": 0.2079248366013072, "grad_norm": 49.75768275909589, "learning_rate": 7.71986140269037e-07, "logits/chosen": 9.268936157226562, "logits/rejected": 10.559523582458496, "logps/chosen": -2.8602659702301025, "logps/rejected": -3.2171220779418945, "loss": 4.2481, "rewards/accuracies": 0.75, "rewards/chosen": -28.602659225463867, "rewards/margins": 3.568561553955078, "rewards/rejected": -32.17121887207031, "step": 1527 }, { "epoch": 0.20806100217864923, "grad_norm": 50.77371153550797, "learning_rate": 7.719161937106062e-07, "logits/chosen": 10.696430206298828, "logits/rejected": 11.823461532592773, "logps/chosen": -3.024768829345703, "logps/rejected": -3.2856173515319824, "loss": 4.1628, "rewards/accuracies": 0.75, "rewards/chosen": -30.247690200805664, "rewards/margins": 2.608482837677002, "rewards/rejected": -32.856170654296875, "step": 1528 }, { "epoch": 0.20819716775599129, "grad_norm": 41.858219198322246, "learning_rate": 7.71846163114637e-07, "logits/chosen": 9.979890823364258, "logits/rejected": 10.180255889892578, "logps/chosen": -3.054157018661499, "logps/rejected": -2.9854788780212402, "loss": 3.5183, "rewards/accuracies": 0.5, "rewards/chosen": -30.54157257080078, "rewards/margins": -0.6867833137512207, "rewards/rejected": -29.85478973388672, "step": 1529 }, { "epoch": 0.20833333333333334, "grad_norm": 47.710011118182805, "learning_rate": 7.717760484969536e-07, "logits/chosen": 10.1435546875, "logits/rejected": 10.103811264038086, "logps/chosen": -3.666451930999756, "logps/rejected": -3.537322759628296, "loss": 4.1227, "rewards/accuracies": 0.75, "rewards/chosen": -36.664520263671875, "rewards/margins": -1.2912921905517578, "rewards/rejected": -35.373226165771484, "step": 1530 }, { "epoch": 0.20846949891067537, "grad_norm": 49.785072059740685, "learning_rate": 7.71705849873399e-07, "logits/chosen": 9.293408393859863, "logits/rejected": 8.836164474487305, "logps/chosen": -2.9147250652313232, "logps/rejected": -2.8333044052124023, "loss": 3.7284, "rewards/accuracies": 0.5, "rewards/chosen": -29.14725112915039, "rewards/margins": -0.814208984375, "rewards/rejected": -28.33304214477539, "step": 1531 }, { "epoch": 0.20860566448801743, "grad_norm": 43.19890080111416, "learning_rate": 7.716355672598349e-07, "logits/chosen": 8.669868469238281, "logits/rejected": 9.978561401367188, "logps/chosen": -3.190791606903076, "logps/rejected": -3.0489630699157715, "loss": 4.2082, "rewards/accuracies": 0.5, "rewards/chosen": -31.907917022705078, "rewards/margins": -1.4182863235473633, "rewards/rejected": -30.4896297454834, "step": 1532 }, { "epoch": 0.20874183006535948, "grad_norm": 45.65757741200186, "learning_rate": 7.715652006721425e-07, "logits/chosen": 7.402101039886475, "logits/rejected": 7.590699195861816, "logps/chosen": -2.894014835357666, "logps/rejected": -2.9818713665008545, "loss": 3.378, "rewards/accuracies": 0.75, "rewards/chosen": -28.940147399902344, "rewards/margins": 0.878565788269043, "rewards/rejected": -29.818714141845703, "step": 1533 }, { "epoch": 0.2088779956427015, "grad_norm": 44.8933872595542, "learning_rate": 7.714947501262216e-07, "logits/chosen": 10.162004470825195, "logits/rejected": 10.799718856811523, "logps/chosen": -3.1611790657043457, "logps/rejected": -3.2777490615844727, "loss": 4.1718, "rewards/accuracies": 0.5, "rewards/chosen": -31.61178970336914, "rewards/margins": 1.1657013893127441, "rewards/rejected": -32.777488708496094, "step": 1534 }, { "epoch": 0.20901416122004357, "grad_norm": 43.527168293503685, "learning_rate": 7.714242156379911e-07, "logits/chosen": 9.880910873413086, "logits/rejected": 10.706607818603516, "logps/chosen": -3.000591516494751, "logps/rejected": -3.130408763885498, "loss": 4.1074, "rewards/accuracies": 0.5, "rewards/chosen": -30.00591468811035, "rewards/margins": 1.2981739044189453, "rewards/rejected": -31.304088592529297, "step": 1535 }, { "epoch": 0.20915032679738563, "grad_norm": 38.985790907068406, "learning_rate": 7.713535972233889e-07, "logits/chosen": 8.93893814086914, "logits/rejected": 9.912113189697266, "logps/chosen": -2.6954760551452637, "logps/rejected": -3.0140233039855957, "loss": 3.5854, "rewards/accuracies": 0.75, "rewards/chosen": -26.954761505126953, "rewards/margins": 3.1854729652404785, "rewards/rejected": -30.140233993530273, "step": 1536 }, { "epoch": 0.20928649237472766, "grad_norm": 55.18875109785145, "learning_rate": 7.712828948983717e-07, "logits/chosen": 10.208080291748047, "logits/rejected": 9.954607009887695, "logps/chosen": -2.9263381958007812, "logps/rejected": -2.8279666900634766, "loss": 4.6055, "rewards/accuracies": 0.5, "rewards/chosen": -29.263381958007812, "rewards/margins": -0.9837126731872559, "rewards/rejected": -28.279666900634766, "step": 1537 }, { "epoch": 0.2094226579520697, "grad_norm": 42.82279669697972, "learning_rate": 7.712121086789154e-07, "logits/chosen": 10.1323823928833, "logits/rejected": 10.037541389465332, "logps/chosen": -2.8613228797912598, "logps/rejected": -3.0761160850524902, "loss": 3.4606, "rewards/accuracies": 0.75, "rewards/chosen": -28.61322784423828, "rewards/margins": 2.147933006286621, "rewards/rejected": -30.76116180419922, "step": 1538 }, { "epoch": 0.20955882352941177, "grad_norm": 50.874025548173705, "learning_rate": 7.711412385810146e-07, "logits/chosen": 7.104107856750488, "logits/rejected": 7.250031471252441, "logps/chosen": -2.613406181335449, "logps/rejected": -2.504574775695801, "loss": 3.6285, "rewards/accuracies": 0.5, "rewards/chosen": -26.134063720703125, "rewards/margins": -1.0883145332336426, "rewards/rejected": -25.04574966430664, "step": 1539 }, { "epoch": 0.20969498910675383, "grad_norm": 47.01404545987808, "learning_rate": 7.710702846206832e-07, "logits/chosen": 10.019678115844727, "logits/rejected": 10.019403457641602, "logps/chosen": -3.1851954460144043, "logps/rejected": -3.3764564990997314, "loss": 4.1109, "rewards/accuracies": 1.0, "rewards/chosen": -31.851953506469727, "rewards/margins": 1.9126100540161133, "rewards/rejected": -33.764564514160156, "step": 1540 }, { "epoch": 0.20983115468409586, "grad_norm": 50.11139415377223, "learning_rate": 7.709992468139536e-07, "logits/chosen": 9.339749336242676, "logits/rejected": 9.07151985168457, "logps/chosen": -2.67063045501709, "logps/rejected": -3.120061159133911, "loss": 3.9357, "rewards/accuracies": 1.0, "rewards/chosen": -26.70630645751953, "rewards/margins": 4.4943060874938965, "rewards/rejected": -31.200613021850586, "step": 1541 }, { "epoch": 0.2099673202614379, "grad_norm": 54.88666364605557, "learning_rate": 7.709281251768774e-07, "logits/chosen": 9.923608779907227, "logits/rejected": 10.986827850341797, "logps/chosen": -2.930598735809326, "logps/rejected": -3.0825276374816895, "loss": 4.0022, "rewards/accuracies": 0.75, "rewards/chosen": -29.305986404418945, "rewards/margins": 1.5192904472351074, "rewards/rejected": -30.82527732849121, "step": 1542 }, { "epoch": 0.21010348583877997, "grad_norm": 52.43011305394723, "learning_rate": 7.708569197255252e-07, "logits/chosen": 9.288869857788086, "logits/rejected": 10.381475448608398, "logps/chosen": -2.8543081283569336, "logps/rejected": -2.859405755996704, "loss": 4.895, "rewards/accuracies": 0.5, "rewards/chosen": -28.543081283569336, "rewards/margins": 0.050977230072021484, "rewards/rejected": -28.594058990478516, "step": 1543 }, { "epoch": 0.210239651416122, "grad_norm": 48.99765716405179, "learning_rate": 7.707856304759865e-07, "logits/chosen": 9.32318115234375, "logits/rejected": 10.160155296325684, "logps/chosen": -2.7280492782592773, "logps/rejected": -3.115129232406616, "loss": 4.2854, "rewards/accuracies": 0.75, "rewards/chosen": -27.28049087524414, "rewards/margins": 3.8708014488220215, "rewards/rejected": -31.151294708251953, "step": 1544 }, { "epoch": 0.21037581699346405, "grad_norm": 56.19501972368359, "learning_rate": 7.707142574443697e-07, "logits/chosen": 10.403949737548828, "logits/rejected": 10.335732460021973, "logps/chosen": -3.097654104232788, "logps/rejected": -3.0853404998779297, "loss": 4.4925, "rewards/accuracies": 0.5, "rewards/chosen": -30.976539611816406, "rewards/margins": -0.12313556671142578, "rewards/rejected": -30.853404998779297, "step": 1545 }, { "epoch": 0.2105119825708061, "grad_norm": 64.83400403000442, "learning_rate": 7.706428006468021e-07, "logits/chosen": 8.5978422164917, "logits/rejected": 8.702027320861816, "logps/chosen": -2.6461000442504883, "logps/rejected": -2.7651305198669434, "loss": 4.2719, "rewards/accuracies": 0.75, "rewards/chosen": -26.461000442504883, "rewards/margins": 1.190305233001709, "rewards/rejected": -27.65130615234375, "step": 1546 }, { "epoch": 0.21064814814814814, "grad_norm": 46.795344625391465, "learning_rate": 7.705712600994297e-07, "logits/chosen": 9.475378036499023, "logits/rejected": 8.401346206665039, "logps/chosen": -2.252368211746216, "logps/rejected": -2.3614819049835205, "loss": 4.1625, "rewards/accuracies": 0.75, "rewards/chosen": -22.523681640625, "rewards/margins": 1.0911383628845215, "rewards/rejected": -23.61482048034668, "step": 1547 }, { "epoch": 0.2107843137254902, "grad_norm": 57.80945901758079, "learning_rate": 7.704996358184182e-07, "logits/chosen": 9.287307739257812, "logits/rejected": 9.809006690979004, "logps/chosen": -2.8657236099243164, "logps/rejected": -3.039936065673828, "loss": 5.2329, "rewards/accuracies": 0.5, "rewards/chosen": -28.657236099243164, "rewards/margins": 1.7421255111694336, "rewards/rejected": -30.39936065673828, "step": 1548 }, { "epoch": 0.21092047930283225, "grad_norm": 47.217317525314485, "learning_rate": 7.704279278199512e-07, "logits/chosen": 8.299505233764648, "logits/rejected": 10.065877914428711, "logps/chosen": -2.6400938034057617, "logps/rejected": -2.818909168243408, "loss": 4.2146, "rewards/accuracies": 0.75, "rewards/chosen": -26.40093994140625, "rewards/margins": 1.788154125213623, "rewards/rejected": -28.18909454345703, "step": 1549 }, { "epoch": 0.21105664488017428, "grad_norm": 46.175273960735645, "learning_rate": 7.703561361202321e-07, "logits/chosen": 9.989957809448242, "logits/rejected": 11.326377868652344, "logps/chosen": -3.4182791709899902, "logps/rejected": -3.6872074604034424, "loss": 4.4441, "rewards/accuracies": 0.75, "rewards/chosen": -34.18278884887695, "rewards/margins": 2.6892833709716797, "rewards/rejected": -36.872074127197266, "step": 1550 }, { "epoch": 0.21119281045751634, "grad_norm": 54.7446693372938, "learning_rate": 7.702842607354826e-07, "logits/chosen": 11.340158462524414, "logits/rejected": 10.414113998413086, "logps/chosen": -3.1707074642181396, "logps/rejected": -3.0914502143859863, "loss": 4.6975, "rewards/accuracies": 0.5, "rewards/chosen": -31.707073211669922, "rewards/margins": -0.7925748825073242, "rewards/rejected": -30.914499282836914, "step": 1551 }, { "epoch": 0.2113289760348584, "grad_norm": 45.35151251254039, "learning_rate": 7.702123016819435e-07, "logits/chosen": 10.465436935424805, "logits/rejected": 10.025437355041504, "logps/chosen": -2.915897846221924, "logps/rejected": -2.9153409004211426, "loss": 4.5222, "rewards/accuracies": 0.75, "rewards/chosen": -29.158981323242188, "rewards/margins": -0.005569934844970703, "rewards/rejected": -29.153409957885742, "step": 1552 }, { "epoch": 0.21146514161220042, "grad_norm": 42.672041655852, "learning_rate": 7.701402589758747e-07, "logits/chosen": 10.687688827514648, "logits/rejected": 11.451541900634766, "logps/chosen": -3.0771260261535645, "logps/rejected": -3.206106662750244, "loss": 4.1412, "rewards/accuracies": 0.75, "rewards/chosen": -30.771259307861328, "rewards/margins": 1.289806842803955, "rewards/rejected": -32.061065673828125, "step": 1553 }, { "epoch": 0.21160130718954248, "grad_norm": 44.65624540363915, "learning_rate": 7.700681326335547e-07, "logits/chosen": 9.617847442626953, "logits/rejected": 10.158291816711426, "logps/chosen": -2.772146701812744, "logps/rejected": -2.991454839706421, "loss": 3.6552, "rewards/accuracies": 0.5, "rewards/chosen": -27.721466064453125, "rewards/margins": 2.193084239959717, "rewards/rejected": -29.914548873901367, "step": 1554 }, { "epoch": 0.21173747276688454, "grad_norm": 46.517478962318904, "learning_rate": 7.699959226712812e-07, "logits/chosen": 9.522117614746094, "logits/rejected": 10.082192420959473, "logps/chosen": -2.601781129837036, "logps/rejected": -2.7989468574523926, "loss": 4.5518, "rewards/accuracies": 0.5, "rewards/chosen": -26.017812728881836, "rewards/margins": 1.9716553688049316, "rewards/rejected": -27.98946762084961, "step": 1555 }, { "epoch": 0.21187363834422657, "grad_norm": 44.91875652828891, "learning_rate": 7.699236291053705e-07, "logits/chosen": 9.093759536743164, "logits/rejected": 9.914636611938477, "logps/chosen": -2.687592029571533, "logps/rejected": -3.044620990753174, "loss": 4.3376, "rewards/accuracies": 0.75, "rewards/chosen": -26.875919342041016, "rewards/margins": 3.5702900886535645, "rewards/rejected": -30.446208953857422, "step": 1556 }, { "epoch": 0.21200980392156862, "grad_norm": 46.542260483561414, "learning_rate": 7.698512519521579e-07, "logits/chosen": 10.887365341186523, "logits/rejected": 11.427061080932617, "logps/chosen": -3.467005968093872, "logps/rejected": -3.42761492729187, "loss": 4.6833, "rewards/accuracies": 0.5, "rewards/chosen": -34.67005920410156, "rewards/margins": -0.3939094543457031, "rewards/rejected": -34.27614974975586, "step": 1557 }, { "epoch": 0.21214596949891068, "grad_norm": 39.03296175730731, "learning_rate": 7.697787912279977e-07, "logits/chosen": 9.505142211914062, "logits/rejected": 10.920951843261719, "logps/chosen": -2.8633413314819336, "logps/rejected": -3.1564557552337646, "loss": 3.976, "rewards/accuracies": 0.5, "rewards/chosen": -28.633411407470703, "rewards/margins": 2.9311447143554688, "rewards/rejected": -31.564556121826172, "step": 1558 }, { "epoch": 0.21228213507625274, "grad_norm": 42.347766642631484, "learning_rate": 7.697062469492632e-07, "logits/chosen": 10.644079208374023, "logits/rejected": 11.594518661499023, "logps/chosen": -2.893411159515381, "logps/rejected": -3.100267171859741, "loss": 3.884, "rewards/accuracies": 0.75, "rewards/chosen": -28.934110641479492, "rewards/margins": 2.0685601234436035, "rewards/rejected": -31.00267219543457, "step": 1559 }, { "epoch": 0.21241830065359477, "grad_norm": 49.63164929275762, "learning_rate": 7.69633619132346e-07, "logits/chosen": 10.40385627746582, "logits/rejected": 10.780734062194824, "logps/chosen": -3.1254465579986572, "logps/rejected": -3.4948177337646484, "loss": 4.2201, "rewards/accuracies": 1.0, "rewards/chosen": -31.254465103149414, "rewards/margins": 3.6937127113342285, "rewards/rejected": -34.948177337646484, "step": 1560 }, { "epoch": 0.21255446623093682, "grad_norm": 42.64742535550734, "learning_rate": 7.695609077936572e-07, "logits/chosen": 9.867100715637207, "logits/rejected": 10.544097900390625, "logps/chosen": -3.0771045684814453, "logps/rejected": -3.422696590423584, "loss": 3.8747, "rewards/accuracies": 0.75, "rewards/chosen": -30.771045684814453, "rewards/margins": 3.4559216499328613, "rewards/rejected": -34.226966857910156, "step": 1561 }, { "epoch": 0.21269063180827888, "grad_norm": 41.99710309754902, "learning_rate": 7.694881129496265e-07, "logits/chosen": 11.731908798217773, "logits/rejected": 11.087465286254883, "logps/chosen": -2.9910712242126465, "logps/rejected": -2.9820828437805176, "loss": 4.0651, "rewards/accuracies": 0.25, "rewards/chosen": -29.910709381103516, "rewards/margins": -0.08988237380981445, "rewards/rejected": -29.82082748413086, "step": 1562 }, { "epoch": 0.2128267973856209, "grad_norm": 49.205671892539634, "learning_rate": 7.694152346167024e-07, "logits/chosen": 11.182632446289062, "logits/rejected": 9.72913932800293, "logps/chosen": -3.0582242012023926, "logps/rejected": -2.881720542907715, "loss": 4.2947, "rewards/accuracies": 0.25, "rewards/chosen": -30.58224105834961, "rewards/margins": -1.7650361061096191, "rewards/rejected": -28.817203521728516, "step": 1563 }, { "epoch": 0.21296296296296297, "grad_norm": 62.30776612613297, "learning_rate": 7.693422728113524e-07, "logits/chosen": 9.421554565429688, "logits/rejected": 9.960573196411133, "logps/chosen": -2.512295722961426, "logps/rejected": -2.9071478843688965, "loss": 3.939, "rewards/accuracies": 1.0, "rewards/chosen": -25.12295913696289, "rewards/margins": 3.948519229888916, "rewards/rejected": -29.07147789001465, "step": 1564 }, { "epoch": 0.21309912854030502, "grad_norm": 56.49958605330683, "learning_rate": 7.69269227550063e-07, "logits/chosen": 10.850687026977539, "logits/rejected": 10.326759338378906, "logps/chosen": -3.5114402770996094, "logps/rejected": -3.46120548248291, "loss": 4.8646, "rewards/accuracies": 0.5, "rewards/chosen": -35.114402770996094, "rewards/margins": -0.5023508071899414, "rewards/rejected": -34.61205291748047, "step": 1565 }, { "epoch": 0.21323529411764705, "grad_norm": 39.2647862576272, "learning_rate": 7.691960988493391e-07, "logits/chosen": 9.933114051818848, "logits/rejected": 11.708734512329102, "logps/chosen": -3.136068820953369, "logps/rejected": -3.633089065551758, "loss": 4.4176, "rewards/accuracies": 1.0, "rewards/chosen": -31.36069107055664, "rewards/margins": 4.970200538635254, "rewards/rejected": -36.33089065551758, "step": 1566 }, { "epoch": 0.2133714596949891, "grad_norm": 56.77641142660399, "learning_rate": 7.691228867257049e-07, "logits/chosen": 11.26307487487793, "logits/rejected": 11.183065414428711, "logps/chosen": -2.729341983795166, "logps/rejected": -2.858442783355713, "loss": 4.1154, "rewards/accuracies": 0.75, "rewards/chosen": -27.293420791625977, "rewards/margins": 1.2910089492797852, "rewards/rejected": -28.584430694580078, "step": 1567 }, { "epoch": 0.21350762527233116, "grad_norm": 45.44505809030883, "learning_rate": 7.690495911957032e-07, "logits/chosen": 10.955680847167969, "logits/rejected": 11.192590713500977, "logps/chosen": -3.049409866333008, "logps/rejected": -3.0171356201171875, "loss": 4.4807, "rewards/accuracies": 0.25, "rewards/chosen": -30.49410057067871, "rewards/margins": -0.3227419853210449, "rewards/rejected": -30.171358108520508, "step": 1568 }, { "epoch": 0.2136437908496732, "grad_norm": 40.97318869890363, "learning_rate": 7.689762122758959e-07, "logits/chosen": 9.944849967956543, "logits/rejected": 10.037748336791992, "logps/chosen": -3.1334738731384277, "logps/rejected": -3.2118327617645264, "loss": 4.0299, "rewards/accuracies": 0.5, "rewards/chosen": -31.334739685058594, "rewards/margins": 0.7835879325866699, "rewards/rejected": -32.118324279785156, "step": 1569 }, { "epoch": 0.21377995642701525, "grad_norm": 49.122568537279804, "learning_rate": 7.689027499828632e-07, "logits/chosen": 12.218103408813477, "logits/rejected": 11.914966583251953, "logps/chosen": -3.486815929412842, "logps/rejected": -3.2854831218719482, "loss": 4.2604, "rewards/accuracies": 0.5, "rewards/chosen": -34.86815643310547, "rewards/margins": -2.0133280754089355, "rewards/rejected": -32.854827880859375, "step": 1570 }, { "epoch": 0.2139161220043573, "grad_norm": 43.39621008096686, "learning_rate": 7.68829204333205e-07, "logits/chosen": 10.746740341186523, "logits/rejected": 11.320868492126465, "logps/chosen": -3.2733211517333984, "logps/rejected": -3.216432571411133, "loss": 3.6209, "rewards/accuracies": 0.5, "rewards/chosen": -32.733211517333984, "rewards/margins": -0.568885326385498, "rewards/rejected": -32.16432571411133, "step": 1571 }, { "epoch": 0.21405228758169934, "grad_norm": 46.69160360497318, "learning_rate": 7.687555753435391e-07, "logits/chosen": 12.013891220092773, "logits/rejected": 12.134092330932617, "logps/chosen": -3.195850133895874, "logps/rejected": -3.338946580886841, "loss": 4.4873, "rewards/accuracies": 0.75, "rewards/chosen": -31.958499908447266, "rewards/margins": 1.4309639930725098, "rewards/rejected": -33.38946533203125, "step": 1572 }, { "epoch": 0.2141884531590414, "grad_norm": 72.465982141839, "learning_rate": 7.686818630305029e-07, "logits/chosen": 10.996406555175781, "logits/rejected": 11.551919937133789, "logps/chosen": -3.032029628753662, "logps/rejected": -3.0741381645202637, "loss": 4.3505, "rewards/accuracies": 0.75, "rewards/chosen": -30.320295333862305, "rewards/margins": 0.42108678817749023, "rewards/rejected": -30.741382598876953, "step": 1573 }, { "epoch": 0.21432461873638345, "grad_norm": 44.88451562713929, "learning_rate": 7.686080674107522e-07, "logits/chosen": 11.601428985595703, "logits/rejected": 11.983808517456055, "logps/chosen": -3.2525815963745117, "logps/rejected": -3.667471408843994, "loss": 4.2678, "rewards/accuracies": 0.75, "rewards/chosen": -32.52581787109375, "rewards/margins": 4.148898124694824, "rewards/rejected": -36.674713134765625, "step": 1574 }, { "epoch": 0.21446078431372548, "grad_norm": 41.916534782026424, "learning_rate": 7.685341885009617e-07, "logits/chosen": 10.154777526855469, "logits/rejected": 11.083196640014648, "logps/chosen": -2.8631885051727295, "logps/rejected": -3.1913676261901855, "loss": 4.563, "rewards/accuracies": 0.75, "rewards/chosen": -28.631885528564453, "rewards/margins": 3.281789779663086, "rewards/rejected": -31.913677215576172, "step": 1575 }, { "epoch": 0.21459694989106753, "grad_norm": 50.888853927721776, "learning_rate": 7.68460226317825e-07, "logits/chosen": 10.162498474121094, "logits/rejected": 10.147138595581055, "logps/chosen": -2.8775248527526855, "logps/rejected": -2.7761716842651367, "loss": 4.2419, "rewards/accuracies": 0.5, "rewards/chosen": -28.775249481201172, "rewards/margins": -1.01353120803833, "rewards/rejected": -27.76171875, "step": 1576 }, { "epoch": 0.2147331154684096, "grad_norm": 41.36558401082972, "learning_rate": 7.683861808780544e-07, "logits/chosen": 10.116613388061523, "logits/rejected": 11.773048400878906, "logps/chosen": -3.1564879417419434, "logps/rejected": -3.3439836502075195, "loss": 4.4661, "rewards/accuracies": 0.75, "rewards/chosen": -31.564876556396484, "rewards/margins": 1.8749589920043945, "rewards/rejected": -33.43983840942383, "step": 1577 }, { "epoch": 0.21486928104575165, "grad_norm": 48.27805386245086, "learning_rate": 7.683120521983813e-07, "logits/chosen": 10.542380332946777, "logits/rejected": 10.814872741699219, "logps/chosen": -2.9005417823791504, "logps/rejected": -2.773895502090454, "loss": 4.6887, "rewards/accuracies": 0.25, "rewards/chosen": -29.00541877746582, "rewards/margins": -1.2664642333984375, "rewards/rejected": -27.738954544067383, "step": 1578 }, { "epoch": 0.21500544662309368, "grad_norm": 45.97565915143697, "learning_rate": 7.682378402955553e-07, "logits/chosen": 12.042425155639648, "logits/rejected": 12.259542465209961, "logps/chosen": -3.0938572883605957, "logps/rejected": -3.1089484691619873, "loss": 4.2698, "rewards/accuracies": 0.5, "rewards/chosen": -30.938570022583008, "rewards/margins": 0.15091371536254883, "rewards/rejected": -31.08948516845703, "step": 1579 }, { "epoch": 0.21514161220043573, "grad_norm": 42.490796413552964, "learning_rate": 7.681635451863455e-07, "logits/chosen": 11.147957801818848, "logits/rejected": 10.917232513427734, "logps/chosen": -2.9385769367218018, "logps/rejected": -3.049510955810547, "loss": 4.1404, "rewards/accuracies": 0.75, "rewards/chosen": -29.38576889038086, "rewards/margins": 1.1093378067016602, "rewards/rejected": -30.495107650756836, "step": 1580 }, { "epoch": 0.2152777777777778, "grad_norm": 44.69501676119365, "learning_rate": 7.680891668875393e-07, "logits/chosen": 10.29680061340332, "logits/rejected": 10.930252075195312, "logps/chosen": -3.2446351051330566, "logps/rejected": -3.1580135822296143, "loss": 4.6818, "rewards/accuracies": 0.25, "rewards/chosen": -32.44635009765625, "rewards/margins": -0.866215705871582, "rewards/rejected": -31.580135345458984, "step": 1581 }, { "epoch": 0.21541394335511982, "grad_norm": 46.26957463443332, "learning_rate": 7.680147054159432e-07, "logits/chosen": 10.913053512573242, "logits/rejected": 11.053434371948242, "logps/chosen": -2.81643009185791, "logps/rejected": -3.1804049015045166, "loss": 3.9573, "rewards/accuracies": 0.75, "rewards/chosen": -28.16429901123047, "rewards/margins": 3.6397480964660645, "rewards/rejected": -31.804048538208008, "step": 1582 }, { "epoch": 0.21555010893246188, "grad_norm": 54.69678742149616, "learning_rate": 7.679401607883825e-07, "logits/chosen": 10.760224342346191, "logits/rejected": 10.904536247253418, "logps/chosen": -2.803906202316284, "logps/rejected": -2.9487643241882324, "loss": 4.3841, "rewards/accuracies": 0.75, "rewards/chosen": -28.0390625, "rewards/margins": 1.4485812187194824, "rewards/rejected": -29.487642288208008, "step": 1583 }, { "epoch": 0.21568627450980393, "grad_norm": 42.887683777796326, "learning_rate": 7.678655330217008e-07, "logits/chosen": 9.899728775024414, "logits/rejected": 9.849885940551758, "logps/chosen": -2.8401660919189453, "logps/rejected": -3.000636577606201, "loss": 4.2984, "rewards/accuracies": 0.75, "rewards/chosen": -28.401662826538086, "rewards/margins": 1.6047039031982422, "rewards/rejected": -30.006364822387695, "step": 1584 }, { "epoch": 0.21582244008714596, "grad_norm": 58.33558059234332, "learning_rate": 7.677908221327614e-07, "logits/chosen": 11.117902755737305, "logits/rejected": 10.486928939819336, "logps/chosen": -3.0306620597839355, "logps/rejected": -3.184180736541748, "loss": 4.4191, "rewards/accuracies": 0.75, "rewards/chosen": -30.306621551513672, "rewards/margins": 1.5351853370666504, "rewards/rejected": -31.841808319091797, "step": 1585 }, { "epoch": 0.21595860566448802, "grad_norm": 39.966535424138016, "learning_rate": 7.677160281384454e-07, "logits/chosen": 10.044441223144531, "logits/rejected": 10.255392074584961, "logps/chosen": -2.8165125846862793, "logps/rejected": -2.962432861328125, "loss": 3.8006, "rewards/accuracies": 0.5, "rewards/chosen": -28.165124893188477, "rewards/margins": 1.459202766418457, "rewards/rejected": -29.62432861328125, "step": 1586 }, { "epoch": 0.21609477124183007, "grad_norm": 59.07423498086564, "learning_rate": 7.676411510556532e-07, "logits/chosen": 9.946072578430176, "logits/rejected": 10.372909545898438, "logps/chosen": -3.118746519088745, "logps/rejected": -3.2652926445007324, "loss": 4.074, "rewards/accuracies": 0.75, "rewards/chosen": -31.18746566772461, "rewards/margins": 1.4654617309570312, "rewards/rejected": -32.65292739868164, "step": 1587 }, { "epoch": 0.2162309368191721, "grad_norm": 44.037375717548464, "learning_rate": 7.675661909013041e-07, "logits/chosen": 9.579008102416992, "logits/rejected": 10.084258079528809, "logps/chosen": -2.9431703090667725, "logps/rejected": -3.2321457862854004, "loss": 4.5195, "rewards/accuracies": 1.0, "rewards/chosen": -29.431703567504883, "rewards/margins": 2.8897528648376465, "rewards/rejected": -32.32145690917969, "step": 1588 }, { "epoch": 0.21636710239651416, "grad_norm": 45.617478793133174, "learning_rate": 7.674911476923358e-07, "logits/chosen": 9.936174392700195, "logits/rejected": 10.439626693725586, "logps/chosen": -2.809185743331909, "logps/rejected": -3.2825965881347656, "loss": 4.0402, "rewards/accuracies": 0.75, "rewards/chosen": -28.09185791015625, "rewards/margins": 4.734107494354248, "rewards/rejected": -32.825965881347656, "step": 1589 }, { "epoch": 0.21650326797385622, "grad_norm": 44.48819184362933, "learning_rate": 7.674160214457049e-07, "logits/chosen": 10.907636642456055, "logits/rejected": 11.478009223937988, "logps/chosen": -2.9144487380981445, "logps/rejected": -3.0407490730285645, "loss": 4.5876, "rewards/accuracies": 0.25, "rewards/chosen": -29.144485473632812, "rewards/margins": 1.263002872467041, "rewards/rejected": -30.407487869262695, "step": 1590 }, { "epoch": 0.21663943355119825, "grad_norm": 71.39617885662256, "learning_rate": 7.673408121783869e-07, "logits/chosen": 10.068742752075195, "logits/rejected": 9.313565254211426, "logps/chosen": -2.6461305618286133, "logps/rejected": -2.6943745613098145, "loss": 4.3658, "rewards/accuracies": 0.25, "rewards/chosen": -26.4613037109375, "rewards/margins": 0.4824404716491699, "rewards/rejected": -26.943744659423828, "step": 1591 }, { "epoch": 0.2167755991285403, "grad_norm": 43.79076589368862, "learning_rate": 7.672655199073759e-07, "logits/chosen": 11.888038635253906, "logits/rejected": 11.476862907409668, "logps/chosen": -3.1273796558380127, "logps/rejected": -3.381855010986328, "loss": 3.9973, "rewards/accuracies": 0.75, "rewards/chosen": -31.27379608154297, "rewards/margins": 2.544753074645996, "rewards/rejected": -33.81855010986328, "step": 1592 }, { "epoch": 0.21691176470588236, "grad_norm": 41.776199167061726, "learning_rate": 7.671901446496848e-07, "logits/chosen": 9.62586498260498, "logits/rejected": 9.880945205688477, "logps/chosen": -2.849433422088623, "logps/rejected": -2.885916233062744, "loss": 4.4446, "rewards/accuracies": 0.5, "rewards/chosen": -28.494333267211914, "rewards/margins": 0.36482810974121094, "rewards/rejected": -28.859161376953125, "step": 1593 }, { "epoch": 0.2170479302832244, "grad_norm": 48.8312552406098, "learning_rate": 7.671146864223454e-07, "logits/chosen": 11.310403823852539, "logits/rejected": 11.839383125305176, "logps/chosen": -3.2462427616119385, "logps/rejected": -3.3439700603485107, "loss": 4.2417, "rewards/accuracies": 0.75, "rewards/chosen": -32.46242904663086, "rewards/margins": 0.977271556854248, "rewards/rejected": -33.439701080322266, "step": 1594 }, { "epoch": 0.21718409586056645, "grad_norm": 41.57612979913696, "learning_rate": 7.67039145242408e-07, "logits/chosen": 11.343948364257812, "logits/rejected": 11.613540649414062, "logps/chosen": -3.2409887313842773, "logps/rejected": -3.4868924617767334, "loss": 3.6765, "rewards/accuracies": 0.75, "rewards/chosen": -32.40988540649414, "rewards/margins": 2.459038257598877, "rewards/rejected": -34.86892318725586, "step": 1595 }, { "epoch": 0.2173202614379085, "grad_norm": 44.17142164520721, "learning_rate": 7.669635211269417e-07, "logits/chosen": 10.55731201171875, "logits/rejected": 10.56271743774414, "logps/chosen": -2.761425256729126, "logps/rejected": -2.9032320976257324, "loss": 3.5915, "rewards/accuracies": 0.75, "rewards/chosen": -27.6142520904541, "rewards/margins": 1.4180665016174316, "rewards/rejected": -29.032318115234375, "step": 1596 }, { "epoch": 0.21745642701525056, "grad_norm": 42.93583784581286, "learning_rate": 7.668878140930344e-07, "logits/chosen": 10.079788208007812, "logits/rejected": 11.572044372558594, "logps/chosen": -3.001664161682129, "logps/rejected": -3.2036006450653076, "loss": 4.6846, "rewards/accuracies": 0.75, "rewards/chosen": -30.016643524169922, "rewards/margins": 2.019364356994629, "rewards/rejected": -32.036006927490234, "step": 1597 }, { "epoch": 0.2175925925925926, "grad_norm": 41.62583207219695, "learning_rate": 7.668120241577929e-07, "logits/chosen": 10.085017204284668, "logits/rejected": 10.31259822845459, "logps/chosen": -3.165400981903076, "logps/rejected": -3.1183652877807617, "loss": 4.1944, "rewards/accuracies": 0.25, "rewards/chosen": -31.654010772705078, "rewards/margins": -0.47035741806030273, "rewards/rejected": -31.183652877807617, "step": 1598 }, { "epoch": 0.21772875816993464, "grad_norm": 44.05251338055131, "learning_rate": 7.667361513383423e-07, "logits/chosen": 10.102235794067383, "logits/rejected": 12.254966735839844, "logps/chosen": -2.7741048336029053, "logps/rejected": -3.37631893157959, "loss": 3.8184, "rewards/accuracies": 1.0, "rewards/chosen": -27.741050720214844, "rewards/margins": 6.022141456604004, "rewards/rejected": -33.76319122314453, "step": 1599 }, { "epoch": 0.2178649237472767, "grad_norm": 47.89186999393182, "learning_rate": 7.666601956518269e-07, "logits/chosen": 9.006420135498047, "logits/rejected": 9.476083755493164, "logps/chosen": -2.6020455360412598, "logps/rejected": -2.581204891204834, "loss": 3.5082, "rewards/accuracies": 0.5, "rewards/chosen": -26.02045440673828, "rewards/margins": -0.208404541015625, "rewards/rejected": -25.812049865722656, "step": 1600 }, { "epoch": 0.21800108932461873, "grad_norm": 51.7484431073147, "learning_rate": 7.665841571154094e-07, "logits/chosen": 9.743616104125977, "logits/rejected": 9.903446197509766, "logps/chosen": -3.0806102752685547, "logps/rejected": -3.323484420776367, "loss": 3.8299, "rewards/accuracies": 0.75, "rewards/chosen": -30.806102752685547, "rewards/margins": 2.428741455078125, "rewards/rejected": -33.23484420776367, "step": 1601 }, { "epoch": 0.2181372549019608, "grad_norm": 100.28742870302676, "learning_rate": 7.665080357462715e-07, "logits/chosen": 9.40369987487793, "logits/rejected": 10.670145034790039, "logps/chosen": -2.9942240715026855, "logps/rejected": -3.165229558944702, "loss": 4.2232, "rewards/accuracies": 0.75, "rewards/chosen": -29.942237854003906, "rewards/margins": 1.710057258605957, "rewards/rejected": -31.652297973632812, "step": 1602 }, { "epoch": 0.21827342047930284, "grad_norm": 43.821930758828394, "learning_rate": 7.664318315616134e-07, "logits/chosen": 9.966240882873535, "logits/rejected": 11.09278678894043, "logps/chosen": -2.8825926780700684, "logps/rejected": -3.2348291873931885, "loss": 4.3619, "rewards/accuracies": 0.75, "rewards/chosen": -28.825927734375, "rewards/margins": 3.5223641395568848, "rewards/rejected": -32.34829330444336, "step": 1603 }, { "epoch": 0.21840958605664487, "grad_norm": 80.99278150915669, "learning_rate": 7.663555445786538e-07, "logits/chosen": 10.232994079589844, "logits/rejected": 10.417881965637207, "logps/chosen": -2.935324192047119, "logps/rejected": -3.3806748390197754, "loss": 4.1506, "rewards/accuracies": 0.75, "rewards/chosen": -29.353240966796875, "rewards/margins": 4.4535064697265625, "rewards/rejected": -33.80674743652344, "step": 1604 }, { "epoch": 0.21854575163398693, "grad_norm": 48.79345792097013, "learning_rate": 7.662791748146307e-07, "logits/chosen": 9.517831802368164, "logits/rejected": 9.823272705078125, "logps/chosen": -3.0484540462493896, "logps/rejected": -3.3394148349761963, "loss": 3.6943, "rewards/accuracies": 0.75, "rewards/chosen": -30.484540939331055, "rewards/margins": 2.90960693359375, "rewards/rejected": -33.39414978027344, "step": 1605 }, { "epoch": 0.21868191721132899, "grad_norm": 42.208788809659204, "learning_rate": 7.662027222868003e-07, "logits/chosen": 8.63067626953125, "logits/rejected": 9.881261825561523, "logps/chosen": -3.163954973220825, "logps/rejected": -3.2367701530456543, "loss": 3.7742, "rewards/accuracies": 0.5, "rewards/chosen": -31.639549255371094, "rewards/margins": 0.7281513214111328, "rewards/rejected": -32.367698669433594, "step": 1606 }, { "epoch": 0.21881808278867101, "grad_norm": 50.430030761333626, "learning_rate": 7.661261870124377e-07, "logits/chosen": 9.955596923828125, "logits/rejected": 10.47617244720459, "logps/chosen": -3.267261505126953, "logps/rejected": -3.347416877746582, "loss": 3.9171, "rewards/accuracies": 0.75, "rewards/chosen": -32.67261505126953, "rewards/margins": 0.8015546798706055, "rewards/rejected": -33.47416687011719, "step": 1607 }, { "epoch": 0.21895424836601307, "grad_norm": 43.448064394391736, "learning_rate": 7.660495690088368e-07, "logits/chosen": 12.16885757446289, "logits/rejected": 11.864875793457031, "logps/chosen": -3.333315372467041, "logps/rejected": -3.5461373329162598, "loss": 4.0377, "rewards/accuracies": 0.5, "rewards/chosen": -33.333152770996094, "rewards/margins": 2.128221035003662, "rewards/rejected": -35.46137237548828, "step": 1608 }, { "epoch": 0.21909041394335513, "grad_norm": 45.405156703591615, "learning_rate": 7.659728682933099e-07, "logits/chosen": 10.175829887390137, "logits/rejected": 10.756729125976562, "logps/chosen": -3.2641758918762207, "logps/rejected": -3.5182058811187744, "loss": 4.2507, "rewards/accuracies": 0.5, "rewards/chosen": -32.64175796508789, "rewards/margins": 2.5403013229370117, "rewards/rejected": -35.18206024169922, "step": 1609 }, { "epoch": 0.21922657952069716, "grad_norm": 42.21761696527975, "learning_rate": 7.658960848831883e-07, "logits/chosen": 9.95063591003418, "logits/rejected": 11.339876174926758, "logps/chosen": -2.6063294410705566, "logps/rejected": -3.1669235229492188, "loss": 3.9172, "rewards/accuracies": 1.0, "rewards/chosen": -26.063297271728516, "rewards/margins": 5.605942249298096, "rewards/rejected": -31.669239044189453, "step": 1610 }, { "epoch": 0.2193627450980392, "grad_norm": 50.83517684955892, "learning_rate": 7.658192187958218e-07, "logits/chosen": 11.301745414733887, "logits/rejected": 12.059618949890137, "logps/chosen": -3.563380241394043, "logps/rejected": -3.6650636196136475, "loss": 4.3873, "rewards/accuracies": 0.75, "rewards/chosen": -35.63380432128906, "rewards/margins": 1.0168342590332031, "rewards/rejected": -36.650634765625, "step": 1611 }, { "epoch": 0.21949891067538127, "grad_norm": 47.05301332031412, "learning_rate": 7.65742270048579e-07, "logits/chosen": 9.645236015319824, "logits/rejected": 10.687830924987793, "logps/chosen": -3.0830109119415283, "logps/rejected": -3.1090266704559326, "loss": 3.851, "rewards/accuracies": 0.5, "rewards/chosen": -30.830108642578125, "rewards/margins": 0.26015663146972656, "rewards/rejected": -31.09026527404785, "step": 1612 }, { "epoch": 0.2196350762527233, "grad_norm": 54.86119032451928, "learning_rate": 7.656652386588468e-07, "logits/chosen": 12.303096771240234, "logits/rejected": 10.661548614501953, "logps/chosen": -3.6667416095733643, "logps/rejected": -3.277395248413086, "loss": 4.8772, "rewards/accuracies": 0.0, "rewards/chosen": -36.667415618896484, "rewards/margins": -3.8934645652770996, "rewards/rejected": -32.773948669433594, "step": 1613 }, { "epoch": 0.21977124183006536, "grad_norm": 42.923428070200266, "learning_rate": 7.655881246440316e-07, "logits/chosen": 10.043821334838867, "logits/rejected": 12.425590515136719, "logps/chosen": -3.3671936988830566, "logps/rejected": -3.9496963024139404, "loss": 4.237, "rewards/accuracies": 1.0, "rewards/chosen": -33.67193603515625, "rewards/margins": 5.8250274658203125, "rewards/rejected": -39.49696350097656, "step": 1614 }, { "epoch": 0.2199074074074074, "grad_norm": 44.98915330930718, "learning_rate": 7.655109280215575e-07, "logits/chosen": 10.823247909545898, "logits/rejected": 10.733755111694336, "logps/chosen": -3.4336376190185547, "logps/rejected": -3.6205637454986572, "loss": 4.1523, "rewards/accuracies": 0.75, "rewards/chosen": -34.33637619018555, "rewards/margins": 1.8692617416381836, "rewards/rejected": -36.20563507080078, "step": 1615 }, { "epoch": 0.22004357298474944, "grad_norm": 50.55885789033671, "learning_rate": 7.654336488088678e-07, "logits/chosen": 11.327067375183105, "logits/rejected": 11.092135429382324, "logps/chosen": -3.123322010040283, "logps/rejected": -3.4360008239746094, "loss": 4.4079, "rewards/accuracies": 0.75, "rewards/chosen": -31.233219146728516, "rewards/margins": 3.1267881393432617, "rewards/rejected": -34.360008239746094, "step": 1616 }, { "epoch": 0.2201797385620915, "grad_norm": 46.371115773336626, "learning_rate": 7.653562870234245e-07, "logits/chosen": 10.287355422973633, "logits/rejected": 10.489201545715332, "logps/chosen": -3.274522066116333, "logps/rejected": -3.3768439292907715, "loss": 4.5095, "rewards/accuracies": 0.75, "rewards/chosen": -32.74522018432617, "rewards/margins": 1.023219108581543, "rewards/rejected": -33.76844024658203, "step": 1617 }, { "epoch": 0.22031590413943355, "grad_norm": 42.20578701039623, "learning_rate": 7.652788426827081e-07, "logits/chosen": 10.494738578796387, "logits/rejected": 11.164095878601074, "logps/chosen": -3.2347142696380615, "logps/rejected": -3.5218088626861572, "loss": 4.1264, "rewards/accuracies": 0.5, "rewards/chosen": -32.347145080566406, "rewards/margins": 2.8709444999694824, "rewards/rejected": -35.21808624267578, "step": 1618 }, { "epoch": 0.2204520697167756, "grad_norm": 47.90128138602086, "learning_rate": 7.652013158042179e-07, "logits/chosen": 11.027521133422852, "logits/rejected": 11.237503051757812, "logps/chosen": -3.565455198287964, "logps/rejected": -3.3765742778778076, "loss": 4.5137, "rewards/accuracies": 0.5, "rewards/chosen": -35.6545524597168, "rewards/margins": -1.8888096809387207, "rewards/rejected": -33.76573944091797, "step": 1619 }, { "epoch": 0.22058823529411764, "grad_norm": 48.31318243978467, "learning_rate": 7.651237064054713e-07, "logits/chosen": 10.482385635375977, "logits/rejected": 10.836200714111328, "logps/chosen": -3.1280112266540527, "logps/rejected": -3.1135072708129883, "loss": 3.46, "rewards/accuracies": 0.5, "rewards/chosen": -31.28011131286621, "rewards/margins": -0.14504003524780273, "rewards/rejected": -31.13507080078125, "step": 1620 }, { "epoch": 0.2207244008714597, "grad_norm": 42.23840908342697, "learning_rate": 7.650460145040053e-07, "logits/chosen": 10.85284423828125, "logits/rejected": 8.885726928710938, "logps/chosen": -3.235384941101074, "logps/rejected": -3.1822197437286377, "loss": 3.9196, "rewards/accuracies": 0.25, "rewards/chosen": -32.35384750366211, "rewards/margins": -0.5316500663757324, "rewards/rejected": -31.82219886779785, "step": 1621 }, { "epoch": 0.22086056644880175, "grad_norm": 40.08982413221266, "learning_rate": 7.649682401173748e-07, "logits/chosen": 10.482494354248047, "logits/rejected": 10.226903915405273, "logps/chosen": -2.9441583156585693, "logps/rejected": -3.114335536956787, "loss": 4.2717, "rewards/accuracies": 0.75, "rewards/chosen": -29.44158363342285, "rewards/margins": 1.7017707824707031, "rewards/rejected": -31.143356323242188, "step": 1622 }, { "epoch": 0.22099673202614378, "grad_norm": 46.86024799613147, "learning_rate": 7.648903832631536e-07, "logits/chosen": 10.731939315795898, "logits/rejected": 10.123628616333008, "logps/chosen": -3.0547218322753906, "logps/rejected": -3.3676302433013916, "loss": 3.9477, "rewards/accuracies": 1.0, "rewards/chosen": -30.547216415405273, "rewards/margins": 3.1290860176086426, "rewards/rejected": -33.67630386352539, "step": 1623 }, { "epoch": 0.22113289760348584, "grad_norm": 40.553112569699074, "learning_rate": 7.64812443958934e-07, "logits/chosen": 10.806344985961914, "logits/rejected": 10.20936393737793, "logps/chosen": -3.0487594604492188, "logps/rejected": -3.1838598251342773, "loss": 4.4435, "rewards/accuracies": 0.75, "rewards/chosen": -30.487594604492188, "rewards/margins": 1.3510031700134277, "rewards/rejected": -31.838598251342773, "step": 1624 }, { "epoch": 0.2212690631808279, "grad_norm": 42.94693866131077, "learning_rate": 7.647344222223273e-07, "logits/chosen": 8.898612022399902, "logits/rejected": 10.523826599121094, "logps/chosen": -2.826646089553833, "logps/rejected": -3.4829821586608887, "loss": 4.1896, "rewards/accuracies": 1.0, "rewards/chosen": -28.266460418701172, "rewards/margins": 6.563360691070557, "rewards/rejected": -34.8298225402832, "step": 1625 }, { "epoch": 0.22140522875816993, "grad_norm": 42.632186002408986, "learning_rate": 7.646563180709627e-07, "logits/chosen": 9.16966724395752, "logits/rejected": 10.09007453918457, "logps/chosen": -3.0818538665771484, "logps/rejected": -3.2952799797058105, "loss": 4.9544, "rewards/accuracies": 0.5, "rewards/chosen": -30.818538665771484, "rewards/margins": 2.1342592239379883, "rewards/rejected": -32.952796936035156, "step": 1626 }, { "epoch": 0.22154139433551198, "grad_norm": 45.806919052616905, "learning_rate": 7.64578131522489e-07, "logits/chosen": 11.264555931091309, "logits/rejected": 11.02249813079834, "logps/chosen": -3.2905008792877197, "logps/rejected": -3.5156688690185547, "loss": 4.0223, "rewards/accuracies": 1.0, "rewards/chosen": -32.90501022338867, "rewards/margins": 2.2516775131225586, "rewards/rejected": -35.15668487548828, "step": 1627 }, { "epoch": 0.22167755991285404, "grad_norm": 53.023442009684565, "learning_rate": 7.644998625945728e-07, "logits/chosen": 10.198068618774414, "logits/rejected": 9.907654762268066, "logps/chosen": -3.306551933288574, "logps/rejected": -3.283109664916992, "loss": 4.5939, "rewards/accuracies": 0.5, "rewards/chosen": -33.065521240234375, "rewards/margins": -0.2344226837158203, "rewards/rejected": -32.83109664916992, "step": 1628 }, { "epoch": 0.22181372549019607, "grad_norm": 38.45798874834746, "learning_rate": 7.644215113048996e-07, "logits/chosen": 11.688542366027832, "logits/rejected": 11.608729362487793, "logps/chosen": -3.187605142593384, "logps/rejected": -3.2740626335144043, "loss": 4.7405, "rewards/accuracies": 0.5, "rewards/chosen": -31.876052856445312, "rewards/margins": 0.8645739555358887, "rewards/rejected": -32.740623474121094, "step": 1629 }, { "epoch": 0.22194989106753812, "grad_norm": 40.42815798040916, "learning_rate": 7.643430776711736e-07, "logits/chosen": 9.835052490234375, "logits/rejected": 9.885414123535156, "logps/chosen": -3.1212925910949707, "logps/rejected": -3.246889352798462, "loss": 4.6539, "rewards/accuracies": 0.75, "rewards/chosen": -31.212923049926758, "rewards/margins": 1.255969524383545, "rewards/rejected": -32.468894958496094, "step": 1630 }, { "epoch": 0.22208605664488018, "grad_norm": 39.19231792490844, "learning_rate": 7.642645617111175e-07, "logits/chosen": 10.327232360839844, "logits/rejected": 10.712189674377441, "logps/chosen": -2.9532530307769775, "logps/rejected": -3.147434711456299, "loss": 4.0427, "rewards/accuracies": 0.5, "rewards/chosen": -29.532529830932617, "rewards/margins": 1.941817283630371, "rewards/rejected": -31.474348068237305, "step": 1631 }, { "epoch": 0.2222222222222222, "grad_norm": 43.69982866927946, "learning_rate": 7.641859634424726e-07, "logits/chosen": 10.637374877929688, "logits/rejected": 11.57960033416748, "logps/chosen": -3.089540958404541, "logps/rejected": -3.2795393466949463, "loss": 3.8607, "rewards/accuracies": 0.75, "rewards/chosen": -30.895408630371094, "rewards/margins": 1.8999848365783691, "rewards/rejected": -32.79539489746094, "step": 1632 }, { "epoch": 0.22235838779956427, "grad_norm": 40.48286808528252, "learning_rate": 7.64107282882999e-07, "logits/chosen": 10.935823440551758, "logits/rejected": 10.681814193725586, "logps/chosen": -3.0336265563964844, "logps/rejected": -3.2264835834503174, "loss": 4.5065, "rewards/accuracies": 0.5, "rewards/chosen": -30.336265563964844, "rewards/margins": 1.9285707473754883, "rewards/rejected": -32.264835357666016, "step": 1633 }, { "epoch": 0.22249455337690632, "grad_norm": 41.62501156237815, "learning_rate": 7.640285200504749e-07, "logits/chosen": 9.574872970581055, "logits/rejected": 9.736640930175781, "logps/chosen": -3.117581844329834, "logps/rejected": -3.1464574337005615, "loss": 4.3231, "rewards/accuracies": 0.5, "rewards/chosen": -31.175819396972656, "rewards/margins": 0.2887554168701172, "rewards/rejected": -31.464574813842773, "step": 1634 }, { "epoch": 0.22263071895424835, "grad_norm": 43.080547843761444, "learning_rate": 7.639496749626978e-07, "logits/chosen": 9.492728233337402, "logits/rejected": 10.138287544250488, "logps/chosen": -3.183295726776123, "logps/rejected": -3.1793556213378906, "loss": 4.1545, "rewards/accuracies": 0.5, "rewards/chosen": -31.832956314086914, "rewards/margins": -0.03940153121948242, "rewards/rejected": -31.793554306030273, "step": 1635 }, { "epoch": 0.2227668845315904, "grad_norm": 33.92914215772635, "learning_rate": 7.638707476374831e-07, "logits/chosen": 9.261457443237305, "logits/rejected": 9.96063232421875, "logps/chosen": -2.7370951175689697, "logps/rejected": -2.7384002208709717, "loss": 3.2366, "rewards/accuracies": 0.75, "rewards/chosen": -27.370952606201172, "rewards/margins": 0.013051033020019531, "rewards/rejected": -27.384002685546875, "step": 1636 }, { "epoch": 0.22290305010893247, "grad_norm": 43.74092201925821, "learning_rate": 7.637917380926652e-07, "logits/chosen": 10.09945011138916, "logits/rejected": 9.976531982421875, "logps/chosen": -2.9625449180603027, "logps/rejected": -3.4534637928009033, "loss": 4.1776, "rewards/accuracies": 1.0, "rewards/chosen": -29.625446319580078, "rewards/margins": 4.909190654754639, "rewards/rejected": -34.534637451171875, "step": 1637 }, { "epoch": 0.22303921568627452, "grad_norm": 39.44520984227425, "learning_rate": 7.637126463460969e-07, "logits/chosen": 9.140487670898438, "logits/rejected": 11.307270050048828, "logps/chosen": -3.021130084991455, "logps/rejected": -3.347202777862549, "loss": 4.7676, "rewards/accuracies": 0.5, "rewards/chosen": -30.211301803588867, "rewards/margins": 3.260725975036621, "rewards/rejected": -33.47202682495117, "step": 1638 }, { "epoch": 0.22317538126361655, "grad_norm": 41.68116516026846, "learning_rate": 7.636334724156497e-07, "logits/chosen": 10.354597091674805, "logits/rejected": 10.705038070678711, "logps/chosen": -2.8062102794647217, "logps/rejected": -2.9360694885253906, "loss": 4.2027, "rewards/accuracies": 0.75, "rewards/chosen": -28.062103271484375, "rewards/margins": 1.2985916137695312, "rewards/rejected": -29.360694885253906, "step": 1639 }, { "epoch": 0.2233115468409586, "grad_norm": 44.656287129574515, "learning_rate": 7.635542163192137e-07, "logits/chosen": 9.482877731323242, "logits/rejected": 9.602519035339355, "logps/chosen": -2.8170926570892334, "logps/rejected": -3.0711112022399902, "loss": 3.8408, "rewards/accuracies": 0.75, "rewards/chosen": -28.17092514038086, "rewards/margins": 2.540187358856201, "rewards/rejected": -30.71111297607422, "step": 1640 }, { "epoch": 0.22344771241830066, "grad_norm": 45.03509073313332, "learning_rate": 7.634748780746973e-07, "logits/chosen": 9.938918113708496, "logits/rejected": 11.141223907470703, "logps/chosen": -3.064537286758423, "logps/rejected": -3.0291342735290527, "loss": 3.7072, "rewards/accuracies": 0.75, "rewards/chosen": -30.64537239074707, "rewards/margins": -0.35403013229370117, "rewards/rejected": -30.291343688964844, "step": 1641 }, { "epoch": 0.2235838779956427, "grad_norm": 41.71940998376934, "learning_rate": 7.633954577000276e-07, "logits/chosen": 9.814565658569336, "logits/rejected": 11.917019844055176, "logps/chosen": -2.743889093399048, "logps/rejected": -3.5130186080932617, "loss": 4.1388, "rewards/accuracies": 1.0, "rewards/chosen": -27.438892364501953, "rewards/margins": 7.691292762756348, "rewards/rejected": -35.130184173583984, "step": 1642 }, { "epoch": 0.22372004357298475, "grad_norm": 39.58663181594028, "learning_rate": 7.633159552131504e-07, "logits/chosen": 10.495260238647461, "logits/rejected": 9.545320510864258, "logps/chosen": -2.62636661529541, "logps/rejected": -2.6914563179016113, "loss": 4.3425, "rewards/accuracies": 0.75, "rewards/chosen": -26.263668060302734, "rewards/margins": 0.6508941650390625, "rewards/rejected": -26.914562225341797, "step": 1643 }, { "epoch": 0.2238562091503268, "grad_norm": 40.182569766345665, "learning_rate": 7.632363706320299e-07, "logits/chosen": 8.648763656616211, "logits/rejected": 10.692028045654297, "logps/chosen": -2.643620729446411, "logps/rejected": -3.0076146125793457, "loss": 3.722, "rewards/accuracies": 1.0, "rewards/chosen": -26.436206817626953, "rewards/margins": 3.639939308166504, "rewards/rejected": -30.07614517211914, "step": 1644 }, { "epoch": 0.22399237472766884, "grad_norm": 38.73317204575719, "learning_rate": 7.631567039746491e-07, "logits/chosen": 10.494891166687012, "logits/rejected": 11.610410690307617, "logps/chosen": -2.8887152671813965, "logps/rejected": -3.122001886367798, "loss": 4.0858, "rewards/accuracies": 1.0, "rewards/chosen": -28.88715362548828, "rewards/margins": 2.332864284515381, "rewards/rejected": -31.22001838684082, "step": 1645 }, { "epoch": 0.2241285403050109, "grad_norm": 42.48887346443057, "learning_rate": 7.63076955259009e-07, "logits/chosen": 10.62775993347168, "logits/rejected": 11.237966537475586, "logps/chosen": -2.9406778812408447, "logps/rejected": -2.8103816509246826, "loss": 4.6295, "rewards/accuracies": 0.5, "rewards/chosen": -29.406776428222656, "rewards/margins": -1.3029603958129883, "rewards/rejected": -28.103816986083984, "step": 1646 }, { "epoch": 0.22426470588235295, "grad_norm": 45.13829028772432, "learning_rate": 7.629971245031296e-07, "logits/chosen": 9.664993286132812, "logits/rejected": 10.545351028442383, "logps/chosen": -2.8444199562072754, "logps/rejected": -2.9716644287109375, "loss": 4.4705, "rewards/accuracies": 0.75, "rewards/chosen": -28.444198608398438, "rewards/margins": 1.272444725036621, "rewards/rejected": -29.716644287109375, "step": 1647 }, { "epoch": 0.22440087145969498, "grad_norm": 42.954314304097565, "learning_rate": 7.629172117250494e-07, "logits/chosen": 11.359594345092773, "logits/rejected": 10.731892585754395, "logps/chosen": -3.0086684226989746, "logps/rejected": -3.013793468475342, "loss": 4.0326, "rewards/accuracies": 0.5, "rewards/chosen": -30.086685180664062, "rewards/margins": 0.05124950408935547, "rewards/rejected": -30.1379337310791, "step": 1648 }, { "epoch": 0.22453703703703703, "grad_norm": 40.065388042479114, "learning_rate": 7.628372169428253e-07, "logits/chosen": 10.779003143310547, "logits/rejected": 11.295446395874023, "logps/chosen": -3.1363255977630615, "logps/rejected": -3.1231768131256104, "loss": 4.3943, "rewards/accuracies": 0.5, "rewards/chosen": -31.36325454711914, "rewards/margins": -0.13148736953735352, "rewards/rejected": -31.231765747070312, "step": 1649 }, { "epoch": 0.2246732026143791, "grad_norm": 43.556338902608395, "learning_rate": 7.627571401745328e-07, "logits/chosen": 10.815631866455078, "logits/rejected": 11.13023567199707, "logps/chosen": -3.009124279022217, "logps/rejected": -2.9546942710876465, "loss": 4.1122, "rewards/accuracies": 0.5, "rewards/chosen": -30.091243743896484, "rewards/margins": -0.5442996025085449, "rewards/rejected": -29.54694366455078, "step": 1650 }, { "epoch": 0.22480936819172112, "grad_norm": 37.99871318049183, "learning_rate": 7.626769814382658e-07, "logits/chosen": 8.757647514343262, "logits/rejected": 10.43747329711914, "logps/chosen": -2.5676183700561523, "logps/rejected": -2.8995795249938965, "loss": 4.0881, "rewards/accuracies": 0.75, "rewards/chosen": -25.67618179321289, "rewards/margins": 3.319612503051758, "rewards/rejected": -28.99579620361328, "step": 1651 }, { "epoch": 0.22494553376906318, "grad_norm": 41.6281922654279, "learning_rate": 7.62596740752137e-07, "logits/chosen": 11.561363220214844, "logits/rejected": 11.893850326538086, "logps/chosen": -3.1026525497436523, "logps/rejected": -3.4526095390319824, "loss": 4.3823, "rewards/accuracies": 0.75, "rewards/chosen": -31.026525497436523, "rewards/margins": 3.499567985534668, "rewards/rejected": -34.526092529296875, "step": 1652 }, { "epoch": 0.22508169934640523, "grad_norm": 42.861410773680944, "learning_rate": 7.625164181342775e-07, "logits/chosen": 11.163588523864746, "logits/rejected": 11.440683364868164, "logps/chosen": -3.2456817626953125, "logps/rejected": -3.2632012367248535, "loss": 4.5411, "rewards/accuracies": 0.5, "rewards/chosen": -32.456817626953125, "rewards/margins": 0.17519283294677734, "rewards/rejected": -32.63201141357422, "step": 1653 }, { "epoch": 0.22521786492374726, "grad_norm": 45.30664731965813, "learning_rate": 7.624360136028366e-07, "logits/chosen": 9.348146438598633, "logits/rejected": 9.73584270477295, "logps/chosen": -2.9108269214630127, "logps/rejected": -2.895287275314331, "loss": 3.7791, "rewards/accuracies": 0.5, "rewards/chosen": -29.10826873779297, "rewards/margins": -0.1553964614868164, "rewards/rejected": -28.95287322998047, "step": 1654 }, { "epoch": 0.22535403050108932, "grad_norm": 48.395580903995885, "learning_rate": 7.623555271759825e-07, "logits/chosen": 10.107051849365234, "logits/rejected": 10.888927459716797, "logps/chosen": -2.6013050079345703, "logps/rejected": -2.6841344833374023, "loss": 4.2472, "rewards/accuracies": 0.5, "rewards/chosen": -26.013050079345703, "rewards/margins": 0.8282933235168457, "rewards/rejected": -26.84134292602539, "step": 1655 }, { "epoch": 0.22549019607843138, "grad_norm": 47.144579487457385, "learning_rate": 7.622749588719018e-07, "logits/chosen": 10.279720306396484, "logits/rejected": 10.823524475097656, "logps/chosen": -2.673274517059326, "logps/rejected": -2.9221887588500977, "loss": 4.0433, "rewards/accuracies": 1.0, "rewards/chosen": -26.732746124267578, "rewards/margins": 2.489142417907715, "rewards/rejected": -29.221887588500977, "step": 1656 }, { "epoch": 0.22562636165577343, "grad_norm": 42.69151492108011, "learning_rate": 7.621943087087995e-07, "logits/chosen": 11.681303024291992, "logits/rejected": 10.661335945129395, "logps/chosen": -3.1148579120635986, "logps/rejected": -2.8654959201812744, "loss": 3.9249, "rewards/accuracies": 0.0, "rewards/chosen": -31.148578643798828, "rewards/margins": -2.4936208724975586, "rewards/rejected": -28.65496063232422, "step": 1657 }, { "epoch": 0.22576252723311546, "grad_norm": 40.4179326108933, "learning_rate": 7.621135767048993e-07, "logits/chosen": 10.029093742370605, "logits/rejected": 10.71600341796875, "logps/chosen": -2.845021963119507, "logps/rejected": -2.9214131832122803, "loss": 4.4438, "rewards/accuracies": 0.5, "rewards/chosen": -28.450220108032227, "rewards/margins": 0.763911247253418, "rewards/rejected": -29.214130401611328, "step": 1658 }, { "epoch": 0.22589869281045752, "grad_norm": 46.63653195208202, "learning_rate": 7.620327628784432e-07, "logits/chosen": 11.197123527526855, "logits/rejected": 11.001283645629883, "logps/chosen": -3.420504570007324, "logps/rejected": -3.1716556549072266, "loss": 3.9005, "rewards/accuracies": 0.25, "rewards/chosen": -34.205047607421875, "rewards/margins": -2.4884896278381348, "rewards/rejected": -31.716556549072266, "step": 1659 }, { "epoch": 0.22603485838779958, "grad_norm": 47.97169159120734, "learning_rate": 7.619518672476916e-07, "logits/chosen": 11.042069435119629, "logits/rejected": 9.114731788635254, "logps/chosen": -2.9710841178894043, "logps/rejected": -2.606074333190918, "loss": 4.4911, "rewards/accuracies": 0.25, "rewards/chosen": -29.710840225219727, "rewards/margins": -3.650096893310547, "rewards/rejected": -26.060745239257812, "step": 1660 }, { "epoch": 0.2261710239651416, "grad_norm": 41.10045861221667, "learning_rate": 7.618708898309238e-07, "logits/chosen": 10.767000198364258, "logits/rejected": 10.325174331665039, "logps/chosen": -2.593057155609131, "logps/rejected": -2.6793313026428223, "loss": 3.9623, "rewards/accuracies": 0.5, "rewards/chosen": -25.930572509765625, "rewards/margins": 0.8627405166625977, "rewards/rejected": -26.793312072753906, "step": 1661 }, { "epoch": 0.22630718954248366, "grad_norm": 127.77171935745464, "learning_rate": 7.617898306464371e-07, "logits/chosen": 9.44491958618164, "logits/rejected": 11.935613632202148, "logps/chosen": -2.7354464530944824, "logps/rejected": -3.1201069355010986, "loss": 4.3287, "rewards/accuracies": 0.75, "rewards/chosen": -27.35446548461914, "rewards/margins": 3.8466005325317383, "rewards/rejected": -31.201068878173828, "step": 1662 }, { "epoch": 0.22644335511982572, "grad_norm": 56.02611381511405, "learning_rate": 7.617086897125476e-07, "logits/chosen": 11.012081146240234, "logits/rejected": 11.54173469543457, "logps/chosen": -3.1414055824279785, "logps/rejected": -3.5389397144317627, "loss": 4.3858, "rewards/accuracies": 0.75, "rewards/chosen": -31.4140567779541, "rewards/margins": 3.9753403663635254, "rewards/rejected": -35.38939666748047, "step": 1663 }, { "epoch": 0.22657952069716775, "grad_norm": 45.06880003813621, "learning_rate": 7.616274670475897e-07, "logits/chosen": 11.425146102905273, "logits/rejected": 11.379518508911133, "logps/chosen": -3.195451259613037, "logps/rejected": -3.2734527587890625, "loss": 4.7623, "rewards/accuracies": 0.75, "rewards/chosen": -31.954513549804688, "rewards/margins": 0.7800154685974121, "rewards/rejected": -32.734527587890625, "step": 1664 }, { "epoch": 0.2267156862745098, "grad_norm": 40.043032545082994, "learning_rate": 7.615461626699164e-07, "logits/chosen": 10.07541275024414, "logits/rejected": 10.875508308410645, "logps/chosen": -2.6381571292877197, "logps/rejected": -2.9523096084594727, "loss": 4.2175, "rewards/accuracies": 1.0, "rewards/chosen": -26.38157081604004, "rewards/margins": 3.1415247917175293, "rewards/rejected": -29.523094177246094, "step": 1665 }, { "epoch": 0.22685185185185186, "grad_norm": 37.35476718804492, "learning_rate": 7.614647765978991e-07, "logits/chosen": 9.866602897644043, "logits/rejected": 10.36777114868164, "logps/chosen": -3.0662384033203125, "logps/rejected": -3.12886643409729, "loss": 4.231, "rewards/accuracies": 0.75, "rewards/chosen": -30.662384033203125, "rewards/margins": 0.6262803077697754, "rewards/rejected": -31.288663864135742, "step": 1666 }, { "epoch": 0.2269880174291939, "grad_norm": 45.16652774649799, "learning_rate": 7.613833088499278e-07, "logits/chosen": 10.73572826385498, "logits/rejected": 11.696893692016602, "logps/chosen": -2.9911434650421143, "logps/rejected": -3.3539929389953613, "loss": 3.9726, "rewards/accuracies": 1.0, "rewards/chosen": -29.911436080932617, "rewards/margins": 3.6284937858581543, "rewards/rejected": -33.53993225097656, "step": 1667 }, { "epoch": 0.22712418300653595, "grad_norm": 40.47427541376753, "learning_rate": 7.613017594444104e-07, "logits/chosen": 10.314510345458984, "logits/rejected": 11.285955429077148, "logps/chosen": -3.1439783573150635, "logps/rejected": -3.3560328483581543, "loss": 4.1654, "rewards/accuracies": 1.0, "rewards/chosen": -31.43978500366211, "rewards/margins": 2.120542526245117, "rewards/rejected": -33.560325622558594, "step": 1668 }, { "epoch": 0.227260348583878, "grad_norm": 44.711512627337356, "learning_rate": 7.61220128399774e-07, "logits/chosen": 10.691610336303711, "logits/rejected": 10.631407737731934, "logps/chosen": -3.1397666931152344, "logps/rejected": -3.2494518756866455, "loss": 4.1299, "rewards/accuracies": 0.5, "rewards/chosen": -31.397666931152344, "rewards/margins": 1.0968523025512695, "rewards/rejected": -32.4945182800293, "step": 1669 }, { "epoch": 0.22739651416122003, "grad_norm": 46.413095907611414, "learning_rate": 7.611384157344638e-07, "logits/chosen": 10.54730224609375, "logits/rejected": 10.480978012084961, "logps/chosen": -2.9907379150390625, "logps/rejected": -2.8700296878814697, "loss": 3.9903, "rewards/accuracies": 0.25, "rewards/chosen": -29.907379150390625, "rewards/margins": -1.2070798873901367, "rewards/rejected": -28.70029640197754, "step": 1670 }, { "epoch": 0.2275326797385621, "grad_norm": 38.30912262152213, "learning_rate": 7.610566214669432e-07, "logits/chosen": 9.30904769897461, "logits/rejected": 10.949422836303711, "logps/chosen": -2.9373362064361572, "logps/rejected": -3.2700376510620117, "loss": 3.9626, "rewards/accuracies": 0.5, "rewards/chosen": -29.373361587524414, "rewards/margins": 3.3270163536071777, "rewards/rejected": -32.70037841796875, "step": 1671 }, { "epoch": 0.22766884531590414, "grad_norm": 36.081832727804745, "learning_rate": 7.609747456156946e-07, "logits/chosen": 11.482694625854492, "logits/rejected": 10.902587890625, "logps/chosen": -3.1019251346588135, "logps/rejected": -3.2134222984313965, "loss": 4.1278, "rewards/accuracies": 0.5, "rewards/chosen": -31.019250869750977, "rewards/margins": 1.1149725914001465, "rewards/rejected": -32.13422393798828, "step": 1672 }, { "epoch": 0.22780501089324617, "grad_norm": 40.70308056649073, "learning_rate": 7.608927881992182e-07, "logits/chosen": 10.765778541564941, "logits/rejected": 10.053537368774414, "logps/chosen": -3.0666565895080566, "logps/rejected": -2.940049648284912, "loss": 4.0394, "rewards/accuracies": 0.25, "rewards/chosen": -30.666563034057617, "rewards/margins": -1.2660694122314453, "rewards/rejected": -29.400493621826172, "step": 1673 }, { "epoch": 0.22794117647058823, "grad_norm": 39.20997233989684, "learning_rate": 7.608107492360333e-07, "logits/chosen": 10.621841430664062, "logits/rejected": 10.218563079833984, "logps/chosen": -2.9839909076690674, "logps/rejected": -3.162595510482788, "loss": 4.3103, "rewards/accuracies": 0.75, "rewards/chosen": -29.839908599853516, "rewards/margins": 1.786046028137207, "rewards/rejected": -31.625953674316406, "step": 1674 }, { "epoch": 0.2280773420479303, "grad_norm": 46.57737731395865, "learning_rate": 7.60728628744677e-07, "logits/chosen": 9.765665054321289, "logits/rejected": 12.060486793518066, "logps/chosen": -3.0089964866638184, "logps/rejected": -3.7642157077789307, "loss": 4.538, "rewards/accuracies": 1.0, "rewards/chosen": -30.089962005615234, "rewards/margins": 7.552196025848389, "rewards/rejected": -37.64215850830078, "step": 1675 }, { "epoch": 0.22821350762527234, "grad_norm": 50.847850342900394, "learning_rate": 7.606464267437052e-07, "logits/chosen": 8.863077163696289, "logits/rejected": 10.451902389526367, "logps/chosen": -2.770291328430176, "logps/rejected": -3.0144333839416504, "loss": 3.7188, "rewards/accuracies": 0.75, "rewards/chosen": -27.702913284301758, "rewards/margins": 2.4414210319519043, "rewards/rejected": -30.14433479309082, "step": 1676 }, { "epoch": 0.22834967320261437, "grad_norm": 43.282819541338185, "learning_rate": 7.605641432516923e-07, "logits/chosen": 11.219411849975586, "logits/rejected": 11.241082191467285, "logps/chosen": -3.1345229148864746, "logps/rejected": -3.2932815551757812, "loss": 4.2696, "rewards/accuracies": 0.75, "rewards/chosen": -31.34522819519043, "rewards/margins": 1.5875859260559082, "rewards/rejected": -32.93281555175781, "step": 1677 }, { "epoch": 0.22848583877995643, "grad_norm": 41.017665349451846, "learning_rate": 7.604817782872307e-07, "logits/chosen": 11.071002006530762, "logits/rejected": 11.602151870727539, "logps/chosen": -3.2270102500915527, "logps/rejected": -3.3254294395446777, "loss": 4.5045, "rewards/accuracies": 0.75, "rewards/chosen": -32.270103454589844, "rewards/margins": 0.9841880798339844, "rewards/rejected": -33.25429153442383, "step": 1678 }, { "epoch": 0.2286220043572985, "grad_norm": 42.76373985364085, "learning_rate": 7.603993318689315e-07, "logits/chosen": 10.434930801391602, "logits/rejected": 11.657386779785156, "logps/chosen": -3.2676358222961426, "logps/rejected": -3.340768337249756, "loss": 4.3001, "rewards/accuracies": 0.5, "rewards/chosen": -32.67635726928711, "rewards/margins": 0.7313251495361328, "rewards/rejected": -33.407684326171875, "step": 1679 }, { "epoch": 0.22875816993464052, "grad_norm": 40.30854237087969, "learning_rate": 7.603168040154242e-07, "logits/chosen": 12.075400352478027, "logits/rejected": 10.36459732055664, "logps/chosen": -3.324509620666504, "logps/rejected": -3.0805821418762207, "loss": 4.125, "rewards/accuracies": 0.25, "rewards/chosen": -33.245094299316406, "rewards/margins": -2.439274787902832, "rewards/rejected": -30.80582046508789, "step": 1680 }, { "epoch": 0.22889433551198257, "grad_norm": 43.932984198365276, "learning_rate": 7.602341947453566e-07, "logits/chosen": 10.721611976623535, "logits/rejected": 10.486390113830566, "logps/chosen": -2.853795051574707, "logps/rejected": -2.998821496963501, "loss": 4.5365, "rewards/accuracies": 0.5, "rewards/chosen": -28.537952423095703, "rewards/margins": 1.450261116027832, "rewards/rejected": -29.98821449279785, "step": 1681 }, { "epoch": 0.22903050108932463, "grad_norm": 42.43810548122513, "learning_rate": 7.60151504077395e-07, "logits/chosen": 12.234806060791016, "logits/rejected": 10.04534912109375, "logps/chosen": -3.1957530975341797, "logps/rejected": -2.907771110534668, "loss": 3.455, "rewards/accuracies": 0.25, "rewards/chosen": -31.95753288269043, "rewards/margins": -2.879821300506592, "rewards/rejected": -29.07771110534668, "step": 1682 }, { "epoch": 0.22916666666666666, "grad_norm": 39.21616020020058, "learning_rate": 7.60068732030224e-07, "logits/chosen": 10.518815994262695, "logits/rejected": 11.611924171447754, "logps/chosen": -3.002340316772461, "logps/rejected": -3.167842149734497, "loss": 4.42, "rewards/accuracies": 0.5, "rewards/chosen": -30.023405075073242, "rewards/margins": 1.655015468597412, "rewards/rejected": -31.678421020507812, "step": 1683 }, { "epoch": 0.22930283224400871, "grad_norm": 38.80074790926921, "learning_rate": 7.599858786225466e-07, "logits/chosen": 10.421335220336914, "logits/rejected": 11.519657135009766, "logps/chosen": -2.993281364440918, "logps/rejected": -3.441329002380371, "loss": 3.609, "rewards/accuracies": 1.0, "rewards/chosen": -29.93281364440918, "rewards/margins": 4.480474472045898, "rewards/rejected": -34.41328811645508, "step": 1684 }, { "epoch": 0.22943899782135077, "grad_norm": 39.69676851769496, "learning_rate": 7.599029438730843e-07, "logits/chosen": 9.753073692321777, "logits/rejected": 10.629011154174805, "logps/chosen": -2.771127223968506, "logps/rejected": -3.030427932739258, "loss": 3.9753, "rewards/accuracies": 0.5, "rewards/chosen": -27.711271286010742, "rewards/margins": 2.5930089950561523, "rewards/rejected": -30.304279327392578, "step": 1685 }, { "epoch": 0.2295751633986928, "grad_norm": 51.53631502949917, "learning_rate": 7.598199278005769e-07, "logits/chosen": 10.3518705368042, "logits/rejected": 10.324962615966797, "logps/chosen": -2.954235076904297, "logps/rejected": -2.8765153884887695, "loss": 4.1686, "rewards/accuracies": 0.5, "rewards/chosen": -29.54235076904297, "rewards/margins": -0.777198314666748, "rewards/rejected": -28.765151977539062, "step": 1686 }, { "epoch": 0.22971132897603486, "grad_norm": 38.59637479623644, "learning_rate": 7.597368304237823e-07, "logits/chosen": 10.786913871765137, "logits/rejected": 11.379411697387695, "logps/chosen": -3.184955358505249, "logps/rejected": -3.363079071044922, "loss": 3.7711, "rewards/accuracies": 1.0, "rewards/chosen": -31.849552154541016, "rewards/margins": 1.781238079071045, "rewards/rejected": -33.63079071044922, "step": 1687 }, { "epoch": 0.2298474945533769, "grad_norm": 47.331931031431, "learning_rate": 7.596536517614774e-07, "logits/chosen": 10.669641494750977, "logits/rejected": 9.088300704956055, "logps/chosen": -3.0948290824890137, "logps/rejected": -2.8163459300994873, "loss": 3.8097, "rewards/accuracies": 0.25, "rewards/chosen": -30.94828987121582, "rewards/margins": -2.7848315238952637, "rewards/rejected": -28.16345977783203, "step": 1688 }, { "epoch": 0.22998366013071894, "grad_norm": 44.548697639828625, "learning_rate": 7.59570391832457e-07, "logits/chosen": 9.693063735961914, "logits/rejected": 10.276634216308594, "logps/chosen": -2.6645264625549316, "logps/rejected": -3.0518558025360107, "loss": 4.1021, "rewards/accuracies": 1.0, "rewards/chosen": -26.645265579223633, "rewards/margins": 3.8732943534851074, "rewards/rejected": -30.5185604095459, "step": 1689 }, { "epoch": 0.230119825708061, "grad_norm": 50.58915920179338, "learning_rate": 7.594870506555343e-07, "logits/chosen": 10.99593448638916, "logits/rejected": 11.415027618408203, "logps/chosen": -3.086207389831543, "logps/rejected": -3.2518815994262695, "loss": 3.8693, "rewards/accuracies": 0.75, "rewards/chosen": -30.862071990966797, "rewards/margins": 1.6567435264587402, "rewards/rejected": -32.51881790161133, "step": 1690 }, { "epoch": 0.23025599128540306, "grad_norm": 40.89737833495333, "learning_rate": 7.594036282495409e-07, "logits/chosen": 10.306074142456055, "logits/rejected": 9.821822166442871, "logps/chosen": -2.9225287437438965, "logps/rejected": -2.8921358585357666, "loss": 3.8376, "rewards/accuracies": 0.5, "rewards/chosen": -29.22528839111328, "rewards/margins": -0.3039283752441406, "rewards/rejected": -28.92136001586914, "step": 1691 }, { "epoch": 0.23039215686274508, "grad_norm": 43.06101971183564, "learning_rate": 7.593201246333269e-07, "logits/chosen": 10.615398406982422, "logits/rejected": 11.538758277893066, "logps/chosen": -2.9561357498168945, "logps/rejected": -3.373074531555176, "loss": 3.8803, "rewards/accuracies": 1.0, "rewards/chosen": -29.561355590820312, "rewards/margins": 4.169389247894287, "rewards/rejected": -33.730743408203125, "step": 1692 }, { "epoch": 0.23052832244008714, "grad_norm": 44.82340534413868, "learning_rate": 7.592365398257605e-07, "logits/chosen": 9.92831039428711, "logits/rejected": 11.495080947875977, "logps/chosen": -2.654182195663452, "logps/rejected": -2.866476535797119, "loss": 3.9792, "rewards/accuracies": 0.75, "rewards/chosen": -26.54182243347168, "rewards/margins": 2.122943878173828, "rewards/rejected": -28.664766311645508, "step": 1693 }, { "epoch": 0.2306644880174292, "grad_norm": 48.015312807055835, "learning_rate": 7.591528738457284e-07, "logits/chosen": 11.306159973144531, "logits/rejected": 10.872743606567383, "logps/chosen": -3.094783306121826, "logps/rejected": -3.178067207336426, "loss": 4.3654, "rewards/accuracies": 0.25, "rewards/chosen": -30.947834014892578, "rewards/margins": 0.8328375816345215, "rewards/rejected": -31.78067398071289, "step": 1694 }, { "epoch": 0.23080065359477125, "grad_norm": 42.41235322444405, "learning_rate": 7.59069126712136e-07, "logits/chosen": 10.710954666137695, "logits/rejected": 10.284696578979492, "logps/chosen": -3.0653023719787598, "logps/rejected": -3.056319236755371, "loss": 4.0269, "rewards/accuracies": 0.5, "rewards/chosen": -30.65302085876465, "rewards/margins": -0.0898275375366211, "rewards/rejected": -30.563194274902344, "step": 1695 }, { "epoch": 0.23093681917211328, "grad_norm": 42.610830599132676, "learning_rate": 7.589852984439059e-07, "logits/chosen": 10.458246231079102, "logits/rejected": 10.928068161010742, "logps/chosen": -2.80136775970459, "logps/rejected": -3.163193702697754, "loss": 3.9018, "rewards/accuracies": 0.75, "rewards/chosen": -28.013675689697266, "rewards/margins": 3.618259906768799, "rewards/rejected": -31.63193702697754, "step": 1696 }, { "epoch": 0.23107298474945534, "grad_norm": 46.050336476461176, "learning_rate": 7.589013890599804e-07, "logits/chosen": 9.979471206665039, "logits/rejected": 10.986335754394531, "logps/chosen": -2.7313010692596436, "logps/rejected": -2.8455429077148438, "loss": 3.8652, "rewards/accuracies": 0.5, "rewards/chosen": -27.313011169433594, "rewards/margins": 1.1424202919006348, "rewards/rejected": -28.45543098449707, "step": 1697 }, { "epoch": 0.2312091503267974, "grad_norm": 42.23188183352252, "learning_rate": 7.588173985793193e-07, "logits/chosen": 8.529629707336426, "logits/rejected": 9.455301284790039, "logps/chosen": -2.7642927169799805, "logps/rejected": -2.8240585327148438, "loss": 3.4894, "rewards/accuracies": 0.5, "rewards/chosen": -27.642925262451172, "rewards/margins": 0.5976581573486328, "rewards/rejected": -28.240585327148438, "step": 1698 }, { "epoch": 0.23134531590413943, "grad_norm": 44.490867474352285, "learning_rate": 7.587333270209011e-07, "logits/chosen": 10.395883560180664, "logits/rejected": 11.68626594543457, "logps/chosen": -2.7980165481567383, "logps/rejected": -3.255852222442627, "loss": 4.053, "rewards/accuracies": 0.75, "rewards/chosen": -27.980165481567383, "rewards/margins": 4.5783562660217285, "rewards/rejected": -32.55852508544922, "step": 1699 }, { "epoch": 0.23148148148148148, "grad_norm": 34.39742731999264, "learning_rate": 7.586491744037222e-07, "logits/chosen": 9.268434524536133, "logits/rejected": 10.16076374053955, "logps/chosen": -2.680814504623413, "logps/rejected": -2.6850740909576416, "loss": 3.7094, "rewards/accuracies": 0.75, "rewards/chosen": -26.80814552307129, "rewards/margins": 0.04259681701660156, "rewards/rejected": -26.85074234008789, "step": 1700 }, { "epoch": 0.23161764705882354, "grad_norm": 45.22799062013803, "learning_rate": 7.585649407467977e-07, "logits/chosen": 11.399559020996094, "logits/rejected": 10.825592041015625, "logps/chosen": -3.2339890003204346, "logps/rejected": -3.4843101501464844, "loss": 4.5843, "rewards/accuracies": 0.75, "rewards/chosen": -32.33988952636719, "rewards/margins": 2.5032100677490234, "rewards/rejected": -34.843101501464844, "step": 1701 }, { "epoch": 0.23175381263616557, "grad_norm": 44.7026427529898, "learning_rate": 7.58480626069161e-07, "logits/chosen": 10.299077987670898, "logits/rejected": 11.232648849487305, "logps/chosen": -2.9496421813964844, "logps/rejected": -3.0049233436584473, "loss": 3.9509, "rewards/accuracies": 0.5, "rewards/chosen": -29.496421813964844, "rewards/margins": 0.5528106689453125, "rewards/rejected": -30.049232482910156, "step": 1702 }, { "epoch": 0.23188997821350762, "grad_norm": 43.44516170674066, "learning_rate": 7.583962303898636e-07, "logits/chosen": 10.113777160644531, "logits/rejected": 9.49751091003418, "logps/chosen": -3.0245044231414795, "logps/rejected": -2.8348946571350098, "loss": 3.8051, "rewards/accuracies": 0.25, "rewards/chosen": -30.245044708251953, "rewards/margins": -1.8960952758789062, "rewards/rejected": -28.348949432373047, "step": 1703 }, { "epoch": 0.23202614379084968, "grad_norm": 45.185435341128574, "learning_rate": 7.583117537279755e-07, "logits/chosen": 11.25482177734375, "logits/rejected": 9.434343338012695, "logps/chosen": -3.059791326522827, "logps/rejected": -2.821939468383789, "loss": 4.2907, "rewards/accuracies": 0.0, "rewards/chosen": -30.59791374206543, "rewards/margins": -2.3785176277160645, "rewards/rejected": -28.21939468383789, "step": 1704 }, { "epoch": 0.2321623093681917, "grad_norm": 37.81635274906993, "learning_rate": 7.582271961025846e-07, "logits/chosen": 9.9263277053833, "logits/rejected": 10.426673889160156, "logps/chosen": -2.8653926849365234, "logps/rejected": -3.3173272609710693, "loss": 3.7576, "rewards/accuracies": 0.75, "rewards/chosen": -28.653926849365234, "rewards/margins": 4.519345283508301, "rewards/rejected": -33.17327117919922, "step": 1705 }, { "epoch": 0.23229847494553377, "grad_norm": 48.85060475365886, "learning_rate": 7.581425575327976e-07, "logits/chosen": 10.551164627075195, "logits/rejected": 10.081336975097656, "logps/chosen": -3.1926183700561523, "logps/rejected": -3.3500072956085205, "loss": 3.9398, "rewards/accuracies": 0.75, "rewards/chosen": -31.92618179321289, "rewards/margins": 1.573890209197998, "rewards/rejected": -33.50007247924805, "step": 1706 }, { "epoch": 0.23243464052287582, "grad_norm": 51.711844152211484, "learning_rate": 7.580578380377394e-07, "logits/chosen": 11.045194625854492, "logits/rejected": 10.554948806762695, "logps/chosen": -3.0824978351593018, "logps/rejected": -2.879647731781006, "loss": 4.6861, "rewards/accuracies": 0.25, "rewards/chosen": -30.824979782104492, "rewards/margins": -2.028501033782959, "rewards/rejected": -28.796478271484375, "step": 1707 }, { "epoch": 0.23257080610021785, "grad_norm": 41.18396680320587, "learning_rate": 7.57973037636553e-07, "logits/chosen": 10.355936050415039, "logits/rejected": 10.279481887817383, "logps/chosen": -2.978408098220825, "logps/rejected": -3.1480796337127686, "loss": 4.1144, "rewards/accuracies": 0.5, "rewards/chosen": -29.784082412719727, "rewards/margins": 1.6967134475708008, "rewards/rejected": -31.480796813964844, "step": 1708 }, { "epoch": 0.2327069716775599, "grad_norm": 43.52874964923498, "learning_rate": 7.578881563483997e-07, "logits/chosen": 9.708698272705078, "logits/rejected": 9.881668090820312, "logps/chosen": -2.6728668212890625, "logps/rejected": -2.9287109375, "loss": 4.1974, "rewards/accuracies": 0.75, "rewards/chosen": -26.728668212890625, "rewards/margins": 2.5584397315979004, "rewards/rejected": -29.287107467651367, "step": 1709 }, { "epoch": 0.23284313725490197, "grad_norm": 44.805428448250844, "learning_rate": 7.57803194192459e-07, "logits/chosen": 10.944175720214844, "logits/rejected": 11.3148193359375, "logps/chosen": -3.017622709274292, "logps/rejected": -3.290374279022217, "loss": 4.2272, "rewards/accuracies": 0.75, "rewards/chosen": -30.176227569580078, "rewards/margins": 2.7275161743164062, "rewards/rejected": -32.903743743896484, "step": 1710 }, { "epoch": 0.232979302832244, "grad_norm": 53.26715678676126, "learning_rate": 7.577181511879291e-07, "logits/chosen": 10.891016006469727, "logits/rejected": 9.394635200500488, "logps/chosen": -3.02433443069458, "logps/rejected": -3.1185646057128906, "loss": 3.7838, "rewards/accuracies": 0.5, "rewards/chosen": -30.243345260620117, "rewards/margins": 0.9423012733459473, "rewards/rejected": -31.185646057128906, "step": 1711 }, { "epoch": 0.23311546840958605, "grad_norm": 44.40316471584128, "learning_rate": 7.57633027354026e-07, "logits/chosen": 10.040407180786133, "logits/rejected": 10.682109832763672, "logps/chosen": -2.6662325859069824, "logps/rejected": -3.0256409645080566, "loss": 4.3643, "rewards/accuracies": 0.5, "rewards/chosen": -26.66232681274414, "rewards/margins": 3.5940823554992676, "rewards/rejected": -30.25640869140625, "step": 1712 }, { "epoch": 0.2332516339869281, "grad_norm": 48.7798346739942, "learning_rate": 7.575478227099841e-07, "logits/chosen": 10.433241844177246, "logits/rejected": 11.38161849975586, "logps/chosen": -3.0046157836914062, "logps/rejected": -3.5452330112457275, "loss": 4.4996, "rewards/accuracies": 1.0, "rewards/chosen": -30.04615592956543, "rewards/margins": 5.406174659729004, "rewards/rejected": -35.45233154296875, "step": 1713 }, { "epoch": 0.23338779956427017, "grad_norm": 40.58905947949557, "learning_rate": 7.574625372750562e-07, "logits/chosen": 9.949766159057617, "logits/rejected": 10.681320190429688, "logps/chosen": -3.120069980621338, "logps/rejected": -3.330817461013794, "loss": 4.1932, "rewards/accuracies": 0.75, "rewards/chosen": -31.200698852539062, "rewards/margins": 2.107475757598877, "rewards/rejected": -33.30817413330078, "step": 1714 }, { "epoch": 0.2335239651416122, "grad_norm": 43.85591723438706, "learning_rate": 7.57377171068513e-07, "logits/chosen": 10.88385009765625, "logits/rejected": 10.756595611572266, "logps/chosen": -3.3809359073638916, "logps/rejected": -3.0997228622436523, "loss": 4.2442, "rewards/accuracies": 0.75, "rewards/chosen": -33.809356689453125, "rewards/margins": -2.8121304512023926, "rewards/rejected": -30.997228622436523, "step": 1715 }, { "epoch": 0.23366013071895425, "grad_norm": 49.1500410939536, "learning_rate": 7.572917241096441e-07, "logits/chosen": 9.927345275878906, "logits/rejected": 10.50364875793457, "logps/chosen": -2.918112277984619, "logps/rejected": -3.011495351791382, "loss": 4.1518, "rewards/accuracies": 0.5, "rewards/chosen": -29.181121826171875, "rewards/margins": 0.9338312149047852, "rewards/rejected": -30.114952087402344, "step": 1716 }, { "epoch": 0.2337962962962963, "grad_norm": 56.38974487500167, "learning_rate": 7.572061964177566e-07, "logits/chosen": 9.66633415222168, "logits/rejected": 10.538005828857422, "logps/chosen": -3.0303964614868164, "logps/rejected": -3.345376968383789, "loss": 4.2159, "rewards/accuracies": 0.75, "rewards/chosen": -30.303964614868164, "rewards/margins": 3.1498055458068848, "rewards/rejected": -33.45376968383789, "step": 1717 }, { "epoch": 0.23393246187363834, "grad_norm": 43.143150012783956, "learning_rate": 7.571205880121764e-07, "logits/chosen": 8.224855422973633, "logits/rejected": 9.756725311279297, "logps/chosen": -2.079315185546875, "logps/rejected": -2.6297454833984375, "loss": 3.2429, "rewards/accuracies": 1.0, "rewards/chosen": -20.793149948120117, "rewards/margins": 5.504302501678467, "rewards/rejected": -26.297454833984375, "step": 1718 }, { "epoch": 0.2340686274509804, "grad_norm": 44.84983362146797, "learning_rate": 7.570348989122473e-07, "logits/chosen": 10.72033405303955, "logits/rejected": 10.8931303024292, "logps/chosen": -2.772843837738037, "logps/rejected": -3.123692035675049, "loss": 4.4269, "rewards/accuracies": 0.75, "rewards/chosen": -27.728439331054688, "rewards/margins": 3.508481502532959, "rewards/rejected": -31.236921310424805, "step": 1719 }, { "epoch": 0.23420479302832245, "grad_norm": 43.439173121675765, "learning_rate": 7.569491291373316e-07, "logits/chosen": 10.495807647705078, "logits/rejected": 10.370546340942383, "logps/chosen": -3.313781261444092, "logps/rejected": -3.3099536895751953, "loss": 4.4821, "rewards/accuracies": 0.5, "rewards/chosen": -33.13780975341797, "rewards/margins": -0.03827476501464844, "rewards/rejected": -33.09954071044922, "step": 1720 }, { "epoch": 0.23434095860566448, "grad_norm": 44.00480357693605, "learning_rate": 7.568632787068095e-07, "logits/chosen": 9.702056884765625, "logits/rejected": 9.31378173828125, "logps/chosen": -2.8088104724884033, "logps/rejected": -2.6734588146209717, "loss": 4.0712, "rewards/accuracies": 0.5, "rewards/chosen": -28.088106155395508, "rewards/margins": -1.3535170555114746, "rewards/rejected": -26.734588623046875, "step": 1721 }, { "epoch": 0.23447712418300654, "grad_norm": 42.4272663822021, "learning_rate": 7.567773476400797e-07, "logits/chosen": 10.105098724365234, "logits/rejected": 9.715797424316406, "logps/chosen": -2.9288620948791504, "logps/rejected": -2.810567617416382, "loss": 4.4117, "rewards/accuracies": 0.25, "rewards/chosen": -29.28862190246582, "rewards/margins": -1.1829462051391602, "rewards/rejected": -28.105674743652344, "step": 1722 }, { "epoch": 0.2346132897603486, "grad_norm": 52.50329472436987, "learning_rate": 7.566913359565591e-07, "logits/chosen": 9.684749603271484, "logits/rejected": 10.758697509765625, "logps/chosen": -3.05869460105896, "logps/rejected": -3.525545597076416, "loss": 4.1092, "rewards/accuracies": 1.0, "rewards/chosen": -30.586944580078125, "rewards/margins": 4.668511867523193, "rewards/rejected": -35.255455017089844, "step": 1723 }, { "epoch": 0.23474945533769062, "grad_norm": 40.01603610861026, "learning_rate": 7.566052436756827e-07, "logits/chosen": 8.97941780090332, "logits/rejected": 10.733766555786133, "logps/chosen": -2.7377400398254395, "logps/rejected": -3.1972084045410156, "loss": 3.4652, "rewards/accuracies": 1.0, "rewards/chosen": -27.377399444580078, "rewards/margins": 4.594685077667236, "rewards/rejected": -31.97208595275879, "step": 1724 }, { "epoch": 0.23488562091503268, "grad_norm": 40.23913469074768, "learning_rate": 7.565190708169037e-07, "logits/chosen": 10.499181747436523, "logits/rejected": 10.638439178466797, "logps/chosen": -3.20206618309021, "logps/rejected": -3.380045175552368, "loss": 3.8883, "rewards/accuracies": 0.75, "rewards/chosen": -32.020660400390625, "rewards/margins": 1.779789924621582, "rewards/rejected": -33.800453186035156, "step": 1725 }, { "epoch": 0.23502178649237473, "grad_norm": 43.70519282730638, "learning_rate": 7.564328173996937e-07, "logits/chosen": 9.020669937133789, "logits/rejected": 10.374549865722656, "logps/chosen": -2.685472249984741, "logps/rejected": -3.0797696113586426, "loss": 3.8522, "rewards/accuracies": 0.75, "rewards/chosen": -26.854721069335938, "rewards/margins": 3.94297456741333, "rewards/rejected": -30.79769515991211, "step": 1726 }, { "epoch": 0.23515795206971676, "grad_norm": 43.78909730183663, "learning_rate": 7.563464834435424e-07, "logits/chosen": 10.547178268432617, "logits/rejected": 11.559447288513184, "logps/chosen": -3.1054184436798096, "logps/rejected": -3.765824794769287, "loss": 3.5136, "rewards/accuracies": 0.75, "rewards/chosen": -31.054183959960938, "rewards/margins": 6.604063510894775, "rewards/rejected": -37.65824890136719, "step": 1727 }, { "epoch": 0.23529411764705882, "grad_norm": 46.43668151970729, "learning_rate": 7.562600689679573e-07, "logits/chosen": 11.76265811920166, "logits/rejected": 11.023716926574707, "logps/chosen": -3.2053539752960205, "logps/rejected": -3.2811787128448486, "loss": 4.1088, "rewards/accuracies": 0.75, "rewards/chosen": -32.05353927612305, "rewards/margins": 0.758246898651123, "rewards/rejected": -32.81178665161133, "step": 1728 }, { "epoch": 0.23543028322440088, "grad_norm": 114.3961435023741, "learning_rate": 7.561735739924649e-07, "logits/chosen": 10.984734535217285, "logits/rejected": 11.509784698486328, "logps/chosen": -3.132838249206543, "logps/rejected": -3.0086488723754883, "loss": 3.7616, "rewards/accuracies": 0.0, "rewards/chosen": -31.32838249206543, "rewards/margins": -1.2418932914733887, "rewards/rejected": -30.086488723754883, "step": 1729 }, { "epoch": 0.2355664488017429, "grad_norm": 38.603920572801435, "learning_rate": 7.560869985366094e-07, "logits/chosen": 9.876811027526855, "logits/rejected": 10.119213104248047, "logps/chosen": -2.8216447830200195, "logps/rejected": -3.0583345890045166, "loss": 3.8198, "rewards/accuracies": 0.75, "rewards/chosen": -28.216447830200195, "rewards/margins": 2.3668980598449707, "rewards/rejected": -30.58334732055664, "step": 1730 }, { "epoch": 0.23570261437908496, "grad_norm": 38.01652528651212, "learning_rate": 7.560003426199531e-07, "logits/chosen": 10.693593978881836, "logits/rejected": 11.85940170288086, "logps/chosen": -3.054110527038574, "logps/rejected": -3.2229888439178467, "loss": 3.8343, "rewards/accuracies": 0.75, "rewards/chosen": -30.541107177734375, "rewards/margins": 1.68878173828125, "rewards/rejected": -32.22988510131836, "step": 1731 }, { "epoch": 0.23583877995642702, "grad_norm": 46.083034916969154, "learning_rate": 7.559136062620766e-07, "logits/chosen": 10.51490592956543, "logits/rejected": 10.922300338745117, "logps/chosen": -2.708972930908203, "logps/rejected": -2.734671115875244, "loss": 4.1309, "rewards/accuracies": 0.75, "rewards/chosen": -27.08972930908203, "rewards/margins": 0.2569847106933594, "rewards/rejected": -27.34671401977539, "step": 1732 }, { "epoch": 0.23597494553376908, "grad_norm": 61.74801642180943, "learning_rate": 7.558267894825787e-07, "logits/chosen": 11.421781539916992, "logits/rejected": 12.460062980651855, "logps/chosen": -3.0443172454833984, "logps/rejected": -3.3160057067871094, "loss": 4.204, "rewards/accuracies": 0.75, "rewards/chosen": -30.443172454833984, "rewards/margins": 2.716884136199951, "rewards/rejected": -33.160057067871094, "step": 1733 }, { "epoch": 0.2361111111111111, "grad_norm": 44.77501870455404, "learning_rate": 7.557398923010764e-07, "logits/chosen": 9.26866626739502, "logits/rejected": 9.854788780212402, "logps/chosen": -2.7056546211242676, "logps/rejected": -2.597944974899292, "loss": 4.2373, "rewards/accuracies": 0.5, "rewards/chosen": -27.056547164916992, "rewards/margins": -1.0770978927612305, "rewards/rejected": -25.979450225830078, "step": 1734 }, { "epoch": 0.23624727668845316, "grad_norm": 45.573178009597925, "learning_rate": 7.55652914737205e-07, "logits/chosen": 11.636743545532227, "logits/rejected": 11.02578353881836, "logps/chosen": -2.6592135429382324, "logps/rejected": -2.936825752258301, "loss": 4.123, "rewards/accuracies": 0.75, "rewards/chosen": -26.59213638305664, "rewards/margins": 2.7761197090148926, "rewards/rejected": -29.368255615234375, "step": 1735 }, { "epoch": 0.23638344226579522, "grad_norm": 41.704612158781345, "learning_rate": 7.555658568106176e-07, "logits/chosen": 10.799644470214844, "logits/rejected": 10.344343185424805, "logps/chosen": -3.1578800678253174, "logps/rejected": -3.0526795387268066, "loss": 4.3137, "rewards/accuracies": 0.5, "rewards/chosen": -31.578800201416016, "rewards/margins": -1.0520052909851074, "rewards/rejected": -30.526796340942383, "step": 1736 }, { "epoch": 0.23651960784313725, "grad_norm": 43.165934169773024, "learning_rate": 7.554787185409857e-07, "logits/chosen": 10.547334671020508, "logits/rejected": 11.671196937561035, "logps/chosen": -3.09220027923584, "logps/rejected": -3.173177480697632, "loss": 4.2181, "rewards/accuracies": 0.75, "rewards/chosen": -30.92200469970703, "rewards/margins": 0.8097696304321289, "rewards/rejected": -31.731773376464844, "step": 1737 }, { "epoch": 0.2366557734204793, "grad_norm": 46.383894272790364, "learning_rate": 7.553914999479989e-07, "logits/chosen": 10.494606018066406, "logits/rejected": 11.829402923583984, "logps/chosen": -3.089344024658203, "logps/rejected": -3.2453179359436035, "loss": 4.1231, "rewards/accuracies": 0.75, "rewards/chosen": -30.8934383392334, "rewards/margins": 1.5597405433654785, "rewards/rejected": -32.45317840576172, "step": 1738 }, { "epoch": 0.23679193899782136, "grad_norm": 38.586726443439744, "learning_rate": 7.55304201051365e-07, "logits/chosen": 11.147008895874023, "logits/rejected": 11.221253395080566, "logps/chosen": -3.228139877319336, "logps/rejected": -3.2465198040008545, "loss": 3.962, "rewards/accuracies": 0.5, "rewards/chosen": -32.28139877319336, "rewards/margins": 0.18379878997802734, "rewards/rejected": -32.46519470214844, "step": 1739 }, { "epoch": 0.2369281045751634, "grad_norm": 46.41802248516028, "learning_rate": 7.552168218708099e-07, "logits/chosen": 10.564022064208984, "logits/rejected": 11.327170372009277, "logps/chosen": -3.382516384124756, "logps/rejected": -3.391047954559326, "loss": 4.567, "rewards/accuracies": 0.5, "rewards/chosen": -33.825164794921875, "rewards/margins": 0.08531713485717773, "rewards/rejected": -33.91048049926758, "step": 1740 }, { "epoch": 0.23706427015250545, "grad_norm": 43.884036841540286, "learning_rate": 7.551293624260778e-07, "logits/chosen": 9.773164749145508, "logits/rejected": 11.162262916564941, "logps/chosen": -3.101351737976074, "logps/rejected": -3.318570137023926, "loss": 3.9718, "rewards/accuracies": 1.0, "rewards/chosen": -31.013519287109375, "rewards/margins": 2.172182559967041, "rewards/rejected": -33.18570327758789, "step": 1741 }, { "epoch": 0.2372004357298475, "grad_norm": 44.66440444864739, "learning_rate": 7.550418227369305e-07, "logits/chosen": 10.617265701293945, "logits/rejected": 10.155790328979492, "logps/chosen": -3.222607135772705, "logps/rejected": -3.1016716957092285, "loss": 4.1811, "rewards/accuracies": 0.0, "rewards/chosen": -32.22607421875, "rewards/margins": -1.2093558311462402, "rewards/rejected": -31.01671600341797, "step": 1742 }, { "epoch": 0.23733660130718953, "grad_norm": 41.32639848993877, "learning_rate": 7.549542028231487e-07, "logits/chosen": 9.29456901550293, "logits/rejected": 11.814521789550781, "logps/chosen": -3.0595195293426514, "logps/rejected": -3.501286029815674, "loss": 4.0969, "rewards/accuracies": 1.0, "rewards/chosen": -30.595195770263672, "rewards/margins": 4.417664051055908, "rewards/rejected": -35.01285934448242, "step": 1743 }, { "epoch": 0.2374727668845316, "grad_norm": 48.245350730735304, "learning_rate": 7.548665027045306e-07, "logits/chosen": 10.44044303894043, "logits/rejected": 10.331338882446289, "logps/chosen": -2.938457489013672, "logps/rejected": -2.8195509910583496, "loss": 3.5429, "rewards/accuracies": 0.25, "rewards/chosen": -29.38457489013672, "rewards/margins": -1.1890673637390137, "rewards/rejected": -28.195507049560547, "step": 1744 }, { "epoch": 0.23760893246187365, "grad_norm": 40.79665119605802, "learning_rate": 7.547787224008929e-07, "logits/chosen": 10.505813598632812, "logits/rejected": 11.06561279296875, "logps/chosen": -3.0152106285095215, "logps/rejected": -3.453975200653076, "loss": 4.4855, "rewards/accuracies": 1.0, "rewards/chosen": -30.152109146118164, "rewards/margins": 4.387642860412598, "rewards/rejected": -34.53975296020508, "step": 1745 }, { "epoch": 0.23774509803921567, "grad_norm": 41.63852192407333, "learning_rate": 7.546908619320702e-07, "logits/chosen": 10.640856742858887, "logits/rejected": 9.19881534576416, "logps/chosen": -3.0175437927246094, "logps/rejected": -2.829268455505371, "loss": 4.0547, "rewards/accuracies": 0.5, "rewards/chosen": -30.175437927246094, "rewards/margins": -1.8827557563781738, "rewards/rejected": -28.29268455505371, "step": 1746 }, { "epoch": 0.23788126361655773, "grad_norm": 41.64038609956122, "learning_rate": 7.546029213179153e-07, "logits/chosen": 11.167765617370605, "logits/rejected": 10.64405345916748, "logps/chosen": -2.9590227603912354, "logps/rejected": -2.9454588890075684, "loss": 4.3156, "rewards/accuracies": 0.5, "rewards/chosen": -29.590227127075195, "rewards/margins": -0.13563919067382812, "rewards/rejected": -29.454587936401367, "step": 1747 }, { "epoch": 0.2380174291938998, "grad_norm": 42.972061844598535, "learning_rate": 7.545149005782993e-07, "logits/chosen": 11.332239151000977, "logits/rejected": 11.546117782592773, "logps/chosen": -3.0437850952148438, "logps/rejected": -3.478013515472412, "loss": 4.3069, "rewards/accuracies": 1.0, "rewards/chosen": -30.437850952148438, "rewards/margins": 4.342282772064209, "rewards/rejected": -34.78013610839844, "step": 1748 }, { "epoch": 0.23815359477124182, "grad_norm": 41.665217992452604, "learning_rate": 7.54426799733111e-07, "logits/chosen": 9.654683113098145, "logits/rejected": 11.557144165039062, "logps/chosen": -2.9766769409179688, "logps/rejected": -3.5068159103393555, "loss": 3.8208, "rewards/accuracies": 0.75, "rewards/chosen": -29.766769409179688, "rewards/margins": 5.301389694213867, "rewards/rejected": -35.06816101074219, "step": 1749 }, { "epoch": 0.23828976034858387, "grad_norm": 43.24249819207394, "learning_rate": 7.543386188022575e-07, "logits/chosen": 11.005176544189453, "logits/rejected": 10.568658828735352, "logps/chosen": -3.168877601623535, "logps/rejected": -3.235931396484375, "loss": 4.197, "rewards/accuracies": 0.75, "rewards/chosen": -31.68877410888672, "rewards/margins": 0.6705389022827148, "rewards/rejected": -32.35931396484375, "step": 1750 }, { "epoch": 0.23842592592592593, "grad_norm": 44.61229718551139, "learning_rate": 7.542503578056642e-07, "logits/chosen": 11.386222839355469, "logits/rejected": 10.748327255249023, "logps/chosen": -3.370497226715088, "logps/rejected": -3.5051817893981934, "loss": 4.4856, "rewards/accuracies": 0.25, "rewards/chosen": -33.70497131347656, "rewards/margins": 1.3468456268310547, "rewards/rejected": -35.05181884765625, "step": 1751 }, { "epoch": 0.238562091503268, "grad_norm": 46.37058893905022, "learning_rate": 7.541620167632743e-07, "logits/chosen": 10.208513259887695, "logits/rejected": 11.253963470458984, "logps/chosen": -3.1162192821502686, "logps/rejected": -3.5214309692382812, "loss": 3.571, "rewards/accuracies": 1.0, "rewards/chosen": -31.162193298339844, "rewards/margins": 4.052116394042969, "rewards/rejected": -35.21430969238281, "step": 1752 }, { "epoch": 0.23869825708061002, "grad_norm": 43.07071434095412, "learning_rate": 7.540735956950491e-07, "logits/chosen": 9.998425483703613, "logits/rejected": 10.967387199401855, "logps/chosen": -3.28467059135437, "logps/rejected": -3.445401668548584, "loss": 4.4351, "rewards/accuracies": 0.75, "rewards/chosen": -32.84670639038086, "rewards/margins": 1.6073131561279297, "rewards/rejected": -34.454017639160156, "step": 1753 }, { "epoch": 0.23883442265795207, "grad_norm": 48.75905836190905, "learning_rate": 7.539850946209683e-07, "logits/chosen": 10.79897689819336, "logits/rejected": 11.553997039794922, "logps/chosen": -3.254655361175537, "logps/rejected": -3.1602087020874023, "loss": 4.639, "rewards/accuracies": 0.5, "rewards/chosen": -32.54655456542969, "rewards/margins": -0.944465160369873, "rewards/rejected": -31.602088928222656, "step": 1754 }, { "epoch": 0.23897058823529413, "grad_norm": 40.368520202419106, "learning_rate": 7.538965135610291e-07, "logits/chosen": 10.421670913696289, "logits/rejected": 10.776912689208984, "logps/chosen": -2.960219144821167, "logps/rejected": -2.9796395301818848, "loss": 4.0233, "rewards/accuracies": 0.75, "rewards/chosen": -29.602191925048828, "rewards/margins": 0.1942000389099121, "rewards/rejected": -29.7963924407959, "step": 1755 }, { "epoch": 0.23910675381263616, "grad_norm": 46.6810733803893, "learning_rate": 7.538078525352474e-07, "logits/chosen": 9.545297622680664, "logits/rejected": 9.743261337280273, "logps/chosen": -3.03581166267395, "logps/rejected": -3.016805410385132, "loss": 4.7754, "rewards/accuracies": 0.75, "rewards/chosen": -30.358116149902344, "rewards/margins": -0.1900620460510254, "rewards/rejected": -30.168054580688477, "step": 1756 }, { "epoch": 0.23924291938997821, "grad_norm": 43.407494449318634, "learning_rate": 7.537191115636569e-07, "logits/chosen": 11.682714462280273, "logits/rejected": 11.939666748046875, "logps/chosen": -3.1165366172790527, "logps/rejected": -3.183441638946533, "loss": 4.246, "rewards/accuracies": 0.25, "rewards/chosen": -31.165367126464844, "rewards/margins": 0.6690492630004883, "rewards/rejected": -31.834415435791016, "step": 1757 }, { "epoch": 0.23937908496732027, "grad_norm": 43.33075563442763, "learning_rate": 7.536302906663092e-07, "logits/chosen": 11.184164047241211, "logits/rejected": 10.83563232421875, "logps/chosen": -3.1407439708709717, "logps/rejected": -3.508632183074951, "loss": 3.878, "rewards/accuracies": 0.75, "rewards/chosen": -31.407438278198242, "rewards/margins": 3.6788811683654785, "rewards/rejected": -35.08631896972656, "step": 1758 }, { "epoch": 0.2395152505446623, "grad_norm": 48.006642815785916, "learning_rate": 7.535413898632741e-07, "logits/chosen": 11.206489562988281, "logits/rejected": 10.864616394042969, "logps/chosen": -3.5275673866271973, "logps/rejected": -3.104224920272827, "loss": 4.5543, "rewards/accuracies": 0.25, "rewards/chosen": -35.275672912597656, "rewards/margins": -4.233423233032227, "rewards/rejected": -31.042251586914062, "step": 1759 }, { "epoch": 0.23965141612200436, "grad_norm": 43.52544319345563, "learning_rate": 7.534524091746396e-07, "logits/chosen": 9.878890991210938, "logits/rejected": 9.710407257080078, "logps/chosen": -3.160651445388794, "logps/rejected": -3.158726692199707, "loss": 4.2892, "rewards/accuracies": 0.5, "rewards/chosen": -31.60651397705078, "rewards/margins": -0.01924610137939453, "rewards/rejected": -31.58726692199707, "step": 1760 }, { "epoch": 0.2397875816993464, "grad_norm": 51.902155021200635, "learning_rate": 7.533633486205117e-07, "logits/chosen": 11.685169219970703, "logits/rejected": 10.636343002319336, "logps/chosen": -2.9314117431640625, "logps/rejected": -3.052851915359497, "loss": 4.1629, "rewards/accuracies": 0.5, "rewards/chosen": -29.314117431640625, "rewards/margins": 1.2144012451171875, "rewards/rejected": -30.528518676757812, "step": 1761 }, { "epoch": 0.23992374727668844, "grad_norm": 42.54386392152411, "learning_rate": 7.532742082210142e-07, "logits/chosen": 11.60477066040039, "logits/rejected": 11.271934509277344, "logps/chosen": -2.907015800476074, "logps/rejected": -3.347604513168335, "loss": 4.3674, "rewards/accuracies": 0.75, "rewards/chosen": -29.070158004760742, "rewards/margins": 4.405887603759766, "rewards/rejected": -33.476043701171875, "step": 1762 }, { "epoch": 0.2400599128540305, "grad_norm": 72.78195234277382, "learning_rate": 7.531849879962891e-07, "logits/chosen": 11.946495056152344, "logits/rejected": 10.476045608520508, "logps/chosen": -3.2818892002105713, "logps/rejected": -3.1695010662078857, "loss": 4.3624, "rewards/accuracies": 0.5, "rewards/chosen": -32.81889343261719, "rewards/margins": -1.123880386352539, "rewards/rejected": -31.69501304626465, "step": 1763 }, { "epoch": 0.24019607843137256, "grad_norm": 38.080037577859365, "learning_rate": 7.530956879664964e-07, "logits/chosen": 10.861932754516602, "logits/rejected": 10.530431747436523, "logps/chosen": -3.3671488761901855, "logps/rejected": -3.3092968463897705, "loss": 3.8247, "rewards/accuracies": 0.5, "rewards/chosen": -33.67149353027344, "rewards/margins": -0.5785226821899414, "rewards/rejected": -33.09296798706055, "step": 1764 }, { "epoch": 0.24033224400871459, "grad_norm": 40.22240071453223, "learning_rate": 7.530063081518145e-07, "logits/chosen": 10.809215545654297, "logits/rejected": 11.51223087310791, "logps/chosen": -2.903024196624756, "logps/rejected": -3.2118921279907227, "loss": 4.0035, "rewards/accuracies": 0.75, "rewards/chosen": -29.030242919921875, "rewards/margins": 3.088677406311035, "rewards/rejected": -32.118919372558594, "step": 1765 }, { "epoch": 0.24046840958605664, "grad_norm": 39.648026086597746, "learning_rate": 7.529168485724392e-07, "logits/chosen": 10.340538024902344, "logits/rejected": 10.65417194366455, "logps/chosen": -3.22552490234375, "logps/rejected": -3.4245824813842773, "loss": 3.9651, "rewards/accuracies": 0.75, "rewards/chosen": -32.2552490234375, "rewards/margins": 1.9905734062194824, "rewards/rejected": -34.245826721191406, "step": 1766 }, { "epoch": 0.2406045751633987, "grad_norm": 44.72152824399886, "learning_rate": 7.528273092485847e-07, "logits/chosen": 10.772461891174316, "logits/rejected": 10.828646659851074, "logps/chosen": -3.308044910430908, "logps/rejected": -3.3796801567077637, "loss": 3.9543, "rewards/accuracies": 0.5, "rewards/chosen": -33.08045196533203, "rewards/margins": 0.7163519859313965, "rewards/rejected": -33.79680252075195, "step": 1767 }, { "epoch": 0.24074074074074073, "grad_norm": 41.73022247146001, "learning_rate": 7.527376902004832e-07, "logits/chosen": 11.212379455566406, "logits/rejected": 11.594470977783203, "logps/chosen": -3.315460681915283, "logps/rejected": -3.622429847717285, "loss": 3.9653, "rewards/accuracies": 0.75, "rewards/chosen": -33.15460968017578, "rewards/margins": 3.0696916580200195, "rewards/rejected": -36.22429656982422, "step": 1768 }, { "epoch": 0.24087690631808278, "grad_norm": 48.00726380714294, "learning_rate": 7.526479914483849e-07, "logits/chosen": 10.366935729980469, "logits/rejected": 11.928447723388672, "logps/chosen": -3.1086158752441406, "logps/rejected": -3.7093021869659424, "loss": 4.3441, "rewards/accuracies": 1.0, "rewards/chosen": -31.086156845092773, "rewards/margins": 6.006864547729492, "rewards/rejected": -37.093021392822266, "step": 1769 }, { "epoch": 0.24101307189542484, "grad_norm": 40.99430721493553, "learning_rate": 7.525582130125577e-07, "logits/chosen": 11.329452514648438, "logits/rejected": 11.51236343383789, "logps/chosen": -3.312621593475342, "logps/rejected": -3.432023525238037, "loss": 4.2149, "rewards/accuracies": 1.0, "rewards/chosen": -33.12621307373047, "rewards/margins": 1.194018840789795, "rewards/rejected": -34.32023239135742, "step": 1770 }, { "epoch": 0.2411492374727669, "grad_norm": 45.3229863388703, "learning_rate": 7.524683549132883e-07, "logits/chosen": 10.260906219482422, "logits/rejected": 10.987838745117188, "logps/chosen": -3.0528059005737305, "logps/rejected": -3.3389899730682373, "loss": 4.6525, "rewards/accuracies": 0.75, "rewards/chosen": -30.528059005737305, "rewards/margins": 2.8618383407592773, "rewards/rejected": -33.389896392822266, "step": 1771 }, { "epoch": 0.24128540305010893, "grad_norm": 43.17671091552605, "learning_rate": 7.523784171708804e-07, "logits/chosen": 12.035484313964844, "logits/rejected": 12.458333015441895, "logps/chosen": -3.6964402198791504, "logps/rejected": -3.5613925457000732, "loss": 4.1483, "rewards/accuracies": 0.5, "rewards/chosen": -36.96440124511719, "rewards/margins": -1.350478172302246, "rewards/rejected": -35.613922119140625, "step": 1772 }, { "epoch": 0.24142156862745098, "grad_norm": 42.04744104036419, "learning_rate": 7.522883998056564e-07, "logits/chosen": 10.675586700439453, "logits/rejected": 11.032492637634277, "logps/chosen": -3.5189425945281982, "logps/rejected": -3.5158016681671143, "loss": 3.9867, "rewards/accuracies": 0.5, "rewards/chosen": -35.18942642211914, "rewards/margins": -0.031407833099365234, "rewards/rejected": -35.15802001953125, "step": 1773 }, { "epoch": 0.24155773420479304, "grad_norm": 41.529892084415266, "learning_rate": 7.521983028379564e-07, "logits/chosen": 10.556413650512695, "logits/rejected": 11.012550354003906, "logps/chosen": -2.9151558876037598, "logps/rejected": -3.3906736373901367, "loss": 3.6733, "rewards/accuracies": 1.0, "rewards/chosen": -29.15155792236328, "rewards/margins": 4.755178451538086, "rewards/rejected": -33.906734466552734, "step": 1774 }, { "epoch": 0.24169389978213507, "grad_norm": 53.700163018739495, "learning_rate": 7.521081262881385e-07, "logits/chosen": 10.779214859008789, "logits/rejected": 10.644599914550781, "logps/chosen": -3.1317105293273926, "logps/rejected": -3.305525302886963, "loss": 4.2373, "rewards/accuracies": 0.5, "rewards/chosen": -31.317106246948242, "rewards/margins": 1.7381443977355957, "rewards/rejected": -33.05525207519531, "step": 1775 }, { "epoch": 0.24183006535947713, "grad_norm": 39.219150051053255, "learning_rate": 7.520178701765789e-07, "logits/chosen": 10.057071685791016, "logits/rejected": 10.487640380859375, "logps/chosen": -3.086913824081421, "logps/rejected": -3.412623882293701, "loss": 4.5426, "rewards/accuracies": 1.0, "rewards/chosen": -30.869136810302734, "rewards/margins": 3.2571005821228027, "rewards/rejected": -34.12623977661133, "step": 1776 }, { "epoch": 0.24196623093681918, "grad_norm": 42.37253644635821, "learning_rate": 7.51927534523672e-07, "logits/chosen": 11.20046615600586, "logits/rejected": 11.881314277648926, "logps/chosen": -3.2754359245300293, "logps/rejected": -3.5540781021118164, "loss": 3.7029, "rewards/accuracies": 0.75, "rewards/chosen": -32.754356384277344, "rewards/margins": 2.786421298980713, "rewards/rejected": -35.54077911376953, "step": 1777 }, { "epoch": 0.2421023965141612, "grad_norm": 40.83688721421586, "learning_rate": 7.518371193498294e-07, "logits/chosen": 11.827091217041016, "logits/rejected": 11.431325912475586, "logps/chosen": -3.7128334045410156, "logps/rejected": -3.463857650756836, "loss": 4.1765, "rewards/accuracies": 0.25, "rewards/chosen": -37.128334045410156, "rewards/margins": -2.4897618293762207, "rewards/rejected": -34.638572692871094, "step": 1778 }, { "epoch": 0.24223856209150327, "grad_norm": 42.66121334635163, "learning_rate": 7.517466246754813e-07, "logits/chosen": 11.166706085205078, "logits/rejected": 12.806842803955078, "logps/chosen": -3.166020631790161, "logps/rejected": -3.551065444946289, "loss": 3.5854, "rewards/accuracies": 1.0, "rewards/chosen": -31.660205841064453, "rewards/margins": 3.8504481315612793, "rewards/rejected": -35.51065444946289, "step": 1779 }, { "epoch": 0.24237472766884532, "grad_norm": 38.36508298092096, "learning_rate": 7.516560505210758e-07, "logits/chosen": 10.845928192138672, "logits/rejected": 12.099262237548828, "logps/chosen": -3.2375833988189697, "logps/rejected": -3.424149751663208, "loss": 3.7149, "rewards/accuracies": 0.75, "rewards/chosen": -32.37583541870117, "rewards/margins": 1.8656620979309082, "rewards/rejected": -34.24150085449219, "step": 1780 }, { "epoch": 0.24251089324618735, "grad_norm": 45.98050351797808, "learning_rate": 7.51565396907079e-07, "logits/chosen": 11.251201629638672, "logits/rejected": 10.889180183410645, "logps/chosen": -3.292722225189209, "logps/rejected": -3.337775230407715, "loss": 4.0906, "rewards/accuracies": 0.5, "rewards/chosen": -32.927223205566406, "rewards/margins": 0.4505305290222168, "rewards/rejected": -33.37775421142578, "step": 1781 }, { "epoch": 0.2426470588235294, "grad_norm": 46.505578482531284, "learning_rate": 7.514746638539747e-07, "logits/chosen": 11.310782432556152, "logits/rejected": 11.632166862487793, "logps/chosen": -3.061856985092163, "logps/rejected": -3.4851436614990234, "loss": 3.8439, "rewards/accuracies": 1.0, "rewards/chosen": -30.618568420410156, "rewards/margins": 4.232867240905762, "rewards/rejected": -34.851436614990234, "step": 1782 }, { "epoch": 0.24278322440087147, "grad_norm": 44.84730283565922, "learning_rate": 7.513838513822646e-07, "logits/chosen": 12.268837928771973, "logits/rejected": 11.877185821533203, "logps/chosen": -3.473012924194336, "logps/rejected": -3.3013272285461426, "loss": 4.1913, "rewards/accuracies": 0.5, "rewards/chosen": -34.73012924194336, "rewards/margins": -1.7168569564819336, "rewards/rejected": -33.01327133178711, "step": 1783 }, { "epoch": 0.2429193899782135, "grad_norm": 43.00314713483261, "learning_rate": 7.512929595124689e-07, "logits/chosen": 11.656518936157227, "logits/rejected": 12.09176254272461, "logps/chosen": -3.169559955596924, "logps/rejected": -3.544830560684204, "loss": 4.3756, "rewards/accuracies": 1.0, "rewards/chosen": -31.695602416992188, "rewards/margins": 3.7527055740356445, "rewards/rejected": -35.44830322265625, "step": 1784 }, { "epoch": 0.24305555555555555, "grad_norm": 42.811404166811556, "learning_rate": 7.512019882651251e-07, "logits/chosen": 11.222017288208008, "logits/rejected": 11.007676124572754, "logps/chosen": -3.257061719894409, "logps/rejected": -3.2809765338897705, "loss": 4.1967, "rewards/accuracies": 0.5, "rewards/chosen": -32.57061767578125, "rewards/margins": 0.23914718627929688, "rewards/rejected": -32.80976486206055, "step": 1785 }, { "epoch": 0.2431917211328976, "grad_norm": 50.96878877634305, "learning_rate": 7.511109376607891e-07, "logits/chosen": 10.294034004211426, "logits/rejected": 11.694679260253906, "logps/chosen": -3.0344185829162598, "logps/rejected": -3.2790379524230957, "loss": 4.2915, "rewards/accuracies": 0.75, "rewards/chosen": -30.344186782836914, "rewards/margins": 2.4461936950683594, "rewards/rejected": -32.790382385253906, "step": 1786 }, { "epoch": 0.24332788671023964, "grad_norm": 40.728532281058534, "learning_rate": 7.510198077200343e-07, "logits/chosen": 10.622997283935547, "logits/rejected": 12.165262222290039, "logps/chosen": -2.8842127323150635, "logps/rejected": -3.476219654083252, "loss": 4.113, "rewards/accuracies": 1.0, "rewards/chosen": -28.84212875366211, "rewards/margins": 5.920068264007568, "rewards/rejected": -34.7621955871582, "step": 1787 }, { "epoch": 0.2434640522875817, "grad_norm": 44.58686906707119, "learning_rate": 7.509285984634523e-07, "logits/chosen": 11.395267486572266, "logits/rejected": 11.683099746704102, "logps/chosen": -3.487581253051758, "logps/rejected": -3.459287643432617, "loss": 4.462, "rewards/accuracies": 0.25, "rewards/chosen": -34.87581253051758, "rewards/margins": -0.28293704986572266, "rewards/rejected": -34.592872619628906, "step": 1788 }, { "epoch": 0.24360021786492375, "grad_norm": 40.58723649723624, "learning_rate": 7.508373099116529e-07, "logits/chosen": 12.02337646484375, "logits/rejected": 11.868021011352539, "logps/chosen": -3.418168067932129, "logps/rejected": -3.1199069023132324, "loss": 4.0836, "rewards/accuracies": 0.5, "rewards/chosen": -34.181678771972656, "rewards/margins": -2.982609748840332, "rewards/rejected": -31.199071884155273, "step": 1789 }, { "epoch": 0.2437363834422658, "grad_norm": 40.35484639776061, "learning_rate": 7.507459420852631e-07, "logits/chosen": 11.731149673461914, "logits/rejected": 12.090150833129883, "logps/chosen": -3.5178301334381104, "logps/rejected": -3.683605194091797, "loss": 3.7876, "rewards/accuracies": 0.75, "rewards/chosen": -35.17830276489258, "rewards/margins": 1.657750129699707, "rewards/rejected": -36.83605194091797, "step": 1790 }, { "epoch": 0.24387254901960784, "grad_norm": 41.81196603544186, "learning_rate": 7.506544950049285e-07, "logits/chosen": 10.058509826660156, "logits/rejected": 11.469587326049805, "logps/chosen": -2.8889288902282715, "logps/rejected": -3.2847535610198975, "loss": 4.1824, "rewards/accuracies": 0.75, "rewards/chosen": -28.88928985595703, "rewards/margins": 3.9582481384277344, "rewards/rejected": -32.847537994384766, "step": 1791 }, { "epoch": 0.2440087145969499, "grad_norm": 45.59730038917976, "learning_rate": 7.505629686913121e-07, "logits/chosen": 11.871353149414062, "logits/rejected": 11.272422790527344, "logps/chosen": -3.3003697395324707, "logps/rejected": -3.1313600540161133, "loss": 4.4791, "rewards/accuracies": 0.0, "rewards/chosen": -33.00369644165039, "rewards/margins": -1.6900968551635742, "rewards/rejected": -31.313600540161133, "step": 1792 }, { "epoch": 0.24414488017429195, "grad_norm": 52.64308475644801, "learning_rate": 7.504713631650952e-07, "logits/chosen": 10.110706329345703, "logits/rejected": 11.029510498046875, "logps/chosen": -3.0764899253845215, "logps/rejected": -3.223057746887207, "loss": 4.1643, "rewards/accuracies": 0.75, "rewards/chosen": -30.764896392822266, "rewards/margins": 1.4656810760498047, "rewards/rejected": -32.2305793762207, "step": 1793 }, { "epoch": 0.24428104575163398, "grad_norm": 38.61403322975515, "learning_rate": 7.503796784469769e-07, "logits/chosen": 10.949043273925781, "logits/rejected": 12.025591850280762, "logps/chosen": -2.943848133087158, "logps/rejected": -3.3316152095794678, "loss": 3.6107, "rewards/accuracies": 0.75, "rewards/chosen": -29.4384822845459, "rewards/margins": 3.8776707649230957, "rewards/rejected": -33.31615447998047, "step": 1794 }, { "epoch": 0.24441721132897604, "grad_norm": 67.27059250668897, "learning_rate": 7.502879145576737e-07, "logits/chosen": 9.49643611907959, "logits/rejected": 11.100922584533691, "logps/chosen": -2.736267566680908, "logps/rejected": -3.6194489002227783, "loss": 3.8443, "rewards/accuracies": 1.0, "rewards/chosen": -27.362674713134766, "rewards/margins": 8.831811904907227, "rewards/rejected": -36.194488525390625, "step": 1795 }, { "epoch": 0.2445533769063181, "grad_norm": 42.848881625163486, "learning_rate": 7.501960715179208e-07, "logits/chosen": 11.342182159423828, "logits/rejected": 12.011916160583496, "logps/chosen": -3.27264404296875, "logps/rejected": -3.5611824989318848, "loss": 4.0483, "rewards/accuracies": 0.75, "rewards/chosen": -32.7264404296875, "rewards/margins": 2.885385036468506, "rewards/rejected": -35.61182403564453, "step": 1796 }, { "epoch": 0.24468954248366012, "grad_norm": 37.713645137748586, "learning_rate": 7.50104149348471e-07, "logits/chosen": 11.179903030395508, "logits/rejected": 12.155465126037598, "logps/chosen": -3.189222812652588, "logps/rejected": -3.3397529125213623, "loss": 3.6434, "rewards/accuracies": 0.75, "rewards/chosen": -31.892227172851562, "rewards/margins": 1.5053033828735352, "rewards/rejected": -33.39752960205078, "step": 1797 }, { "epoch": 0.24482570806100218, "grad_norm": 42.98226867507533, "learning_rate": 7.500121480700943e-07, "logits/chosen": 10.734039306640625, "logits/rejected": 11.636612892150879, "logps/chosen": -3.393911361694336, "logps/rejected": -3.5133848190307617, "loss": 4.0729, "rewards/accuracies": 0.75, "rewards/chosen": -33.939117431640625, "rewards/margins": 1.1947336196899414, "rewards/rejected": -35.13385009765625, "step": 1798 }, { "epoch": 0.24496187363834424, "grad_norm": 41.203646323402275, "learning_rate": 7.499200677035798e-07, "logits/chosen": 11.239189147949219, "logits/rejected": 11.488492965698242, "logps/chosen": -3.383319139480591, "logps/rejected": -3.557936191558838, "loss": 4.5022, "rewards/accuracies": 0.75, "rewards/chosen": -33.83319091796875, "rewards/margins": 1.746168613433838, "rewards/rejected": -35.57936096191406, "step": 1799 }, { "epoch": 0.24509803921568626, "grad_norm": 38.289048724858425, "learning_rate": 7.498279082697335e-07, "logits/chosen": 11.347497940063477, "logits/rejected": 10.483489990234375, "logps/chosen": -3.490436553955078, "logps/rejected": -3.6541576385498047, "loss": 4.0162, "rewards/accuracies": 0.5, "rewards/chosen": -34.90436553955078, "rewards/margins": 1.6372122764587402, "rewards/rejected": -36.54158020019531, "step": 1800 }, { "epoch": 0.24523420479302832, "grad_norm": 38.19073751743113, "learning_rate": 7.497356697893795e-07, "logits/chosen": 11.76392936706543, "logits/rejected": 12.307897567749023, "logps/chosen": -3.2457950115203857, "logps/rejected": -3.1203744411468506, "loss": 3.3995, "rewards/accuracies": 0.25, "rewards/chosen": -32.457950592041016, "rewards/margins": -1.2542061805725098, "rewards/rejected": -31.203744888305664, "step": 1801 }, { "epoch": 0.24537037037037038, "grad_norm": 44.98079366468212, "learning_rate": 7.496433522833602e-07, "logits/chosen": 11.40532112121582, "logits/rejected": 11.236037254333496, "logps/chosen": -3.025808334350586, "logps/rejected": -3.3491289615631104, "loss": 4.402, "rewards/accuracies": 1.0, "rewards/chosen": -30.258085250854492, "rewards/margins": 3.233205795288086, "rewards/rejected": -33.49129104614258, "step": 1802 }, { "epoch": 0.2455065359477124, "grad_norm": 38.402469407009335, "learning_rate": 7.49550955772535e-07, "logits/chosen": 11.402997970581055, "logits/rejected": 10.712882995605469, "logps/chosen": -3.1795482635498047, "logps/rejected": -3.2934679985046387, "loss": 4.1994, "rewards/accuracies": 0.5, "rewards/chosen": -31.795482635498047, "rewards/margins": 1.1391983032226562, "rewards/rejected": -32.93468475341797, "step": 1803 }, { "epoch": 0.24564270152505446, "grad_norm": 44.60244588825335, "learning_rate": 7.494584802777821e-07, "logits/chosen": 11.122444152832031, "logits/rejected": 11.494237899780273, "logps/chosen": -2.793707847595215, "logps/rejected": -2.9180736541748047, "loss": 4.2957, "rewards/accuracies": 0.5, "rewards/chosen": -27.937076568603516, "rewards/margins": 1.2436599731445312, "rewards/rejected": -29.180736541748047, "step": 1804 }, { "epoch": 0.24577886710239652, "grad_norm": 41.25155666924745, "learning_rate": 7.493659258199969e-07, "logits/chosen": 10.916686058044434, "logits/rejected": 11.861747741699219, "logps/chosen": -3.1392273902893066, "logps/rejected": -3.4279251098632812, "loss": 4.0096, "rewards/accuracies": 0.5, "rewards/chosen": -31.39227294921875, "rewards/margins": 2.886977195739746, "rewards/rejected": -34.27925109863281, "step": 1805 }, { "epoch": 0.24591503267973855, "grad_norm": 40.837280653181786, "learning_rate": 7.492732924200927e-07, "logits/chosen": 12.120750427246094, "logits/rejected": 11.339387893676758, "logps/chosen": -3.0070252418518066, "logps/rejected": -3.7576656341552734, "loss": 3.6585, "rewards/accuracies": 0.5, "rewards/chosen": -30.070255279541016, "rewards/margins": 7.506402015686035, "rewards/rejected": -37.57665252685547, "step": 1806 }, { "epoch": 0.2460511982570806, "grad_norm": 45.496272885561126, "learning_rate": 7.491805800990011e-07, "logits/chosen": 11.23698616027832, "logits/rejected": 11.794516563415527, "logps/chosen": -2.9052908420562744, "logps/rejected": -3.2362399101257324, "loss": 4.3336, "rewards/accuracies": 0.75, "rewards/chosen": -29.05290985107422, "rewards/margins": 3.309488296508789, "rewards/rejected": -32.362396240234375, "step": 1807 }, { "epoch": 0.24618736383442266, "grad_norm": 48.21717825963109, "learning_rate": 7.490877888776712e-07, "logits/chosen": 11.194572448730469, "logits/rejected": 11.180242538452148, "logps/chosen": -3.075193405151367, "logps/rejected": -3.604159355163574, "loss": 4.4196, "rewards/accuracies": 0.75, "rewards/chosen": -30.751934051513672, "rewards/margins": 5.2896599769592285, "rewards/rejected": -36.041595458984375, "step": 1808 }, { "epoch": 0.24632352941176472, "grad_norm": 41.29830536872395, "learning_rate": 7.489949187770695e-07, "logits/chosen": 11.208166122436523, "logits/rejected": 10.878366470336914, "logps/chosen": -3.184464693069458, "logps/rejected": -3.0216829776763916, "loss": 3.5979, "rewards/accuracies": 0.25, "rewards/chosen": -31.844646453857422, "rewards/margins": -1.6278190612792969, "rewards/rejected": -30.216827392578125, "step": 1809 }, { "epoch": 0.24645969498910675, "grad_norm": 40.647087846594474, "learning_rate": 7.489019698181813e-07, "logits/chosen": 10.926816940307617, "logits/rejected": 10.87136459350586, "logps/chosen": -2.8203201293945312, "logps/rejected": -2.834437370300293, "loss": 3.9324, "rewards/accuracies": 0.75, "rewards/chosen": -28.203201293945312, "rewards/margins": 0.1411728858947754, "rewards/rejected": -28.34437370300293, "step": 1810 }, { "epoch": 0.2465958605664488, "grad_norm": 78.4946567873568, "learning_rate": 7.48808942022009e-07, "logits/chosen": 11.932140350341797, "logits/rejected": 12.719474792480469, "logps/chosen": -3.335984230041504, "logps/rejected": -3.646550416946411, "loss": 3.9743, "rewards/accuracies": 0.75, "rewards/chosen": -33.359840393066406, "rewards/margins": 3.1056642532348633, "rewards/rejected": -36.46550369262695, "step": 1811 }, { "epoch": 0.24673202614379086, "grad_norm": 45.242205947413346, "learning_rate": 7.487158354095729e-07, "logits/chosen": 10.617462158203125, "logits/rejected": 10.88599967956543, "logps/chosen": -3.146834135055542, "logps/rejected": -3.215343952178955, "loss": 4.5153, "rewards/accuracies": 0.75, "rewards/chosen": -31.468339920043945, "rewards/margins": 0.6851005554199219, "rewards/rejected": -32.1534423828125, "step": 1812 }, { "epoch": 0.2468681917211329, "grad_norm": 45.79315876689066, "learning_rate": 7.486226500019112e-07, "logits/chosen": 10.374195098876953, "logits/rejected": 11.470439910888672, "logps/chosen": -2.9751577377319336, "logps/rejected": -3.4344089031219482, "loss": 4.3744, "rewards/accuracies": 0.75, "rewards/chosen": -29.751577377319336, "rewards/margins": 4.59251070022583, "rewards/rejected": -34.344085693359375, "step": 1813 }, { "epoch": 0.24700435729847495, "grad_norm": 44.584248555460185, "learning_rate": 7.485293858200801e-07, "logits/chosen": 11.523561477661133, "logits/rejected": 11.459885597229004, "logps/chosen": -3.1340456008911133, "logps/rejected": -3.1530370712280273, "loss": 4.2754, "rewards/accuracies": 0.25, "rewards/chosen": -31.340457916259766, "rewards/margins": 0.18991518020629883, "rewards/rejected": -31.530372619628906, "step": 1814 }, { "epoch": 0.247140522875817, "grad_norm": 43.52783391921809, "learning_rate": 7.484360428851532e-07, "logits/chosen": 10.528692245483398, "logits/rejected": 10.647981643676758, "logps/chosen": -2.803995370864868, "logps/rejected": -3.0291738510131836, "loss": 4.2338, "rewards/accuracies": 0.75, "rewards/chosen": -28.039955139160156, "rewards/margins": 2.251784324645996, "rewards/rejected": -30.291738510131836, "step": 1815 }, { "epoch": 0.24727668845315903, "grad_norm": 39.45905495280708, "learning_rate": 7.483426212182223e-07, "logits/chosen": 10.051349639892578, "logits/rejected": 12.263505935668945, "logps/chosen": -3.05056095123291, "logps/rejected": -3.474191188812256, "loss": 3.4739, "rewards/accuracies": 0.75, "rewards/chosen": -30.50560760498047, "rewards/margins": 4.236303806304932, "rewards/rejected": -34.741912841796875, "step": 1816 }, { "epoch": 0.2474128540305011, "grad_norm": 42.990277591161785, "learning_rate": 7.482491208403967e-07, "logits/chosen": 11.232955932617188, "logits/rejected": 10.97962760925293, "logps/chosen": -3.5362071990966797, "logps/rejected": -3.6433634757995605, "loss": 4.5504, "rewards/accuracies": 0.75, "rewards/chosen": -35.3620719909668, "rewards/margins": 1.071563720703125, "rewards/rejected": -36.43363571166992, "step": 1817 }, { "epoch": 0.24754901960784315, "grad_norm": 42.17018774758674, "learning_rate": 7.481555417728035e-07, "logits/chosen": 11.89334487915039, "logits/rejected": 11.69241714477539, "logps/chosen": -3.259678363800049, "logps/rejected": -3.644967555999756, "loss": 4.0047, "rewards/accuracies": 1.0, "rewards/chosen": -32.59678649902344, "rewards/margins": 3.852890968322754, "rewards/rejected": -36.449676513671875, "step": 1818 }, { "epoch": 0.24768518518518517, "grad_norm": 40.52100220043416, "learning_rate": 7.480618840365879e-07, "logits/chosen": 10.958877563476562, "logits/rejected": 11.397058486938477, "logps/chosen": -3.3662312030792236, "logps/rejected": -3.323101043701172, "loss": 4.4767, "rewards/accuracies": 0.25, "rewards/chosen": -33.66231155395508, "rewards/margins": -0.431302547454834, "rewards/rejected": -33.23101043701172, "step": 1819 }, { "epoch": 0.24782135076252723, "grad_norm": 43.860013686090184, "learning_rate": 7.479681476529123e-07, "logits/chosen": 10.559288024902344, "logits/rejected": 11.244205474853516, "logps/chosen": -2.5041391849517822, "logps/rejected": -2.860440731048584, "loss": 3.378, "rewards/accuracies": 0.75, "rewards/chosen": -25.041393280029297, "rewards/margins": 3.563014507293701, "rewards/rejected": -28.604408264160156, "step": 1820 }, { "epoch": 0.2479575163398693, "grad_norm": 44.610775730276494, "learning_rate": 7.478743326429576e-07, "logits/chosen": 10.915871620178223, "logits/rejected": 9.954790115356445, "logps/chosen": -3.5510356426239014, "logps/rejected": -3.187378168106079, "loss": 3.8417, "rewards/accuracies": 0.0, "rewards/chosen": -35.510353088378906, "rewards/margins": -3.6365742683410645, "rewards/rejected": -31.873781204223633, "step": 1821 }, { "epoch": 0.24809368191721132, "grad_norm": 44.18001689186221, "learning_rate": 7.477804390279217e-07, "logits/chosen": 11.194910049438477, "logits/rejected": 11.751337051391602, "logps/chosen": -3.5802268981933594, "logps/rejected": -3.622215747833252, "loss": 3.8316, "rewards/accuracies": 0.25, "rewards/chosen": -35.802268981933594, "rewards/margins": 0.4198880195617676, "rewards/rejected": -36.2221565246582, "step": 1822 }, { "epoch": 0.24822984749455337, "grad_norm": 58.63525625158882, "learning_rate": 7.47686466829021e-07, "logits/chosen": 11.864641189575195, "logits/rejected": 11.316974639892578, "logps/chosen": -3.2196712493896484, "logps/rejected": -3.2815818786621094, "loss": 5.0467, "rewards/accuracies": 0.25, "rewards/chosen": -32.196712493896484, "rewards/margins": 0.6191072463989258, "rewards/rejected": -32.815818786621094, "step": 1823 }, { "epoch": 0.24836601307189543, "grad_norm": 50.64325304532634, "learning_rate": 7.47592416067489e-07, "logits/chosen": 10.474879264831543, "logits/rejected": 10.59619140625, "logps/chosen": -3.2005324363708496, "logps/rejected": -3.1242516040802, "loss": 4.0033, "rewards/accuracies": 0.5, "rewards/chosen": -32.00532531738281, "rewards/margins": -0.7628097534179688, "rewards/rejected": -31.242515563964844, "step": 1824 }, { "epoch": 0.24850217864923746, "grad_norm": 50.6727792692456, "learning_rate": 7.474982867645774e-07, "logits/chosen": 10.742690086364746, "logits/rejected": 10.760225296020508, "logps/chosen": -2.960318088531494, "logps/rejected": -3.2718732357025146, "loss": 3.8128, "rewards/accuracies": 0.75, "rewards/chosen": -29.603179931640625, "rewards/margins": 3.115553379058838, "rewards/rejected": -32.71873474121094, "step": 1825 }, { "epoch": 0.24863834422657952, "grad_norm": 50.56340856364662, "learning_rate": 7.474040789415554e-07, "logits/chosen": 11.746210098266602, "logits/rejected": 11.44821834564209, "logps/chosen": -3.195469379425049, "logps/rejected": -3.2035932540893555, "loss": 4.0527, "rewards/accuracies": 0.5, "rewards/chosen": -31.954692840576172, "rewards/margins": 0.08123779296875, "rewards/rejected": -32.03593063354492, "step": 1826 }, { "epoch": 0.24877450980392157, "grad_norm": 44.46783278765142, "learning_rate": 7.473097926197102e-07, "logits/chosen": 9.129486083984375, "logits/rejected": 11.453760147094727, "logps/chosen": -3.09806227684021, "logps/rejected": -3.630876064300537, "loss": 4.0616, "rewards/accuracies": 1.0, "rewards/chosen": -30.980623245239258, "rewards/margins": 5.328137397766113, "rewards/rejected": -36.30876159667969, "step": 1827 }, { "epoch": 0.24891067538126363, "grad_norm": 45.42485630223, "learning_rate": 7.472154278203463e-07, "logits/chosen": 10.180963516235352, "logits/rejected": 10.864875793457031, "logps/chosen": -2.809995651245117, "logps/rejected": -3.527498245239258, "loss": 3.8658, "rewards/accuracies": 1.0, "rewards/chosen": -28.099956512451172, "rewards/margins": 7.175024032592773, "rewards/rejected": -35.27497863769531, "step": 1828 }, { "epoch": 0.24904684095860566, "grad_norm": 44.45844419163703, "learning_rate": 7.471209845647865e-07, "logits/chosen": 9.87983512878418, "logits/rejected": 10.3491849899292, "logps/chosen": -2.5307812690734863, "logps/rejected": -2.8183369636535645, "loss": 3.238, "rewards/accuracies": 0.75, "rewards/chosen": -25.307811737060547, "rewards/margins": 2.8755574226379395, "rewards/rejected": -28.183368682861328, "step": 1829 }, { "epoch": 0.24918300653594772, "grad_norm": 45.19274977299418, "learning_rate": 7.470264628743709e-07, "logits/chosen": 10.958763122558594, "logits/rejected": 11.235336303710938, "logps/chosen": -2.8811683654785156, "logps/rejected": -3.079345703125, "loss": 4.4268, "rewards/accuracies": 0.75, "rewards/chosen": -28.81168556213379, "rewards/margins": 1.981773853302002, "rewards/rejected": -30.79345703125, "step": 1830 }, { "epoch": 0.24931917211328977, "grad_norm": 57.7201410454507, "learning_rate": 7.469318627704573e-07, "logits/chosen": 9.703084945678711, "logits/rejected": 11.81503677368164, "logps/chosen": -2.8088483810424805, "logps/rejected": -3.3563475608825684, "loss": 3.6529, "rewards/accuracies": 0.75, "rewards/chosen": -28.088485717773438, "rewards/margins": 5.4749908447265625, "rewards/rejected": -33.5634765625, "step": 1831 }, { "epoch": 0.2494553376906318, "grad_norm": 41.11380578474489, "learning_rate": 7.468371842744218e-07, "logits/chosen": 11.063441276550293, "logits/rejected": 10.992152214050293, "logps/chosen": -2.9563493728637695, "logps/rejected": -3.250638484954834, "loss": 3.6922, "rewards/accuracies": 0.75, "rewards/chosen": -29.563493728637695, "rewards/margins": 2.9428930282592773, "rewards/rejected": -32.506385803222656, "step": 1832 }, { "epoch": 0.24959150326797386, "grad_norm": 41.73627892554979, "learning_rate": 7.467424274076574e-07, "logits/chosen": 9.528297424316406, "logits/rejected": 9.861739158630371, "logps/chosen": -2.9365644454956055, "logps/rejected": -3.274897575378418, "loss": 3.7766, "rewards/accuracies": 0.75, "rewards/chosen": -29.365646362304688, "rewards/margins": 3.3833303451538086, "rewards/rejected": -32.74897766113281, "step": 1833 }, { "epoch": 0.24972766884531591, "grad_norm": 45.42040116858669, "learning_rate": 7.466475921915753e-07, "logits/chosen": 9.46429443359375, "logits/rejected": 9.507148742675781, "logps/chosen": -2.829540729522705, "logps/rejected": -2.982492208480835, "loss": 3.7602, "rewards/accuracies": 0.75, "rewards/chosen": -28.295406341552734, "rewards/margins": 1.5295162200927734, "rewards/rejected": -29.824920654296875, "step": 1834 }, { "epoch": 0.24986383442265794, "grad_norm": 45.838477521452056, "learning_rate": 7.465526786476044e-07, "logits/chosen": 9.593233108520508, "logits/rejected": 10.659894943237305, "logps/chosen": -3.0374457836151123, "logps/rejected": -3.5250635147094727, "loss": 4.2552, "rewards/accuracies": 1.0, "rewards/chosen": -30.37445831298828, "rewards/margins": 4.87617826461792, "rewards/rejected": -35.250633239746094, "step": 1835 }, { "epoch": 0.25, "grad_norm": 41.7578601248821, "learning_rate": 7.464576867971911e-07, "logits/chosen": 10.867700576782227, "logits/rejected": 11.39498233795166, "logps/chosen": -3.4246666431427, "logps/rejected": -3.3294219970703125, "loss": 3.9807, "rewards/accuracies": 0.25, "rewards/chosen": -34.246665954589844, "rewards/margins": -0.9524445533752441, "rewards/rejected": -33.294219970703125, "step": 1836 }, { "epoch": 0.25013616557734203, "grad_norm": 41.94129448008892, "learning_rate": 7.463626166617996e-07, "logits/chosen": 10.496920585632324, "logits/rejected": 11.408843994140625, "logps/chosen": -2.8408021926879883, "logps/rejected": -3.2360072135925293, "loss": 3.6602, "rewards/accuracies": 0.5, "rewards/chosen": -28.408023834228516, "rewards/margins": 3.9520506858825684, "rewards/rejected": -32.36007308959961, "step": 1837 }, { "epoch": 0.2502723311546841, "grad_norm": 51.535974567329, "learning_rate": 7.462674682629119e-07, "logits/chosen": 10.072620391845703, "logits/rejected": 11.523749351501465, "logps/chosen": -2.877671241760254, "logps/rejected": -3.3589913845062256, "loss": 4.1797, "rewards/accuracies": 1.0, "rewards/chosen": -28.77671241760254, "rewards/margins": 4.813200950622559, "rewards/rejected": -33.58991241455078, "step": 1838 }, { "epoch": 0.25040849673202614, "grad_norm": 45.52344109025172, "learning_rate": 7.461722416220273e-07, "logits/chosen": 11.193941116333008, "logits/rejected": 10.772687911987305, "logps/chosen": -3.042065143585205, "logps/rejected": -3.0001955032348633, "loss": 4.4774, "rewards/accuracies": 0.5, "rewards/chosen": -30.420650482177734, "rewards/margins": -0.4186978340148926, "rewards/rejected": -30.001955032348633, "step": 1839 }, { "epoch": 0.25054466230936817, "grad_norm": 51.8454872300225, "learning_rate": 7.460769367606632e-07, "logits/chosen": 9.205709457397461, "logits/rejected": 11.279658317565918, "logps/chosen": -2.4233245849609375, "logps/rejected": -2.919146776199341, "loss": 3.9626, "rewards/accuracies": 0.75, "rewards/chosen": -24.233245849609375, "rewards/margins": 4.958222389221191, "rewards/rejected": -29.19146728515625, "step": 1840 }, { "epoch": 0.25068082788671026, "grad_norm": 97.13772175007963, "learning_rate": 7.459815537003548e-07, "logits/chosen": 10.261220932006836, "logits/rejected": 11.043708801269531, "logps/chosen": -3.2051937580108643, "logps/rejected": -3.645906925201416, "loss": 4.4276, "rewards/accuracies": 1.0, "rewards/chosen": -32.051937103271484, "rewards/margins": 4.407131671905518, "rewards/rejected": -36.459068298339844, "step": 1841 }, { "epoch": 0.2508169934640523, "grad_norm": 59.06448741138018, "learning_rate": 7.458860924626541e-07, "logits/chosen": 10.469205856323242, "logits/rejected": 10.647359848022461, "logps/chosen": -3.2528634071350098, "logps/rejected": -3.421938419342041, "loss": 4.5566, "rewards/accuracies": 0.75, "rewards/chosen": -32.52863311767578, "rewards/margins": 1.690751075744629, "rewards/rejected": -34.219383239746094, "step": 1842 }, { "epoch": 0.2509531590413943, "grad_norm": 47.09053852120964, "learning_rate": 7.457905530691319e-07, "logits/chosen": 10.354785919189453, "logits/rejected": 11.720205307006836, "logps/chosen": -3.4287750720977783, "logps/rejected": -3.7128946781158447, "loss": 3.9991, "rewards/accuracies": 0.75, "rewards/chosen": -34.287750244140625, "rewards/margins": 2.8411965370178223, "rewards/rejected": -37.12894821166992, "step": 1843 }, { "epoch": 0.2510893246187364, "grad_norm": 45.937014766400885, "learning_rate": 7.456949355413759e-07, "logits/chosen": 11.997598648071289, "logits/rejected": 12.140399932861328, "logps/chosen": -3.3944854736328125, "logps/rejected": -3.522061347961426, "loss": 4.0307, "rewards/accuracies": 0.75, "rewards/chosen": -33.944854736328125, "rewards/margins": 1.2757596969604492, "rewards/rejected": -35.220611572265625, "step": 1844 }, { "epoch": 0.2512254901960784, "grad_norm": 42.120898156512, "learning_rate": 7.455992399009917e-07, "logits/chosen": 10.983596801757812, "logits/rejected": 10.322748184204102, "logps/chosen": -3.3345651626586914, "logps/rejected": -3.291161298751831, "loss": 4.4593, "rewards/accuracies": 0.5, "rewards/chosen": -33.34564971923828, "rewards/margins": -0.4340395927429199, "rewards/rejected": -32.9116096496582, "step": 1845 }, { "epoch": 0.25136165577342046, "grad_norm": 41.495664455428475, "learning_rate": 7.455034661696023e-07, "logits/chosen": 10.75949478149414, "logits/rejected": 11.548370361328125, "logps/chosen": -3.5414350032806396, "logps/rejected": -3.468031406402588, "loss": 3.9611, "rewards/accuracies": 0.5, "rewards/chosen": -35.41435241699219, "rewards/margins": -0.734036922454834, "rewards/rejected": -34.68031311035156, "step": 1846 }, { "epoch": 0.25149782135076254, "grad_norm": 47.475883826516196, "learning_rate": 7.454076143688489e-07, "logits/chosen": 10.881038665771484, "logits/rejected": 10.475961685180664, "logps/chosen": -3.516660690307617, "logps/rejected": -3.3692736625671387, "loss": 4.2993, "rewards/accuracies": 0.25, "rewards/chosen": -35.16660690307617, "rewards/margins": -1.4738712310791016, "rewards/rejected": -33.6927375793457, "step": 1847 }, { "epoch": 0.25163398692810457, "grad_norm": 48.691205559655536, "learning_rate": 7.453116845203898e-07, "logits/chosen": 12.012901306152344, "logits/rejected": 11.63891887664795, "logps/chosen": -3.7594549655914307, "logps/rejected": -3.7370386123657227, "loss": 4.1444, "rewards/accuracies": 0.25, "rewards/chosen": -37.59455108642578, "rewards/margins": -0.22416400909423828, "rewards/rejected": -37.370384216308594, "step": 1848 }, { "epoch": 0.2517701525054466, "grad_norm": 43.92632722192314, "learning_rate": 7.452156766459013e-07, "logits/chosen": 9.726093292236328, "logits/rejected": 10.54836654663086, "logps/chosen": -3.290809392929077, "logps/rejected": -3.6981141567230225, "loss": 4.3802, "rewards/accuracies": 1.0, "rewards/chosen": -32.90809631347656, "rewards/margins": 4.073047161102295, "rewards/rejected": -36.98114013671875, "step": 1849 }, { "epoch": 0.2519063180827887, "grad_norm": 48.030839103164546, "learning_rate": 7.451195907670769e-07, "logits/chosen": 11.534768104553223, "logits/rejected": 10.290410041809082, "logps/chosen": -3.5864317417144775, "logps/rejected": -3.3840878009796143, "loss": 4.4313, "rewards/accuracies": 0.5, "rewards/chosen": -35.86431884765625, "rewards/margins": -2.023441791534424, "rewards/rejected": -33.84087371826172, "step": 1850 }, { "epoch": 0.2520424836601307, "grad_norm": 46.71965024591784, "learning_rate": 7.450234269056284e-07, "logits/chosen": 10.894220352172852, "logits/rejected": 11.114397048950195, "logps/chosen": -3.334218978881836, "logps/rejected": -3.676969528198242, "loss": 4.0184, "rewards/accuracies": 1.0, "rewards/chosen": -33.34218978881836, "rewards/margins": 3.4275054931640625, "rewards/rejected": -36.769691467285156, "step": 1851 }, { "epoch": 0.25217864923747274, "grad_norm": 49.536553368425594, "learning_rate": 7.449271850832845e-07, "logits/chosen": 10.089509963989258, "logits/rejected": 10.648088455200195, "logps/chosen": -2.9998302459716797, "logps/rejected": -3.2648632526397705, "loss": 4.1152, "rewards/accuracies": 0.75, "rewards/chosen": -29.99830436706543, "rewards/margins": 2.650330066680908, "rewards/rejected": -32.64863586425781, "step": 1852 }, { "epoch": 0.2523148148148148, "grad_norm": 46.739816595533306, "learning_rate": 7.448308653217919e-07, "logits/chosen": 9.826395034790039, "logits/rejected": 11.127756118774414, "logps/chosen": -3.0855460166931152, "logps/rejected": -3.3103885650634766, "loss": 4.5106, "rewards/accuracies": 0.75, "rewards/chosen": -30.85546112060547, "rewards/margins": 2.2484264373779297, "rewards/rejected": -33.10388946533203, "step": 1853 }, { "epoch": 0.25245098039215685, "grad_norm": 52.63232402321272, "learning_rate": 7.447344676429149e-07, "logits/chosen": 10.524055480957031, "logits/rejected": 11.374723434448242, "logps/chosen": -3.457684278488159, "logps/rejected": -3.359111785888672, "loss": 4.6975, "rewards/accuracies": 0.25, "rewards/chosen": -34.57684326171875, "rewards/margins": -0.9857254028320312, "rewards/rejected": -33.59111785888672, "step": 1854 }, { "epoch": 0.2525871459694989, "grad_norm": 43.02970716786774, "learning_rate": 7.446379920684354e-07, "logits/chosen": 9.773387908935547, "logits/rejected": 10.944501876831055, "logps/chosen": -2.895115375518799, "logps/rejected": -3.400662899017334, "loss": 3.7123, "rewards/accuracies": 1.0, "rewards/chosen": -28.951154708862305, "rewards/margins": 5.05547571182251, "rewards/rejected": -34.006629943847656, "step": 1855 }, { "epoch": 0.25272331154684097, "grad_norm": 48.21045250290908, "learning_rate": 7.445414386201527e-07, "logits/chosen": 9.362489700317383, "logits/rejected": 11.241830825805664, "logps/chosen": -3.1108665466308594, "logps/rejected": -3.291721820831299, "loss": 4.4891, "rewards/accuracies": 0.5, "rewards/chosen": -31.108665466308594, "rewards/margins": 1.8085527420043945, "rewards/rejected": -32.91721725463867, "step": 1856 }, { "epoch": 0.252859477124183, "grad_norm": 45.497958156359616, "learning_rate": 7.44444807319884e-07, "logits/chosen": 10.318010330200195, "logits/rejected": 10.585862159729004, "logps/chosen": -3.0940041542053223, "logps/rejected": -3.1909689903259277, "loss": 4.1129, "rewards/accuracies": 0.5, "rewards/chosen": -30.940040588378906, "rewards/margins": 0.9696478843688965, "rewards/rejected": -31.909687042236328, "step": 1857 }, { "epoch": 0.2529956427015251, "grad_norm": 49.73097694379481, "learning_rate": 7.443480981894637e-07, "logits/chosen": 11.520971298217773, "logits/rejected": 10.65789794921875, "logps/chosen": -3.2938153743743896, "logps/rejected": -3.3591246604919434, "loss": 4.2222, "rewards/accuracies": 0.75, "rewards/chosen": -32.93815612792969, "rewards/margins": 0.6530919075012207, "rewards/rejected": -33.59124755859375, "step": 1858 }, { "epoch": 0.2531318082788671, "grad_norm": 45.14823864152612, "learning_rate": 7.442513112507445e-07, "logits/chosen": 10.131548881530762, "logits/rejected": 11.89011287689209, "logps/chosen": -3.2548091411590576, "logps/rejected": -3.246645450592041, "loss": 3.9556, "rewards/accuracies": 0.5, "rewards/chosen": -32.548091888427734, "rewards/margins": -0.08163738250732422, "rewards/rejected": -32.466453552246094, "step": 1859 }, { "epoch": 0.25326797385620914, "grad_norm": 49.67209242709432, "learning_rate": 7.441544465255956e-07, "logits/chosen": 11.01941967010498, "logits/rejected": 10.736672401428223, "logps/chosen": -3.0432841777801514, "logps/rejected": -3.159777879714966, "loss": 3.8431, "rewards/accuracies": 0.75, "rewards/chosen": -30.432842254638672, "rewards/margins": 1.1649370193481445, "rewards/rejected": -31.597780227661133, "step": 1860 }, { "epoch": 0.2534041394335512, "grad_norm": 43.7471839097304, "learning_rate": 7.44057504035905e-07, "logits/chosen": 10.305191993713379, "logits/rejected": 9.999126434326172, "logps/chosen": -3.1956825256347656, "logps/rejected": -3.2296206951141357, "loss": 4.5936, "rewards/accuracies": 0.5, "rewards/chosen": -31.95682716369629, "rewards/margins": 0.33938026428222656, "rewards/rejected": -32.296207427978516, "step": 1861 }, { "epoch": 0.25354030501089325, "grad_norm": 46.84227961900398, "learning_rate": 7.439604838035771e-07, "logits/chosen": 8.980025291442871, "logits/rejected": 9.601190567016602, "logps/chosen": -2.7873618602752686, "logps/rejected": -3.096656084060669, "loss": 4.4256, "rewards/accuracies": 0.75, "rewards/chosen": -27.87361717224121, "rewards/margins": 3.092942237854004, "rewards/rejected": -30.96656036376953, "step": 1862 }, { "epoch": 0.2536764705882353, "grad_norm": 39.50771328169241, "learning_rate": 7.438633858505348e-07, "logits/chosen": 10.4456148147583, "logits/rejected": 11.286053657531738, "logps/chosen": -2.973677396774292, "logps/rejected": -3.330150604248047, "loss": 3.6599, "rewards/accuracies": 0.75, "rewards/chosen": -29.736774444580078, "rewards/margins": 3.5647315979003906, "rewards/rejected": -33.30150604248047, "step": 1863 }, { "epoch": 0.25381263616557737, "grad_norm": 49.76381688925091, "learning_rate": 7.437662101987181e-07, "logits/chosen": 8.457077980041504, "logits/rejected": 9.393598556518555, "logps/chosen": -2.9690194129943848, "logps/rejected": -3.3772823810577393, "loss": 4.6404, "rewards/accuracies": 1.0, "rewards/chosen": -29.69019317626953, "rewards/margins": 4.082632541656494, "rewards/rejected": -33.772823333740234, "step": 1864 }, { "epoch": 0.2539488017429194, "grad_norm": 48.96617296684822, "learning_rate": 7.436689568700845e-07, "logits/chosen": 10.357841491699219, "logits/rejected": 11.884169578552246, "logps/chosen": -3.109795570373535, "logps/rejected": -3.6586263179779053, "loss": 3.8377, "rewards/accuracies": 1.0, "rewards/chosen": -31.09795379638672, "rewards/margins": 5.488308429718018, "rewards/rejected": -36.586265563964844, "step": 1865 }, { "epoch": 0.2540849673202614, "grad_norm": 44.192588616671515, "learning_rate": 7.435716258866093e-07, "logits/chosen": 11.519071578979492, "logits/rejected": 10.758563995361328, "logps/chosen": -3.5568490028381348, "logps/rejected": -3.5246074199676514, "loss": 4.2099, "rewards/accuracies": 0.5, "rewards/chosen": -35.56848907470703, "rewards/margins": -0.3224153518676758, "rewards/rejected": -35.24607467651367, "step": 1866 }, { "epoch": 0.2542211328976035, "grad_norm": 39.31466680281769, "learning_rate": 7.434742172702854e-07, "logits/chosen": 9.698490142822266, "logits/rejected": 11.301961898803711, "logps/chosen": -3.632863998413086, "logps/rejected": -3.6990766525268555, "loss": 4.1961, "rewards/accuracies": 0.75, "rewards/chosen": -36.32863998413086, "rewards/margins": 0.6621236801147461, "rewards/rejected": -36.99076843261719, "step": 1867 }, { "epoch": 0.25435729847494554, "grad_norm": 46.97588563643719, "learning_rate": 7.433767310431228e-07, "logits/chosen": 10.471126556396484, "logits/rejected": 10.836179733276367, "logps/chosen": -2.9163060188293457, "logps/rejected": -3.283661127090454, "loss": 3.9284, "rewards/accuracies": 0.75, "rewards/chosen": -29.163063049316406, "rewards/margins": 3.673549175262451, "rewards/rejected": -32.83660888671875, "step": 1868 }, { "epoch": 0.25449346405228757, "grad_norm": 47.42137095134382, "learning_rate": 7.432791672271495e-07, "logits/chosen": 10.865997314453125, "logits/rejected": 10.837446212768555, "logps/chosen": -3.226957321166992, "logps/rejected": -3.130338668823242, "loss": 4.3067, "rewards/accuracies": 0.5, "rewards/chosen": -32.269569396972656, "rewards/margins": -0.9661855697631836, "rewards/rejected": -31.303386688232422, "step": 1869 }, { "epoch": 0.25462962962962965, "grad_norm": 41.152272429845226, "learning_rate": 7.431815258444107e-07, "logits/chosen": 9.256330490112305, "logits/rejected": 10.774629592895508, "logps/chosen": -2.582803249359131, "logps/rejected": -3.115966796875, "loss": 3.664, "rewards/accuracies": 1.0, "rewards/chosen": -25.828033447265625, "rewards/margins": 5.331635475158691, "rewards/rejected": -31.15966796875, "step": 1870 }, { "epoch": 0.2547657952069717, "grad_norm": 50.52322132313479, "learning_rate": 7.430838069169695e-07, "logits/chosen": 10.098700523376465, "logits/rejected": 11.171964645385742, "logps/chosen": -2.8984317779541016, "logps/rejected": -3.2122154235839844, "loss": 3.9558, "rewards/accuracies": 1.0, "rewards/chosen": -28.984317779541016, "rewards/margins": 3.1378374099731445, "rewards/rejected": -32.122154235839844, "step": 1871 }, { "epoch": 0.2549019607843137, "grad_norm": 42.455229229582656, "learning_rate": 7.42986010466906e-07, "logits/chosen": 10.34404182434082, "logits/rejected": 10.829983711242676, "logps/chosen": -3.150979995727539, "logps/rejected": -3.320309638977051, "loss": 3.3075, "rewards/accuracies": 0.75, "rewards/chosen": -31.50979995727539, "rewards/margins": 1.6932978630065918, "rewards/rejected": -33.203094482421875, "step": 1872 }, { "epoch": 0.2550381263616558, "grad_norm": 46.424245553002734, "learning_rate": 7.428881365163183e-07, "logits/chosen": 8.82518482208252, "logits/rejected": 10.09974479675293, "logps/chosen": -3.0085222721099854, "logps/rejected": -3.203704357147217, "loss": 3.5272, "rewards/accuracies": 0.75, "rewards/chosen": -30.085224151611328, "rewards/margins": 1.9518203735351562, "rewards/rejected": -32.037044525146484, "step": 1873 }, { "epoch": 0.2551742919389978, "grad_norm": 51.5577924034908, "learning_rate": 7.427901850873219e-07, "logits/chosen": 9.361685752868652, "logits/rejected": 11.098331451416016, "logps/chosen": -2.8768038749694824, "logps/rejected": -3.280409812927246, "loss": 3.9214, "rewards/accuracies": 0.75, "rewards/chosen": -28.76803970336914, "rewards/margins": 4.036057949066162, "rewards/rejected": -32.80409622192383, "step": 1874 }, { "epoch": 0.25531045751633985, "grad_norm": 130.1290544782773, "learning_rate": 7.426921562020497e-07, "logits/chosen": 10.600032806396484, "logits/rejected": 10.718918800354004, "logps/chosen": -2.692725896835327, "logps/rejected": -3.0869693756103516, "loss": 5.9647, "rewards/accuracies": 0.75, "rewards/chosen": -26.927257537841797, "rewards/margins": 3.9424362182617188, "rewards/rejected": -30.869693756103516, "step": 1875 }, { "epoch": 0.25544662309368193, "grad_norm": 40.93318496765094, "learning_rate": 7.42594049882652e-07, "logits/chosen": 11.420249938964844, "logits/rejected": 11.480515480041504, "logps/chosen": -3.2129597663879395, "logps/rejected": -3.101294994354248, "loss": 3.9835, "rewards/accuracies": 0.5, "rewards/chosen": -32.129600524902344, "rewards/margins": -1.1166510581970215, "rewards/rejected": -31.01294708251953, "step": 1876 }, { "epoch": 0.25558278867102396, "grad_norm": 55.84115346608271, "learning_rate": 7.424958661512968e-07, "logits/chosen": 9.997581481933594, "logits/rejected": 11.544149398803711, "logps/chosen": -2.6606273651123047, "logps/rejected": -3.2823216915130615, "loss": 4.4707, "rewards/accuracies": 1.0, "rewards/chosen": -26.606273651123047, "rewards/margins": 6.216944217681885, "rewards/rejected": -32.823219299316406, "step": 1877 }, { "epoch": 0.255718954248366, "grad_norm": 45.73843742869195, "learning_rate": 7.423976050301696e-07, "logits/chosen": 11.301568984985352, "logits/rejected": 11.574542999267578, "logps/chosen": -3.6273984909057617, "logps/rejected": -3.7720417976379395, "loss": 3.8053, "rewards/accuracies": 0.5, "rewards/chosen": -36.273983001708984, "rewards/margins": 1.4464354515075684, "rewards/rejected": -37.72041702270508, "step": 1878 }, { "epoch": 0.2558551198257081, "grad_norm": 41.45699750734228, "learning_rate": 7.422992665414732e-07, "logits/chosen": 9.989646911621094, "logits/rejected": 11.533361434936523, "logps/chosen": -2.7165048122406006, "logps/rejected": -2.9795405864715576, "loss": 3.9902, "rewards/accuracies": 0.75, "rewards/chosen": -27.16504669189453, "rewards/margins": 2.630357265472412, "rewards/rejected": -29.795406341552734, "step": 1879 }, { "epoch": 0.2559912854030501, "grad_norm": 43.30202767365732, "learning_rate": 7.422008507074281e-07, "logits/chosen": 11.374940872192383, "logits/rejected": 11.360275268554688, "logps/chosen": -3.2467825412750244, "logps/rejected": -3.1959757804870605, "loss": 4.2817, "rewards/accuracies": 0.5, "rewards/chosen": -32.46782684326172, "rewards/margins": -0.5080666542053223, "rewards/rejected": -31.95975685119629, "step": 1880 }, { "epoch": 0.25612745098039214, "grad_norm": 46.92519004718002, "learning_rate": 7.42102357550272e-07, "logits/chosen": 11.375726699829102, "logits/rejected": 10.244794845581055, "logps/chosen": -3.2165164947509766, "logps/rejected": -3.048323631286621, "loss": 4.3772, "rewards/accuracies": 0.25, "rewards/chosen": -32.165164947509766, "rewards/margins": -1.6819286346435547, "rewards/rejected": -30.48323631286621, "step": 1881 }, { "epoch": 0.2562636165577342, "grad_norm": 42.837694008813635, "learning_rate": 7.420037870922605e-07, "logits/chosen": 10.280059814453125, "logits/rejected": 11.35774040222168, "logps/chosen": -2.990893840789795, "logps/rejected": -3.1128101348876953, "loss": 4.0219, "rewards/accuracies": 0.75, "rewards/chosen": -29.908939361572266, "rewards/margins": 1.2191600799560547, "rewards/rejected": -31.12809944152832, "step": 1882 }, { "epoch": 0.25639978213507625, "grad_norm": 43.610412610994594, "learning_rate": 7.419051393556663e-07, "logits/chosen": 10.161006927490234, "logits/rejected": 11.147485733032227, "logps/chosen": -3.0931851863861084, "logps/rejected": -3.312276840209961, "loss": 4.0472, "rewards/accuracies": 0.5, "rewards/chosen": -30.93185043334961, "rewards/margins": 2.1909189224243164, "rewards/rejected": -33.122772216796875, "step": 1883 }, { "epoch": 0.2565359477124183, "grad_norm": 44.52997229552045, "learning_rate": 7.418064143627796e-07, "logits/chosen": 10.633783340454102, "logits/rejected": 11.047900199890137, "logps/chosen": -2.995486259460449, "logps/rejected": -3.2721691131591797, "loss": 3.9572, "rewards/accuracies": 1.0, "rewards/chosen": -29.954864501953125, "rewards/margins": 2.7668256759643555, "rewards/rejected": -32.7216911315918, "step": 1884 }, { "epoch": 0.25667211328976036, "grad_norm": 50.95568677121986, "learning_rate": 7.417076121359081e-07, "logits/chosen": 11.224218368530273, "logits/rejected": 12.100414276123047, "logps/chosen": -3.4752016067504883, "logps/rejected": -3.3584885597229004, "loss": 4.6355, "rewards/accuracies": 0.5, "rewards/chosen": -34.752017974853516, "rewards/margins": -1.1671319007873535, "rewards/rejected": -33.58488464355469, "step": 1885 }, { "epoch": 0.2568082788671024, "grad_norm": 44.97701322940007, "learning_rate": 7.416087326973771e-07, "logits/chosen": 11.71926498413086, "logits/rejected": 12.015233993530273, "logps/chosen": -3.4984679222106934, "logps/rejected": -3.9219632148742676, "loss": 4.4492, "rewards/accuracies": 1.0, "rewards/chosen": -34.98468017578125, "rewards/margins": 4.234951019287109, "rewards/rejected": -39.21963119506836, "step": 1886 }, { "epoch": 0.2569444444444444, "grad_norm": 43.36338712099027, "learning_rate": 7.415097760695292e-07, "logits/chosen": 9.870473861694336, "logits/rejected": 10.675151824951172, "logps/chosen": -3.132819652557373, "logps/rejected": -3.5796046257019043, "loss": 3.5709, "rewards/accuracies": 0.25, "rewards/chosen": -31.328197479248047, "rewards/margins": 4.467851161956787, "rewards/rejected": -35.796051025390625, "step": 1887 }, { "epoch": 0.2570806100217865, "grad_norm": 40.78301517814631, "learning_rate": 7.414107422747245e-07, "logits/chosen": 9.500000953674316, "logits/rejected": 11.280536651611328, "logps/chosen": -3.103407144546509, "logps/rejected": -3.6780946254730225, "loss": 3.8364, "rewards/accuracies": 1.0, "rewards/chosen": -31.03407096862793, "rewards/margins": 5.746873378753662, "rewards/rejected": -36.78094482421875, "step": 1888 }, { "epoch": 0.25721677559912853, "grad_norm": 43.366327874881826, "learning_rate": 7.413116313353404e-07, "logits/chosen": 11.18326187133789, "logits/rejected": 11.76833724975586, "logps/chosen": -3.1525518894195557, "logps/rejected": -3.500143051147461, "loss": 4.4383, "rewards/accuracies": 1.0, "rewards/chosen": -31.5255184173584, "rewards/margins": 3.475912570953369, "rewards/rejected": -35.00143051147461, "step": 1889 }, { "epoch": 0.25735294117647056, "grad_norm": 41.94765019112343, "learning_rate": 7.412124432737719e-07, "logits/chosen": 10.966299057006836, "logits/rejected": 11.980335235595703, "logps/chosen": -3.4392824172973633, "logps/rejected": -3.5522677898406982, "loss": 4.239, "rewards/accuracies": 1.0, "rewards/chosen": -34.392826080322266, "rewards/margins": 1.1298503875732422, "rewards/rejected": -35.522674560546875, "step": 1890 }, { "epoch": 0.25748910675381265, "grad_norm": 41.46863300067039, "learning_rate": 7.411131781124313e-07, "logits/chosen": 10.772804260253906, "logits/rejected": 11.56441879272461, "logps/chosen": -3.266876697540283, "logps/rejected": -3.625901937484741, "loss": 4.244, "rewards/accuracies": 0.5, "rewards/chosen": -32.668766021728516, "rewards/margins": 3.590251922607422, "rewards/rejected": -36.25901794433594, "step": 1891 }, { "epoch": 0.2576252723311547, "grad_norm": 42.408070535676046, "learning_rate": 7.410138358737485e-07, "logits/chosen": 11.468881607055664, "logits/rejected": 10.461647033691406, "logps/chosen": -3.437687635421753, "logps/rejected": -3.2235145568847656, "loss": 4.643, "rewards/accuracies": 0.0, "rewards/chosen": -34.37687683105469, "rewards/margins": -2.141728401184082, "rewards/rejected": -32.23514938354492, "step": 1892 }, { "epoch": 0.2577614379084967, "grad_norm": 47.711971318741874, "learning_rate": 7.409144165801706e-07, "logits/chosen": 10.996413230895996, "logits/rejected": 12.158422470092773, "logps/chosen": -3.0189735889434814, "logps/rejected": -3.4091005325317383, "loss": 3.9518, "rewards/accuracies": 0.75, "rewards/chosen": -30.18973731994629, "rewards/margins": 3.90126895904541, "rewards/rejected": -34.09100341796875, "step": 1893 }, { "epoch": 0.2578976034858388, "grad_norm": 41.531569253708064, "learning_rate": 7.408149202541622e-07, "logits/chosen": 11.0115966796875, "logits/rejected": 10.583379745483398, "logps/chosen": -3.571857213973999, "logps/rejected": -3.477932929992676, "loss": 3.8932, "rewards/accuracies": 0.25, "rewards/chosen": -35.718570709228516, "rewards/margins": -0.9392428398132324, "rewards/rejected": -34.779327392578125, "step": 1894 }, { "epoch": 0.2580337690631808, "grad_norm": 41.77898437836639, "learning_rate": 7.407153469182054e-07, "logits/chosen": 11.44415283203125, "logits/rejected": 12.322565078735352, "logps/chosen": -3.0076212882995605, "logps/rejected": -3.189598321914673, "loss": 3.768, "rewards/accuracies": 0.75, "rewards/chosen": -30.076213836669922, "rewards/margins": 1.8197693824768066, "rewards/rejected": -31.89598274230957, "step": 1895 }, { "epoch": 0.2581699346405229, "grad_norm": 47.34666486087812, "learning_rate": 7.406156965947996e-07, "logits/chosen": 10.354137420654297, "logits/rejected": 11.206109046936035, "logps/chosen": -3.428583860397339, "logps/rejected": -3.0068440437316895, "loss": 4.3359, "rewards/accuracies": 0.25, "rewards/chosen": -34.28583526611328, "rewards/margins": -4.2173991203308105, "rewards/rejected": -30.068439483642578, "step": 1896 }, { "epoch": 0.25830610021786493, "grad_norm": 44.590326708450405, "learning_rate": 7.405159693064617e-07, "logits/chosen": 10.415945053100586, "logits/rejected": 10.568880081176758, "logps/chosen": -3.1816697120666504, "logps/rejected": -3.4512062072753906, "loss": 3.9093, "rewards/accuracies": 0.5, "rewards/chosen": -31.81669807434082, "rewards/margins": 2.695363998413086, "rewards/rejected": -34.512062072753906, "step": 1897 }, { "epoch": 0.25844226579520696, "grad_norm": 41.73021314525678, "learning_rate": 7.404161650757256e-07, "logits/chosen": 9.746925354003906, "logits/rejected": 10.69199275970459, "logps/chosen": -2.8606929779052734, "logps/rejected": -3.190673828125, "loss": 4.0926, "rewards/accuracies": 0.75, "rewards/chosen": -28.606929779052734, "rewards/margins": 3.2998080253601074, "rewards/rejected": -31.90673828125, "step": 1898 }, { "epoch": 0.25857843137254904, "grad_norm": 39.390113380929385, "learning_rate": 7.403162839251433e-07, "logits/chosen": 10.328797340393066, "logits/rejected": 9.359493255615234, "logps/chosen": -3.322237491607666, "logps/rejected": -3.4114327430725098, "loss": 4.0547, "rewards/accuracies": 0.75, "rewards/chosen": -33.222373962402344, "rewards/margins": 0.8919510841369629, "rewards/rejected": -34.11432647705078, "step": 1899 }, { "epoch": 0.2587145969498911, "grad_norm": 49.99835588395357, "learning_rate": 7.402163258772834e-07, "logits/chosen": 11.148311614990234, "logits/rejected": 11.447904586791992, "logps/chosen": -3.1640758514404297, "logps/rejected": -3.538086175918579, "loss": 4.4098, "rewards/accuracies": 1.0, "rewards/chosen": -31.640756607055664, "rewards/margins": 3.740105628967285, "rewards/rejected": -35.380863189697266, "step": 1900 }, { "epoch": 0.2588507625272331, "grad_norm": 39.78974962926837, "learning_rate": 7.401162909547324e-07, "logits/chosen": 9.733465194702148, "logits/rejected": 10.928139686584473, "logps/chosen": -3.124742031097412, "logps/rejected": -4.041587829589844, "loss": 4.4321, "rewards/accuracies": 1.0, "rewards/chosen": -31.247421264648438, "rewards/margins": 9.16845417022705, "rewards/rejected": -40.41587448120117, "step": 1901 }, { "epoch": 0.2589869281045752, "grad_norm": 42.14710605342385, "learning_rate": 7.400161791800942e-07, "logits/chosen": 10.717540740966797, "logits/rejected": 10.051481246948242, "logps/chosen": -3.585089683532715, "logps/rejected": -3.4638116359710693, "loss": 4.1783, "rewards/accuracies": 0.25, "rewards/chosen": -35.85089874267578, "rewards/margins": -1.212780475616455, "rewards/rejected": -34.63811492919922, "step": 1902 }, { "epoch": 0.2591230936819172, "grad_norm": 42.92034734740435, "learning_rate": 7.399159905759895e-07, "logits/chosen": 12.061437606811523, "logits/rejected": 11.892739295959473, "logps/chosen": -3.7963101863861084, "logps/rejected": -4.038656711578369, "loss": 4.4927, "rewards/accuracies": 1.0, "rewards/chosen": -37.963104248046875, "rewards/margins": 2.423464775085449, "rewards/rejected": -40.386566162109375, "step": 1903 }, { "epoch": 0.25925925925925924, "grad_norm": 43.281353582635454, "learning_rate": 7.398157251650571e-07, "logits/chosen": 10.826711654663086, "logits/rejected": 11.339479446411133, "logps/chosen": -2.9349091053009033, "logps/rejected": -3.2072839736938477, "loss": 4.2029, "rewards/accuracies": 0.5, "rewards/chosen": -29.349090576171875, "rewards/margins": 2.7237491607666016, "rewards/rejected": -32.072837829589844, "step": 1904 }, { "epoch": 0.25939542483660133, "grad_norm": 41.671755233441985, "learning_rate": 7.397153829699526e-07, "logits/chosen": 11.384782791137695, "logits/rejected": 11.423739433288574, "logps/chosen": -3.2073676586151123, "logps/rejected": -3.4062509536743164, "loss": 4.0619, "rewards/accuracies": 0.75, "rewards/chosen": -32.07367706298828, "rewards/margins": 1.9888324737548828, "rewards/rejected": -34.06250762939453, "step": 1905 }, { "epoch": 0.25953159041394336, "grad_norm": 40.255632479262474, "learning_rate": 7.396149640133492e-07, "logits/chosen": 11.114269256591797, "logits/rejected": 11.497520446777344, "logps/chosen": -3.0723400115966797, "logps/rejected": -3.489351749420166, "loss": 4.0199, "rewards/accuracies": 1.0, "rewards/chosen": -30.723400115966797, "rewards/margins": 4.170118808746338, "rewards/rejected": -34.89352035522461, "step": 1906 }, { "epoch": 0.2596677559912854, "grad_norm": 43.19334850638961, "learning_rate": 7.395144683179375e-07, "logits/chosen": 10.644603729248047, "logits/rejected": 11.461169242858887, "logps/chosen": -3.471329689025879, "logps/rejected": -3.733816146850586, "loss": 4.2837, "rewards/accuracies": 0.75, "rewards/chosen": -34.713294982910156, "rewards/margins": 2.6248655319213867, "rewards/rejected": -37.33816146850586, "step": 1907 }, { "epoch": 0.25980392156862747, "grad_norm": 39.882745677550645, "learning_rate": 7.394138959064251e-07, "logits/chosen": 12.067195892333984, "logits/rejected": 11.145037651062012, "logps/chosen": -3.2312564849853516, "logps/rejected": -3.5881638526916504, "loss": 4.1168, "rewards/accuracies": 0.75, "rewards/chosen": -32.312564849853516, "rewards/margins": 3.5690736770629883, "rewards/rejected": -35.88163757324219, "step": 1908 }, { "epoch": 0.2599400871459695, "grad_norm": 46.09788338834427, "learning_rate": 7.393132468015374e-07, "logits/chosen": 10.449044227600098, "logits/rejected": 12.053888320922852, "logps/chosen": -3.1178231239318848, "logps/rejected": -3.759847402572632, "loss": 4.0764, "rewards/accuracies": 1.0, "rewards/chosen": -31.178232192993164, "rewards/margins": 6.420242786407471, "rewards/rejected": -37.598472595214844, "step": 1909 }, { "epoch": 0.26007625272331153, "grad_norm": 43.25817922403918, "learning_rate": 7.392125210260167e-07, "logits/chosen": 11.751480102539062, "logits/rejected": 11.658409118652344, "logps/chosen": -3.310455322265625, "logps/rejected": -3.3586504459381104, "loss": 4.3708, "rewards/accuracies": 0.5, "rewards/chosen": -33.10455322265625, "rewards/margins": 0.4819493293762207, "rewards/rejected": -33.58650207519531, "step": 1910 }, { "epoch": 0.2602124183006536, "grad_norm": 43.83417454694217, "learning_rate": 7.391117186026229e-07, "logits/chosen": 12.172324180603027, "logits/rejected": 11.791522979736328, "logps/chosen": -3.6511569023132324, "logps/rejected": -3.5525877475738525, "loss": 3.8797, "rewards/accuracies": 0.25, "rewards/chosen": -36.51156997680664, "rewards/margins": -0.9856910705566406, "rewards/rejected": -35.52587890625, "step": 1911 }, { "epoch": 0.26034858387799564, "grad_norm": 48.15194353914132, "learning_rate": 7.390108395541333e-07, "logits/chosen": 11.095311164855957, "logits/rejected": 11.149942398071289, "logps/chosen": -3.4615323543548584, "logps/rejected": -3.5296144485473633, "loss": 4.3597, "rewards/accuracies": 0.5, "rewards/chosen": -34.615325927734375, "rewards/margins": 0.680819034576416, "rewards/rejected": -35.296142578125, "step": 1912 }, { "epoch": 0.26048474945533767, "grad_norm": 44.08237987234142, "learning_rate": 7.38909883903342e-07, "logits/chosen": 10.962881088256836, "logits/rejected": 10.126334190368652, "logps/chosen": -3.288520097732544, "logps/rejected": -3.256730318069458, "loss": 4.288, "rewards/accuracies": 0.5, "rewards/chosen": -32.88520050048828, "rewards/margins": -0.31789731979370117, "rewards/rejected": -32.56730270385742, "step": 1913 }, { "epoch": 0.26062091503267976, "grad_norm": 57.41458303693515, "learning_rate": 7.388088516730611e-07, "logits/chosen": 10.758360862731934, "logits/rejected": 11.934501647949219, "logps/chosen": -3.360098123550415, "logps/rejected": -3.7383334636688232, "loss": 3.8591, "rewards/accuracies": 0.75, "rewards/chosen": -33.600982666015625, "rewards/margins": 3.7823524475097656, "rewards/rejected": -37.38333511352539, "step": 1914 }, { "epoch": 0.2607570806100218, "grad_norm": 46.849941352578924, "learning_rate": 7.387077428861194e-07, "logits/chosen": 10.449636459350586, "logits/rejected": 12.472415924072266, "logps/chosen": -3.441692352294922, "logps/rejected": -3.7251272201538086, "loss": 3.8525, "rewards/accuracies": 1.0, "rewards/chosen": -34.41692352294922, "rewards/margins": 2.834348678588867, "rewards/rejected": -37.25127410888672, "step": 1915 }, { "epoch": 0.2608932461873638, "grad_norm": 50.518730329282945, "learning_rate": 7.386065575653637e-07, "logits/chosen": 11.542362213134766, "logits/rejected": 10.695821762084961, "logps/chosen": -3.5683228969573975, "logps/rejected": -3.4674980640411377, "loss": 3.707, "rewards/accuracies": 0.5, "rewards/chosen": -35.6832275390625, "rewards/margins": -1.008249282836914, "rewards/rejected": -34.67498016357422, "step": 1916 }, { "epoch": 0.2610294117647059, "grad_norm": 41.28747610658732, "learning_rate": 7.385052957336571e-07, "logits/chosen": 10.552292823791504, "logits/rejected": 11.231054306030273, "logps/chosen": -3.0124263763427734, "logps/rejected": -3.266913652420044, "loss": 4.0437, "rewards/accuracies": 1.0, "rewards/chosen": -30.124265670776367, "rewards/margins": 2.5448713302612305, "rewards/rejected": -32.66913604736328, "step": 1917 }, { "epoch": 0.2611655773420479, "grad_norm": 50.512497271908416, "learning_rate": 7.38403957413881e-07, "logits/chosen": 10.748170852661133, "logits/rejected": 11.2970609664917, "logps/chosen": -3.365273952484131, "logps/rejected": -3.481126546859741, "loss": 4.0193, "rewards/accuracies": 0.5, "rewards/chosen": -33.652740478515625, "rewards/margins": 1.158524990081787, "rewards/rejected": -34.8112678527832, "step": 1918 }, { "epoch": 0.26130174291938996, "grad_norm": 79.3475788373819, "learning_rate": 7.383025426289333e-07, "logits/chosen": 10.481672286987305, "logits/rejected": 11.45295524597168, "logps/chosen": -3.260089635848999, "logps/rejected": -3.392517566680908, "loss": 3.7342, "rewards/accuracies": 0.5, "rewards/chosen": -32.60089874267578, "rewards/margins": 1.3242783546447754, "rewards/rejected": -33.92517852783203, "step": 1919 }, { "epoch": 0.26143790849673204, "grad_norm": 39.31558517345677, "learning_rate": 7.382010514017297e-07, "logits/chosen": 10.91908073425293, "logits/rejected": 11.414831161499023, "logps/chosen": -3.3602254390716553, "logps/rejected": -3.4797616004943848, "loss": 3.4308, "rewards/accuracies": 0.5, "rewards/chosen": -33.602256774902344, "rewards/margins": 1.1953601837158203, "rewards/rejected": -34.79761505126953, "step": 1920 }, { "epoch": 0.26157407407407407, "grad_norm": 42.29887331266019, "learning_rate": 7.38099483755203e-07, "logits/chosen": 9.77513313293457, "logits/rejected": 11.083758354187012, "logps/chosen": -3.191680908203125, "logps/rejected": -3.589873790740967, "loss": 3.8782, "rewards/accuracies": 0.75, "rewards/chosen": -31.91680908203125, "rewards/margins": 3.9819283485412598, "rewards/rejected": -35.898738861083984, "step": 1921 }, { "epoch": 0.2617102396514161, "grad_norm": 45.657195901269766, "learning_rate": 7.379978397123031e-07, "logits/chosen": 11.411643981933594, "logits/rejected": 11.06993579864502, "logps/chosen": -3.0733842849731445, "logps/rejected": -3.3916306495666504, "loss": 4.135, "rewards/accuracies": 0.75, "rewards/chosen": -30.733840942382812, "rewards/margins": 3.182464122772217, "rewards/rejected": -33.91630554199219, "step": 1922 }, { "epoch": 0.2618464052287582, "grad_norm": 59.602627684859876, "learning_rate": 7.378961192959975e-07, "logits/chosen": 10.031135559082031, "logits/rejected": 10.262187957763672, "logps/chosen": -3.2309365272521973, "logps/rejected": -3.1941328048706055, "loss": 4.6739, "rewards/accuracies": 0.5, "rewards/chosen": -32.309364318847656, "rewards/margins": -0.36803722381591797, "rewards/rejected": -31.941329956054688, "step": 1923 }, { "epoch": 0.2619825708061002, "grad_norm": 49.47037682979862, "learning_rate": 7.377943225292707e-07, "logits/chosen": 10.791187286376953, "logits/rejected": 10.38812255859375, "logps/chosen": -3.0182504653930664, "logps/rejected": -2.869501829147339, "loss": 4.3346, "rewards/accuracies": 0.25, "rewards/chosen": -30.182504653930664, "rewards/margins": -1.487485408782959, "rewards/rejected": -28.695018768310547, "step": 1924 }, { "epoch": 0.26211873638344224, "grad_norm": 41.20318037311506, "learning_rate": 7.376924494351243e-07, "logits/chosen": 11.21685791015625, "logits/rejected": 10.40408706665039, "logps/chosen": -3.7474989891052246, "logps/rejected": -3.2932517528533936, "loss": 3.9708, "rewards/accuracies": 0.0, "rewards/chosen": -37.47499084472656, "rewards/margins": -4.542474269866943, "rewards/rejected": -32.932518005371094, "step": 1925 }, { "epoch": 0.2622549019607843, "grad_norm": 47.953802308087006, "learning_rate": 7.375905000365777e-07, "logits/chosen": 10.397927284240723, "logits/rejected": 11.736417770385742, "logps/chosen": -2.8819193840026855, "logps/rejected": -3.33274507522583, "loss": 3.5913, "rewards/accuracies": 1.0, "rewards/chosen": -28.819196701049805, "rewards/margins": 4.508256912231445, "rewards/rejected": -33.32745361328125, "step": 1926 }, { "epoch": 0.26239106753812635, "grad_norm": 65.74575553674171, "learning_rate": 7.37488474356667e-07, "logits/chosen": 11.121989250183105, "logits/rejected": 10.800054550170898, "logps/chosen": -3.0992348194122314, "logps/rejected": -3.0546298027038574, "loss": 4.2326, "rewards/accuracies": 0.5, "rewards/chosen": -30.992347717285156, "rewards/margins": -0.44605112075805664, "rewards/rejected": -30.546295166015625, "step": 1927 }, { "epoch": 0.2625272331154684, "grad_norm": 42.51370096211766, "learning_rate": 7.373863724184457e-07, "logits/chosen": 10.091492652893066, "logits/rejected": 10.814645767211914, "logps/chosen": -3.0629115104675293, "logps/rejected": -3.265735149383545, "loss": 4.3971, "rewards/accuracies": 0.5, "rewards/chosen": -30.62911605834961, "rewards/margins": 2.0282373428344727, "rewards/rejected": -32.65735626220703, "step": 1928 }, { "epoch": 0.26266339869281047, "grad_norm": 47.652175127782066, "learning_rate": 7.37284194244985e-07, "logits/chosen": 12.348164558410645, "logits/rejected": 12.383289337158203, "logps/chosen": -3.7135424613952637, "logps/rejected": -3.932204008102417, "loss": 4.357, "rewards/accuracies": 0.75, "rewards/chosen": -37.13542556762695, "rewards/margins": 2.186614990234375, "rewards/rejected": -39.32204055786133, "step": 1929 }, { "epoch": 0.2627995642701525, "grad_norm": 39.390927380257835, "learning_rate": 7.371819398593723e-07, "logits/chosen": 11.432425498962402, "logits/rejected": 11.696893692016602, "logps/chosen": -2.889641284942627, "logps/rejected": -3.334806442260742, "loss": 3.5925, "rewards/accuracies": 0.75, "rewards/chosen": -28.896411895751953, "rewards/margins": 4.451653003692627, "rewards/rejected": -33.34806823730469, "step": 1930 }, { "epoch": 0.2629357298474945, "grad_norm": 50.36411544684795, "learning_rate": 7.370796092847132e-07, "logits/chosen": 10.637269020080566, "logits/rejected": 11.77490520477295, "logps/chosen": -3.244816303253174, "logps/rejected": -3.757547616958618, "loss": 4.0203, "rewards/accuracies": 0.75, "rewards/chosen": -32.44816589355469, "rewards/margins": 5.127310276031494, "rewards/rejected": -37.575477600097656, "step": 1931 }, { "epoch": 0.2630718954248366, "grad_norm": 40.65076950036561, "learning_rate": 7.369772025441301e-07, "logits/chosen": 10.791311264038086, "logits/rejected": 10.512789726257324, "logps/chosen": -2.730264663696289, "logps/rejected": -3.103733539581299, "loss": 4.1219, "rewards/accuracies": 0.75, "rewards/chosen": -27.30264663696289, "rewards/margins": 3.7346887588500977, "rewards/rejected": -31.037336349487305, "step": 1932 }, { "epoch": 0.26320806100217864, "grad_norm": 47.49613158433412, "learning_rate": 7.368747196607626e-07, "logits/chosen": 11.529467582702637, "logits/rejected": 11.966669082641602, "logps/chosen": -3.4695191383361816, "logps/rejected": -3.4863715171813965, "loss": 4.1012, "rewards/accuracies": 0.5, "rewards/chosen": -34.6951904296875, "rewards/margins": 0.16852426528930664, "rewards/rejected": -34.86371612548828, "step": 1933 }, { "epoch": 0.2633442265795207, "grad_norm": 42.12359407273417, "learning_rate": 7.367721606577676e-07, "logits/chosen": 9.723258018493652, "logits/rejected": 10.64831829071045, "logps/chosen": -2.876579761505127, "logps/rejected": -3.069397449493408, "loss": 4.0071, "rewards/accuracies": 0.5, "rewards/chosen": -28.765796661376953, "rewards/margins": 1.9281768798828125, "rewards/rejected": -30.693973541259766, "step": 1934 }, { "epoch": 0.26348039215686275, "grad_norm": 47.43679761046471, "learning_rate": 7.36669525558319e-07, "logits/chosen": 10.699777603149414, "logits/rejected": 10.626397132873535, "logps/chosen": -3.308260679244995, "logps/rejected": -3.1192970275878906, "loss": 3.7784, "rewards/accuracies": 0.25, "rewards/chosen": -33.08260726928711, "rewards/margins": -1.889638900756836, "rewards/rejected": -31.192970275878906, "step": 1935 }, { "epoch": 0.2636165577342048, "grad_norm": 50.31671379022343, "learning_rate": 7.365668143856082e-07, "logits/chosen": 10.224912643432617, "logits/rejected": 10.627740859985352, "logps/chosen": -2.9707720279693604, "logps/rejected": -3.5640764236450195, "loss": 4.0197, "rewards/accuracies": 0.75, "rewards/chosen": -29.707721710205078, "rewards/margins": 5.933043956756592, "rewards/rejected": -35.64076232910156, "step": 1936 }, { "epoch": 0.26375272331154687, "grad_norm": 40.906898814731235, "learning_rate": 7.364640271628437e-07, "logits/chosen": 9.940099716186523, "logits/rejected": 10.243020057678223, "logps/chosen": -2.955732822418213, "logps/rejected": -3.101503372192383, "loss": 4.2898, "rewards/accuracies": 0.75, "rewards/chosen": -29.557327270507812, "rewards/margins": 1.4577054977416992, "rewards/rejected": -31.015033721923828, "step": 1937 }, { "epoch": 0.2638888888888889, "grad_norm": 38.18149989876727, "learning_rate": 7.363611639132509e-07, "logits/chosen": 10.938648223876953, "logits/rejected": 11.03270149230957, "logps/chosen": -2.999992609024048, "logps/rejected": -3.3050217628479004, "loss": 3.872, "rewards/accuracies": 0.75, "rewards/chosen": -29.99992561340332, "rewards/margins": 3.050291061401367, "rewards/rejected": -33.05021667480469, "step": 1938 }, { "epoch": 0.2640250544662309, "grad_norm": 43.372574306247714, "learning_rate": 7.362582246600728e-07, "logits/chosen": 11.151269912719727, "logits/rejected": 11.186691284179688, "logps/chosen": -3.2148685455322266, "logps/rejected": -3.204688310623169, "loss": 3.7792, "rewards/accuracies": 0.5, "rewards/chosen": -32.148685455322266, "rewards/margins": -0.10180091857910156, "rewards/rejected": -32.04688262939453, "step": 1939 }, { "epoch": 0.264161220043573, "grad_norm": 37.48213469854135, "learning_rate": 7.361552094265693e-07, "logits/chosen": 9.75839614868164, "logits/rejected": 10.277175903320312, "logps/chosen": -3.171039342880249, "logps/rejected": -3.044469118118286, "loss": 4.0513, "rewards/accuracies": 0.5, "rewards/chosen": -31.71039581298828, "rewards/margins": -1.2657032012939453, "rewards/rejected": -30.444690704345703, "step": 1940 }, { "epoch": 0.26429738562091504, "grad_norm": 43.85164972834509, "learning_rate": 7.360521182360175e-07, "logits/chosen": 11.120224952697754, "logits/rejected": 11.065742492675781, "logps/chosen": -3.1965413093566895, "logps/rejected": -2.989211082458496, "loss": 4.5607, "rewards/accuracies": 0.25, "rewards/chosen": -31.965415954589844, "rewards/margins": -2.07330322265625, "rewards/rejected": -29.89211082458496, "step": 1941 }, { "epoch": 0.26443355119825707, "grad_norm": 41.615989914010385, "learning_rate": 7.359489511117117e-07, "logits/chosen": 9.7338285446167, "logits/rejected": 10.777101516723633, "logps/chosen": -3.0477170944213867, "logps/rejected": -3.216432809829712, "loss": 3.7824, "rewards/accuracies": 0.75, "rewards/chosen": -30.4771728515625, "rewards/margins": 1.6871566772460938, "rewards/rejected": -32.164329528808594, "step": 1942 }, { "epoch": 0.26456971677559915, "grad_norm": 43.77249883738246, "learning_rate": 7.358457080769634e-07, "logits/chosen": 10.546295166015625, "logits/rejected": 11.248600959777832, "logps/chosen": -3.3284173011779785, "logps/rejected": -3.494819164276123, "loss": 3.4237, "rewards/accuracies": 0.5, "rewards/chosen": -33.28417205810547, "rewards/margins": 1.6640172004699707, "rewards/rejected": -34.94818878173828, "step": 1943 }, { "epoch": 0.2647058823529412, "grad_norm": 41.67223327275648, "learning_rate": 7.357423891551014e-07, "logits/chosen": 10.722221374511719, "logits/rejected": 11.682809829711914, "logps/chosen": -3.141258478164673, "logps/rejected": -3.6027143001556396, "loss": 4.2372, "rewards/accuracies": 0.5, "rewards/chosen": -31.41258430480957, "rewards/margins": 4.614559173583984, "rewards/rejected": -36.02714157104492, "step": 1944 }, { "epoch": 0.2648420479302832, "grad_norm": 38.621783337787285, "learning_rate": 7.356389943694711e-07, "logits/chosen": 11.584735870361328, "logits/rejected": 11.742534637451172, "logps/chosen": -3.0430104732513428, "logps/rejected": -3.196777820587158, "loss": 3.7161, "rewards/accuracies": 0.75, "rewards/chosen": -30.430103302001953, "rewards/margins": 1.5376715660095215, "rewards/rejected": -31.967777252197266, "step": 1945 }, { "epoch": 0.2649782135076253, "grad_norm": 41.063845726581064, "learning_rate": 7.355355237434357e-07, "logits/chosen": 10.25638198852539, "logits/rejected": 10.73379135131836, "logps/chosen": -3.0563535690307617, "logps/rejected": -3.046020746231079, "loss": 4.0035, "rewards/accuracies": 0.25, "rewards/chosen": -30.563535690307617, "rewards/margins": -0.10332822799682617, "rewards/rejected": -30.460206985473633, "step": 1946 }, { "epoch": 0.2651143790849673, "grad_norm": 50.87077428682207, "learning_rate": 7.354319773003752e-07, "logits/chosen": 10.954300880432129, "logits/rejected": 10.753691673278809, "logps/chosen": -3.3007001876831055, "logps/rejected": -3.2416539192199707, "loss": 4.3796, "rewards/accuracies": 0.5, "rewards/chosen": -33.00700378417969, "rewards/margins": -0.5904626846313477, "rewards/rejected": -32.416542053222656, "step": 1947 }, { "epoch": 0.26525054466230935, "grad_norm": 46.707370236380086, "learning_rate": 7.353283550636866e-07, "logits/chosen": 10.465070724487305, "logits/rejected": 11.11390495300293, "logps/chosen": -2.7994070053100586, "logps/rejected": -3.2825169563293457, "loss": 4.8286, "rewards/accuracies": 0.75, "rewards/chosen": -27.994068145751953, "rewards/margins": 4.831099987030029, "rewards/rejected": -32.82516860961914, "step": 1948 }, { "epoch": 0.26538671023965144, "grad_norm": 34.7007277219765, "learning_rate": 7.352246570567844e-07, "logits/chosen": 10.890164375305176, "logits/rejected": 11.858293533325195, "logps/chosen": -3.042649984359741, "logps/rejected": -3.455538749694824, "loss": 3.4142, "rewards/accuracies": 1.0, "rewards/chosen": -30.426498413085938, "rewards/margins": 4.128890037536621, "rewards/rejected": -34.555389404296875, "step": 1949 }, { "epoch": 0.26552287581699346, "grad_norm": 38.247037755592274, "learning_rate": 7.351208833031001e-07, "logits/chosen": 10.271339416503906, "logits/rejected": 10.186382293701172, "logps/chosen": -3.2795486450195312, "logps/rejected": -3.329472303390503, "loss": 3.9037, "rewards/accuracies": 0.75, "rewards/chosen": -32.79548645019531, "rewards/margins": 0.4992365837097168, "rewards/rejected": -33.29472351074219, "step": 1950 }, { "epoch": 0.2656590413943355, "grad_norm": 37.559133816531464, "learning_rate": 7.350170338260817e-07, "logits/chosen": 10.768985748291016, "logits/rejected": 10.597286224365234, "logps/chosen": -3.1299853324890137, "logps/rejected": -3.245513677597046, "loss": 3.8528, "rewards/accuracies": 0.25, "rewards/chosen": -31.299854278564453, "rewards/margins": 1.1552820205688477, "rewards/rejected": -32.455135345458984, "step": 1951 }, { "epoch": 0.2657952069716776, "grad_norm": 39.1633280638947, "learning_rate": 7.349131086491954e-07, "logits/chosen": 10.056303024291992, "logits/rejected": 10.833038330078125, "logps/chosen": -3.261533498764038, "logps/rejected": -3.742305278778076, "loss": 3.8073, "rewards/accuracies": 0.75, "rewards/chosen": -32.615333557128906, "rewards/margins": 4.807720184326172, "rewards/rejected": -37.423057556152344, "step": 1952 }, { "epoch": 0.2659313725490196, "grad_norm": 44.98302981552326, "learning_rate": 7.348091077959239e-07, "logits/chosen": 11.096495628356934, "logits/rejected": 10.412912368774414, "logps/chosen": -3.5636420249938965, "logps/rejected": -3.4254400730133057, "loss": 3.7953, "rewards/accuracies": 0.25, "rewards/chosen": -35.63642120361328, "rewards/margins": -1.3820176124572754, "rewards/rejected": -34.25440216064453, "step": 1953 }, { "epoch": 0.26606753812636164, "grad_norm": 43.20338255148738, "learning_rate": 7.347050312897669e-07, "logits/chosen": 10.44157600402832, "logits/rejected": 11.846715927124023, "logps/chosen": -3.2591614723205566, "logps/rejected": -3.549104928970337, "loss": 4.4519, "rewards/accuracies": 0.75, "rewards/chosen": -32.59161376953125, "rewards/margins": 2.899435520172119, "rewards/rejected": -35.491050720214844, "step": 1954 }, { "epoch": 0.2662037037037037, "grad_norm": 43.43974087694097, "learning_rate": 7.346008791542412e-07, "logits/chosen": 9.964396476745605, "logits/rejected": 11.247949600219727, "logps/chosen": -3.1572980880737305, "logps/rejected": -3.6209664344787598, "loss": 4.1953, "rewards/accuracies": 1.0, "rewards/chosen": -31.572978973388672, "rewards/margins": 4.636682987213135, "rewards/rejected": -36.20966339111328, "step": 1955 }, { "epoch": 0.26633986928104575, "grad_norm": 44.78607670439572, "learning_rate": 7.344966514128813e-07, "logits/chosen": 10.602190971374512, "logits/rejected": 9.894803047180176, "logps/chosen": -3.2937064170837402, "logps/rejected": -3.0664920806884766, "loss": 4.0637, "rewards/accuracies": 0.25, "rewards/chosen": -32.93706512451172, "rewards/margins": -2.272143840789795, "rewards/rejected": -30.664918899536133, "step": 1956 }, { "epoch": 0.2664760348583878, "grad_norm": 41.082096638725254, "learning_rate": 7.343923480892378e-07, "logits/chosen": 10.013287544250488, "logits/rejected": 10.254140853881836, "logps/chosen": -3.026778221130371, "logps/rejected": -2.956954002380371, "loss": 4.543, "rewards/accuracies": 0.25, "rewards/chosen": -30.26778221130371, "rewards/margins": -0.6982412338256836, "rewards/rejected": -29.569541931152344, "step": 1957 }, { "epoch": 0.26661220043572986, "grad_norm": 36.98155477185099, "learning_rate": 7.342879692068793e-07, "logits/chosen": 9.492193222045898, "logits/rejected": 11.287012100219727, "logps/chosen": -3.251889944076538, "logps/rejected": -3.675137519836426, "loss": 3.6723, "rewards/accuracies": 0.75, "rewards/chosen": -32.518898010253906, "rewards/margins": 4.232473850250244, "rewards/rejected": -36.751373291015625, "step": 1958 }, { "epoch": 0.2667483660130719, "grad_norm": 51.14729112669271, "learning_rate": 7.341835147893908e-07, "logits/chosen": 10.372011184692383, "logits/rejected": 10.412449836730957, "logps/chosen": -3.274508476257324, "logps/rejected": -3.3852005004882812, "loss": 4.3573, "rewards/accuracies": 1.0, "rewards/chosen": -32.745086669921875, "rewards/margins": 1.1069202423095703, "rewards/rejected": -33.85200500488281, "step": 1959 }, { "epoch": 0.2668845315904139, "grad_norm": 42.09144260864885, "learning_rate": 7.340789848603748e-07, "logits/chosen": 10.37115478515625, "logits/rejected": 11.192606925964355, "logps/chosen": -3.025813579559326, "logps/rejected": -3.274139165878296, "loss": 3.9953, "rewards/accuracies": 0.75, "rewards/chosen": -30.258136749267578, "rewards/margins": 2.4832544326782227, "rewards/rejected": -32.74139404296875, "step": 1960 }, { "epoch": 0.267020697167756, "grad_norm": 43.46531858056656, "learning_rate": 7.339743794434506e-07, "logits/chosen": 10.049535751342773, "logits/rejected": 10.809403419494629, "logps/chosen": -3.265913486480713, "logps/rejected": -3.458897352218628, "loss": 3.9657, "rewards/accuracies": 0.75, "rewards/chosen": -32.65913391113281, "rewards/margins": 1.9298367500305176, "rewards/rejected": -34.58897399902344, "step": 1961 }, { "epoch": 0.26715686274509803, "grad_norm": 43.02968539204671, "learning_rate": 7.338696985622547e-07, "logits/chosen": 10.372154235839844, "logits/rejected": 10.335376739501953, "logps/chosen": -3.6877808570861816, "logps/rejected": -3.5942509174346924, "loss": 4.5462, "rewards/accuracies": 0.25, "rewards/chosen": -36.8778076171875, "rewards/margins": -0.9352998733520508, "rewards/rejected": -35.942508697509766, "step": 1962 }, { "epoch": 0.26729302832244006, "grad_norm": 38.158936575339624, "learning_rate": 7.337649422404406e-07, "logits/chosen": 9.985786437988281, "logits/rejected": 9.440921783447266, "logps/chosen": -2.940579891204834, "logps/rejected": -2.8997437953948975, "loss": 3.9659, "rewards/accuracies": 0.5, "rewards/chosen": -29.405799865722656, "rewards/margins": -0.40836286544799805, "rewards/rejected": -28.9974365234375, "step": 1963 }, { "epoch": 0.26742919389978215, "grad_norm": 38.46184792493987, "learning_rate": 7.33660110501679e-07, "logits/chosen": 9.95692253112793, "logits/rejected": 10.780373573303223, "logps/chosen": -2.9605960845947266, "logps/rejected": -3.389394760131836, "loss": 3.9352, "rewards/accuracies": 0.75, "rewards/chosen": -29.605958938598633, "rewards/margins": 4.287988662719727, "rewards/rejected": -33.89394760131836, "step": 1964 }, { "epoch": 0.2675653594771242, "grad_norm": 41.01978962823603, "learning_rate": 7.335552033696572e-07, "logits/chosen": 9.80801773071289, "logits/rejected": 10.03887939453125, "logps/chosen": -2.897951602935791, "logps/rejected": -3.1166865825653076, "loss": 4.0347, "rewards/accuracies": 0.5, "rewards/chosen": -28.979516983032227, "rewards/margins": 2.187350273132324, "rewards/rejected": -31.166866302490234, "step": 1965 }, { "epoch": 0.2677015250544662, "grad_norm": 41.19228101193046, "learning_rate": 7.334502208680801e-07, "logits/chosen": 9.944029808044434, "logits/rejected": 10.352792739868164, "logps/chosen": -3.4022669792175293, "logps/rejected": -3.567188262939453, "loss": 4.5208, "rewards/accuracies": 0.5, "rewards/chosen": -34.022666931152344, "rewards/margins": 1.6492114067077637, "rewards/rejected": -35.67188262939453, "step": 1966 }, { "epoch": 0.2678376906318083, "grad_norm": 43.67743399593118, "learning_rate": 7.333451630206692e-07, "logits/chosen": 10.123884201049805, "logits/rejected": 10.311910629272461, "logps/chosen": -3.207327127456665, "logps/rejected": -3.154877185821533, "loss": 3.4013, "rewards/accuracies": 0.5, "rewards/chosen": -32.073272705078125, "rewards/margins": -0.524500846862793, "rewards/rejected": -31.548770904541016, "step": 1967 }, { "epoch": 0.2679738562091503, "grad_norm": 42.80775529792834, "learning_rate": 7.332400298511633e-07, "logits/chosen": 9.393898010253906, "logits/rejected": 10.388338088989258, "logps/chosen": -2.940988063812256, "logps/rejected": -3.1172518730163574, "loss": 3.805, "rewards/accuracies": 0.75, "rewards/chosen": -29.409879684448242, "rewards/margins": 1.762639045715332, "rewards/rejected": -31.17251968383789, "step": 1968 }, { "epoch": 0.26811002178649235, "grad_norm": 42.29471958994095, "learning_rate": 7.33134821383318e-07, "logits/chosen": 9.999351501464844, "logits/rejected": 10.184792518615723, "logps/chosen": -3.1466727256774902, "logps/rejected": -3.0927529335021973, "loss": 4.1349, "rewards/accuracies": 0.25, "rewards/chosen": -31.46672821044922, "rewards/margins": -0.5391983985900879, "rewards/rejected": -30.927528381347656, "step": 1969 }, { "epoch": 0.26824618736383443, "grad_norm": 40.02009664819819, "learning_rate": 7.330295376409061e-07, "logits/chosen": 9.891084671020508, "logits/rejected": 10.21920108795166, "logps/chosen": -2.7133586406707764, "logps/rejected": -3.0122644901275635, "loss": 4.0181, "rewards/accuracies": 0.75, "rewards/chosen": -27.133586883544922, "rewards/margins": 2.9890589714050293, "rewards/rejected": -30.122644424438477, "step": 1970 }, { "epoch": 0.26838235294117646, "grad_norm": 41.765960477940574, "learning_rate": 7.329241786477175e-07, "logits/chosen": 9.635854721069336, "logits/rejected": 10.248570442199707, "logps/chosen": -3.104513645172119, "logps/rejected": -3.236632823944092, "loss": 4.2485, "rewards/accuracies": 0.5, "rewards/chosen": -31.045135498046875, "rewards/margins": 1.321192741394043, "rewards/rejected": -32.36632537841797, "step": 1971 }, { "epoch": 0.26851851851851855, "grad_norm": 42.16353137146665, "learning_rate": 7.328187444275586e-07, "logits/chosen": 10.547447204589844, "logits/rejected": 10.413936614990234, "logps/chosen": -2.854233503341675, "logps/rejected": -2.9968297481536865, "loss": 3.9225, "rewards/accuracies": 0.75, "rewards/chosen": -28.542335510253906, "rewards/margins": 1.4259629249572754, "rewards/rejected": -29.96829605102539, "step": 1972 }, { "epoch": 0.2686546840958606, "grad_norm": 40.906873099177, "learning_rate": 7.327132350042533e-07, "logits/chosen": 10.017626762390137, "logits/rejected": 10.13817024230957, "logps/chosen": -2.821767807006836, "logps/rejected": -3.1142501831054688, "loss": 4.0892, "rewards/accuracies": 0.75, "rewards/chosen": -28.21767807006836, "rewards/margins": 2.9248247146606445, "rewards/rejected": -31.142501831054688, "step": 1973 }, { "epoch": 0.2687908496732026, "grad_norm": 44.124107170926095, "learning_rate": 7.326076504016424e-07, "logits/chosen": 10.600744247436523, "logits/rejected": 10.145637512207031, "logps/chosen": -3.467470645904541, "logps/rejected": -3.312404155731201, "loss": 4.334, "rewards/accuracies": 0.5, "rewards/chosen": -34.674705505371094, "rewards/margins": -1.5506653785705566, "rewards/rejected": -33.12403869628906, "step": 1974 }, { "epoch": 0.2689270152505447, "grad_norm": 40.901229292635534, "learning_rate": 7.325019906435834e-07, "logits/chosen": 10.026725769042969, "logits/rejected": 9.416793823242188, "logps/chosen": -2.792693614959717, "logps/rejected": -3.1521518230438232, "loss": 3.621, "rewards/accuracies": 1.0, "rewards/chosen": -27.92693519592285, "rewards/margins": 3.5945849418640137, "rewards/rejected": -31.52151870727539, "step": 1975 }, { "epoch": 0.2690631808278867, "grad_norm": 46.02377344450744, "learning_rate": 7.323962557539512e-07, "logits/chosen": 8.80463981628418, "logits/rejected": 7.651312828063965, "logps/chosen": -2.84019136428833, "logps/rejected": -2.6687774658203125, "loss": 4.3367, "rewards/accuracies": 0.5, "rewards/chosen": -28.401914596557617, "rewards/margins": -1.7141413688659668, "rewards/rejected": -26.687774658203125, "step": 1976 }, { "epoch": 0.26919934640522875, "grad_norm": 47.85216449590051, "learning_rate": 7.322904457566373e-07, "logits/chosen": 10.047639846801758, "logits/rejected": 10.698040008544922, "logps/chosen": -3.2090654373168945, "logps/rejected": -3.3748326301574707, "loss": 4.8521, "rewards/accuracies": 0.75, "rewards/chosen": -32.09065246582031, "rewards/margins": 1.6576743125915527, "rewards/rejected": -33.74832534790039, "step": 1977 }, { "epoch": 0.26933551198257083, "grad_norm": 53.02472775014781, "learning_rate": 7.321845606755506e-07, "logits/chosen": 10.5790433883667, "logits/rejected": 10.442056655883789, "logps/chosen": -3.0924363136291504, "logps/rejected": -2.9189035892486572, "loss": 3.7517, "rewards/accuracies": 0.25, "rewards/chosen": -30.92436408996582, "rewards/margins": -1.7353267669677734, "rewards/rejected": -29.189037322998047, "step": 1978 }, { "epoch": 0.26947167755991286, "grad_norm": 47.707322754258634, "learning_rate": 7.320786005346164e-07, "logits/chosen": 10.06892204284668, "logits/rejected": 10.566646575927734, "logps/chosen": -2.744755268096924, "logps/rejected": -3.127293586730957, "loss": 4.4313, "rewards/accuracies": 0.5, "rewards/chosen": -27.44754981994629, "rewards/margins": 3.8253865242004395, "rewards/rejected": -31.272937774658203, "step": 1979 }, { "epoch": 0.2696078431372549, "grad_norm": 41.97831548112182, "learning_rate": 7.319725653577776e-07, "logits/chosen": 10.910099029541016, "logits/rejected": 10.800838470458984, "logps/chosen": -3.4657413959503174, "logps/rejected": -2.988886833190918, "loss": 4.2965, "rewards/accuracies": 0.25, "rewards/chosen": -34.657413482666016, "rewards/margins": -4.7685441970825195, "rewards/rejected": -29.888870239257812, "step": 1980 }, { "epoch": 0.269744008714597, "grad_norm": 41.97241255247379, "learning_rate": 7.318664551689935e-07, "logits/chosen": 9.43173599243164, "logits/rejected": 9.008756637573242, "logps/chosen": -3.149214744567871, "logps/rejected": -3.1362791061401367, "loss": 3.7492, "rewards/accuracies": 0.5, "rewards/chosen": -31.492149353027344, "rewards/margins": -0.12935876846313477, "rewards/rejected": -31.362789154052734, "step": 1981 }, { "epoch": 0.269880174291939, "grad_norm": 42.84278716570164, "learning_rate": 7.317602699922404e-07, "logits/chosen": 10.618846893310547, "logits/rejected": 9.749662399291992, "logps/chosen": -3.6707544326782227, "logps/rejected": -3.212108850479126, "loss": 4.1979, "rewards/accuracies": 0.0, "rewards/chosen": -36.707542419433594, "rewards/margins": -4.586457252502441, "rewards/rejected": -32.12108612060547, "step": 1982 }, { "epoch": 0.27001633986928103, "grad_norm": 43.558978038864915, "learning_rate": 7.316540098515122e-07, "logits/chosen": 10.410881042480469, "logits/rejected": 11.085712432861328, "logps/chosen": -3.195931911468506, "logps/rejected": -3.3598289489746094, "loss": 3.6946, "rewards/accuracies": 0.75, "rewards/chosen": -31.959320068359375, "rewards/margins": 1.6389689445495605, "rewards/rejected": -33.598289489746094, "step": 1983 }, { "epoch": 0.2701525054466231, "grad_norm": 41.5029670452536, "learning_rate": 7.315476747708189e-07, "logits/chosen": 10.514701843261719, "logits/rejected": 9.665594100952148, "logps/chosen": -3.3144001960754395, "logps/rejected": -2.940643787384033, "loss": 4.4704, "rewards/accuracies": 0.25, "rewards/chosen": -33.144004821777344, "rewards/margins": -3.7375640869140625, "rewards/rejected": -29.406436920166016, "step": 1984 }, { "epoch": 0.27028867102396514, "grad_norm": 42.74109658047353, "learning_rate": 7.314412647741879e-07, "logits/chosen": 9.5400390625, "logits/rejected": 10.488121032714844, "logps/chosen": -3.158114433288574, "logps/rejected": -3.3209304809570312, "loss": 4.0939, "rewards/accuracies": 0.5, "rewards/chosen": -31.58114242553711, "rewards/margins": 1.6281628608703613, "rewards/rejected": -33.20930480957031, "step": 1985 }, { "epoch": 0.2704248366013072, "grad_norm": 53.61162314978346, "learning_rate": 7.313347798856632e-07, "logits/chosen": 9.810964584350586, "logits/rejected": 9.343202590942383, "logps/chosen": -3.06648850440979, "logps/rejected": -3.3682308197021484, "loss": 4.2991, "rewards/accuracies": 0.75, "rewards/chosen": -30.664884567260742, "rewards/margins": 3.017423152923584, "rewards/rejected": -33.682308197021484, "step": 1986 }, { "epoch": 0.27056100217864926, "grad_norm": 36.235881141184905, "learning_rate": 7.312282201293063e-07, "logits/chosen": 9.853435516357422, "logits/rejected": 10.52936840057373, "logps/chosen": -2.8395605087280273, "logps/rejected": -3.3459951877593994, "loss": 3.5795, "rewards/accuracies": 1.0, "rewards/chosen": -28.39560317993164, "rewards/margins": 5.064347743988037, "rewards/rejected": -33.45995330810547, "step": 1987 }, { "epoch": 0.2706971677559913, "grad_norm": 39.57804610509507, "learning_rate": 7.311215855291952e-07, "logits/chosen": 10.01500129699707, "logits/rejected": 9.848960876464844, "logps/chosen": -3.2995517253875732, "logps/rejected": -3.5093560218811035, "loss": 3.3344, "rewards/accuracies": 0.75, "rewards/chosen": -32.99551773071289, "rewards/margins": 2.0980429649353027, "rewards/rejected": -35.09355926513672, "step": 1988 }, { "epoch": 0.2708333333333333, "grad_norm": 42.569307638889434, "learning_rate": 7.310148761094246e-07, "logits/chosen": 10.143556594848633, "logits/rejected": 9.783519744873047, "logps/chosen": -3.1694388389587402, "logps/rejected": -3.2506957054138184, "loss": 4.2954, "rewards/accuracies": 0.75, "rewards/chosen": -31.69438934326172, "rewards/margins": 0.8125691413879395, "rewards/rejected": -32.5069580078125, "step": 1989 }, { "epoch": 0.2709694989106754, "grad_norm": 40.875820588285734, "learning_rate": 7.309080918941068e-07, "logits/chosen": 9.625185012817383, "logits/rejected": 10.340717315673828, "logps/chosen": -3.5030574798583984, "logps/rejected": -3.4265642166137695, "loss": 4.2894, "rewards/accuracies": 0.5, "rewards/chosen": -35.030574798583984, "rewards/margins": -0.7649321556091309, "rewards/rejected": -34.26564025878906, "step": 1990 }, { "epoch": 0.27110566448801743, "grad_norm": 57.90210849370428, "learning_rate": 7.308012329073701e-07, "logits/chosen": 9.319253921508789, "logits/rejected": 10.094758987426758, "logps/chosen": -3.033557653427124, "logps/rejected": -3.2773966789245605, "loss": 3.9943, "rewards/accuracies": 0.75, "rewards/chosen": -30.335575103759766, "rewards/margins": 2.438389778137207, "rewards/rejected": -32.77396774291992, "step": 1991 }, { "epoch": 0.27124183006535946, "grad_norm": 44.83085045217553, "learning_rate": 7.306942991733605e-07, "logits/chosen": 10.353662490844727, "logits/rejected": 10.547569274902344, "logps/chosen": -3.3918616771698, "logps/rejected": -3.575209379196167, "loss": 4.4926, "rewards/accuracies": 0.5, "rewards/chosen": -33.918617248535156, "rewards/margins": 1.833475112915039, "rewards/rejected": -35.75209426879883, "step": 1992 }, { "epoch": 0.27137799564270154, "grad_norm": 40.50023653907384, "learning_rate": 7.305872907162405e-07, "logits/chosen": 9.724394798278809, "logits/rejected": 10.98190689086914, "logps/chosen": -3.345672845840454, "logps/rejected": -3.693354845046997, "loss": 4.1612, "rewards/accuracies": 1.0, "rewards/chosen": -33.456729888916016, "rewards/margins": 3.476820468902588, "rewards/rejected": -36.93354797363281, "step": 1993 }, { "epoch": 0.27151416122004357, "grad_norm": 45.62147889975053, "learning_rate": 7.304802075601893e-07, "logits/chosen": 9.62606430053711, "logits/rejected": 10.181051254272461, "logps/chosen": -3.176384925842285, "logps/rejected": -3.503230333328247, "loss": 3.9026, "rewards/accuracies": 0.75, "rewards/chosen": -31.76384735107422, "rewards/margins": 3.268455982208252, "rewards/rejected": -35.03230285644531, "step": 1994 }, { "epoch": 0.2716503267973856, "grad_norm": 49.34156127908102, "learning_rate": 7.303730497294035e-07, "logits/chosen": 10.756474494934082, "logits/rejected": 10.595354080200195, "logps/chosen": -3.181816577911377, "logps/rejected": -3.1570794582366943, "loss": 3.8481, "rewards/accuracies": 0.5, "rewards/chosen": -31.818164825439453, "rewards/margins": -0.24737119674682617, "rewards/rejected": -31.57079315185547, "step": 1995 }, { "epoch": 0.2717864923747277, "grad_norm": 47.372169679420224, "learning_rate": 7.302658172480963e-07, "logits/chosen": 10.552287101745605, "logits/rejected": 10.441638946533203, "logps/chosen": -3.153855323791504, "logps/rejected": -3.675349235534668, "loss": 4.4635, "rewards/accuracies": 1.0, "rewards/chosen": -31.538551330566406, "rewards/margins": 5.214939594268799, "rewards/rejected": -36.75349044799805, "step": 1996 }, { "epoch": 0.2719226579520697, "grad_norm": 44.800069424007916, "learning_rate": 7.301585101404976e-07, "logits/chosen": 9.657844543457031, "logits/rejected": 11.270419120788574, "logps/chosen": -3.1465752124786377, "logps/rejected": -3.567129135131836, "loss": 3.7298, "rewards/accuracies": 1.0, "rewards/chosen": -31.46575164794922, "rewards/margins": 4.205540657043457, "rewards/rejected": -35.67129135131836, "step": 1997 }, { "epoch": 0.27205882352941174, "grad_norm": 38.3998270123879, "learning_rate": 7.300511284308545e-07, "logits/chosen": 10.640653610229492, "logits/rejected": 10.181758880615234, "logps/chosen": -3.5720889568328857, "logps/rejected": -3.320509433746338, "loss": 3.6801, "rewards/accuracies": 0.25, "rewards/chosen": -35.720890045166016, "rewards/margins": -2.5157947540283203, "rewards/rejected": -33.20509338378906, "step": 1998 }, { "epoch": 0.2721949891067538, "grad_norm": 46.682657763122926, "learning_rate": 7.299436721434305e-07, "logits/chosen": 9.994207382202148, "logits/rejected": 9.958724021911621, "logps/chosen": -3.2730519771575928, "logps/rejected": -3.4148778915405273, "loss": 3.9468, "rewards/accuracies": 0.75, "rewards/chosen": -32.73051834106445, "rewards/margins": 1.4182629585266113, "rewards/rejected": -34.148780822753906, "step": 1999 }, { "epoch": 0.27233115468409586, "grad_norm": 45.85139065773554, "learning_rate": 7.298361413025068e-07, "logits/chosen": 10.021673202514648, "logits/rejected": 9.847940444946289, "logps/chosen": -3.09476375579834, "logps/rejected": -3.352832794189453, "loss": 3.7473, "rewards/accuracies": 0.75, "rewards/chosen": -30.947635650634766, "rewards/margins": 2.580691337585449, "rewards/rejected": -33.52832794189453, "step": 2000 }, { "epoch": 0.2724673202614379, "grad_norm": 39.83416747208019, "learning_rate": 7.297285359323802e-07, "logits/chosen": 9.596455574035645, "logits/rejected": 10.099464416503906, "logps/chosen": -2.9542179107666016, "logps/rejected": -3.210066318511963, "loss": 3.6283, "rewards/accuracies": 0.75, "rewards/chosen": -29.542179107666016, "rewards/margins": 2.5584845542907715, "rewards/rejected": -32.10066223144531, "step": 2001 }, { "epoch": 0.27260348583877997, "grad_norm": 50.52452349632165, "learning_rate": 7.296208560573654e-07, "logits/chosen": 10.431659698486328, "logits/rejected": 10.21342658996582, "logps/chosen": -3.4793825149536133, "logps/rejected": -3.4201819896698, "loss": 3.7601, "rewards/accuracies": 0.5, "rewards/chosen": -34.7938232421875, "rewards/margins": -0.5920038223266602, "rewards/rejected": -34.201820373535156, "step": 2002 }, { "epoch": 0.272739651416122, "grad_norm": 41.78227093712794, "learning_rate": 7.295131017017936e-07, "logits/chosen": 9.384137153625488, "logits/rejected": 11.109783172607422, "logps/chosen": -2.8871817588806152, "logps/rejected": -3.2562050819396973, "loss": 3.4745, "rewards/accuracies": 0.75, "rewards/chosen": -28.87181854248047, "rewards/margins": 3.690230369567871, "rewards/rejected": -32.562049865722656, "step": 2003 }, { "epoch": 0.272875816993464, "grad_norm": 43.12066445305443, "learning_rate": 7.294052728900126e-07, "logits/chosen": 9.780315399169922, "logits/rejected": 8.994919776916504, "logps/chosen": -3.3728890419006348, "logps/rejected": -3.0195231437683105, "loss": 3.6502, "rewards/accuracies": 0.25, "rewards/chosen": -33.72888946533203, "rewards/margins": -3.533660411834717, "rewards/rejected": -30.195232391357422, "step": 2004 }, { "epoch": 0.2730119825708061, "grad_norm": 45.74974116533449, "learning_rate": 7.292973696463875e-07, "logits/chosen": 9.443635940551758, "logits/rejected": 9.208436965942383, "logps/chosen": -2.7407658100128174, "logps/rejected": -2.726888656616211, "loss": 4.4727, "rewards/accuracies": 0.25, "rewards/chosen": -27.407657623291016, "rewards/margins": -0.13877296447753906, "rewards/rejected": -27.26888656616211, "step": 2005 }, { "epoch": 0.27314814814814814, "grad_norm": 55.445755542388845, "learning_rate": 7.291893919952995e-07, "logits/chosen": 9.138827323913574, "logits/rejected": 10.167364120483398, "logps/chosen": -2.7411270141601562, "logps/rejected": -3.284613609313965, "loss": 4.5916, "rewards/accuracies": 1.0, "rewards/chosen": -27.411270141601562, "rewards/margins": 5.434866428375244, "rewards/rejected": -32.846134185791016, "step": 2006 }, { "epoch": 0.27328431372549017, "grad_norm": 42.92315368762588, "learning_rate": 7.290813399611475e-07, "logits/chosen": 8.91054916381836, "logits/rejected": 9.453010559082031, "logps/chosen": -3.07243013381958, "logps/rejected": -3.0113744735717773, "loss": 4.1607, "rewards/accuracies": 0.5, "rewards/chosen": -30.724300384521484, "rewards/margins": -0.6105542182922363, "rewards/rejected": -30.113746643066406, "step": 2007 }, { "epoch": 0.27342047930283225, "grad_norm": 47.622157835041406, "learning_rate": 7.289732135683462e-07, "logits/chosen": 9.505614280700684, "logits/rejected": 10.12661361694336, "logps/chosen": -3.172787666320801, "logps/rejected": -3.2572457790374756, "loss": 4.639, "rewards/accuracies": 0.5, "rewards/chosen": -31.72787857055664, "rewards/margins": 0.8445792198181152, "rewards/rejected": -32.57245635986328, "step": 2008 }, { "epoch": 0.2735566448801743, "grad_norm": 41.132817427537944, "learning_rate": 7.288650128413282e-07, "logits/chosen": 9.128654479980469, "logits/rejected": 9.442363739013672, "logps/chosen": -3.0355076789855957, "logps/rejected": -3.1576149463653564, "loss": 4.3894, "rewards/accuracies": 0.75, "rewards/chosen": -30.35507583618164, "rewards/margins": 1.2210736274719238, "rewards/rejected": -31.576148986816406, "step": 2009 }, { "epoch": 0.27369281045751637, "grad_norm": 41.76110862077498, "learning_rate": 7.287567378045421e-07, "logits/chosen": 9.333142280578613, "logits/rejected": 8.890130996704102, "logps/chosen": -3.1826422214508057, "logps/rejected": -3.1868581771850586, "loss": 3.7783, "rewards/accuracies": 0.5, "rewards/chosen": -31.82642364501953, "rewards/margins": 0.04215860366821289, "rewards/rejected": -31.86858367919922, "step": 2010 }, { "epoch": 0.2738289760348584, "grad_norm": 40.54413921753005, "learning_rate": 7.286483884824534e-07, "logits/chosen": 8.798290252685547, "logits/rejected": 8.768571853637695, "logps/chosen": -2.9602131843566895, "logps/rejected": -2.9777190685272217, "loss": 4.006, "rewards/accuracies": 0.5, "rewards/chosen": -29.602130889892578, "rewards/margins": 0.17505979537963867, "rewards/rejected": -29.777191162109375, "step": 2011 }, { "epoch": 0.2739651416122004, "grad_norm": 108.9123598164272, "learning_rate": 7.285399648995449e-07, "logits/chosen": 9.889120101928711, "logits/rejected": 10.547415733337402, "logps/chosen": -3.1997594833374023, "logps/rejected": -3.3309736251831055, "loss": 5.8754, "rewards/accuracies": 0.5, "rewards/chosen": -31.99759292602539, "rewards/margins": 1.312143325805664, "rewards/rejected": -33.30973815917969, "step": 2012 }, { "epoch": 0.2741013071895425, "grad_norm": 43.85815458683306, "learning_rate": 7.284314670803156e-07, "logits/chosen": 9.352710723876953, "logits/rejected": 8.940980911254883, "logps/chosen": -2.962510108947754, "logps/rejected": -3.0065760612487793, "loss": 4.2354, "rewards/accuracies": 0.75, "rewards/chosen": -29.625099182128906, "rewards/margins": 0.4406599998474121, "rewards/rejected": -30.065759658813477, "step": 2013 }, { "epoch": 0.27423747276688454, "grad_norm": 43.9568877795987, "learning_rate": 7.283228950492812e-07, "logits/chosen": 10.29849624633789, "logits/rejected": 9.278623580932617, "logps/chosen": -3.2450668811798096, "logps/rejected": -3.049617052078247, "loss": 4.0751, "rewards/accuracies": 0.25, "rewards/chosen": -32.45066833496094, "rewards/margins": -1.954498291015625, "rewards/rejected": -30.496170043945312, "step": 2014 }, { "epoch": 0.27437363834422657, "grad_norm": 43.36546190526251, "learning_rate": 7.28214248830975e-07, "logits/chosen": 9.026542663574219, "logits/rejected": 9.845423698425293, "logps/chosen": -3.0337114334106445, "logps/rejected": -3.1838254928588867, "loss": 4.5398, "rewards/accuracies": 0.75, "rewards/chosen": -30.337114334106445, "rewards/margins": 1.5011415481567383, "rewards/rejected": -31.8382568359375, "step": 2015 }, { "epoch": 0.27450980392156865, "grad_norm": 39.44153287869997, "learning_rate": 7.28105528449946e-07, "logits/chosen": 10.067562103271484, "logits/rejected": 10.419604301452637, "logps/chosen": -2.936244010925293, "logps/rejected": -3.5217602252960205, "loss": 3.0338, "rewards/accuracies": 0.75, "rewards/chosen": -29.362438201904297, "rewards/margins": 5.855162143707275, "rewards/rejected": -35.21760177612305, "step": 2016 }, { "epoch": 0.2746459694989107, "grad_norm": 44.166197741295775, "learning_rate": 7.279967339307608e-07, "logits/chosen": 10.153070449829102, "logits/rejected": 10.688526153564453, "logps/chosen": -3.1774892807006836, "logps/rejected": -3.528541326522827, "loss": 3.7591, "rewards/accuracies": 0.75, "rewards/chosen": -31.77489471435547, "rewards/margins": 3.5105209350585938, "rewards/rejected": -35.28541564941406, "step": 2017 }, { "epoch": 0.2747821350762527, "grad_norm": 39.475507124322924, "learning_rate": 7.278878652980024e-07, "logits/chosen": 8.487432479858398, "logits/rejected": 9.453228950500488, "logps/chosen": -2.6727280616760254, "logps/rejected": -3.2610983848571777, "loss": 3.8267, "rewards/accuracies": 0.75, "rewards/chosen": -26.72728157043457, "rewards/margins": 5.883702754974365, "rewards/rejected": -32.610984802246094, "step": 2018 }, { "epoch": 0.2749183006535948, "grad_norm": 40.25987288676104, "learning_rate": 7.277789225762704e-07, "logits/chosen": 9.330527305603027, "logits/rejected": 11.014518737792969, "logps/chosen": -2.7719473838806152, "logps/rejected": -3.2359619140625, "loss": 4.1805, "rewards/accuracies": 0.75, "rewards/chosen": -27.71947479248047, "rewards/margins": 4.640145301818848, "rewards/rejected": -32.359619140625, "step": 2019 }, { "epoch": 0.2750544662309368, "grad_norm": 42.83946053019282, "learning_rate": 7.276699057901815e-07, "logits/chosen": 10.596317291259766, "logits/rejected": 9.884830474853516, "logps/chosen": -3.392362594604492, "logps/rejected": -3.4253177642822266, "loss": 3.6566, "rewards/accuracies": 0.5, "rewards/chosen": -33.92362594604492, "rewards/margins": 0.32954931259155273, "rewards/rejected": -34.253173828125, "step": 2020 }, { "epoch": 0.27519063180827885, "grad_norm": 43.931226786051596, "learning_rate": 7.275608149643687e-07, "logits/chosen": 9.62728500366211, "logits/rejected": 10.02025318145752, "logps/chosen": -3.001437187194824, "logps/rejected": -3.0108284950256348, "loss": 4.0335, "rewards/accuracies": 0.5, "rewards/chosen": -30.014373779296875, "rewards/margins": 0.09391212463378906, "rewards/rejected": -30.10828399658203, "step": 2021 }, { "epoch": 0.27532679738562094, "grad_norm": 48.655092460163466, "learning_rate": 7.274516501234822e-07, "logits/chosen": 9.756704330444336, "logits/rejected": 10.151304244995117, "logps/chosen": -3.067408323287964, "logps/rejected": -3.2243850231170654, "loss": 4.4667, "rewards/accuracies": 0.5, "rewards/chosen": -30.674083709716797, "rewards/margins": 1.5697646141052246, "rewards/rejected": -32.24384689331055, "step": 2022 }, { "epoch": 0.27546296296296297, "grad_norm": 39.597419013868034, "learning_rate": 7.273424112921887e-07, "logits/chosen": 9.598923683166504, "logits/rejected": 10.096939086914062, "logps/chosen": -3.2184996604919434, "logps/rejected": -3.473783016204834, "loss": 3.6387, "rewards/accuracies": 1.0, "rewards/chosen": -32.18499755859375, "rewards/margins": 2.552835464477539, "rewards/rejected": -34.737831115722656, "step": 2023 }, { "epoch": 0.275599128540305, "grad_norm": 40.72920958236727, "learning_rate": 7.272330984951714e-07, "logits/chosen": 9.882648468017578, "logits/rejected": 9.892444610595703, "logps/chosen": -3.136953830718994, "logps/rejected": -3.3359484672546387, "loss": 3.8552, "rewards/accuracies": 0.75, "rewards/chosen": -31.369537353515625, "rewards/margins": 1.9899468421936035, "rewards/rejected": -33.35948181152344, "step": 2024 }, { "epoch": 0.2757352941176471, "grad_norm": 53.9959894771226, "learning_rate": 7.271237117571306e-07, "logits/chosen": 9.806317329406738, "logits/rejected": 11.406789779663086, "logps/chosen": -3.317953586578369, "logps/rejected": -3.8449771404266357, "loss": 3.877, "rewards/accuracies": 1.0, "rewards/chosen": -33.179534912109375, "rewards/margins": 5.270237922668457, "rewards/rejected": -38.449771881103516, "step": 2025 }, { "epoch": 0.2758714596949891, "grad_norm": 40.37227282266723, "learning_rate": 7.27014251102783e-07, "logits/chosen": 10.843324661254883, "logits/rejected": 11.134241104125977, "logps/chosen": -3.4940309524536133, "logps/rejected": -3.889085292816162, "loss": 4.0636, "rewards/accuracies": 0.75, "rewards/chosen": -34.9403076171875, "rewards/margins": 3.950542449951172, "rewards/rejected": -38.89085388183594, "step": 2026 }, { "epoch": 0.27600762527233114, "grad_norm": 46.95831076243495, "learning_rate": 7.269047165568623e-07, "logits/chosen": 11.133188247680664, "logits/rejected": 10.752498626708984, "logps/chosen": -3.1652369499206543, "logps/rejected": -3.1863291263580322, "loss": 4.2039, "rewards/accuracies": 0.25, "rewards/chosen": -31.652368545532227, "rewards/margins": 0.2109236717224121, "rewards/rejected": -31.863290786743164, "step": 2027 }, { "epoch": 0.2761437908496732, "grad_norm": 43.26982663066297, "learning_rate": 7.267951081441188e-07, "logits/chosen": 10.198108673095703, "logits/rejected": 10.212335586547852, "logps/chosen": -3.3900094032287598, "logps/rejected": -3.5685956478118896, "loss": 4.4873, "rewards/accuracies": 0.75, "rewards/chosen": -33.90009307861328, "rewards/margins": 1.7858614921569824, "rewards/rejected": -35.68595504760742, "step": 2028 }, { "epoch": 0.27627995642701525, "grad_norm": 44.72921075521773, "learning_rate": 7.266854258893191e-07, "logits/chosen": 10.013330459594727, "logits/rejected": 9.920618057250977, "logps/chosen": -2.985029935836792, "logps/rejected": -3.07511568069458, "loss": 3.9704, "rewards/accuracies": 0.75, "rewards/chosen": -29.850299835205078, "rewards/margins": 0.900855541229248, "rewards/rejected": -30.751155853271484, "step": 2029 }, { "epoch": 0.2764161220043573, "grad_norm": 42.615353862296416, "learning_rate": 7.26575669817247e-07, "logits/chosen": 10.708846092224121, "logits/rejected": 9.886032104492188, "logps/chosen": -3.3533897399902344, "logps/rejected": -3.2181601524353027, "loss": 4.0206, "rewards/accuracies": 0.5, "rewards/chosen": -33.533897399902344, "rewards/margins": -1.3522944450378418, "rewards/rejected": -32.181602478027344, "step": 2030 }, { "epoch": 0.27655228758169936, "grad_norm": 52.54123294013655, "learning_rate": 7.264658399527031e-07, "logits/chosen": 10.606027603149414, "logits/rejected": 10.80879020690918, "logps/chosen": -3.4310078620910645, "logps/rejected": -3.2521018981933594, "loss": 4.2826, "rewards/accuracies": 0.0, "rewards/chosen": -34.31007766723633, "rewards/margins": -1.7890605926513672, "rewards/rejected": -32.521018981933594, "step": 2031 }, { "epoch": 0.2766884531590414, "grad_norm": 44.72010057667563, "learning_rate": 7.263559363205038e-07, "logits/chosen": 10.007386207580566, "logits/rejected": 10.14357852935791, "logps/chosen": -2.7042219638824463, "logps/rejected": -2.8308346271514893, "loss": 3.866, "rewards/accuracies": 0.75, "rewards/chosen": -27.042217254638672, "rewards/margins": 1.266127109527588, "rewards/rejected": -28.308345794677734, "step": 2032 }, { "epoch": 0.2768246187363834, "grad_norm": 43.35877614078326, "learning_rate": 7.26245958945483e-07, "logits/chosen": 11.07912826538086, "logits/rejected": 11.249441146850586, "logps/chosen": -3.5325207710266113, "logps/rejected": -3.573923349380493, "loss": 4.1982, "rewards/accuracies": 0.75, "rewards/chosen": -35.3252067565918, "rewards/margins": 0.41402769088745117, "rewards/rejected": -35.73923110961914, "step": 2033 }, { "epoch": 0.2769607843137255, "grad_norm": 44.223291482559844, "learning_rate": 7.261359078524912e-07, "logits/chosen": 10.903989791870117, "logits/rejected": 11.222539901733398, "logps/chosen": -3.3834495544433594, "logps/rejected": -3.637399196624756, "loss": 3.9528, "rewards/accuracies": 1.0, "rewards/chosen": -33.834495544433594, "rewards/margins": 2.539496898651123, "rewards/rejected": -36.373992919921875, "step": 2034 }, { "epoch": 0.27709694989106753, "grad_norm": 38.84555506632122, "learning_rate": 7.260257830663949e-07, "logits/chosen": 10.563400268554688, "logits/rejected": 11.647253036499023, "logps/chosen": -3.069068431854248, "logps/rejected": -3.6733505725860596, "loss": 4.071, "rewards/accuracies": 0.75, "rewards/chosen": -30.690683364868164, "rewards/margins": 6.04282283782959, "rewards/rejected": -36.73350524902344, "step": 2035 }, { "epoch": 0.27723311546840956, "grad_norm": 41.68099239806937, "learning_rate": 7.259155846120781e-07, "logits/chosen": 10.531320571899414, "logits/rejected": 11.344558715820312, "logps/chosen": -3.065237283706665, "logps/rejected": -3.2924506664276123, "loss": 3.9197, "rewards/accuracies": 0.75, "rewards/chosen": -30.652374267578125, "rewards/margins": 2.2721333503723145, "rewards/rejected": -32.92450714111328, "step": 2036 }, { "epoch": 0.27736928104575165, "grad_norm": 43.827814416341255, "learning_rate": 7.258053125144409e-07, "logits/chosen": 11.33468246459961, "logits/rejected": 11.329795837402344, "logps/chosen": -3.1353676319122314, "logps/rejected": -3.351585865020752, "loss": 3.6984, "rewards/accuracies": 1.0, "rewards/chosen": -31.353675842285156, "rewards/margins": 2.162181854248047, "rewards/rejected": -33.5158576965332, "step": 2037 }, { "epoch": 0.2775054466230937, "grad_norm": 44.18499904221448, "learning_rate": 7.256949667984003e-07, "logits/chosen": 10.647211074829102, "logits/rejected": 10.673377990722656, "logps/chosen": -3.359405517578125, "logps/rejected": -3.451932907104492, "loss": 3.9797, "rewards/accuracies": 0.5, "rewards/chosen": -33.59405517578125, "rewards/margins": 0.9252748489379883, "rewards/rejected": -34.51933288574219, "step": 2038 }, { "epoch": 0.2776416122004357, "grad_norm": 46.09152487272223, "learning_rate": 7.255845474888895e-07, "logits/chosen": 11.285388946533203, "logits/rejected": 10.565520286560059, "logps/chosen": -3.7757177352905273, "logps/rejected": -3.8049440383911133, "loss": 4.1203, "rewards/accuracies": 0.5, "rewards/chosen": -37.757179260253906, "rewards/margins": 0.29226255416870117, "rewards/rejected": -38.0494384765625, "step": 2039 }, { "epoch": 0.2777777777777778, "grad_norm": 42.87735720473763, "learning_rate": 7.254740546108591e-07, "logits/chosen": 10.082635879516602, "logits/rejected": 10.568843841552734, "logps/chosen": -3.069612979888916, "logps/rejected": -3.2404725551605225, "loss": 3.7765, "rewards/accuracies": 0.75, "rewards/chosen": -30.696128845214844, "rewards/margins": 1.708597183227539, "rewards/rejected": -32.40472412109375, "step": 2040 }, { "epoch": 0.2779139433551198, "grad_norm": 43.65810330455431, "learning_rate": 7.253634881892755e-07, "logits/chosen": 9.532072067260742, "logits/rejected": 10.428838729858398, "logps/chosen": -2.930309295654297, "logps/rejected": -3.2453527450561523, "loss": 4.0317, "rewards/accuracies": 1.0, "rewards/chosen": -29.303089141845703, "rewards/margins": 3.150437355041504, "rewards/rejected": -32.453529357910156, "step": 2041 }, { "epoch": 0.27805010893246185, "grad_norm": 40.648481372746886, "learning_rate": 7.252528482491224e-07, "logits/chosen": 11.042488098144531, "logits/rejected": 11.430194854736328, "logps/chosen": -3.3751068115234375, "logps/rejected": -3.521754503250122, "loss": 3.3774, "rewards/accuracies": 0.75, "rewards/chosen": -33.751068115234375, "rewards/margins": 1.4664793014526367, "rewards/rejected": -35.21754455566406, "step": 2042 }, { "epoch": 0.27818627450980393, "grad_norm": 43.03298904416847, "learning_rate": 7.251421348153996e-07, "logits/chosen": 10.185897827148438, "logits/rejected": 11.39175796508789, "logps/chosen": -3.2424440383911133, "logps/rejected": -3.665432929992676, "loss": 3.3439, "rewards/accuracies": 1.0, "rewards/chosen": -32.4244384765625, "rewards/margins": 4.229889392852783, "rewards/rejected": -36.65433120727539, "step": 2043 }, { "epoch": 0.27832244008714596, "grad_norm": 40.795214504018176, "learning_rate": 7.250313479131238e-07, "logits/chosen": 10.797327041625977, "logits/rejected": 10.993000030517578, "logps/chosen": -3.4697728157043457, "logps/rejected": -3.6637330055236816, "loss": 3.8642, "rewards/accuracies": 0.75, "rewards/chosen": -34.69772720336914, "rewards/margins": 1.9395990371704102, "rewards/rejected": -36.6373291015625, "step": 2044 }, { "epoch": 0.278458605664488, "grad_norm": 48.092208742106685, "learning_rate": 7.249204875673282e-07, "logits/chosen": 10.237092971801758, "logits/rejected": 10.696450233459473, "logps/chosen": -3.1441545486450195, "logps/rejected": -3.2451565265655518, "loss": 4.0832, "rewards/accuracies": 0.5, "rewards/chosen": -31.441547393798828, "rewards/margins": 1.0100183486938477, "rewards/rejected": -32.45156478881836, "step": 2045 }, { "epoch": 0.2785947712418301, "grad_norm": 47.72794509530352, "learning_rate": 7.248095538030626e-07, "logits/chosen": 11.654447555541992, "logits/rejected": 11.14938735961914, "logps/chosen": -3.3125650882720947, "logps/rejected": -3.516573190689087, "loss": 4.5033, "rewards/accuracies": 0.5, "rewards/chosen": -33.125648498535156, "rewards/margins": 2.0400829315185547, "rewards/rejected": -35.165733337402344, "step": 2046 }, { "epoch": 0.2787309368191721, "grad_norm": 43.31836190060848, "learning_rate": 7.246985466453934e-07, "logits/chosen": 10.918994903564453, "logits/rejected": 10.6776704788208, "logps/chosen": -3.8264307975769043, "logps/rejected": -3.6622416973114014, "loss": 3.9974, "rewards/accuracies": 0.25, "rewards/chosen": -38.264305114746094, "rewards/margins": -1.6418890953063965, "rewards/rejected": -36.62241744995117, "step": 2047 }, { "epoch": 0.2788671023965142, "grad_norm": 64.13160138579072, "learning_rate": 7.245874661194037e-07, "logits/chosen": 9.913036346435547, "logits/rejected": 11.095425605773926, "logps/chosen": -3.0639243125915527, "logps/rejected": -3.0694520473480225, "loss": 3.71, "rewards/accuracies": 0.25, "rewards/chosen": -30.63924217224121, "rewards/margins": 0.055278778076171875, "rewards/rejected": -30.694520950317383, "step": 2048 }, { "epoch": 0.2790032679738562, "grad_norm": 44.70539125661694, "learning_rate": 7.244763122501928e-07, "logits/chosen": 10.263595581054688, "logits/rejected": 11.14377212524414, "logps/chosen": -3.0248541831970215, "logps/rejected": -3.4215331077575684, "loss": 3.9224, "rewards/accuracies": 0.75, "rewards/chosen": -30.2485408782959, "rewards/margins": 3.9667911529541016, "rewards/rejected": -34.21533203125, "step": 2049 }, { "epoch": 0.27913943355119825, "grad_norm": 39.65162881534594, "learning_rate": 7.243650850628771e-07, "logits/chosen": 11.040898323059082, "logits/rejected": 11.072880744934082, "logps/chosen": -3.488393783569336, "logps/rejected": -3.320924758911133, "loss": 3.8698, "rewards/accuracies": 0.5, "rewards/chosen": -34.883934020996094, "rewards/margins": -1.6746916770935059, "rewards/rejected": -33.20924377441406, "step": 2050 }, { "epoch": 0.27927559912854033, "grad_norm": 41.35697757926682, "learning_rate": 7.242537845825891e-07, "logits/chosen": 11.360201835632324, "logits/rejected": 10.955767631530762, "logps/chosen": -3.775747060775757, "logps/rejected": -3.4014408588409424, "loss": 3.9838, "rewards/accuracies": 0.0, "rewards/chosen": -37.757469177246094, "rewards/margins": -3.7430615425109863, "rewards/rejected": -34.014408111572266, "step": 2051 }, { "epoch": 0.27941176470588236, "grad_norm": 42.34035904154086, "learning_rate": 7.241424108344784e-07, "logits/chosen": 10.938819885253906, "logits/rejected": 11.510564804077148, "logps/chosen": -3.7927322387695312, "logps/rejected": -3.9044029712677, "loss": 4.2046, "rewards/accuracies": 0.5, "rewards/chosen": -37.92732238769531, "rewards/margins": 1.1167078018188477, "rewards/rejected": -39.044029235839844, "step": 2052 }, { "epoch": 0.2795479302832244, "grad_norm": 43.831990850874924, "learning_rate": 7.240309638437104e-07, "logits/chosen": 11.25056266784668, "logits/rejected": 11.709178924560547, "logps/chosen": -3.6821649074554443, "logps/rejected": -3.9168150424957275, "loss": 4.4921, "rewards/accuracies": 0.25, "rewards/chosen": -36.82164764404297, "rewards/margins": 2.3465027809143066, "rewards/rejected": -39.16815185546875, "step": 2053 }, { "epoch": 0.2796840958605665, "grad_norm": 63.006848304818114, "learning_rate": 7.239194436354677e-07, "logits/chosen": 10.791830062866211, "logits/rejected": 10.642576217651367, "logps/chosen": -3.4422426223754883, "logps/rejected": -3.5224661827087402, "loss": 4.4123, "rewards/accuracies": 0.75, "rewards/chosen": -34.42242431640625, "rewards/margins": 0.8022360801696777, "rewards/rejected": -35.22466278076172, "step": 2054 }, { "epoch": 0.2798202614379085, "grad_norm": 46.85779112453062, "learning_rate": 7.238078502349491e-07, "logits/chosen": 10.619102478027344, "logits/rejected": 10.90992546081543, "logps/chosen": -3.5404610633850098, "logps/rejected": -3.6300036907196045, "loss": 4.2863, "rewards/accuracies": 0.5, "rewards/chosen": -35.40461349487305, "rewards/margins": 0.8954238891601562, "rewards/rejected": -36.30003356933594, "step": 2055 }, { "epoch": 0.27995642701525053, "grad_norm": 38.21149547858686, "learning_rate": 7.236961836673701e-07, "logits/chosen": 12.160784721374512, "logits/rejected": 12.376384735107422, "logps/chosen": -3.46551251411438, "logps/rejected": -3.256913185119629, "loss": 3.9344, "rewards/accuracies": 0.5, "rewards/chosen": -34.65512466430664, "rewards/margins": -2.0859928131103516, "rewards/rejected": -32.56913375854492, "step": 2056 }, { "epoch": 0.2800925925925926, "grad_norm": 36.8738825913263, "learning_rate": 7.23584443957963e-07, "logits/chosen": 11.309317588806152, "logits/rejected": 11.756097793579102, "logps/chosen": -3.606645107269287, "logps/rejected": -3.6639397144317627, "loss": 3.9222, "rewards/accuracies": 0.75, "rewards/chosen": -36.06645202636719, "rewards/margins": 0.5729451179504395, "rewards/rejected": -36.63939666748047, "step": 2057 }, { "epoch": 0.28022875816993464, "grad_norm": 45.78217432287904, "learning_rate": 7.234726311319757e-07, "logits/chosen": 10.859420776367188, "logits/rejected": 11.622247695922852, "logps/chosen": -3.471132755279541, "logps/rejected": -3.6204347610473633, "loss": 4.6063, "rewards/accuracies": 0.5, "rewards/chosen": -34.711326599121094, "rewards/margins": 1.4930219650268555, "rewards/rejected": -36.204345703125, "step": 2058 }, { "epoch": 0.2803649237472767, "grad_norm": 40.030104909830094, "learning_rate": 7.233607452146737e-07, "logits/chosen": 11.535882949829102, "logits/rejected": 11.377315521240234, "logps/chosen": -3.2947959899902344, "logps/rejected": -3.389193296432495, "loss": 4.2838, "rewards/accuracies": 0.75, "rewards/chosen": -32.947959899902344, "rewards/margins": 0.9439711570739746, "rewards/rejected": -33.89193344116211, "step": 2059 }, { "epoch": 0.28050108932461876, "grad_norm": 40.454803387375854, "learning_rate": 7.232487862313382e-07, "logits/chosen": 10.908284187316895, "logits/rejected": 11.737518310546875, "logps/chosen": -3.4614951610565186, "logps/rejected": -3.751293420791626, "loss": 4.1347, "rewards/accuracies": 0.75, "rewards/chosen": -34.614952087402344, "rewards/margins": 2.897982120513916, "rewards/rejected": -37.51293182373047, "step": 2060 }, { "epoch": 0.2806372549019608, "grad_norm": 42.23527891366299, "learning_rate": 7.231367542072677e-07, "logits/chosen": 11.410090446472168, "logits/rejected": 12.0116605758667, "logps/chosen": -3.5251569747924805, "logps/rejected": -3.7813339233398438, "loss": 3.7033, "rewards/accuracies": 0.5, "rewards/chosen": -35.25157165527344, "rewards/margins": 2.5617685317993164, "rewards/rejected": -37.81333923339844, "step": 2061 }, { "epoch": 0.2807734204793028, "grad_norm": 38.47141117999346, "learning_rate": 7.230246491677762e-07, "logits/chosen": 11.27967357635498, "logits/rejected": 11.569711685180664, "logps/chosen": -3.1758649349212646, "logps/rejected": -3.5062146186828613, "loss": 3.8283, "rewards/accuracies": 0.5, "rewards/chosen": -31.758649826049805, "rewards/margins": 3.3034963607788086, "rewards/rejected": -35.06214904785156, "step": 2062 }, { "epoch": 0.2809095860566449, "grad_norm": 41.74986404019126, "learning_rate": 7.229124711381952e-07, "logits/chosen": 11.507131576538086, "logits/rejected": 11.546791076660156, "logps/chosen": -3.7352683544158936, "logps/rejected": -3.8507232666015625, "loss": 3.8947, "rewards/accuracies": 0.75, "rewards/chosen": -37.352684020996094, "rewards/margins": 1.1545495986938477, "rewards/rejected": -38.507232666015625, "step": 2063 }, { "epoch": 0.28104575163398693, "grad_norm": 42.69888362168256, "learning_rate": 7.228002201438723e-07, "logits/chosen": 10.485315322875977, "logits/rejected": 10.822240829467773, "logps/chosen": -3.4331459999084473, "logps/rejected": -3.550240993499756, "loss": 4.0251, "rewards/accuracies": 0.5, "rewards/chosen": -34.331459045410156, "rewards/margins": 1.1709489822387695, "rewards/rejected": -35.502410888671875, "step": 2064 }, { "epoch": 0.28118191721132896, "grad_norm": 41.22916406854048, "learning_rate": 7.226878962101712e-07, "logits/chosen": 12.140020370483398, "logits/rejected": 12.021973609924316, "logps/chosen": -3.651979446411133, "logps/rejected": -3.8021738529205322, "loss": 4.2456, "rewards/accuracies": 1.0, "rewards/chosen": -36.51979446411133, "rewards/margins": 1.5019440650939941, "rewards/rejected": -38.02173614501953, "step": 2065 }, { "epoch": 0.28131808278867104, "grad_norm": 43.128441224957456, "learning_rate": 7.225754993624727e-07, "logits/chosen": 10.92729377746582, "logits/rejected": 11.257198333740234, "logps/chosen": -3.1881017684936523, "logps/rejected": -3.439635992050171, "loss": 3.9232, "rewards/accuracies": 0.75, "rewards/chosen": -31.881017684936523, "rewards/margins": 2.5153441429138184, "rewards/rejected": -34.3963623046875, "step": 2066 }, { "epoch": 0.28145424836601307, "grad_norm": 41.81216671383537, "learning_rate": 7.224630296261736e-07, "logits/chosen": 10.18753433227539, "logits/rejected": 10.58694839477539, "logps/chosen": -3.4881980419158936, "logps/rejected": -3.4878311157226562, "loss": 4.4011, "rewards/accuracies": 0.25, "rewards/chosen": -34.881980895996094, "rewards/margins": -0.0036683082580566406, "rewards/rejected": -34.87831115722656, "step": 2067 }, { "epoch": 0.2815904139433551, "grad_norm": 39.760917513067746, "learning_rate": 7.223504870266875e-07, "logits/chosen": 11.478717803955078, "logits/rejected": 10.935314178466797, "logps/chosen": -3.398975372314453, "logps/rejected": -3.103243827819824, "loss": 4.4325, "rewards/accuracies": 0.0, "rewards/chosen": -33.98975372314453, "rewards/margins": -2.9573159217834473, "rewards/rejected": -31.032438278198242, "step": 2068 }, { "epoch": 0.2817265795206972, "grad_norm": 36.11653067629148, "learning_rate": 7.222378715894442e-07, "logits/chosen": 11.318563461303711, "logits/rejected": 11.335214614868164, "logps/chosen": -3.2800981998443604, "logps/rejected": -3.446213483810425, "loss": 3.844, "rewards/accuracies": 0.5, "rewards/chosen": -32.80098342895508, "rewards/margins": 1.6611523628234863, "rewards/rejected": -34.462135314941406, "step": 2069 }, { "epoch": 0.2818627450980392, "grad_norm": 43.73520962359488, "learning_rate": 7.221251833398902e-07, "logits/chosen": 10.158218383789062, "logits/rejected": 10.153473854064941, "logps/chosen": -3.171928644180298, "logps/rejected": -3.1954360008239746, "loss": 4.0562, "rewards/accuracies": 0.5, "rewards/chosen": -31.71928596496582, "rewards/margins": 0.2350764274597168, "rewards/rejected": -31.954362869262695, "step": 2070 }, { "epoch": 0.28199891067538124, "grad_norm": 49.77702576902295, "learning_rate": 7.220124223034883e-07, "logits/chosen": 10.531118392944336, "logits/rejected": 10.467278480529785, "logps/chosen": -3.4100613594055176, "logps/rejected": -3.3228750228881836, "loss": 4.0385, "rewards/accuracies": 0.25, "rewards/chosen": -34.100616455078125, "rewards/margins": -0.8718662261962891, "rewards/rejected": -33.22874450683594, "step": 2071 }, { "epoch": 0.2821350762527233, "grad_norm": 42.56255382471536, "learning_rate": 7.218995885057179e-07, "logits/chosen": 11.340509414672852, "logits/rejected": 11.544965744018555, "logps/chosen": -3.3726274967193604, "logps/rejected": -3.6362013816833496, "loss": 4.483, "rewards/accuracies": 0.5, "rewards/chosen": -33.72627639770508, "rewards/margins": 2.635740280151367, "rewards/rejected": -36.36201477050781, "step": 2072 }, { "epoch": 0.28227124183006536, "grad_norm": 47.702992560008184, "learning_rate": 7.217866819720745e-07, "logits/chosen": 10.977622985839844, "logits/rejected": 11.728052139282227, "logps/chosen": -3.3248956203460693, "logps/rejected": -3.384880304336548, "loss": 4.1796, "rewards/accuracies": 0.75, "rewards/chosen": -33.24895477294922, "rewards/margins": 0.5998477935791016, "rewards/rejected": -33.84880447387695, "step": 2073 }, { "epoch": 0.2824074074074074, "grad_norm": 41.72364171518313, "learning_rate": 7.216737027280704e-07, "logits/chosen": 11.509664535522461, "logits/rejected": 11.68753433227539, "logps/chosen": -3.5565450191497803, "logps/rejected": -3.2268686294555664, "loss": 4.2106, "rewards/accuracies": 0.0, "rewards/chosen": -35.565452575683594, "rewards/margins": -3.2967653274536133, "rewards/rejected": -32.26868438720703, "step": 2074 }, { "epoch": 0.28254357298474947, "grad_norm": 43.243242239445536, "learning_rate": 7.215606507992342e-07, "logits/chosen": 11.61751651763916, "logits/rejected": 11.850065231323242, "logps/chosen": -3.4514713287353516, "logps/rejected": -3.525350570678711, "loss": 4.6176, "rewards/accuracies": 0.75, "rewards/chosen": -34.51470947265625, "rewards/margins": 0.7387943267822266, "rewards/rejected": -35.25350570678711, "step": 2075 }, { "epoch": 0.2826797385620915, "grad_norm": 35.59473522971418, "learning_rate": 7.214475262111109e-07, "logits/chosen": 11.027950286865234, "logits/rejected": 11.485614776611328, "logps/chosen": -3.2616934776306152, "logps/rejected": -3.4573216438293457, "loss": 4.1878, "rewards/accuracies": 0.5, "rewards/chosen": -32.61693572998047, "rewards/margins": 1.9562835693359375, "rewards/rejected": -34.573219299316406, "step": 2076 }, { "epoch": 0.2828159041394335, "grad_norm": 42.15509285921336, "learning_rate": 7.21334328989262e-07, "logits/chosen": 10.157630920410156, "logits/rejected": 10.895065307617188, "logps/chosen": -3.0982320308685303, "logps/rejected": -3.3522417545318604, "loss": 4.3594, "rewards/accuracies": 0.75, "rewards/chosen": -30.982318878173828, "rewards/margins": 2.540097236633301, "rewards/rejected": -33.52241516113281, "step": 2077 }, { "epoch": 0.2829520697167756, "grad_norm": 38.684733352950204, "learning_rate": 7.212210591592653e-07, "logits/chosen": 11.015716552734375, "logits/rejected": 11.75433349609375, "logps/chosen": -3.150559186935425, "logps/rejected": -3.403111457824707, "loss": 3.7411, "rewards/accuracies": 0.5, "rewards/chosen": -31.505592346191406, "rewards/margins": 2.525524139404297, "rewards/rejected": -34.03111267089844, "step": 2078 }, { "epoch": 0.28308823529411764, "grad_norm": 84.1125318767281, "learning_rate": 7.21107716746715e-07, "logits/chosen": 9.980108261108398, "logits/rejected": 10.961986541748047, "logps/chosen": -2.6217188835144043, "logps/rejected": -2.9578607082366943, "loss": 3.714, "rewards/accuracies": 0.75, "rewards/chosen": -26.21718978881836, "rewards/margins": 3.361417770385742, "rewards/rejected": -29.5786075592041, "step": 2079 }, { "epoch": 0.28322440087145967, "grad_norm": 45.41070543128515, "learning_rate": 7.209943017772218e-07, "logits/chosen": 11.486701965332031, "logits/rejected": 11.784791946411133, "logps/chosen": -3.4684886932373047, "logps/rejected": -3.6421618461608887, "loss": 4.343, "rewards/accuracies": 0.5, "rewards/chosen": -34.68488311767578, "rewards/margins": 1.7367339134216309, "rewards/rejected": -36.42162322998047, "step": 2080 }, { "epoch": 0.28336056644880175, "grad_norm": 40.22866665759287, "learning_rate": 7.208808142764128e-07, "logits/chosen": 9.949079513549805, "logits/rejected": 10.173770904541016, "logps/chosen": -2.757162570953369, "logps/rejected": -3.088344097137451, "loss": 4.3074, "rewards/accuracies": 0.75, "rewards/chosen": -27.571624755859375, "rewards/margins": 3.3118138313293457, "rewards/rejected": -30.883438110351562, "step": 2081 }, { "epoch": 0.2834967320261438, "grad_norm": 37.01275620186824, "learning_rate": 7.207672542699314e-07, "logits/chosen": 10.69143295288086, "logits/rejected": 10.80512809753418, "logps/chosen": -2.914109706878662, "logps/rejected": -3.2736971378326416, "loss": 3.9794, "rewards/accuracies": 0.75, "rewards/chosen": -29.141096115112305, "rewards/margins": 3.595874786376953, "rewards/rejected": -32.736968994140625, "step": 2082 }, { "epoch": 0.2836328976034858, "grad_norm": 40.214881075608794, "learning_rate": 7.206536217834372e-07, "logits/chosen": 11.35748291015625, "logits/rejected": 11.109901428222656, "logps/chosen": -3.356778621673584, "logps/rejected": -3.428896903991699, "loss": 4.2631, "rewards/accuracies": 0.5, "rewards/chosen": -33.56778335571289, "rewards/margins": 0.7211847305297852, "rewards/rejected": -34.288970947265625, "step": 2083 }, { "epoch": 0.2837690631808279, "grad_norm": 39.61788934517965, "learning_rate": 7.205399168426069e-07, "logits/chosen": 11.35329818725586, "logits/rejected": 10.604606628417969, "logps/chosen": -3.292066812515259, "logps/rejected": -3.1343843936920166, "loss": 4.4104, "rewards/accuracies": 0.25, "rewards/chosen": -32.92066955566406, "rewards/margins": -1.576826572418213, "rewards/rejected": -31.343843460083008, "step": 2084 }, { "epoch": 0.2839052287581699, "grad_norm": 41.09121045132113, "learning_rate": 7.204261394731326e-07, "logits/chosen": 10.447331428527832, "logits/rejected": 11.129108428955078, "logps/chosen": -3.092533588409424, "logps/rejected": -3.213778495788574, "loss": 4.525, "rewards/accuracies": 0.75, "rewards/chosen": -30.925338745117188, "rewards/margins": 1.2124457359313965, "rewards/rejected": -32.13778305053711, "step": 2085 }, { "epoch": 0.284041394335512, "grad_norm": 39.87013145440644, "learning_rate": 7.203122897007234e-07, "logits/chosen": 8.959901809692383, "logits/rejected": 10.661760330200195, "logps/chosen": -2.936089277267456, "logps/rejected": -3.240321636199951, "loss": 3.4136, "rewards/accuracies": 0.75, "rewards/chosen": -29.36089324951172, "rewards/margins": 3.0423240661621094, "rewards/rejected": -32.40321350097656, "step": 2086 }, { "epoch": 0.28417755991285404, "grad_norm": 41.7644547190528, "learning_rate": 7.201983675511046e-07, "logits/chosen": 9.583335876464844, "logits/rejected": 10.465907096862793, "logps/chosen": -2.915891408920288, "logps/rejected": -3.333545207977295, "loss": 4.3685, "rewards/accuracies": 1.0, "rewards/chosen": -29.158912658691406, "rewards/margins": 4.1765360832214355, "rewards/rejected": -33.33544921875, "step": 2087 }, { "epoch": 0.28431372549019607, "grad_norm": 38.98811784877334, "learning_rate": 7.20084373050018e-07, "logits/chosen": 10.950359344482422, "logits/rejected": 11.049334526062012, "logps/chosen": -3.091425895690918, "logps/rejected": -3.306710720062256, "loss": 4.4892, "rewards/accuracies": 0.75, "rewards/chosen": -30.914260864257812, "rewards/margins": 2.1528449058532715, "rewards/rejected": -33.06710433959961, "step": 2088 }, { "epoch": 0.28444989106753815, "grad_norm": 38.00164718094527, "learning_rate": 7.199703062232214e-07, "logits/chosen": 11.127623558044434, "logits/rejected": 11.479201316833496, "logps/chosen": -3.266111373901367, "logps/rejected": -3.4982175827026367, "loss": 3.7317, "rewards/accuracies": 0.75, "rewards/chosen": -32.66111373901367, "rewards/margins": 2.3210606575012207, "rewards/rejected": -34.982173919677734, "step": 2089 }, { "epoch": 0.2845860566448802, "grad_norm": 44.01253281487247, "learning_rate": 7.198561670964892e-07, "logits/chosen": 10.158564567565918, "logits/rejected": 10.047782897949219, "logps/chosen": -3.154510021209717, "logps/rejected": -3.174996852874756, "loss": 3.9554, "rewards/accuracies": 0.5, "rewards/chosen": -31.545101165771484, "rewards/margins": 0.20486783981323242, "rewards/rejected": -31.749967575073242, "step": 2090 }, { "epoch": 0.2847222222222222, "grad_norm": 37.8614274925566, "learning_rate": 7.19741955695612e-07, "logits/chosen": 9.263012886047363, "logits/rejected": 11.136116027832031, "logps/chosen": -3.1601452827453613, "logps/rejected": -3.584956645965576, "loss": 4.2156, "rewards/accuracies": 0.75, "rewards/chosen": -31.601451873779297, "rewards/margins": 4.24811315536499, "rewards/rejected": -35.84956359863281, "step": 2091 }, { "epoch": 0.2848583877995643, "grad_norm": 41.37135497889003, "learning_rate": 7.19627672046397e-07, "logits/chosen": 10.22445011138916, "logits/rejected": 10.743232727050781, "logps/chosen": -3.3629395961761475, "logps/rejected": -3.4945836067199707, "loss": 3.6726, "rewards/accuracies": 0.75, "rewards/chosen": -33.62939453125, "rewards/margins": 1.3164420127868652, "rewards/rejected": -34.945838928222656, "step": 2092 }, { "epoch": 0.2849945533769063, "grad_norm": 55.46139488799118, "learning_rate": 7.195133161746675e-07, "logits/chosen": 11.121204376220703, "logits/rejected": 11.313770294189453, "logps/chosen": -3.0754148960113525, "logps/rejected": -3.3104257583618164, "loss": 3.6578, "rewards/accuracies": 0.5, "rewards/chosen": -30.754146575927734, "rewards/margins": 2.3501100540161133, "rewards/rejected": -33.1042594909668, "step": 2093 }, { "epoch": 0.28513071895424835, "grad_norm": 42.04777927279191, "learning_rate": 7.19398888106263e-07, "logits/chosen": 10.838562965393066, "logits/rejected": 10.615730285644531, "logps/chosen": -2.9619991779327393, "logps/rejected": -2.692878484725952, "loss": 4.4133, "rewards/accuracies": 0.25, "rewards/chosen": -29.619991302490234, "rewards/margins": -2.691206455230713, "rewards/rejected": -26.92878532409668, "step": 2094 }, { "epoch": 0.28526688453159044, "grad_norm": 38.71363714302691, "learning_rate": 7.192843878670396e-07, "logits/chosen": 9.932821273803711, "logits/rejected": 10.287668228149414, "logps/chosen": -3.0690650939941406, "logps/rejected": -3.3321266174316406, "loss": 3.9968, "rewards/accuracies": 0.75, "rewards/chosen": -30.690650939941406, "rewards/margins": 2.6306142807006836, "rewards/rejected": -33.321266174316406, "step": 2095 }, { "epoch": 0.28540305010893247, "grad_norm": 40.3485751662438, "learning_rate": 7.191698154828694e-07, "logits/chosen": 10.858542442321777, "logits/rejected": 10.747417449951172, "logps/chosen": -3.129497528076172, "logps/rejected": -3.2045812606811523, "loss": 4.4869, "rewards/accuracies": 0.75, "rewards/chosen": -31.29497528076172, "rewards/margins": 0.7508344650268555, "rewards/rejected": -32.04581069946289, "step": 2096 }, { "epoch": 0.2855392156862745, "grad_norm": 37.02660215524474, "learning_rate": 7.190551709796413e-07, "logits/chosen": 11.277715682983398, "logits/rejected": 11.34707260131836, "logps/chosen": -3.0359466075897217, "logps/rejected": -3.2978858947753906, "loss": 3.8583, "rewards/accuracies": 0.75, "rewards/chosen": -30.359466552734375, "rewards/margins": 2.6193947792053223, "rewards/rejected": -32.978858947753906, "step": 2097 }, { "epoch": 0.2856753812636166, "grad_norm": 38.286305775000606, "learning_rate": 7.189404543832598e-07, "logits/chosen": 10.76581859588623, "logits/rejected": 11.180744171142578, "logps/chosen": -3.030181646347046, "logps/rejected": -3.5013532638549805, "loss": 3.9758, "rewards/accuracies": 1.0, "rewards/chosen": -30.301816940307617, "rewards/margins": 4.711716175079346, "rewards/rejected": -35.01353454589844, "step": 2098 }, { "epoch": 0.2858115468409586, "grad_norm": 37.198231110039785, "learning_rate": 7.188256657196463e-07, "logits/chosen": 10.747861862182617, "logits/rejected": 11.038681030273438, "logps/chosen": -3.3747196197509766, "logps/rejected": -3.6243395805358887, "loss": 4.0509, "rewards/accuracies": 1.0, "rewards/chosen": -33.747196197509766, "rewards/margins": 2.496199131011963, "rewards/rejected": -36.2433967590332, "step": 2099 }, { "epoch": 0.28594771241830064, "grad_norm": 39.095120708300726, "learning_rate": 7.187108050147382e-07, "logits/chosen": 10.834617614746094, "logits/rejected": 11.38058090209961, "logps/chosen": -3.6775412559509277, "logps/rejected": -3.651283025741577, "loss": 4.6058, "rewards/accuracies": 0.25, "rewards/chosen": -36.775413513183594, "rewards/margins": -0.26258230209350586, "rewards/rejected": -36.5128288269043, "step": 2100 }, { "epoch": 0.2860838779956427, "grad_norm": 43.01045759407692, "learning_rate": 7.185958722944893e-07, "logits/chosen": 11.19924545288086, "logits/rejected": 11.593623161315918, "logps/chosen": -3.715104579925537, "logps/rejected": -3.886352062225342, "loss": 4.2792, "rewards/accuracies": 0.75, "rewards/chosen": -37.15104675292969, "rewards/margins": 1.7124738693237305, "rewards/rejected": -38.86351776123047, "step": 2101 }, { "epoch": 0.28622004357298475, "grad_norm": 37.51390411675116, "learning_rate": 7.184808675848693e-07, "logits/chosen": 10.592370986938477, "logits/rejected": 11.150842666625977, "logps/chosen": -3.3166329860687256, "logps/rejected": -3.6063878536224365, "loss": 3.9894, "rewards/accuracies": 0.75, "rewards/chosen": -33.16632843017578, "rewards/margins": 2.897547721862793, "rewards/rejected": -36.063880920410156, "step": 2102 }, { "epoch": 0.2863562091503268, "grad_norm": 42.05424430051542, "learning_rate": 7.183657909118648e-07, "logits/chosen": 10.795780181884766, "logits/rejected": 11.429191589355469, "logps/chosen": -3.013545036315918, "logps/rejected": -3.254915714263916, "loss": 3.617, "rewards/accuracies": 0.75, "rewards/chosen": -30.13545036315918, "rewards/margins": 2.4137043952941895, "rewards/rejected": -32.549156188964844, "step": 2103 }, { "epoch": 0.28649237472766886, "grad_norm": 42.876225143935656, "learning_rate": 7.182506423014784e-07, "logits/chosen": 11.13083267211914, "logits/rejected": 10.966611862182617, "logps/chosen": -3.106478691101074, "logps/rejected": -3.2434492111206055, "loss": 3.7588, "rewards/accuracies": 0.75, "rewards/chosen": -31.064786911010742, "rewards/margins": 1.369706153869629, "rewards/rejected": -32.43449401855469, "step": 2104 }, { "epoch": 0.2866285403050109, "grad_norm": 39.052518977818195, "learning_rate": 7.181354217797285e-07, "logits/chosen": 10.553728103637695, "logits/rejected": 10.495622634887695, "logps/chosen": -2.947722911834717, "logps/rejected": -3.1726205348968506, "loss": 3.8119, "rewards/accuracies": 0.75, "rewards/chosen": -29.477230072021484, "rewards/margins": 2.2489757537841797, "rewards/rejected": -31.726205825805664, "step": 2105 }, { "epoch": 0.2867647058823529, "grad_norm": 43.042811061077735, "learning_rate": 7.180201293726503e-07, "logits/chosen": 11.576311111450195, "logits/rejected": 11.397565841674805, "logps/chosen": -3.277323007583618, "logps/rejected": -3.668376922607422, "loss": 4.4409, "rewards/accuracies": 0.75, "rewards/chosen": -32.773231506347656, "rewards/margins": 3.9105420112609863, "rewards/rejected": -36.68376922607422, "step": 2106 }, { "epoch": 0.286900871459695, "grad_norm": 43.84220110338753, "learning_rate": 7.179047651062951e-07, "logits/chosen": 11.108848571777344, "logits/rejected": 10.345226287841797, "logps/chosen": -3.220594882965088, "logps/rejected": -2.934520721435547, "loss": 4.8051, "rewards/accuracies": 0.25, "rewards/chosen": -32.20594787597656, "rewards/margins": -2.8607425689697266, "rewards/rejected": -29.34520721435547, "step": 2107 }, { "epoch": 0.28703703703703703, "grad_norm": 39.575724041253594, "learning_rate": 7.177893290067304e-07, "logits/chosen": 9.904850959777832, "logits/rejected": 11.60328483581543, "logps/chosen": -3.130096435546875, "logps/rejected": -3.774843692779541, "loss": 3.7164, "rewards/accuracies": 1.0, "rewards/chosen": -31.30096435546875, "rewards/margins": 6.447470188140869, "rewards/rejected": -37.748435974121094, "step": 2108 }, { "epoch": 0.28717320261437906, "grad_norm": 41.81857353148091, "learning_rate": 7.176738211000399e-07, "logits/chosen": 10.810476303100586, "logits/rejected": 11.538655281066895, "logps/chosen": -3.3731889724731445, "logps/rejected": -3.6921839714050293, "loss": 4.0595, "rewards/accuracies": 0.75, "rewards/chosen": -33.73188781738281, "rewards/margins": 3.1899514198303223, "rewards/rejected": -36.921836853027344, "step": 2109 }, { "epoch": 0.28730936819172115, "grad_norm": 44.88909751923984, "learning_rate": 7.175582414123237e-07, "logits/chosen": 10.671016693115234, "logits/rejected": 11.269364356994629, "logps/chosen": -3.3841960430145264, "logps/rejected": -3.4845924377441406, "loss": 3.6448, "rewards/accuracies": 0.75, "rewards/chosen": -33.84196090698242, "rewards/margins": 1.0039634704589844, "rewards/rejected": -34.845924377441406, "step": 2110 }, { "epoch": 0.2874455337690632, "grad_norm": 43.72005647638514, "learning_rate": 7.174425899696978e-07, "logits/chosen": 9.853521347045898, "logits/rejected": 10.72364616394043, "logps/chosen": -3.186257839202881, "logps/rejected": -3.1585867404937744, "loss": 4.3799, "rewards/accuracies": 0.5, "rewards/chosen": -31.862579345703125, "rewards/margins": -0.27670955657958984, "rewards/rejected": -31.58586883544922, "step": 2111 }, { "epoch": 0.2875816993464052, "grad_norm": 36.33771232680472, "learning_rate": 7.173268667982947e-07, "logits/chosen": 11.256717681884766, "logits/rejected": 11.35436725616455, "logps/chosen": -3.2701869010925293, "logps/rejected": -3.3465042114257812, "loss": 4.2027, "rewards/accuracies": 0.5, "rewards/chosen": -32.70186996459961, "rewards/margins": 0.7631740570068359, "rewards/rejected": -33.46504211425781, "step": 2112 }, { "epoch": 0.2877178649237473, "grad_norm": 34.88552455457718, "learning_rate": 7.172110719242631e-07, "logits/chosen": 10.4834623336792, "logits/rejected": 10.49162483215332, "logps/chosen": -3.4865150451660156, "logps/rejected": -3.245612859725952, "loss": 3.8853, "rewards/accuracies": 0.25, "rewards/chosen": -34.865150451660156, "rewards/margins": -2.4090213775634766, "rewards/rejected": -32.45613098144531, "step": 2113 }, { "epoch": 0.2878540305010893, "grad_norm": 37.42311233811965, "learning_rate": 7.170952053737676e-07, "logits/chosen": 10.863659858703613, "logits/rejected": 11.256707191467285, "logps/chosen": -3.341517686843872, "logps/rejected": -3.3507561683654785, "loss": 4.2599, "rewards/accuracies": 0.75, "rewards/chosen": -33.41517639160156, "rewards/margins": 0.09238481521606445, "rewards/rejected": -33.50756072998047, "step": 2114 }, { "epoch": 0.28799019607843135, "grad_norm": 40.06323586487434, "learning_rate": 7.169792671729894e-07, "logits/chosen": 11.059139251708984, "logits/rejected": 10.796707153320312, "logps/chosen": -3.4654593467712402, "logps/rejected": -3.7308225631713867, "loss": 3.9918, "rewards/accuracies": 0.75, "rewards/chosen": -34.65459060668945, "rewards/margins": 2.653635025024414, "rewards/rejected": -37.3082275390625, "step": 2115 }, { "epoch": 0.28812636165577343, "grad_norm": 43.41115565001603, "learning_rate": 7.168632573481255e-07, "logits/chosen": 10.965394973754883, "logits/rejected": 11.521050453186035, "logps/chosen": -3.5955538749694824, "logps/rejected": -3.5366439819335938, "loss": 3.8614, "rewards/accuracies": 0.25, "rewards/chosen": -35.95553970336914, "rewards/margins": -0.5890989303588867, "rewards/rejected": -35.36643981933594, "step": 2116 }, { "epoch": 0.28826252723311546, "grad_norm": 40.65218185652989, "learning_rate": 7.167471759253894e-07, "logits/chosen": 11.555259704589844, "logits/rejected": 11.03438949584961, "logps/chosen": -3.436495304107666, "logps/rejected": -3.5457072257995605, "loss": 4.4339, "rewards/accuracies": 0.75, "rewards/chosen": -34.364952087402344, "rewards/margins": 1.0921196937561035, "rewards/rejected": -35.45707321166992, "step": 2117 }, { "epoch": 0.2883986928104575, "grad_norm": 53.12158516334508, "learning_rate": 7.166310229310107e-07, "logits/chosen": 10.051264762878418, "logits/rejected": 10.316778182983398, "logps/chosen": -3.0578503608703613, "logps/rejected": -3.233869791030884, "loss": 3.632, "rewards/accuracies": 0.75, "rewards/chosen": -30.578502655029297, "rewards/margins": 1.7601962089538574, "rewards/rejected": -32.33869934082031, "step": 2118 }, { "epoch": 0.2885348583877996, "grad_norm": 44.86785064828463, "learning_rate": 7.16514798391235e-07, "logits/chosen": 12.137922286987305, "logits/rejected": 12.739681243896484, "logps/chosen": -3.6855382919311523, "logps/rejected": -3.851141929626465, "loss": 4.0745, "rewards/accuracies": 0.5, "rewards/chosen": -36.855384826660156, "rewards/margins": 1.6560325622558594, "rewards/rejected": -38.511417388916016, "step": 2119 }, { "epoch": 0.2886710239651416, "grad_norm": 46.586526081441285, "learning_rate": 7.163985023323244e-07, "logits/chosen": 11.060290336608887, "logits/rejected": 11.392194747924805, "logps/chosen": -3.683396339416504, "logps/rejected": -3.800532341003418, "loss": 4.493, "rewards/accuracies": 0.5, "rewards/chosen": -36.83396530151367, "rewards/margins": 1.1713600158691406, "rewards/rejected": -38.00532531738281, "step": 2120 }, { "epoch": 0.28880718954248363, "grad_norm": 41.61993246579155, "learning_rate": 7.162821347805567e-07, "logits/chosen": 11.651653289794922, "logits/rejected": 12.357900619506836, "logps/chosen": -3.629120349884033, "logps/rejected": -3.7103123664855957, "loss": 3.953, "rewards/accuracies": 0.25, "rewards/chosen": -36.291202545166016, "rewards/margins": 0.8119215965270996, "rewards/rejected": -37.10312271118164, "step": 2121 }, { "epoch": 0.2889433551198257, "grad_norm": 37.533367286818695, "learning_rate": 7.161656957622263e-07, "logits/chosen": 10.842105865478516, "logits/rejected": 10.511799812316895, "logps/chosen": -3.2834692001342773, "logps/rejected": -3.2715277671813965, "loss": 3.9242, "rewards/accuracies": 0.25, "rewards/chosen": -32.834693908691406, "rewards/margins": -0.119415283203125, "rewards/rejected": -32.71527862548828, "step": 2122 }, { "epoch": 0.28907952069716775, "grad_norm": 40.027105487872404, "learning_rate": 7.160491853036434e-07, "logits/chosen": 10.629539489746094, "logits/rejected": 11.305721282958984, "logps/chosen": -3.172055959701538, "logps/rejected": -3.407134532928467, "loss": 4.5258, "rewards/accuracies": 1.0, "rewards/chosen": -31.720558166503906, "rewards/margins": 2.3507866859436035, "rewards/rejected": -34.071346282958984, "step": 2123 }, { "epoch": 0.28921568627450983, "grad_norm": 37.48421485928862, "learning_rate": 7.159326034311347e-07, "logits/chosen": 11.255231857299805, "logits/rejected": 12.516913414001465, "logps/chosen": -3.39675235748291, "logps/rejected": -3.58113431930542, "loss": 3.7196, "rewards/accuracies": 0.75, "rewards/chosen": -33.967525482177734, "rewards/margins": 1.8438215255737305, "rewards/rejected": -35.811344146728516, "step": 2124 }, { "epoch": 0.28935185185185186, "grad_norm": 38.75570548562448, "learning_rate": 7.158159501710426e-07, "logits/chosen": 10.588151931762695, "logits/rejected": 12.108465194702148, "logps/chosen": -3.407858371734619, "logps/rejected": -3.6765003204345703, "loss": 4.0243, "rewards/accuracies": 0.75, "rewards/chosen": -34.078582763671875, "rewards/margins": 2.6864185333251953, "rewards/rejected": -36.76500701904297, "step": 2125 }, { "epoch": 0.2894880174291939, "grad_norm": 42.54501848985375, "learning_rate": 7.156992255497261e-07, "logits/chosen": 11.197874069213867, "logits/rejected": 11.66586971282959, "logps/chosen": -3.4167630672454834, "logps/rejected": -3.6866583824157715, "loss": 4.3746, "rewards/accuracies": 0.75, "rewards/chosen": -34.167633056640625, "rewards/margins": 2.6989521980285645, "rewards/rejected": -36.86658477783203, "step": 2126 }, { "epoch": 0.289624183006536, "grad_norm": 34.7666845834194, "learning_rate": 7.155824295935599e-07, "logits/chosen": 10.979301452636719, "logits/rejected": 12.318748474121094, "logps/chosen": -3.426755666732788, "logps/rejected": -3.9204440116882324, "loss": 4.0174, "rewards/accuracies": 1.0, "rewards/chosen": -34.267555236816406, "rewards/margins": 4.936886787414551, "rewards/rejected": -39.204444885253906, "step": 2127 }, { "epoch": 0.289760348583878, "grad_norm": 38.103916610363974, "learning_rate": 7.154655623289353e-07, "logits/chosen": 11.209142684936523, "logits/rejected": 12.566801071166992, "logps/chosen": -3.3568575382232666, "logps/rejected": -3.786526679992676, "loss": 3.8023, "rewards/accuracies": 1.0, "rewards/chosen": -33.56857681274414, "rewards/margins": 4.29669189453125, "rewards/rejected": -37.865264892578125, "step": 2128 }, { "epoch": 0.28989651416122003, "grad_norm": 39.78091547804518, "learning_rate": 7.15348623782259e-07, "logits/chosen": 11.646772384643555, "logits/rejected": 11.691157341003418, "logps/chosen": -3.489837169647217, "logps/rejected": -3.6171748638153076, "loss": 3.9907, "rewards/accuracies": 0.25, "rewards/chosen": -34.898372650146484, "rewards/margins": 1.2733774185180664, "rewards/rejected": -36.171749114990234, "step": 2129 }, { "epoch": 0.2900326797385621, "grad_norm": 39.529197891940605, "learning_rate": 7.152316139799545e-07, "logits/chosen": 12.510444641113281, "logits/rejected": 11.8594970703125, "logps/chosen": -3.958559036254883, "logps/rejected": -4.186831474304199, "loss": 3.6415, "rewards/accuracies": 0.75, "rewards/chosen": -39.585594177246094, "rewards/margins": 2.282726287841797, "rewards/rejected": -41.868316650390625, "step": 2130 }, { "epoch": 0.29016884531590414, "grad_norm": 41.44292709840084, "learning_rate": 7.151145329484612e-07, "logits/chosen": 10.509756088256836, "logits/rejected": 10.342765808105469, "logps/chosen": -3.2015726566314697, "logps/rejected": -3.108278512954712, "loss": 4.4116, "rewards/accuracies": 0.5, "rewards/chosen": -32.015724182128906, "rewards/margins": -0.9329404830932617, "rewards/rejected": -31.082786560058594, "step": 2131 }, { "epoch": 0.2903050108932462, "grad_norm": 39.96398136407553, "learning_rate": 7.149973807142343e-07, "logits/chosen": 10.804698944091797, "logits/rejected": 11.817280769348145, "logps/chosen": -3.0852410793304443, "logps/rejected": -3.3209996223449707, "loss": 4.0104, "rewards/accuracies": 1.0, "rewards/chosen": -30.85240936279297, "rewards/margins": 2.3575868606567383, "rewards/rejected": -33.209999084472656, "step": 2132 }, { "epoch": 0.29044117647058826, "grad_norm": 43.976798317979515, "learning_rate": 7.148801573037454e-07, "logits/chosen": 11.412076950073242, "logits/rejected": 11.215566635131836, "logps/chosen": -3.0404205322265625, "logps/rejected": -3.263152599334717, "loss": 4.5227, "rewards/accuracies": 0.75, "rewards/chosen": -30.404205322265625, "rewards/margins": 2.227320671081543, "rewards/rejected": -32.631526947021484, "step": 2133 }, { "epoch": 0.2905773420479303, "grad_norm": 43.33770085010112, "learning_rate": 7.147628627434823e-07, "logits/chosen": 11.152210235595703, "logits/rejected": 11.855325698852539, "logps/chosen": -3.4123172760009766, "logps/rejected": -3.47163462638855, "loss": 4.1306, "rewards/accuracies": 0.75, "rewards/chosen": -34.123172760009766, "rewards/margins": 0.593174934387207, "rewards/rejected": -34.716346740722656, "step": 2134 }, { "epoch": 0.2907135076252723, "grad_norm": 41.852758517882855, "learning_rate": 7.146454970599484e-07, "logits/chosen": 10.502395629882812, "logits/rejected": 11.547521591186523, "logps/chosen": -3.1709375381469727, "logps/rejected": -3.597296714782715, "loss": 4.2139, "rewards/accuracies": 1.0, "rewards/chosen": -31.709375381469727, "rewards/margins": 4.2635908126831055, "rewards/rejected": -35.97296905517578, "step": 2135 }, { "epoch": 0.2908496732026144, "grad_norm": 42.12485104099577, "learning_rate": 7.145280602796636e-07, "logits/chosen": 10.431926727294922, "logits/rejected": 10.6827392578125, "logps/chosen": -3.1882805824279785, "logps/rejected": -3.2409183979034424, "loss": 3.6231, "rewards/accuracies": 0.5, "rewards/chosen": -31.882808685302734, "rewards/margins": 0.5263752937316895, "rewards/rejected": -32.409183502197266, "step": 2136 }, { "epoch": 0.29098583877995643, "grad_norm": 42.235401908960284, "learning_rate": 7.144105524291637e-07, "logits/chosen": 11.212801933288574, "logits/rejected": 11.169610977172852, "logps/chosen": -3.4478564262390137, "logps/rejected": -3.6206564903259277, "loss": 3.6411, "rewards/accuracies": 0.75, "rewards/chosen": -34.47856140136719, "rewards/margins": 1.7279996871948242, "rewards/rejected": -36.206565856933594, "step": 2137 }, { "epoch": 0.29112200435729846, "grad_norm": 39.69455782388026, "learning_rate": 7.142929735350005e-07, "logits/chosen": 10.630279541015625, "logits/rejected": 11.134230613708496, "logps/chosen": -3.750199794769287, "logps/rejected": -3.597062826156616, "loss": 4.4399, "rewards/accuracies": 0.5, "rewards/chosen": -37.50199508666992, "rewards/margins": -1.531367301940918, "rewards/rejected": -35.97062683105469, "step": 2138 }, { "epoch": 0.29125816993464054, "grad_norm": 46.580769896126284, "learning_rate": 7.141753236237419e-07, "logits/chosen": 11.82204532623291, "logits/rejected": 11.63368034362793, "logps/chosen": -3.4186184406280518, "logps/rejected": -3.31742000579834, "loss": 4.1691, "rewards/accuracies": 0.25, "rewards/chosen": -34.18618392944336, "rewards/margins": -1.0119857788085938, "rewards/rejected": -33.17420196533203, "step": 2139 }, { "epoch": 0.29139433551198257, "grad_norm": 45.92177573809445, "learning_rate": 7.140576027219719e-07, "logits/chosen": 10.629310607910156, "logits/rejected": 10.879437446594238, "logps/chosen": -3.4075756072998047, "logps/rejected": -3.6376049518585205, "loss": 4.0219, "rewards/accuracies": 1.0, "rewards/chosen": -34.07575225830078, "rewards/margins": 2.3002939224243164, "rewards/rejected": -36.37604904174805, "step": 2140 }, { "epoch": 0.2915305010893246, "grad_norm": 46.032901029460824, "learning_rate": 7.139398108562906e-07, "logits/chosen": 10.752117156982422, "logits/rejected": 11.720320701599121, "logps/chosen": -3.487278938293457, "logps/rejected": -3.517709732055664, "loss": 4.3287, "rewards/accuracies": 0.75, "rewards/chosen": -34.87278747558594, "rewards/margins": 0.30431032180786133, "rewards/rejected": -35.177101135253906, "step": 2141 }, { "epoch": 0.2916666666666667, "grad_norm": 47.819435154558036, "learning_rate": 7.13821948053314e-07, "logits/chosen": 12.232488632202148, "logits/rejected": 11.229104995727539, "logps/chosen": -3.56123948097229, "logps/rejected": -3.3971052169799805, "loss": 4.4449, "rewards/accuracies": 0.25, "rewards/chosen": -35.612396240234375, "rewards/margins": -1.6413416862487793, "rewards/rejected": -33.97105407714844, "step": 2142 }, { "epoch": 0.2918028322440087, "grad_norm": 44.751747030545374, "learning_rate": 7.137040143396742e-07, "logits/chosen": 11.044319152832031, "logits/rejected": 11.601269721984863, "logps/chosen": -3.1226413249969482, "logps/rejected": -3.3558950424194336, "loss": 3.9806, "rewards/accuracies": 1.0, "rewards/chosen": -31.22641372680664, "rewards/margins": 2.3325376510620117, "rewards/rejected": -33.5589485168457, "step": 2143 }, { "epoch": 0.29193899782135074, "grad_norm": 49.884200877077205, "learning_rate": 7.135860097420192e-07, "logits/chosen": 11.692065238952637, "logits/rejected": 11.525842666625977, "logps/chosen": -3.3366665840148926, "logps/rejected": -3.7541213035583496, "loss": 4.6708, "rewards/accuracies": 0.5, "rewards/chosen": -33.36666488647461, "rewards/margins": 4.1745476722717285, "rewards/rejected": -37.54121398925781, "step": 2144 }, { "epoch": 0.2920751633986928, "grad_norm": 51.33320969375151, "learning_rate": 7.134679342870133e-07, "logits/chosen": 10.782598495483398, "logits/rejected": 9.98433780670166, "logps/chosen": -3.179088592529297, "logps/rejected": -3.058027505874634, "loss": 4.462, "rewards/accuracies": 0.5, "rewards/chosen": -31.79088592529297, "rewards/margins": -1.210611343383789, "rewards/rejected": -30.580272674560547, "step": 2145 }, { "epoch": 0.29221132897603486, "grad_norm": 39.6767522289209, "learning_rate": 7.133497880013363e-07, "logits/chosen": 11.521333694458008, "logits/rejected": 12.022737503051758, "logps/chosen": -3.475071668624878, "logps/rejected": -3.708827257156372, "loss": 4.0981, "rewards/accuracies": 0.75, "rewards/chosen": -34.75071716308594, "rewards/margins": 2.337557792663574, "rewards/rejected": -37.08827209472656, "step": 2146 }, { "epoch": 0.2923474945533769, "grad_norm": 45.923224043463705, "learning_rate": 7.132315709116845e-07, "logits/chosen": 12.025634765625, "logits/rejected": 11.416915893554688, "logps/chosen": -3.56461763381958, "logps/rejected": -3.684192657470703, "loss": 4.5175, "rewards/accuracies": 0.75, "rewards/chosen": -35.64617156982422, "rewards/margins": 1.195749282836914, "rewards/rejected": -36.84192657470703, "step": 2147 }, { "epoch": 0.29248366013071897, "grad_norm": 46.82960453086646, "learning_rate": 7.131132830447703e-07, "logits/chosen": 11.043560028076172, "logits/rejected": 11.983026504516602, "logps/chosen": -3.3752007484436035, "logps/rejected": -3.854867458343506, "loss": 3.5834, "rewards/accuracies": 1.0, "rewards/chosen": -33.75200653076172, "rewards/margins": 4.796667098999023, "rewards/rejected": -38.548675537109375, "step": 2148 }, { "epoch": 0.292619825708061, "grad_norm": 45.22359178568344, "learning_rate": 7.129949244273212e-07, "logits/chosen": 12.255117416381836, "logits/rejected": 11.492868423461914, "logps/chosen": -3.7011053562164307, "logps/rejected": -3.3237969875335693, "loss": 4.2641, "rewards/accuracies": 0.25, "rewards/chosen": -37.01105499267578, "rewards/margins": -3.773083209991455, "rewards/rejected": -33.23796844482422, "step": 2149 }, { "epoch": 0.292755991285403, "grad_norm": 42.7198618845879, "learning_rate": 7.128764950860819e-07, "logits/chosen": 12.386180877685547, "logits/rejected": 12.249672889709473, "logps/chosen": -3.5387542247772217, "logps/rejected": -3.7469730377197266, "loss": 4.0713, "rewards/accuracies": 0.75, "rewards/chosen": -35.387542724609375, "rewards/margins": 2.08219051361084, "rewards/rejected": -37.469730377197266, "step": 2150 }, { "epoch": 0.2928921568627451, "grad_norm": 42.354747081313924, "learning_rate": 7.127579950478123e-07, "logits/chosen": 11.227087020874023, "logits/rejected": 11.153234481811523, "logps/chosen": -3.2605817317962646, "logps/rejected": -3.3823771476745605, "loss": 3.7643, "rewards/accuracies": 0.25, "rewards/chosen": -32.60581970214844, "rewards/margins": 1.2179532051086426, "rewards/rejected": -33.82377243041992, "step": 2151 }, { "epoch": 0.29302832244008714, "grad_norm": 46.6890649958995, "learning_rate": 7.126394243392885e-07, "logits/chosen": 11.998618125915527, "logits/rejected": 12.493343353271484, "logps/chosen": -3.3604249954223633, "logps/rejected": -3.3525609970092773, "loss": 4.4353, "rewards/accuracies": 0.5, "rewards/chosen": -33.604251861572266, "rewards/margins": -0.07863903045654297, "rewards/rejected": -33.525611877441406, "step": 2152 }, { "epoch": 0.29316448801742917, "grad_norm": 39.98459197407984, "learning_rate": 7.125207829873023e-07, "logits/chosen": 12.44512939453125, "logits/rejected": 11.93747329711914, "logps/chosen": -3.3391313552856445, "logps/rejected": -3.771796226501465, "loss": 3.8288, "rewards/accuracies": 0.75, "rewards/chosen": -33.39131164550781, "rewards/margins": 4.326651096343994, "rewards/rejected": -37.71796417236328, "step": 2153 }, { "epoch": 0.29330065359477125, "grad_norm": 46.713578861392826, "learning_rate": 7.12402071018662e-07, "logits/chosen": 11.923574447631836, "logits/rejected": 12.328412055969238, "logps/chosen": -3.9217278957366943, "logps/rejected": -3.9810757637023926, "loss": 4.1246, "rewards/accuracies": 0.5, "rewards/chosen": -39.21727752685547, "rewards/margins": 0.593477725982666, "rewards/rejected": -39.81075668334961, "step": 2154 }, { "epoch": 0.2934368191721133, "grad_norm": 48.82663802027573, "learning_rate": 7.122832884601914e-07, "logits/chosen": 12.027244567871094, "logits/rejected": 12.431875228881836, "logps/chosen": -3.745208263397217, "logps/rejected": -4.011331081390381, "loss": 3.4921, "rewards/accuracies": 1.0, "rewards/chosen": -37.452083587646484, "rewards/margins": 2.661227226257324, "rewards/rejected": -40.113311767578125, "step": 2155 }, { "epoch": 0.2935729847494553, "grad_norm": 41.81561911885673, "learning_rate": 7.121644353387303e-07, "logits/chosen": 11.251729965209961, "logits/rejected": 11.472887992858887, "logps/chosen": -3.24086332321167, "logps/rejected": -3.5704097747802734, "loss": 3.2569, "rewards/accuracies": 0.75, "rewards/chosen": -32.408634185791016, "rewards/margins": 3.295464038848877, "rewards/rejected": -35.7041015625, "step": 2156 }, { "epoch": 0.2937091503267974, "grad_norm": 42.284013577738534, "learning_rate": 7.120455116811347e-07, "logits/chosen": 12.46154499053955, "logits/rejected": 11.8634033203125, "logps/chosen": -3.4460065364837646, "logps/rejected": -3.4776875972747803, "loss": 4.1338, "rewards/accuracies": 0.5, "rewards/chosen": -34.46006393432617, "rewards/margins": 0.31681203842163086, "rewards/rejected": -34.776878356933594, "step": 2157 }, { "epoch": 0.2938453159041394, "grad_norm": 42.743764757238466, "learning_rate": 7.119265175142764e-07, "logits/chosen": 10.301908493041992, "logits/rejected": 12.188713073730469, "logps/chosen": -2.9344615936279297, "logps/rejected": -3.5740671157836914, "loss": 3.7191, "rewards/accuracies": 0.75, "rewards/chosen": -29.344614028930664, "rewards/margins": 6.396058082580566, "rewards/rejected": -35.74066925048828, "step": 2158 }, { "epoch": 0.29398148148148145, "grad_norm": 41.36335259058478, "learning_rate": 7.11807452865043e-07, "logits/chosen": 11.479089736938477, "logits/rejected": 11.890027046203613, "logps/chosen": -3.5986485481262207, "logps/rejected": -3.885104179382324, "loss": 3.9964, "rewards/accuracies": 0.75, "rewards/chosen": -35.98648452758789, "rewards/margins": 2.8645567893981934, "rewards/rejected": -38.851043701171875, "step": 2159 }, { "epoch": 0.29411764705882354, "grad_norm": 44.48281329048136, "learning_rate": 7.116883177603383e-07, "logits/chosen": 9.900421142578125, "logits/rejected": 11.944005966186523, "logps/chosen": -3.109159469604492, "logps/rejected": -3.5148720741271973, "loss": 4.478, "rewards/accuracies": 1.0, "rewards/chosen": -31.091594696044922, "rewards/margins": 4.057126045227051, "rewards/rejected": -35.148719787597656, "step": 2160 }, { "epoch": 0.29425381263616557, "grad_norm": 45.911563308701155, "learning_rate": 7.115691122270817e-07, "logits/chosen": 11.271615028381348, "logits/rejected": 11.729894638061523, "logps/chosen": -3.2845191955566406, "logps/rejected": -3.347543239593506, "loss": 4.0447, "rewards/accuracies": 0.5, "rewards/chosen": -32.845191955566406, "rewards/margins": 0.6302433013916016, "rewards/rejected": -33.475433349609375, "step": 2161 }, { "epoch": 0.29438997821350765, "grad_norm": 42.75581708449369, "learning_rate": 7.114498362922086e-07, "logits/chosen": 11.015718460083008, "logits/rejected": 11.42719841003418, "logps/chosen": -3.2601165771484375, "logps/rejected": -3.429530620574951, "loss": 4.3277, "rewards/accuracies": 1.0, "rewards/chosen": -32.601165771484375, "rewards/margins": 1.694140911102295, "rewards/rejected": -34.29530334472656, "step": 2162 }, { "epoch": 0.2945261437908497, "grad_norm": 44.00163897571475, "learning_rate": 7.113304899826707e-07, "logits/chosen": 11.036127090454102, "logits/rejected": 11.851408004760742, "logps/chosen": -3.5539627075195312, "logps/rejected": -3.8404476642608643, "loss": 4.5447, "rewards/accuracies": 0.5, "rewards/chosen": -35.53962707519531, "rewards/margins": 2.864849090576172, "rewards/rejected": -38.404476165771484, "step": 2163 }, { "epoch": 0.2946623093681917, "grad_norm": 40.74562440319361, "learning_rate": 7.11211073325435e-07, "logits/chosen": 11.694814682006836, "logits/rejected": 11.624214172363281, "logps/chosen": -3.533114433288574, "logps/rejected": -3.6859841346740723, "loss": 3.7666, "rewards/accuracies": 0.75, "rewards/chosen": -35.331146240234375, "rewards/margins": 1.528696060180664, "rewards/rejected": -36.859840393066406, "step": 2164 }, { "epoch": 0.2947984749455338, "grad_norm": 44.24417159838607, "learning_rate": 7.110915863474849e-07, "logits/chosen": 11.704452514648438, "logits/rejected": 11.803810119628906, "logps/chosen": -3.646902084350586, "logps/rejected": -3.605518102645874, "loss": 4.1885, "rewards/accuracies": 0.5, "rewards/chosen": -36.469024658203125, "rewards/margins": -0.41384124755859375, "rewards/rejected": -36.05518341064453, "step": 2165 }, { "epoch": 0.2949346405228758, "grad_norm": 41.014895952009596, "learning_rate": 7.109720290758192e-07, "logits/chosen": 11.439916610717773, "logits/rejected": 11.785527229309082, "logps/chosen": -3.2535946369171143, "logps/rejected": -3.5191397666931152, "loss": 3.5027, "rewards/accuracies": 1.0, "rewards/chosen": -32.53594970703125, "rewards/margins": 2.655449867248535, "rewards/rejected": -35.19139862060547, "step": 2166 }, { "epoch": 0.29507080610021785, "grad_norm": 41.295386202006966, "learning_rate": 7.108524015374531e-07, "logits/chosen": 12.445409774780273, "logits/rejected": 12.896276473999023, "logps/chosen": -3.5751757621765137, "logps/rejected": -3.55326247215271, "loss": 4.0829, "rewards/accuracies": 0.5, "rewards/chosen": -35.75175857543945, "rewards/margins": -0.2191333770751953, "rewards/rejected": -35.532623291015625, "step": 2167 }, { "epoch": 0.29520697167755994, "grad_norm": 43.14413908390561, "learning_rate": 7.107327037594173e-07, "logits/chosen": 11.82746696472168, "logits/rejected": 11.803665161132812, "logps/chosen": -3.7267391681671143, "logps/rejected": -3.5550191402435303, "loss": 4.0595, "rewards/accuracies": 0.25, "rewards/chosen": -37.267391204833984, "rewards/margins": -1.7172021865844727, "rewards/rejected": -35.55018997192383, "step": 2168 }, { "epoch": 0.29534313725490197, "grad_norm": 44.123308692482375, "learning_rate": 7.106129357687586e-07, "logits/chosen": 12.28730583190918, "logits/rejected": 12.07876205444336, "logps/chosen": -3.7633001804351807, "logps/rejected": -3.9334702491760254, "loss": 3.899, "rewards/accuracies": 0.5, "rewards/chosen": -37.63300323486328, "rewards/margins": 1.701700210571289, "rewards/rejected": -39.33470153808594, "step": 2169 }, { "epoch": 0.295479302832244, "grad_norm": 46.63867880056127, "learning_rate": 7.104930975925395e-07, "logits/chosen": 12.298587799072266, "logits/rejected": 12.455591201782227, "logps/chosen": -3.615576982498169, "logps/rejected": -3.7559990882873535, "loss": 3.8598, "rewards/accuracies": 0.75, "rewards/chosen": -36.15576934814453, "rewards/margins": 1.404221534729004, "rewards/rejected": -37.55998992919922, "step": 2170 }, { "epoch": 0.2956154684095861, "grad_norm": 41.096455090274155, "learning_rate": 7.103731892578384e-07, "logits/chosen": 10.718184471130371, "logits/rejected": 11.942587852478027, "logps/chosen": -2.6629343032836914, "logps/rejected": -3.2454142570495605, "loss": 4.1, "rewards/accuracies": 1.0, "rewards/chosen": -26.629344940185547, "rewards/margins": 5.824799060821533, "rewards/rejected": -32.45414352416992, "step": 2171 }, { "epoch": 0.2957516339869281, "grad_norm": 41.23411783784559, "learning_rate": 7.102532107917496e-07, "logits/chosen": 10.698221206665039, "logits/rejected": 10.572818756103516, "logps/chosen": -3.084409713745117, "logps/rejected": -3.3233683109283447, "loss": 3.5562, "rewards/accuracies": 0.75, "rewards/chosen": -30.844099044799805, "rewards/margins": 2.389582633972168, "rewards/rejected": -33.233680725097656, "step": 2172 }, { "epoch": 0.29588779956427014, "grad_norm": 39.410277856841944, "learning_rate": 7.101331622213833e-07, "logits/chosen": 10.967972755432129, "logits/rejected": 13.468862533569336, "logps/chosen": -3.045510768890381, "logps/rejected": -3.6193652153015137, "loss": 3.6798, "rewards/accuracies": 1.0, "rewards/chosen": -30.455106735229492, "rewards/margins": 5.738546848297119, "rewards/rejected": -36.19365310668945, "step": 2173 }, { "epoch": 0.2960239651416122, "grad_norm": 51.417203887856616, "learning_rate": 7.100130435738654e-07, "logits/chosen": 11.82148265838623, "logits/rejected": 12.19273853302002, "logps/chosen": -3.3932223320007324, "logps/rejected": -3.4394736289978027, "loss": 4.011, "rewards/accuracies": 0.5, "rewards/chosen": -33.932220458984375, "rewards/margins": 0.4625129699707031, "rewards/rejected": -34.394737243652344, "step": 2174 }, { "epoch": 0.29616013071895425, "grad_norm": 44.20580663111177, "learning_rate": 7.098928548763377e-07, "logits/chosen": 11.228134155273438, "logits/rejected": 12.004265785217285, "logps/chosen": -3.0242135524749756, "logps/rejected": -3.4256553649902344, "loss": 4.3859, "rewards/accuracies": 1.0, "rewards/chosen": -30.242137908935547, "rewards/margins": 4.0144147872924805, "rewards/rejected": -34.256553649902344, "step": 2175 }, { "epoch": 0.2962962962962963, "grad_norm": 41.803319589710206, "learning_rate": 7.097725961559579e-07, "logits/chosen": 12.946300506591797, "logits/rejected": 12.86607837677002, "logps/chosen": -3.5633344650268555, "logps/rejected": -3.469564199447632, "loss": 4.6276, "rewards/accuracies": 0.25, "rewards/chosen": -35.63334655761719, "rewards/margins": -0.9377036094665527, "rewards/rejected": -34.695640563964844, "step": 2176 }, { "epoch": 0.29643246187363836, "grad_norm": 42.37737592790246, "learning_rate": 7.096522674398993e-07, "logits/chosen": 11.44052505493164, "logits/rejected": 11.361312866210938, "logps/chosen": -3.1112053394317627, "logps/rejected": -3.2438650131225586, "loss": 4.259, "rewards/accuracies": 0.75, "rewards/chosen": -31.11205291748047, "rewards/margins": 1.3265957832336426, "rewards/rejected": -32.43865203857422, "step": 2177 }, { "epoch": 0.2965686274509804, "grad_norm": 41.68188479994079, "learning_rate": 7.095318687553513e-07, "logits/chosen": 12.368444442749023, "logits/rejected": 12.243179321289062, "logps/chosen": -3.450457811355591, "logps/rejected": -3.290320873260498, "loss": 4.2357, "rewards/accuracies": 0.25, "rewards/chosen": -34.50457763671875, "rewards/margins": -1.6013708114624023, "rewards/rejected": -32.90320587158203, "step": 2178 }, { "epoch": 0.2967047930283224, "grad_norm": 37.618098895241914, "learning_rate": 7.094114001295188e-07, "logits/chosen": 11.47496509552002, "logits/rejected": 12.437177658081055, "logps/chosen": -3.028184652328491, "logps/rejected": -3.905759334564209, "loss": 3.4225, "rewards/accuracies": 1.0, "rewards/chosen": -30.281845092773438, "rewards/margins": 8.77574634552002, "rewards/rejected": -39.057594299316406, "step": 2179 }, { "epoch": 0.2968409586056645, "grad_norm": 53.70743293370266, "learning_rate": 7.092908615896231e-07, "logits/chosen": 11.435022354125977, "logits/rejected": 11.931463241577148, "logps/chosen": -2.8018009662628174, "logps/rejected": -3.4148459434509277, "loss": 3.75, "rewards/accuracies": 0.75, "rewards/chosen": -28.018009185791016, "rewards/margins": 6.130448341369629, "rewards/rejected": -34.148460388183594, "step": 2180 }, { "epoch": 0.29697712418300654, "grad_norm": 49.76336021354967, "learning_rate": 7.091702531629003e-07, "logits/chosen": 10.444080352783203, "logits/rejected": 12.262200355529785, "logps/chosen": -3.179570198059082, "logps/rejected": -3.433405876159668, "loss": 4.0869, "rewards/accuracies": 1.0, "rewards/chosen": -31.795698165893555, "rewards/margins": 2.538360118865967, "rewards/rejected": -34.33406066894531, "step": 2181 }, { "epoch": 0.29711328976034856, "grad_norm": 46.459042845916976, "learning_rate": 7.090495748766035e-07, "logits/chosen": 11.314746856689453, "logits/rejected": 11.738883972167969, "logps/chosen": -3.1014297008514404, "logps/rejected": -3.347993850708008, "loss": 4.3444, "rewards/accuracies": 0.75, "rewards/chosen": -31.014297485351562, "rewards/margins": 2.4656424522399902, "rewards/rejected": -33.479942321777344, "step": 2182 }, { "epoch": 0.29724945533769065, "grad_norm": 43.11383330162366, "learning_rate": 7.089288267580004e-07, "logits/chosen": 10.91168212890625, "logits/rejected": 11.41135025024414, "logps/chosen": -3.3476874828338623, "logps/rejected": -3.481055736541748, "loss": 4.2954, "rewards/accuracies": 0.5, "rewards/chosen": -33.47687530517578, "rewards/margins": 1.3336806297302246, "rewards/rejected": -34.81055450439453, "step": 2183 }, { "epoch": 0.2973856209150327, "grad_norm": 40.65326733754102, "learning_rate": 7.088080088343753e-07, "logits/chosen": 11.929584503173828, "logits/rejected": 11.888385772705078, "logps/chosen": -3.2116127014160156, "logps/rejected": -3.530557632446289, "loss": 3.9114, "rewards/accuracies": 1.0, "rewards/chosen": -32.116127014160156, "rewards/margins": 3.1894493103027344, "rewards/rejected": -35.305572509765625, "step": 2184 }, { "epoch": 0.2975217864923747, "grad_norm": 44.24113897904616, "learning_rate": 7.08687121133028e-07, "logits/chosen": 11.188358306884766, "logits/rejected": 11.724082946777344, "logps/chosen": -3.347250461578369, "logps/rejected": -3.480469226837158, "loss": 4.3926, "rewards/accuracies": 0.75, "rewards/chosen": -33.472503662109375, "rewards/margins": 1.3321895599365234, "rewards/rejected": -34.804691314697266, "step": 2185 }, { "epoch": 0.2976579520697168, "grad_norm": 43.02467473643245, "learning_rate": 7.08566163681274e-07, "logits/chosen": 12.84941291809082, "logits/rejected": 12.688312530517578, "logps/chosen": -3.2676315307617188, "logps/rejected": -3.528930425643921, "loss": 3.685, "rewards/accuracies": 1.0, "rewards/chosen": -32.67631530761719, "rewards/margins": 2.6129889488220215, "rewards/rejected": -35.289306640625, "step": 2186 }, { "epoch": 0.2977941176470588, "grad_norm": 51.66386690861404, "learning_rate": 7.084451365064447e-07, "logits/chosen": 10.955644607543945, "logits/rejected": 11.510534286499023, "logps/chosen": -3.403900384902954, "logps/rejected": -3.4097988605499268, "loss": 4.5922, "rewards/accuracies": 0.75, "rewards/chosen": -34.03900146484375, "rewards/margins": 0.058985233306884766, "rewards/rejected": -34.097984313964844, "step": 2187 }, { "epoch": 0.29793028322440085, "grad_norm": 49.66660441525105, "learning_rate": 7.083240396358872e-07, "logits/chosen": 11.211545944213867, "logits/rejected": 10.511404991149902, "logps/chosen": -3.2707321643829346, "logps/rejected": -3.3646960258483887, "loss": 4.2426, "rewards/accuracies": 0.75, "rewards/chosen": -32.70732116699219, "rewards/margins": 0.9396405220031738, "rewards/rejected": -33.64696502685547, "step": 2188 }, { "epoch": 0.29806644880174293, "grad_norm": 43.175552868056435, "learning_rate": 7.082028730969643e-07, "logits/chosen": 11.453487396240234, "logits/rejected": 11.0365629196167, "logps/chosen": -3.012645721435547, "logps/rejected": -3.0957517623901367, "loss": 4.1871, "rewards/accuracies": 0.5, "rewards/chosen": -30.12645721435547, "rewards/margins": 0.8310604095458984, "rewards/rejected": -30.957517623901367, "step": 2189 }, { "epoch": 0.29820261437908496, "grad_norm": 41.188469839336314, "learning_rate": 7.080816369170545e-07, "logits/chosen": 11.926647186279297, "logits/rejected": 12.460783958435059, "logps/chosen": -3.5243489742279053, "logps/rejected": -3.6065869331359863, "loss": 4.5067, "rewards/accuracies": 0.75, "rewards/chosen": -35.24348831176758, "rewards/margins": 0.8223814964294434, "rewards/rejected": -36.06587219238281, "step": 2190 }, { "epoch": 0.298338779956427, "grad_norm": 42.636192710306936, "learning_rate": 7.079603311235524e-07, "logits/chosen": 11.295583724975586, "logits/rejected": 11.450926780700684, "logps/chosen": -3.3148088455200195, "logps/rejected": -3.5747079849243164, "loss": 4.1079, "rewards/accuracies": 0.5, "rewards/chosen": -33.14809036254883, "rewards/margins": 2.5989904403686523, "rewards/rejected": -35.7470817565918, "step": 2191 }, { "epoch": 0.2984749455337691, "grad_norm": 39.5158432982622, "learning_rate": 7.078389557438677e-07, "logits/chosen": 11.167699813842773, "logits/rejected": 11.933305740356445, "logps/chosen": -3.6721763610839844, "logps/rejected": -3.6629157066345215, "loss": 3.8494, "rewards/accuracies": 0.5, "rewards/chosen": -36.721763610839844, "rewards/margins": -0.09260416030883789, "rewards/rejected": -36.62915802001953, "step": 2192 }, { "epoch": 0.2986111111111111, "grad_norm": 47.98223319558976, "learning_rate": 7.077175108054265e-07, "logits/chosen": 12.525609970092773, "logits/rejected": 12.639261245727539, "logps/chosen": -3.6089680194854736, "logps/rejected": -3.518517017364502, "loss": 3.8186, "rewards/accuracies": 0.5, "rewards/chosen": -36.08967971801758, "rewards/margins": -0.9045124053955078, "rewards/rejected": -35.18516540527344, "step": 2193 }, { "epoch": 0.29874727668845313, "grad_norm": 40.54977832159377, "learning_rate": 7.075959963356699e-07, "logits/chosen": 11.999431610107422, "logits/rejected": 11.858842849731445, "logps/chosen": -3.5311942100524902, "logps/rejected": -3.8343117237091064, "loss": 3.6327, "rewards/accuracies": 0.75, "rewards/chosen": -35.31194305419922, "rewards/margins": 3.0311756134033203, "rewards/rejected": -38.343116760253906, "step": 2194 }, { "epoch": 0.2988834422657952, "grad_norm": 43.21765873732154, "learning_rate": 7.074744123620554e-07, "logits/chosen": 11.677298545837402, "logits/rejected": 12.714021682739258, "logps/chosen": -3.3191065788269043, "logps/rejected": -3.8129522800445557, "loss": 4.3569, "rewards/accuracies": 1.0, "rewards/chosen": -33.19106674194336, "rewards/margins": 4.938457489013672, "rewards/rejected": -38.12952423095703, "step": 2195 }, { "epoch": 0.29901960784313725, "grad_norm": 46.05948532605176, "learning_rate": 7.073527589120559e-07, "logits/chosen": 11.280040740966797, "logits/rejected": 12.080421447753906, "logps/chosen": -3.402548313140869, "logps/rejected": -3.8914685249328613, "loss": 4.0265, "rewards/accuracies": 1.0, "rewards/chosen": -34.025482177734375, "rewards/margins": 4.889200687408447, "rewards/rejected": -38.91468048095703, "step": 2196 }, { "epoch": 0.2991557734204793, "grad_norm": 44.00948049516043, "learning_rate": 7.072310360131598e-07, "logits/chosen": 11.798778533935547, "logits/rejected": 11.499767303466797, "logps/chosen": -3.4325618743896484, "logps/rejected": -3.895385503768921, "loss": 4.2318, "rewards/accuracies": 0.75, "rewards/chosen": -34.325618743896484, "rewards/margins": 4.628234386444092, "rewards/rejected": -38.953853607177734, "step": 2197 }, { "epoch": 0.29929193899782136, "grad_norm": 40.06223235973965, "learning_rate": 7.071092436928715e-07, "logits/chosen": 12.412121772766113, "logits/rejected": 12.20956039428711, "logps/chosen": -3.4616427421569824, "logps/rejected": -3.4017891883850098, "loss": 3.9426, "rewards/accuracies": 0.5, "rewards/chosen": -34.61642837524414, "rewards/margins": -0.5985345840454102, "rewards/rejected": -34.01789474487305, "step": 2198 }, { "epoch": 0.2994281045751634, "grad_norm": 39.44540269924119, "learning_rate": 7.069873819787111e-07, "logits/chosen": 11.61970329284668, "logits/rejected": 11.955142974853516, "logps/chosen": -3.3911499977111816, "logps/rejected": -3.478672504425049, "loss": 3.8412, "rewards/accuracies": 0.5, "rewards/chosen": -33.9114990234375, "rewards/margins": 0.8752269744873047, "rewards/rejected": -34.78672790527344, "step": 2199 }, { "epoch": 0.2995642701525055, "grad_norm": 42.35905578391073, "learning_rate": 7.068654508982142e-07, "logits/chosen": 10.453097343444824, "logits/rejected": 11.375041007995605, "logps/chosen": -3.2559995651245117, "logps/rejected": -3.5750510692596436, "loss": 3.9797, "rewards/accuracies": 1.0, "rewards/chosen": -32.55999755859375, "rewards/margins": 3.190516471862793, "rewards/rejected": -35.750511169433594, "step": 2200 }, { "epoch": 0.2997004357298475, "grad_norm": 42.92899351136541, "learning_rate": 7.06743450478932e-07, "logits/chosen": 10.94963264465332, "logits/rejected": 11.890162467956543, "logps/chosen": -3.2915334701538086, "logps/rejected": -3.239027500152588, "loss": 4.0784, "rewards/accuracies": 0.5, "rewards/chosen": -32.91533279418945, "rewards/margins": -0.5250568389892578, "rewards/rejected": -32.39027404785156, "step": 2201 }, { "epoch": 0.29983660130718953, "grad_norm": 42.76463374766119, "learning_rate": 7.066213807484315e-07, "logits/chosen": 11.66266918182373, "logits/rejected": 11.749606132507324, "logps/chosen": -3.47049617767334, "logps/rejected": -3.713291645050049, "loss": 3.7856, "rewards/accuracies": 0.5, "rewards/chosen": -34.70496368408203, "rewards/margins": 2.427957057952881, "rewards/rejected": -37.13291931152344, "step": 2202 }, { "epoch": 0.2999727668845316, "grad_norm": 45.09626620972899, "learning_rate": 7.064992417342956e-07, "logits/chosen": 12.149323463439941, "logits/rejected": 12.51561164855957, "logps/chosen": -3.46895694732666, "logps/rejected": -3.589179277420044, "loss": 3.453, "rewards/accuracies": 0.5, "rewards/chosen": -34.68956756591797, "rewards/margins": 1.202225685119629, "rewards/rejected": -35.89179229736328, "step": 2203 }, { "epoch": 0.30010893246187365, "grad_norm": 46.816663602030836, "learning_rate": 7.063770334641224e-07, "logits/chosen": 11.554815292358398, "logits/rejected": 12.148208618164062, "logps/chosen": -3.505800247192383, "logps/rejected": -3.3677139282226562, "loss": 4.5418, "rewards/accuracies": 0.25, "rewards/chosen": -35.05800247192383, "rewards/margins": -1.3808622360229492, "rewards/rejected": -33.67713928222656, "step": 2204 }, { "epoch": 0.3002450980392157, "grad_norm": 45.56320960636055, "learning_rate": 7.062547559655261e-07, "logits/chosen": 11.36461067199707, "logits/rejected": 10.775622367858887, "logps/chosen": -3.2348294258117676, "logps/rejected": -3.192451000213623, "loss": 4.6715, "rewards/accuracies": 0.5, "rewards/chosen": -32.34829330444336, "rewards/margins": -0.4237818717956543, "rewards/rejected": -31.92451286315918, "step": 2205 }, { "epoch": 0.30038126361655776, "grad_norm": 39.346956332456486, "learning_rate": 7.06132409266136e-07, "logits/chosen": 11.478675842285156, "logits/rejected": 11.810663223266602, "logps/chosen": -2.9614124298095703, "logps/rejected": -3.3363757133483887, "loss": 4.3338, "rewards/accuracies": 1.0, "rewards/chosen": -29.614124298095703, "rewards/margins": 3.749631881713867, "rewards/rejected": -33.3637580871582, "step": 2206 }, { "epoch": 0.3005174291938998, "grad_norm": 39.28567217482046, "learning_rate": 7.060099933935976e-07, "logits/chosen": 11.773592948913574, "logits/rejected": 12.003703117370605, "logps/chosen": -3.355090856552124, "logps/rejected": -3.2988622188568115, "loss": 4.1466, "rewards/accuracies": 0.25, "rewards/chosen": -33.55091094970703, "rewards/margins": -0.562288761138916, "rewards/rejected": -32.98862075805664, "step": 2207 }, { "epoch": 0.3006535947712418, "grad_norm": 40.105047995041, "learning_rate": 7.058875083755718e-07, "logits/chosen": 11.218460083007812, "logits/rejected": 11.442258834838867, "logps/chosen": -3.411764621734619, "logps/rejected": -3.2937374114990234, "loss": 4.0095, "rewards/accuracies": 0.25, "rewards/chosen": -34.117645263671875, "rewards/margins": -1.180269718170166, "rewards/rejected": -32.937374114990234, "step": 2208 }, { "epoch": 0.3007897603485839, "grad_norm": 42.25963836216733, "learning_rate": 7.057649542397348e-07, "logits/chosen": 11.798910140991211, "logits/rejected": 11.874853134155273, "logps/chosen": -3.342560291290283, "logps/rejected": -3.188232898712158, "loss": 4.2217, "rewards/accuracies": 0.5, "rewards/chosen": -33.42560577392578, "rewards/margins": -1.54327392578125, "rewards/rejected": -31.8823299407959, "step": 2209 }, { "epoch": 0.30092592592592593, "grad_norm": 43.29803900617655, "learning_rate": 7.05642331013779e-07, "logits/chosen": 10.796884536743164, "logits/rejected": 11.922369003295898, "logps/chosen": -3.0951926708221436, "logps/rejected": -3.438579559326172, "loss": 3.5048, "rewards/accuracies": 1.0, "rewards/chosen": -30.951927185058594, "rewards/margins": 3.433870792388916, "rewards/rejected": -34.385799407958984, "step": 2210 }, { "epoch": 0.30106209150326796, "grad_norm": 37.941878073368656, "learning_rate": 7.055196387254119e-07, "logits/chosen": 10.0645751953125, "logits/rejected": 11.750747680664062, "logps/chosen": -2.845913887023926, "logps/rejected": -3.645974636077881, "loss": 3.8687, "rewards/accuracies": 1.0, "rewards/chosen": -28.459138870239258, "rewards/margins": 8.000606536865234, "rewards/rejected": -36.459747314453125, "step": 2211 }, { "epoch": 0.30119825708061004, "grad_norm": 37.493270209496266, "learning_rate": 7.053968774023571e-07, "logits/chosen": 10.9259033203125, "logits/rejected": 9.976348876953125, "logps/chosen": -3.1243093013763428, "logps/rejected": -3.0038235187530518, "loss": 3.7992, "rewards/accuracies": 0.25, "rewards/chosen": -31.243091583251953, "rewards/margins": -1.204857349395752, "rewards/rejected": -30.03823471069336, "step": 2212 }, { "epoch": 0.3013344226579521, "grad_norm": 44.36322837679535, "learning_rate": 7.052740470723535e-07, "logits/chosen": 9.754515647888184, "logits/rejected": 11.07333755493164, "logps/chosen": -3.018313407897949, "logps/rejected": -3.336205005645752, "loss": 4.0318, "rewards/accuracies": 0.75, "rewards/chosen": -30.183134078979492, "rewards/margins": 3.1789135932922363, "rewards/rejected": -33.3620491027832, "step": 2213 }, { "epoch": 0.3014705882352941, "grad_norm": 38.04918785398683, "learning_rate": 7.051511477631554e-07, "logits/chosen": 10.971918106079102, "logits/rejected": 10.857603073120117, "logps/chosen": -3.040834426879883, "logps/rejected": -3.0400755405426025, "loss": 4.0482, "rewards/accuracies": 0.5, "rewards/chosen": -30.408344268798828, "rewards/margins": -0.007588863372802734, "rewards/rejected": -30.400754928588867, "step": 2214 }, { "epoch": 0.3016067538126362, "grad_norm": 36.43740035624278, "learning_rate": 7.050281795025331e-07, "logits/chosen": 10.404448509216309, "logits/rejected": 12.00681209564209, "logps/chosen": -2.908475875854492, "logps/rejected": -3.4270875453948975, "loss": 3.6574, "rewards/accuracies": 1.0, "rewards/chosen": -29.084758758544922, "rewards/margins": 5.186115741729736, "rewards/rejected": -34.2708740234375, "step": 2215 }, { "epoch": 0.3017429193899782, "grad_norm": 43.068546040905346, "learning_rate": 7.049051423182721e-07, "logits/chosen": 10.324483871459961, "logits/rejected": 10.329492568969727, "logps/chosen": -3.3465659618377686, "logps/rejected": -3.3760085105895996, "loss": 4.1599, "rewards/accuracies": 0.25, "rewards/chosen": -33.465660095214844, "rewards/margins": 0.29442739486694336, "rewards/rejected": -33.76008605957031, "step": 2216 }, { "epoch": 0.30187908496732024, "grad_norm": 44.0958038291632, "learning_rate": 7.04782036238174e-07, "logits/chosen": 12.169591903686523, "logits/rejected": 11.181795120239258, "logps/chosen": -3.458894968032837, "logps/rejected": -3.475180149078369, "loss": 4.0798, "rewards/accuracies": 0.5, "rewards/chosen": -34.588951110839844, "rewards/margins": 0.16285037994384766, "rewards/rejected": -34.751800537109375, "step": 2217 }, { "epoch": 0.30201525054466233, "grad_norm": 43.10898962041292, "learning_rate": 7.046588612900555e-07, "logits/chosen": 12.021172523498535, "logits/rejected": 12.142984390258789, "logps/chosen": -3.5055088996887207, "logps/rejected": -3.6208622455596924, "loss": 4.4635, "rewards/accuracies": 0.5, "rewards/chosen": -35.055091857910156, "rewards/margins": 1.1535329818725586, "rewards/rejected": -36.208621978759766, "step": 2218 }, { "epoch": 0.30215141612200436, "grad_norm": 40.1090545356557, "learning_rate": 7.045356175017489e-07, "logits/chosen": 11.534444808959961, "logits/rejected": 11.41024398803711, "logps/chosen": -3.272155284881592, "logps/rejected": -3.2342007160186768, "loss": 4.3872, "rewards/accuracies": 0.5, "rewards/chosen": -32.721553802490234, "rewards/margins": -0.3795456886291504, "rewards/rejected": -32.34200668334961, "step": 2219 }, { "epoch": 0.3022875816993464, "grad_norm": 42.98831653655742, "learning_rate": 7.044123049011022e-07, "logits/chosen": 12.298223495483398, "logits/rejected": 11.900169372558594, "logps/chosen": -3.6238455772399902, "logps/rejected": -3.9571642875671387, "loss": 3.8159, "rewards/accuracies": 0.5, "rewards/chosen": -36.23845672607422, "rewards/margins": 3.3331851959228516, "rewards/rejected": -39.57164001464844, "step": 2220 }, { "epoch": 0.30242374727668847, "grad_norm": 38.16702246308936, "learning_rate": 7.042889235159789e-07, "logits/chosen": 10.84185791015625, "logits/rejected": 12.475227355957031, "logps/chosen": -3.25357723236084, "logps/rejected": -3.624072790145874, "loss": 4.1082, "rewards/accuracies": 0.75, "rewards/chosen": -32.535770416259766, "rewards/margins": 3.7049560546875, "rewards/rejected": -36.240726470947266, "step": 2221 }, { "epoch": 0.3025599128540305, "grad_norm": 43.138519700013504, "learning_rate": 7.041654733742581e-07, "logits/chosen": 11.794241905212402, "logits/rejected": 12.113641738891602, "logps/chosen": -3.2665295600891113, "logps/rejected": -3.7497942447662354, "loss": 3.7275, "rewards/accuracies": 1.0, "rewards/chosen": -32.6652946472168, "rewards/margins": 4.83264684677124, "rewards/rejected": -37.49794006347656, "step": 2222 }, { "epoch": 0.30269607843137253, "grad_norm": 41.20015969357148, "learning_rate": 7.040419545038344e-07, "logits/chosen": 11.122289657592773, "logits/rejected": 11.474456787109375, "logps/chosen": -3.432112455368042, "logps/rejected": -3.412248134613037, "loss": 3.9522, "rewards/accuracies": 0.25, "rewards/chosen": -34.32112121582031, "rewards/margins": -0.19864177703857422, "rewards/rejected": -34.12248229980469, "step": 2223 }, { "epoch": 0.3028322440087146, "grad_norm": 44.968688867343616, "learning_rate": 7.039183669326175e-07, "logits/chosen": 12.099332809448242, "logits/rejected": 11.430289268493652, "logps/chosen": -3.4024949073791504, "logps/rejected": -3.546409845352173, "loss": 4.753, "rewards/accuracies": 0.5, "rewards/chosen": -34.02494812011719, "rewards/margins": 1.4391484260559082, "rewards/rejected": -35.46409606933594, "step": 2224 }, { "epoch": 0.30296840958605664, "grad_norm": 41.69703535614305, "learning_rate": 7.037947106885336e-07, "logits/chosen": 11.758533477783203, "logits/rejected": 11.666925430297852, "logps/chosen": -3.213287115097046, "logps/rejected": -3.4297986030578613, "loss": 3.9342, "rewards/accuracies": 0.75, "rewards/chosen": -32.13287353515625, "rewards/margins": 2.165113925933838, "rewards/rejected": -34.29798126220703, "step": 2225 }, { "epoch": 0.30310457516339867, "grad_norm": 47.604576429718996, "learning_rate": 7.036709857995237e-07, "logits/chosen": 10.765058517456055, "logits/rejected": 12.361915588378906, "logps/chosen": -3.319671154022217, "logps/rejected": -3.71844482421875, "loss": 3.8364, "rewards/accuracies": 0.75, "rewards/chosen": -33.19670867919922, "rewards/margins": 3.9877371788024902, "rewards/rejected": -37.1844482421875, "step": 2226 }, { "epoch": 0.30324074074074076, "grad_norm": 40.48198290571867, "learning_rate": 7.035471922935445e-07, "logits/chosen": 11.674215316772461, "logits/rejected": 11.210480690002441, "logps/chosen": -3.26155424118042, "logps/rejected": -3.0193097591400146, "loss": 4.2771, "rewards/accuracies": 0.25, "rewards/chosen": -32.61553955078125, "rewards/margins": -2.42244291305542, "rewards/rejected": -30.193098068237305, "step": 2227 }, { "epoch": 0.3033769063180828, "grad_norm": 43.14379917799768, "learning_rate": 7.034233301985678e-07, "logits/chosen": 11.85120677947998, "logits/rejected": 11.481633186340332, "logps/chosen": -3.4553706645965576, "logps/rejected": -3.2574715614318848, "loss": 4.2869, "rewards/accuracies": 0.25, "rewards/chosen": -34.553707122802734, "rewards/margins": -1.9789905548095703, "rewards/rejected": -32.57471466064453, "step": 2228 }, { "epoch": 0.3035130718954248, "grad_norm": 44.79913408342048, "learning_rate": 7.032993995425815e-07, "logits/chosen": 10.921058654785156, "logits/rejected": 11.043956756591797, "logps/chosen": -3.37042498588562, "logps/rejected": -3.7878658771514893, "loss": 3.7667, "rewards/accuracies": 1.0, "rewards/chosen": -33.704246520996094, "rewards/margins": 4.174407958984375, "rewards/rejected": -37.87865447998047, "step": 2229 }, { "epoch": 0.3036492374727669, "grad_norm": 38.477002277271964, "learning_rate": 7.031754003535889e-07, "logits/chosen": 11.808006286621094, "logits/rejected": 12.971307754516602, "logps/chosen": -3.305497884750366, "logps/rejected": -3.7387473583221436, "loss": 3.9311, "rewards/accuracies": 1.0, "rewards/chosen": -33.05497741699219, "rewards/margins": 4.332496643066406, "rewards/rejected": -37.387474060058594, "step": 2230 }, { "epoch": 0.3037854030501089, "grad_norm": 39.88479084602674, "learning_rate": 7.030513326596085e-07, "logits/chosen": 11.382307052612305, "logits/rejected": 12.36121940612793, "logps/chosen": -3.3999741077423096, "logps/rejected": -3.5288970470428467, "loss": 4.0103, "rewards/accuracies": 0.5, "rewards/chosen": -33.99974060058594, "rewards/margins": 1.2892284393310547, "rewards/rejected": -35.288970947265625, "step": 2231 }, { "epoch": 0.30392156862745096, "grad_norm": 44.14010383856239, "learning_rate": 7.029271964886745e-07, "logits/chosen": 11.200643539428711, "logits/rejected": 10.87764835357666, "logps/chosen": -3.1968488693237305, "logps/rejected": -3.3509597778320312, "loss": 4.3739, "rewards/accuracies": 0.75, "rewards/chosen": -31.968488693237305, "rewards/margins": 1.5411100387573242, "rewards/rejected": -33.50959777832031, "step": 2232 }, { "epoch": 0.30405773420479304, "grad_norm": 43.34887152994324, "learning_rate": 7.028029918688364e-07, "logits/chosen": 11.203577041625977, "logits/rejected": 11.578418731689453, "logps/chosen": -3.577200412750244, "logps/rejected": -3.532623529434204, "loss": 4.1922, "rewards/accuracies": 0.25, "rewards/chosen": -35.772003173828125, "rewards/margins": -0.4457693099975586, "rewards/rejected": -35.32623291015625, "step": 2233 }, { "epoch": 0.30419389978213507, "grad_norm": 45.065465165340036, "learning_rate": 7.026787188281592e-07, "logits/chosen": 10.127873420715332, "logits/rejected": 11.48507022857666, "logps/chosen": -2.7268080711364746, "logps/rejected": -3.0325052738189697, "loss": 4.3829, "rewards/accuracies": 0.75, "rewards/chosen": -27.26807975769043, "rewards/margins": 3.056972026824951, "rewards/rejected": -30.32505226135254, "step": 2234 }, { "epoch": 0.3043300653594771, "grad_norm": 42.84435053914933, "learning_rate": 7.025543773947235e-07, "logits/chosen": 10.967060089111328, "logits/rejected": 11.441261291503906, "logps/chosen": -2.9617042541503906, "logps/rejected": -3.1872053146362305, "loss": 4.1037, "rewards/accuracies": 0.5, "rewards/chosen": -29.617042541503906, "rewards/margins": 2.2550101280212402, "rewards/rejected": -31.872053146362305, "step": 2235 }, { "epoch": 0.3044662309368192, "grad_norm": 44.511038698081514, "learning_rate": 7.024299675966255e-07, "logits/chosen": 11.258929252624512, "logits/rejected": 12.082538604736328, "logps/chosen": -3.43464994430542, "logps/rejected": -3.787750720977783, "loss": 4.2423, "rewards/accuracies": 1.0, "rewards/chosen": -34.346500396728516, "rewards/margins": 3.5310072898864746, "rewards/rejected": -37.87751007080078, "step": 2236 }, { "epoch": 0.3046023965141612, "grad_norm": 41.44968022110191, "learning_rate": 7.023054894619763e-07, "logits/chosen": 11.81938362121582, "logits/rejected": 11.507572174072266, "logps/chosen": -3.6734654903411865, "logps/rejected": -3.699580192565918, "loss": 4.5284, "rewards/accuracies": 0.5, "rewards/chosen": -36.734657287597656, "rewards/margins": 0.26114606857299805, "rewards/rejected": -36.99580001831055, "step": 2237 }, { "epoch": 0.3047385620915033, "grad_norm": 43.94488931066717, "learning_rate": 7.021809430189028e-07, "logits/chosen": 10.355047225952148, "logits/rejected": 10.84855842590332, "logps/chosen": -3.410794496536255, "logps/rejected": -3.6572265625, "loss": 3.4768, "rewards/accuracies": 1.0, "rewards/chosen": -34.10794448852539, "rewards/margins": 2.464322090148926, "rewards/rejected": -36.572265625, "step": 2238 }, { "epoch": 0.3048747276688453, "grad_norm": 40.969336156529344, "learning_rate": 7.020563282955474e-07, "logits/chosen": 11.590864181518555, "logits/rejected": 11.721116065979004, "logps/chosen": -3.7787957191467285, "logps/rejected": -3.4851794242858887, "loss": 4.6499, "rewards/accuracies": 0.25, "rewards/chosen": -37.78795623779297, "rewards/margins": -2.9361648559570312, "rewards/rejected": -34.85179138183594, "step": 2239 }, { "epoch": 0.30501089324618735, "grad_norm": 38.76646367467741, "learning_rate": 7.019316453200678e-07, "logits/chosen": 11.699911117553711, "logits/rejected": 11.029449462890625, "logps/chosen": -3.2596495151519775, "logps/rejected": -3.096848487854004, "loss": 4.0048, "rewards/accuracies": 0.5, "rewards/chosen": -32.59649658203125, "rewards/margins": -1.62800931930542, "rewards/rejected": -30.968486785888672, "step": 2240 }, { "epoch": 0.30514705882352944, "grad_norm": 39.7208232447834, "learning_rate": 7.018068941206372e-07, "logits/chosen": 11.892587661743164, "logits/rejected": 12.891881942749023, "logps/chosen": -3.5575149059295654, "logps/rejected": -3.761326551437378, "loss": 3.7733, "rewards/accuracies": 0.75, "rewards/chosen": -35.57514953613281, "rewards/margins": 2.038115978240967, "rewards/rejected": -37.61326599121094, "step": 2241 }, { "epoch": 0.30528322440087147, "grad_norm": 39.012684612520154, "learning_rate": 7.01682074725444e-07, "logits/chosen": 12.284144401550293, "logits/rejected": 11.455957412719727, "logps/chosen": -3.649069309234619, "logps/rejected": -3.549274444580078, "loss": 4.1666, "rewards/accuracies": 0.5, "rewards/chosen": -36.490692138671875, "rewards/margins": -0.997950553894043, "rewards/rejected": -35.49274444580078, "step": 2242 }, { "epoch": 0.3054193899782135, "grad_norm": 39.92271562562137, "learning_rate": 7.015571871626925e-07, "logits/chosen": 10.816485404968262, "logits/rejected": 11.687138557434082, "logps/chosen": -3.4414072036743164, "logps/rejected": -3.8875794410705566, "loss": 3.4776, "rewards/accuracies": 1.0, "rewards/chosen": -34.41407012939453, "rewards/margins": 4.461725234985352, "rewards/rejected": -38.87579345703125, "step": 2243 }, { "epoch": 0.3055555555555556, "grad_norm": 41.45903513000083, "learning_rate": 7.014322314606017e-07, "logits/chosen": 10.930960655212402, "logits/rejected": 11.029304504394531, "logps/chosen": -3.398651361465454, "logps/rejected": -3.3568758964538574, "loss": 3.7427, "rewards/accuracies": 0.5, "rewards/chosen": -33.986515045166016, "rewards/margins": -0.417755126953125, "rewards/rejected": -33.56875991821289, "step": 2244 }, { "epoch": 0.3056917211328976, "grad_norm": 40.61527134322448, "learning_rate": 7.013072076474065e-07, "logits/chosen": 12.399106979370117, "logits/rejected": 12.293024063110352, "logps/chosen": -3.9064691066741943, "logps/rejected": -4.052059173583984, "loss": 3.8968, "rewards/accuracies": 0.5, "rewards/chosen": -39.06468963623047, "rewards/margins": 1.4559011459350586, "rewards/rejected": -40.520591735839844, "step": 2245 }, { "epoch": 0.30582788671023964, "grad_norm": 42.14929247367481, "learning_rate": 7.011821157513572e-07, "logits/chosen": 11.85551929473877, "logits/rejected": 11.853687286376953, "logps/chosen": -3.553013563156128, "logps/rejected": -3.625584363937378, "loss": 4.1737, "rewards/accuracies": 0.75, "rewards/chosen": -35.53013610839844, "rewards/margins": 0.7257070541381836, "rewards/rejected": -36.25584411621094, "step": 2246 }, { "epoch": 0.3059640522875817, "grad_norm": 44.187192064182376, "learning_rate": 7.010569558007193e-07, "logits/chosen": 10.854665756225586, "logits/rejected": 11.887351989746094, "logps/chosen": -3.351332426071167, "logps/rejected": -3.6275744438171387, "loss": 4.0139, "rewards/accuracies": 0.75, "rewards/chosen": -33.51332473754883, "rewards/margins": 2.762418746948242, "rewards/rejected": -36.27574157714844, "step": 2247 }, { "epoch": 0.30610021786492375, "grad_norm": 40.797970483950216, "learning_rate": 7.009317278237735e-07, "logits/chosen": 12.10551929473877, "logits/rejected": 12.666051864624023, "logps/chosen": -3.59370756149292, "logps/rejected": -3.9134786128997803, "loss": 4.0695, "rewards/accuracies": 0.75, "rewards/chosen": -35.93707275390625, "rewards/margins": 3.197709560394287, "rewards/rejected": -39.13478469848633, "step": 2248 }, { "epoch": 0.3062363834422658, "grad_norm": 41.97176172122257, "learning_rate": 7.008064318488163e-07, "logits/chosen": 11.930152893066406, "logits/rejected": 12.275012969970703, "logps/chosen": -3.3842146396636963, "logps/rejected": -3.395012378692627, "loss": 3.8645, "rewards/accuracies": 0.5, "rewards/chosen": -33.84214782714844, "rewards/margins": 0.10797739028930664, "rewards/rejected": -33.95012283325195, "step": 2249 }, { "epoch": 0.30637254901960786, "grad_norm": 47.4711527819455, "learning_rate": 7.006810679041594e-07, "logits/chosen": 11.54547119140625, "logits/rejected": 12.403144836425781, "logps/chosen": -3.300914764404297, "logps/rejected": -3.734288454055786, "loss": 4.5141, "rewards/accuracies": 1.0, "rewards/chosen": -33.00914764404297, "rewards/margins": 4.333736896514893, "rewards/rejected": -37.34288787841797, "step": 2250 }, { "epoch": 0.3065087145969499, "grad_norm": 45.55929389395545, "learning_rate": 7.005556360181298e-07, "logits/chosen": 11.353856086730957, "logits/rejected": 11.537841796875, "logps/chosen": -3.4155709743499756, "logps/rejected": -3.7688000202178955, "loss": 4.2307, "rewards/accuracies": 0.75, "rewards/chosen": -34.15570831298828, "rewards/margins": 3.532289505004883, "rewards/rejected": -37.6879997253418, "step": 2251 }, { "epoch": 0.3066448801742919, "grad_norm": 41.32677208692552, "learning_rate": 7.004301362190698e-07, "logits/chosen": 11.266395568847656, "logits/rejected": 11.271211624145508, "logps/chosen": -3.489046812057495, "logps/rejected": -3.6633245944976807, "loss": 3.846, "rewards/accuracies": 0.75, "rewards/chosen": -34.89046859741211, "rewards/margins": 1.74277925491333, "rewards/rejected": -36.63324737548828, "step": 2252 }, { "epoch": 0.306781045751634, "grad_norm": 42.67176638944737, "learning_rate": 7.00304568535337e-07, "logits/chosen": 12.468703269958496, "logits/rejected": 10.966670989990234, "logps/chosen": -3.0363950729370117, "logps/rejected": -3.5772957801818848, "loss": 3.9337, "rewards/accuracies": 0.75, "rewards/chosen": -30.36394691467285, "rewards/margins": 5.4090118408203125, "rewards/rejected": -35.7729606628418, "step": 2253 }, { "epoch": 0.30691721132897604, "grad_norm": 44.83376399500748, "learning_rate": 7.001789329953048e-07, "logits/chosen": 11.273149490356445, "logits/rejected": 12.31314754486084, "logps/chosen": -3.30137300491333, "logps/rejected": -3.6994576454162598, "loss": 3.7218, "rewards/accuracies": 0.75, "rewards/chosen": -33.013729095458984, "rewards/margins": 3.9808483123779297, "rewards/rejected": -36.99457550048828, "step": 2254 }, { "epoch": 0.30705337690631807, "grad_norm": 50.26011283878689, "learning_rate": 7.000532296273612e-07, "logits/chosen": 11.406070709228516, "logits/rejected": 11.529441833496094, "logps/chosen": -3.3502769470214844, "logps/rejected": -4.026610374450684, "loss": 4.3561, "rewards/accuracies": 0.5, "rewards/chosen": -33.502769470214844, "rewards/margins": 6.763331413269043, "rewards/rejected": -40.2661018371582, "step": 2255 }, { "epoch": 0.30718954248366015, "grad_norm": 43.73701888457592, "learning_rate": 6.999274584599102e-07, "logits/chosen": 10.75625228881836, "logits/rejected": 11.077129364013672, "logps/chosen": -3.3004207611083984, "logps/rejected": -3.473236322402954, "loss": 4.1904, "rewards/accuracies": 0.5, "rewards/chosen": -33.004207611083984, "rewards/margins": 1.7281551361083984, "rewards/rejected": -34.732364654541016, "step": 2256 }, { "epoch": 0.3073257080610022, "grad_norm": 51.0692589184919, "learning_rate": 6.998016195213708e-07, "logits/chosen": 11.472497940063477, "logits/rejected": 11.845024108886719, "logps/chosen": -3.449608564376831, "logps/rejected": -3.673189640045166, "loss": 4.6091, "rewards/accuracies": 1.0, "rewards/chosen": -34.49608612060547, "rewards/margins": 2.235811710357666, "rewards/rejected": -36.731895446777344, "step": 2257 }, { "epoch": 0.3074618736383442, "grad_norm": 45.00337256649374, "learning_rate": 6.996757128401771e-07, "logits/chosen": 11.125436782836914, "logits/rejected": 11.624268531799316, "logps/chosen": -3.3395607471466064, "logps/rejected": -3.7065415382385254, "loss": 4.5346, "rewards/accuracies": 1.0, "rewards/chosen": -33.395606994628906, "rewards/margins": 3.6698074340820312, "rewards/rejected": -37.06541442871094, "step": 2258 }, { "epoch": 0.3075980392156863, "grad_norm": 44.553468954850516, "learning_rate": 6.995497384447791e-07, "logits/chosen": 11.756542205810547, "logits/rejected": 12.231535911560059, "logps/chosen": -3.583202362060547, "logps/rejected": -3.9779891967773438, "loss": 4.6611, "rewards/accuracies": 0.75, "rewards/chosen": -35.83202362060547, "rewards/margins": 3.947869300842285, "rewards/rejected": -39.7798957824707, "step": 2259 }, { "epoch": 0.3077342047930283, "grad_norm": 44.010174188710366, "learning_rate": 6.994236963636415e-07, "logits/chosen": 11.935039520263672, "logits/rejected": 11.285399436950684, "logps/chosen": -3.557860851287842, "logps/rejected": -3.6050872802734375, "loss": 4.1143, "rewards/accuracies": 0.5, "rewards/chosen": -35.578609466552734, "rewards/margins": 0.47226381301879883, "rewards/rejected": -36.050872802734375, "step": 2260 }, { "epoch": 0.30787037037037035, "grad_norm": 44.06557436655882, "learning_rate": 6.992975866252447e-07, "logits/chosen": 11.006881713867188, "logits/rejected": 11.431036949157715, "logps/chosen": -3.3809332847595215, "logps/rejected": -3.7042880058288574, "loss": 4.5656, "rewards/accuracies": 1.0, "rewards/chosen": -33.80933380126953, "rewards/margins": 3.2335448265075684, "rewards/rejected": -37.042877197265625, "step": 2261 }, { "epoch": 0.30800653594771243, "grad_norm": 43.82132010800348, "learning_rate": 6.991714092580842e-07, "logits/chosen": 10.823076248168945, "logits/rejected": 11.611648559570312, "logps/chosen": -3.416147470474243, "logps/rejected": -3.4946513175964355, "loss": 4.1786, "rewards/accuracies": 0.5, "rewards/chosen": -34.161476135253906, "rewards/margins": 0.7850379943847656, "rewards/rejected": -34.94651412963867, "step": 2262 }, { "epoch": 0.30814270152505446, "grad_norm": 42.993094318333696, "learning_rate": 6.990451642906708e-07, "logits/chosen": 11.864828109741211, "logits/rejected": 11.623956680297852, "logps/chosen": -3.437166213989258, "logps/rejected": -3.697451591491699, "loss": 4.5035, "rewards/accuracies": 1.0, "rewards/chosen": -34.37166213989258, "rewards/margins": 2.6028552055358887, "rewards/rejected": -36.974517822265625, "step": 2263 }, { "epoch": 0.3082788671023965, "grad_norm": 44.97597602990577, "learning_rate": 6.989188517515305e-07, "logits/chosen": 11.217645645141602, "logits/rejected": 11.293975830078125, "logps/chosen": -3.3013360500335693, "logps/rejected": -3.4179606437683105, "loss": 4.31, "rewards/accuracies": 0.5, "rewards/chosen": -33.01335906982422, "rewards/margins": 1.1662440299987793, "rewards/rejected": -34.179603576660156, "step": 2264 }, { "epoch": 0.3084150326797386, "grad_norm": 45.64371520214349, "learning_rate": 6.987924716692049e-07, "logits/chosen": 11.50311279296875, "logits/rejected": 10.804561614990234, "logps/chosen": -3.62668514251709, "logps/rejected": -3.3908963203430176, "loss": 4.2205, "rewards/accuracies": 0.5, "rewards/chosen": -36.26685333251953, "rewards/margins": -2.3578858375549316, "rewards/rejected": -33.908966064453125, "step": 2265 }, { "epoch": 0.3085511982570806, "grad_norm": 44.31906848167911, "learning_rate": 6.986660240722504e-07, "logits/chosen": 11.493392944335938, "logits/rejected": 11.402316093444824, "logps/chosen": -3.480926752090454, "logps/rejected": -3.404172420501709, "loss": 4.0349, "rewards/accuracies": 0.5, "rewards/chosen": -34.80926513671875, "rewards/margins": -0.7675442695617676, "rewards/rejected": -34.041725158691406, "step": 2266 }, { "epoch": 0.30868736383442263, "grad_norm": 39.70483745557178, "learning_rate": 6.985395089892391e-07, "logits/chosen": 11.076103210449219, "logits/rejected": 11.557138442993164, "logps/chosen": -3.324711799621582, "logps/rejected": -3.6125617027282715, "loss": 3.9277, "rewards/accuracies": 1.0, "rewards/chosen": -33.24711608886719, "rewards/margins": 2.8784966468811035, "rewards/rejected": -36.125614166259766, "step": 2267 }, { "epoch": 0.3088235294117647, "grad_norm": 43.06615673413248, "learning_rate": 6.984129264487578e-07, "logits/chosen": 11.001264572143555, "logits/rejected": 11.216363906860352, "logps/chosen": -3.492375612258911, "logps/rejected": -3.6507272720336914, "loss": 4.0121, "rewards/accuracies": 0.75, "rewards/chosen": -34.92375946044922, "rewards/margins": 1.5835165977478027, "rewards/rejected": -36.50727081298828, "step": 2268 }, { "epoch": 0.30895969498910675, "grad_norm": 43.16448823695723, "learning_rate": 6.982862764794091e-07, "logits/chosen": 9.963193893432617, "logits/rejected": 11.31355094909668, "logps/chosen": -3.3823494911193848, "logps/rejected": -3.6300604343414307, "loss": 3.8937, "rewards/accuracies": 0.5, "rewards/chosen": -33.82349395751953, "rewards/margins": 2.477112293243408, "rewards/rejected": -36.30060577392578, "step": 2269 }, { "epoch": 0.3090958605664488, "grad_norm": 42.54730749577177, "learning_rate": 6.981595591098106e-07, "logits/chosen": 11.343334197998047, "logits/rejected": 11.368042945861816, "logps/chosen": -3.5862996578216553, "logps/rejected": -3.6708030700683594, "loss": 3.6772, "rewards/accuracies": 0.5, "rewards/chosen": -35.86299514770508, "rewards/margins": 0.845034122467041, "rewards/rejected": -36.708030700683594, "step": 2270 }, { "epoch": 0.30923202614379086, "grad_norm": 40.09381225112101, "learning_rate": 6.980327743685951e-07, "logits/chosen": 9.599184036254883, "logits/rejected": 10.066062927246094, "logps/chosen": -3.284609317779541, "logps/rejected": -3.527657985687256, "loss": 3.6212, "rewards/accuracies": 0.5, "rewards/chosen": -32.846092224121094, "rewards/margins": 2.430487632751465, "rewards/rejected": -35.276580810546875, "step": 2271 }, { "epoch": 0.3093681917211329, "grad_norm": 38.150355428518935, "learning_rate": 6.979059222844107e-07, "logits/chosen": 12.107028007507324, "logits/rejected": 11.11212158203125, "logps/chosen": -3.5592164993286133, "logps/rejected": -3.437382459640503, "loss": 3.9397, "rewards/accuracies": 0.25, "rewards/chosen": -35.5921630859375, "rewards/margins": -1.218339443206787, "rewards/rejected": -34.37382507324219, "step": 2272 }, { "epoch": 0.3095043572984749, "grad_norm": 43.17203632380363, "learning_rate": 6.977790028859206e-07, "logits/chosen": 11.55173110961914, "logits/rejected": 10.822774887084961, "logps/chosen": -3.356204032897949, "logps/rejected": -3.5447912216186523, "loss": 4.1093, "rewards/accuracies": 1.0, "rewards/chosen": -33.562042236328125, "rewards/margins": 1.8858699798583984, "rewards/rejected": -35.44791030883789, "step": 2273 }, { "epoch": 0.309640522875817, "grad_norm": 39.947172585619775, "learning_rate": 6.976520162018033e-07, "logits/chosen": 11.118194580078125, "logits/rejected": 11.304895401000977, "logps/chosen": -3.17710542678833, "logps/rejected": -3.3352413177490234, "loss": 4.2596, "rewards/accuracies": 0.5, "rewards/chosen": -31.771053314208984, "rewards/margins": 1.5813593864440918, "rewards/rejected": -33.352413177490234, "step": 2274 }, { "epoch": 0.30977668845315903, "grad_norm": 46.13188492302895, "learning_rate": 6.975249622607525e-07, "logits/chosen": 10.442989349365234, "logits/rejected": 10.932384490966797, "logps/chosen": -3.4531471729278564, "logps/rejected": -3.490044593811035, "loss": 3.9696, "rewards/accuracies": 0.5, "rewards/chosen": -34.531471252441406, "rewards/margins": 0.3689751625061035, "rewards/rejected": -34.90044403076172, "step": 2275 }, { "epoch": 0.3099128540305011, "grad_norm": 43.562251203918265, "learning_rate": 6.973978410914773e-07, "logits/chosen": 9.402719497680664, "logits/rejected": 10.044441223144531, "logps/chosen": -2.9010300636291504, "logps/rejected": -3.191235065460205, "loss": 4.259, "rewards/accuracies": 0.5, "rewards/chosen": -29.010299682617188, "rewards/margins": 2.9020495414733887, "rewards/rejected": -31.912349700927734, "step": 2276 }, { "epoch": 0.31004901960784315, "grad_norm": 37.43481434362707, "learning_rate": 6.972706527227015e-07, "logits/chosen": 11.350761413574219, "logits/rejected": 11.712804794311523, "logps/chosen": -3.4104552268981934, "logps/rejected": -3.6606569290161133, "loss": 3.7977, "rewards/accuracies": 0.5, "rewards/chosen": -34.10455322265625, "rewards/margins": 2.5020151138305664, "rewards/rejected": -36.6065673828125, "step": 2277 }, { "epoch": 0.3101851851851852, "grad_norm": 46.904838717302404, "learning_rate": 6.971433971831644e-07, "logits/chosen": 11.267566680908203, "logits/rejected": 11.661576271057129, "logps/chosen": -3.232447862625122, "logps/rejected": -3.5657286643981934, "loss": 4.0272, "rewards/accuracies": 0.5, "rewards/chosen": -32.32447814941406, "rewards/margins": 3.3328075408935547, "rewards/rejected": -35.65728759765625, "step": 2278 }, { "epoch": 0.31032135076252726, "grad_norm": 39.64773955839634, "learning_rate": 6.970160745016205e-07, "logits/chosen": 10.626688003540039, "logits/rejected": 11.597478866577148, "logps/chosen": -3.249610185623169, "logps/rejected": -3.4963669776916504, "loss": 3.8241, "rewards/accuracies": 0.75, "rewards/chosen": -32.49610137939453, "rewards/margins": 2.4675683975219727, "rewards/rejected": -34.96366882324219, "step": 2279 }, { "epoch": 0.3104575163398693, "grad_norm": 39.86622830557821, "learning_rate": 6.968886847068394e-07, "logits/chosen": 9.936025619506836, "logits/rejected": 10.625967025756836, "logps/chosen": -3.0207443237304688, "logps/rejected": -3.3272464275360107, "loss": 4.2203, "rewards/accuracies": 0.75, "rewards/chosen": -30.207443237304688, "rewards/margins": 3.0650229454040527, "rewards/rejected": -33.272464752197266, "step": 2280 }, { "epoch": 0.3105936819172113, "grad_norm": 38.28645748999303, "learning_rate": 6.967612278276059e-07, "logits/chosen": 11.71755599975586, "logits/rejected": 11.0377197265625, "logps/chosen": -3.103604793548584, "logps/rejected": -3.3339309692382812, "loss": 3.8517, "rewards/accuracies": 0.75, "rewards/chosen": -31.036046981811523, "rewards/margins": 2.3032612800598145, "rewards/rejected": -33.33930969238281, "step": 2281 }, { "epoch": 0.3107298474945534, "grad_norm": 45.22263085174592, "learning_rate": 6.9663370389272e-07, "logits/chosen": 9.483844757080078, "logits/rejected": 11.139551162719727, "logps/chosen": -2.8116865158081055, "logps/rejected": -3.343360424041748, "loss": 3.7933, "rewards/accuracies": 1.0, "rewards/chosen": -28.116863250732422, "rewards/margins": 5.316740989685059, "rewards/rejected": -33.43360137939453, "step": 2282 }, { "epoch": 0.31086601307189543, "grad_norm": 37.123973368187094, "learning_rate": 6.965061129309965e-07, "logits/chosen": 10.703886032104492, "logits/rejected": 11.918354034423828, "logps/chosen": -3.587939977645874, "logps/rejected": -3.987164258956909, "loss": 3.6191, "rewards/accuracies": 0.75, "rewards/chosen": -35.87940216064453, "rewards/margins": 3.992246627807617, "rewards/rejected": -39.87164306640625, "step": 2283 }, { "epoch": 0.31100217864923746, "grad_norm": 40.23518424057586, "learning_rate": 6.963784549712661e-07, "logits/chosen": 10.766088485717773, "logits/rejected": 11.766496658325195, "logps/chosen": -3.2864413261413574, "logps/rejected": -3.640856981277466, "loss": 3.7275, "rewards/accuracies": 1.0, "rewards/chosen": -32.864410400390625, "rewards/margins": 3.5441579818725586, "rewards/rejected": -36.4085693359375, "step": 2284 }, { "epoch": 0.31113834422657954, "grad_norm": 42.689226328044754, "learning_rate": 6.962507300423738e-07, "logits/chosen": 11.870327949523926, "logits/rejected": 11.426044464111328, "logps/chosen": -3.520704507827759, "logps/rejected": -3.5796995162963867, "loss": 3.6125, "rewards/accuracies": 0.5, "rewards/chosen": -35.20704650878906, "rewards/margins": 0.5899500846862793, "rewards/rejected": -35.7969970703125, "step": 2285 }, { "epoch": 0.3112745098039216, "grad_norm": 47.91225870906378, "learning_rate": 6.961229381731801e-07, "logits/chosen": 11.354001998901367, "logits/rejected": 10.88693618774414, "logps/chosen": -3.2393035888671875, "logps/rejected": -3.4472203254699707, "loss": 4.5454, "rewards/accuracies": 0.75, "rewards/chosen": -32.39303970336914, "rewards/margins": 2.0791659355163574, "rewards/rejected": -34.472206115722656, "step": 2286 }, { "epoch": 0.3114106753812636, "grad_norm": 44.11893454296747, "learning_rate": 6.959950793925608e-07, "logits/chosen": 10.80520248413086, "logits/rejected": 11.088960647583008, "logps/chosen": -3.475717067718506, "logps/rejected": -3.6794118881225586, "loss": 3.8094, "rewards/accuracies": 0.75, "rewards/chosen": -34.757171630859375, "rewards/margins": 2.036945343017578, "rewards/rejected": -36.79411697387695, "step": 2287 }, { "epoch": 0.3115468409586057, "grad_norm": 40.23026531436552, "learning_rate": 6.958671537294067e-07, "logits/chosen": 10.44041633605957, "logits/rejected": 12.233506202697754, "logps/chosen": -3.4359614849090576, "logps/rejected": -3.910839796066284, "loss": 3.8365, "rewards/accuracies": 1.0, "rewards/chosen": -34.359615325927734, "rewards/margins": 4.748783111572266, "rewards/rejected": -39.1083984375, "step": 2288 }, { "epoch": 0.3116830065359477, "grad_norm": 41.54530159375748, "learning_rate": 6.957391612126235e-07, "logits/chosen": 11.49532699584961, "logits/rejected": 11.631202697753906, "logps/chosen": -3.906235456466675, "logps/rejected": -3.796292543411255, "loss": 4.363, "rewards/accuracies": 0.5, "rewards/chosen": -39.062355041503906, "rewards/margins": -1.0994291305541992, "rewards/rejected": -37.96292495727539, "step": 2289 }, { "epoch": 0.31181917211328974, "grad_norm": 42.34900202281179, "learning_rate": 6.956111018711322e-07, "logits/chosen": 11.679570198059082, "logits/rejected": 11.545347213745117, "logps/chosen": -3.1949756145477295, "logps/rejected": -3.3109967708587646, "loss": 3.686, "rewards/accuracies": 0.5, "rewards/chosen": -31.949756622314453, "rewards/margins": 1.160210132598877, "rewards/rejected": -33.10996627807617, "step": 2290 }, { "epoch": 0.31195533769063183, "grad_norm": 46.70875245188932, "learning_rate": 6.954829757338689e-07, "logits/chosen": 10.973085403442383, "logits/rejected": 11.360173225402832, "logps/chosen": -3.241300106048584, "logps/rejected": -3.498516082763672, "loss": 3.9596, "rewards/accuracies": 0.5, "rewards/chosen": -32.413002014160156, "rewards/margins": 2.57216215133667, "rewards/rejected": -34.98516082763672, "step": 2291 }, { "epoch": 0.31209150326797386, "grad_norm": 49.70569618700192, "learning_rate": 6.953547828297847e-07, "logits/chosen": 11.106077194213867, "logits/rejected": 10.956823348999023, "logps/chosen": -3.4583206176757812, "logps/rejected": -3.591766357421875, "loss": 4.5879, "rewards/accuracies": 0.5, "rewards/chosen": -34.58320617675781, "rewards/margins": 1.3344578742980957, "rewards/rejected": -35.91766357421875, "step": 2292 }, { "epoch": 0.3122276688453159, "grad_norm": 41.405702212450265, "learning_rate": 6.95226523187846e-07, "logits/chosen": 10.404648780822754, "logits/rejected": 11.42236042022705, "logps/chosen": -3.0098137855529785, "logps/rejected": -3.7804486751556396, "loss": 3.8449, "rewards/accuracies": 1.0, "rewards/chosen": -30.098140716552734, "rewards/margins": 7.706346035003662, "rewards/rejected": -37.80448532104492, "step": 2293 }, { "epoch": 0.31236383442265797, "grad_norm": 44.825383806708565, "learning_rate": 6.950981968370339e-07, "logits/chosen": 11.546395301818848, "logits/rejected": 11.838850021362305, "logps/chosen": -3.743117570877075, "logps/rejected": -4.159497261047363, "loss": 3.7697, "rewards/accuracies": 0.5, "rewards/chosen": -37.431175231933594, "rewards/margins": 4.163797378540039, "rewards/rejected": -41.594974517822266, "step": 2294 }, { "epoch": 0.3125, "grad_norm": 41.85255003924569, "learning_rate": 6.94969803806345e-07, "logits/chosen": 10.189860343933105, "logits/rejected": 11.056144714355469, "logps/chosen": -3.3984169960021973, "logps/rejected": -3.442509174346924, "loss": 4.0565, "rewards/accuracies": 0.75, "rewards/chosen": -33.984169006347656, "rewards/margins": 0.4409217834472656, "rewards/rejected": -34.42509078979492, "step": 2295 }, { "epoch": 0.31263616557734203, "grad_norm": 47.70790585854515, "learning_rate": 6.948413441247906e-07, "logits/chosen": 12.12860107421875, "logits/rejected": 12.09918212890625, "logps/chosen": -3.2979791164398193, "logps/rejected": -3.5602967739105225, "loss": 4.1574, "rewards/accuracies": 0.75, "rewards/chosen": -32.97978973388672, "rewards/margins": 2.6231751441955566, "rewards/rejected": -35.60296630859375, "step": 2296 }, { "epoch": 0.3127723311546841, "grad_norm": 41.166493886287526, "learning_rate": 6.947128178213974e-07, "logits/chosen": 10.569414138793945, "logits/rejected": 10.965458869934082, "logps/chosen": -3.4819092750549316, "logps/rejected": -3.4316439628601074, "loss": 4.2048, "rewards/accuracies": 0.25, "rewards/chosen": -34.819091796875, "rewards/margins": -0.5026512145996094, "rewards/rejected": -34.31644058227539, "step": 2297 }, { "epoch": 0.31290849673202614, "grad_norm": 41.915898951319555, "learning_rate": 6.945842249252068e-07, "logits/chosen": 11.456920623779297, "logits/rejected": 11.613164901733398, "logps/chosen": -3.6055164337158203, "logps/rejected": -3.4273862838745117, "loss": 4.2493, "rewards/accuracies": 0.0, "rewards/chosen": -36.05516815185547, "rewards/margins": -1.7813019752502441, "rewards/rejected": -34.27386474609375, "step": 2298 }, { "epoch": 0.31304466230936817, "grad_norm": 42.40523595466666, "learning_rate": 6.944555654652756e-07, "logits/chosen": 10.403803825378418, "logits/rejected": 10.938528060913086, "logps/chosen": -3.6322100162506104, "logps/rejected": -3.976914167404175, "loss": 3.9541, "rewards/accuracies": 1.0, "rewards/chosen": -36.32209777832031, "rewards/margins": 3.4470415115356445, "rewards/rejected": -39.769142150878906, "step": 2299 }, { "epoch": 0.31318082788671026, "grad_norm": 45.115031156994654, "learning_rate": 6.943268394706754e-07, "logits/chosen": 9.742878913879395, "logits/rejected": 10.1004638671875, "logps/chosen": -2.976290225982666, "logps/rejected": -3.160391330718994, "loss": 4.6936, "rewards/accuracies": 0.75, "rewards/chosen": -29.76290512084961, "rewards/margins": 1.8410096168518066, "rewards/rejected": -31.603914260864258, "step": 2300 }, { "epoch": 0.3133169934640523, "grad_norm": 38.58251530119952, "learning_rate": 6.941980469704928e-07, "logits/chosen": 10.260574340820312, "logits/rejected": 10.709478378295898, "logps/chosen": -3.1251778602600098, "logps/rejected": -3.526427745819092, "loss": 3.5154, "rewards/accuracies": 0.75, "rewards/chosen": -31.251779556274414, "rewards/margins": 4.0124993324279785, "rewards/rejected": -35.264278411865234, "step": 2301 }, { "epoch": 0.3134531590413943, "grad_norm": 39.3912152114386, "learning_rate": 6.940691879938297e-07, "logits/chosen": 10.53492546081543, "logits/rejected": 11.510799407958984, "logps/chosen": -3.3895058631896973, "logps/rejected": -3.8983981609344482, "loss": 3.846, "rewards/accuracies": 1.0, "rewards/chosen": -33.895057678222656, "rewards/margins": 5.088922500610352, "rewards/rejected": -38.983978271484375, "step": 2302 }, { "epoch": 0.3135893246187364, "grad_norm": 37.787544953411484, "learning_rate": 6.939402625698027e-07, "logits/chosen": 11.223822593688965, "logits/rejected": 11.503158569335938, "logps/chosen": -3.4472885131835938, "logps/rejected": -3.5747876167297363, "loss": 3.5583, "rewards/accuracies": 0.75, "rewards/chosen": -34.47288513183594, "rewards/margins": 1.274991512298584, "rewards/rejected": -35.74787521362305, "step": 2303 }, { "epoch": 0.3137254901960784, "grad_norm": 39.60738714435242, "learning_rate": 6.938112707275437e-07, "logits/chosen": 10.609771728515625, "logits/rejected": 10.733989715576172, "logps/chosen": -3.3342299461364746, "logps/rejected": -3.436621904373169, "loss": 3.5651, "rewards/accuracies": 0.5, "rewards/chosen": -33.34230041503906, "rewards/margins": 1.0239219665527344, "rewards/rejected": -34.36621856689453, "step": 2304 }, { "epoch": 0.31386165577342046, "grad_norm": 40.66024182122422, "learning_rate": 6.936822124961994e-07, "logits/chosen": 10.606162071228027, "logits/rejected": 10.904239654541016, "logps/chosen": -3.183971881866455, "logps/rejected": -3.2097721099853516, "loss": 3.5093, "rewards/accuracies": 0.75, "rewards/chosen": -31.839717864990234, "rewards/margins": 0.25800180435180664, "rewards/rejected": -32.097721099853516, "step": 2305 }, { "epoch": 0.31399782135076254, "grad_norm": 39.99937800704626, "learning_rate": 6.935530879049317e-07, "logits/chosen": 10.417257308959961, "logits/rejected": 10.886758804321289, "logps/chosen": -3.2220921516418457, "logps/rejected": -3.9729089736938477, "loss": 3.5506, "rewards/accuracies": 1.0, "rewards/chosen": -32.220924377441406, "rewards/margins": 7.508167266845703, "rewards/rejected": -39.729087829589844, "step": 2306 }, { "epoch": 0.31413398692810457, "grad_norm": 44.79925845192639, "learning_rate": 6.93423896982917e-07, "logits/chosen": 10.067211151123047, "logits/rejected": 10.734917640686035, "logps/chosen": -3.1478066444396973, "logps/rejected": -3.2337684631347656, "loss": 4.9144, "rewards/accuracies": 0.5, "rewards/chosen": -31.47806739807129, "rewards/margins": 0.859616756439209, "rewards/rejected": -32.337684631347656, "step": 2307 }, { "epoch": 0.3142701525054466, "grad_norm": 41.71474453779803, "learning_rate": 6.932946397593475e-07, "logits/chosen": 10.0125093460083, "logits/rejected": 10.493181228637695, "logps/chosen": -3.1431169509887695, "logps/rejected": -3.359297752380371, "loss": 4.7044, "rewards/accuracies": 0.75, "rewards/chosen": -31.431169509887695, "rewards/margins": 2.161807060241699, "rewards/rejected": -33.592979431152344, "step": 2308 }, { "epoch": 0.3144063180827887, "grad_norm": 44.44847496380986, "learning_rate": 6.931653162634296e-07, "logits/chosen": 10.11424732208252, "logits/rejected": 10.849357604980469, "logps/chosen": -3.4535584449768066, "logps/rejected": -3.588343620300293, "loss": 3.9438, "rewards/accuracies": 0.75, "rewards/chosen": -34.53558349609375, "rewards/margins": 1.347851276397705, "rewards/rejected": -35.8834342956543, "step": 2309 }, { "epoch": 0.3145424836601307, "grad_norm": 40.48168560117965, "learning_rate": 6.930359265243853e-07, "logits/chosen": 10.457841873168945, "logits/rejected": 9.582741737365723, "logps/chosen": -3.105956792831421, "logps/rejected": -3.090733051300049, "loss": 4.8159, "rewards/accuracies": 0.25, "rewards/chosen": -31.059568405151367, "rewards/margins": -0.1522364616394043, "rewards/rejected": -30.907331466674805, "step": 2310 }, { "epoch": 0.31467864923747274, "grad_norm": 41.05299664502008, "learning_rate": 6.929064705714511e-07, "logits/chosen": 9.77215576171875, "logits/rejected": 9.737981796264648, "logps/chosen": -3.206878662109375, "logps/rejected": -3.53328013420105, "loss": 4.4742, "rewards/accuracies": 0.75, "rewards/chosen": -32.06878662109375, "rewards/margins": 3.264014720916748, "rewards/rejected": -35.332801818847656, "step": 2311 }, { "epoch": 0.3148148148148148, "grad_norm": 38.7698810267979, "learning_rate": 6.927769484338787e-07, "logits/chosen": 10.421945571899414, "logits/rejected": 10.32179069519043, "logps/chosen": -2.786407947540283, "logps/rejected": -3.331523895263672, "loss": 3.7627, "rewards/accuracies": 1.0, "rewards/chosen": -27.86408042907715, "rewards/margins": 5.45115852355957, "rewards/rejected": -33.31523895263672, "step": 2312 }, { "epoch": 0.31495098039215685, "grad_norm": 39.96214362540618, "learning_rate": 6.926473601409346e-07, "logits/chosen": 11.312337875366211, "logits/rejected": 11.23133659362793, "logps/chosen": -3.4609172344207764, "logps/rejected": -3.5056614875793457, "loss": 4.083, "rewards/accuracies": 0.75, "rewards/chosen": -34.60917282104492, "rewards/margins": 0.44744205474853516, "rewards/rejected": -35.05661392211914, "step": 2313 }, { "epoch": 0.3150871459694989, "grad_norm": 38.49873739874739, "learning_rate": 6.925177057219006e-07, "logits/chosen": 11.863322257995605, "logits/rejected": 10.857198715209961, "logps/chosen": -3.587153434753418, "logps/rejected": -3.239605188369751, "loss": 3.8069, "rewards/accuracies": 0.25, "rewards/chosen": -35.87153244018555, "rewards/margins": -3.4754815101623535, "rewards/rejected": -32.39604949951172, "step": 2314 }, { "epoch": 0.31522331154684097, "grad_norm": 39.46098267102925, "learning_rate": 6.923879852060729e-07, "logits/chosen": 10.897855758666992, "logits/rejected": 11.51303482055664, "logps/chosen": -3.1284079551696777, "logps/rejected": -3.254249095916748, "loss": 4.1959, "rewards/accuracies": 0.75, "rewards/chosen": -31.284076690673828, "rewards/margins": 1.2584123611450195, "rewards/rejected": -32.54248809814453, "step": 2315 }, { "epoch": 0.315359477124183, "grad_norm": 38.46395637813852, "learning_rate": 6.92258198622763e-07, "logits/chosen": 9.482101440429688, "logits/rejected": 10.501955032348633, "logps/chosen": -3.3747544288635254, "logps/rejected": -3.6064870357513428, "loss": 4.3018, "rewards/accuracies": 1.0, "rewards/chosen": -33.74754333496094, "rewards/margins": 2.3173274993896484, "rewards/rejected": -36.06487274169922, "step": 2316 }, { "epoch": 0.3154956427015251, "grad_norm": 37.703637066702015, "learning_rate": 6.921283460012974e-07, "logits/chosen": 10.590078353881836, "logits/rejected": 12.174808502197266, "logps/chosen": -3.459131956100464, "logps/rejected": -3.6862401962280273, "loss": 3.7387, "rewards/accuracies": 0.75, "rewards/chosen": -34.59131622314453, "rewards/margins": 2.271082878112793, "rewards/rejected": -36.862403869628906, "step": 2317 }, { "epoch": 0.3156318082788671, "grad_norm": 39.99242304098481, "learning_rate": 6.919984273710172e-07, "logits/chosen": 11.581854820251465, "logits/rejected": 11.067626953125, "logps/chosen": -3.1421613693237305, "logps/rejected": -3.2477238178253174, "loss": 4.1213, "rewards/accuracies": 0.75, "rewards/chosen": -31.421615600585938, "rewards/margins": 1.0556244850158691, "rewards/rejected": -32.47724151611328, "step": 2318 }, { "epoch": 0.31576797385620914, "grad_norm": 45.195968868294756, "learning_rate": 6.918684427612787e-07, "logits/chosen": 10.184133529663086, "logits/rejected": 10.593152046203613, "logps/chosen": -3.4102535247802734, "logps/rejected": -3.497100830078125, "loss": 4.0102, "rewards/accuracies": 0.75, "rewards/chosen": -34.1025390625, "rewards/margins": 0.8684711456298828, "rewards/rejected": -34.97100830078125, "step": 2319 }, { "epoch": 0.3159041394335512, "grad_norm": 39.36603043127723, "learning_rate": 6.917383922014527e-07, "logits/chosen": 9.745162963867188, "logits/rejected": 10.993203163146973, "logps/chosen": -3.13421893119812, "logps/rejected": -3.502133369445801, "loss": 4.1968, "rewards/accuracies": 0.75, "rewards/chosen": -31.34218978881836, "rewards/margins": 3.679142475128174, "rewards/rejected": -35.021331787109375, "step": 2320 }, { "epoch": 0.31604030501089325, "grad_norm": 40.11621389052736, "learning_rate": 6.916082757209258e-07, "logits/chosen": 11.363396644592285, "logits/rejected": 10.939159393310547, "logps/chosen": -3.3126041889190674, "logps/rejected": -3.5756633281707764, "loss": 4.0453, "rewards/accuracies": 0.5, "rewards/chosen": -33.126041412353516, "rewards/margins": 2.630591869354248, "rewards/rejected": -35.75663375854492, "step": 2321 }, { "epoch": 0.3161764705882353, "grad_norm": 38.89062600489645, "learning_rate": 6.914780933490984e-07, "logits/chosen": 10.016443252563477, "logits/rejected": 9.663918495178223, "logps/chosen": -2.956878185272217, "logps/rejected": -3.2655274868011475, "loss": 3.7738, "rewards/accuracies": 0.75, "rewards/chosen": -29.568782806396484, "rewards/margins": 3.086491584777832, "rewards/rejected": -32.6552734375, "step": 2322 }, { "epoch": 0.31631263616557737, "grad_norm": 41.28558624315879, "learning_rate": 6.913478451153864e-07, "logits/chosen": 11.935882568359375, "logits/rejected": 11.776313781738281, "logps/chosen": -3.5741043090820312, "logps/rejected": -3.6858010292053223, "loss": 4.4459, "rewards/accuracies": 0.5, "rewards/chosen": -35.74104309082031, "rewards/margins": 1.1169662475585938, "rewards/rejected": -36.858009338378906, "step": 2323 }, { "epoch": 0.3164488017429194, "grad_norm": 41.768550251148305, "learning_rate": 6.912175310492205e-07, "logits/chosen": 10.531024932861328, "logits/rejected": 11.723794937133789, "logps/chosen": -3.3910818099975586, "logps/rejected": -3.5855982303619385, "loss": 4.3403, "rewards/accuracies": 0.75, "rewards/chosen": -33.91082000732422, "rewards/margins": 1.9451656341552734, "rewards/rejected": -35.85598373413086, "step": 2324 }, { "epoch": 0.3165849673202614, "grad_norm": 39.796836504593685, "learning_rate": 6.910871511800462e-07, "logits/chosen": 9.716005325317383, "logits/rejected": 10.402896881103516, "logps/chosen": -2.8529326915740967, "logps/rejected": -3.144892930984497, "loss": 4.1667, "rewards/accuracies": 0.75, "rewards/chosen": -28.529327392578125, "rewards/margins": 2.919602870941162, "rewards/rejected": -31.448928833007812, "step": 2325 }, { "epoch": 0.3167211328976035, "grad_norm": 39.89501447955771, "learning_rate": 6.90956705537324e-07, "logits/chosen": 9.955850601196289, "logits/rejected": 9.987081527709961, "logps/chosen": -3.124558448791504, "logps/rejected": -3.0437393188476562, "loss": 4.0596, "rewards/accuracies": 0.5, "rewards/chosen": -31.245582580566406, "rewards/margins": -0.8081903457641602, "rewards/rejected": -30.437393188476562, "step": 2326 }, { "epoch": 0.31685729847494554, "grad_norm": 40.85269343911853, "learning_rate": 6.90826194150529e-07, "logits/chosen": 10.612957954406738, "logits/rejected": 11.805763244628906, "logps/chosen": -3.353206157684326, "logps/rejected": -3.644737958908081, "loss": 4.0603, "rewards/accuracies": 1.0, "rewards/chosen": -33.53205871582031, "rewards/margins": 2.915316581726074, "rewards/rejected": -36.4473762512207, "step": 2327 }, { "epoch": 0.31699346405228757, "grad_norm": 39.46442136819617, "learning_rate": 6.906956170491516e-07, "logits/chosen": 10.241668701171875, "logits/rejected": 10.951473236083984, "logps/chosen": -2.9994635581970215, "logps/rejected": -3.246098518371582, "loss": 3.6717, "rewards/accuracies": 1.0, "rewards/chosen": -29.99463653564453, "rewards/margins": 2.466348648071289, "rewards/rejected": -32.46098327636719, "step": 2328 }, { "epoch": 0.31712962962962965, "grad_norm": 41.89093128251974, "learning_rate": 6.905649742626966e-07, "logits/chosen": 10.341571807861328, "logits/rejected": 10.66937255859375, "logps/chosen": -3.2463877201080322, "logps/rejected": -3.632674217224121, "loss": 4.0461, "rewards/accuracies": 0.75, "rewards/chosen": -32.46387481689453, "rewards/margins": 3.862865924835205, "rewards/rejected": -36.32674026489258, "step": 2329 }, { "epoch": 0.3172657952069717, "grad_norm": 40.60486011365039, "learning_rate": 6.904342658206836e-07, "logits/chosen": 11.210859298706055, "logits/rejected": 11.533208847045898, "logps/chosen": -3.2934088706970215, "logps/rejected": -3.5498924255371094, "loss": 4.3132, "rewards/accuracies": 0.5, "rewards/chosen": -32.93408966064453, "rewards/margins": 2.5648345947265625, "rewards/rejected": -35.498924255371094, "step": 2330 }, { "epoch": 0.3174019607843137, "grad_norm": 39.6867120887699, "learning_rate": 6.903034917526478e-07, "logits/chosen": 10.82481575012207, "logits/rejected": 11.363116264343262, "logps/chosen": -3.1831002235412598, "logps/rejected": -3.5436620712280273, "loss": 4.0363, "rewards/accuracies": 0.75, "rewards/chosen": -31.83100128173828, "rewards/margins": 3.605618476867676, "rewards/rejected": -35.43661880493164, "step": 2331 }, { "epoch": 0.3175381263616558, "grad_norm": 37.279330074221406, "learning_rate": 6.901726520881382e-07, "logits/chosen": 10.366175651550293, "logits/rejected": 11.55254077911377, "logps/chosen": -3.2527947425842285, "logps/rejected": -3.9134950637817383, "loss": 3.6595, "rewards/accuracies": 1.0, "rewards/chosen": -32.52794647216797, "rewards/margins": 6.607001781463623, "rewards/rejected": -39.13494873046875, "step": 2332 }, { "epoch": 0.3176742919389978, "grad_norm": 39.366277572581964, "learning_rate": 6.900417468567193e-07, "logits/chosen": 10.739977836608887, "logits/rejected": 11.283428192138672, "logps/chosen": -3.1502203941345215, "logps/rejected": -3.4825544357299805, "loss": 3.9798, "rewards/accuracies": 1.0, "rewards/chosen": -31.5022029876709, "rewards/margins": 3.3233418464660645, "rewards/rejected": -34.82554626464844, "step": 2333 }, { "epoch": 0.31781045751633985, "grad_norm": 44.18051505741612, "learning_rate": 6.899107760879701e-07, "logits/chosen": 10.606242179870605, "logits/rejected": 11.522682189941406, "logps/chosen": -3.276050090789795, "logps/rejected": -3.6268811225891113, "loss": 4.2099, "rewards/accuracies": 0.75, "rewards/chosen": -32.760498046875, "rewards/margins": 3.5083117485046387, "rewards/rejected": -36.26881408691406, "step": 2334 }, { "epoch": 0.31794662309368193, "grad_norm": 40.08627404300547, "learning_rate": 6.897797398114847e-07, "logits/chosen": 12.337606430053711, "logits/rejected": 11.761277198791504, "logps/chosen": -3.0526914596557617, "logps/rejected": -3.227924346923828, "loss": 4.3553, "rewards/accuracies": 0.75, "rewards/chosen": -30.52691650390625, "rewards/margins": 1.752326488494873, "rewards/rejected": -32.27924346923828, "step": 2335 }, { "epoch": 0.31808278867102396, "grad_norm": 44.584896933379085, "learning_rate": 6.896486380568718e-07, "logits/chosen": 11.664896965026855, "logits/rejected": 11.548269271850586, "logps/chosen": -3.407991647720337, "logps/rejected": -3.4976937770843506, "loss": 4.0687, "rewards/accuracies": 0.25, "rewards/chosen": -34.079917907714844, "rewards/margins": 0.8970217704772949, "rewards/rejected": -34.97693634033203, "step": 2336 }, { "epoch": 0.318218954248366, "grad_norm": 42.19440790445765, "learning_rate": 6.895174708537548e-07, "logits/chosen": 11.10974407196045, "logits/rejected": 11.752776145935059, "logps/chosen": -3.2331676483154297, "logps/rejected": -3.501938581466675, "loss": 4.2928, "rewards/accuracies": 0.75, "rewards/chosen": -32.33167266845703, "rewards/margins": 2.687711238861084, "rewards/rejected": -35.019386291503906, "step": 2337 }, { "epoch": 0.3183551198257081, "grad_norm": 38.05728045022061, "learning_rate": 6.893862382317721e-07, "logits/chosen": 11.711198806762695, "logits/rejected": 12.280521392822266, "logps/chosen": -3.791278839111328, "logps/rejected": -3.8818910121917725, "loss": 3.6841, "rewards/accuracies": 0.5, "rewards/chosen": -37.91278839111328, "rewards/margins": 0.9061212539672852, "rewards/rejected": -38.81890869140625, "step": 2338 }, { "epoch": 0.3184912854030501, "grad_norm": 38.46098802077646, "learning_rate": 6.892549402205767e-07, "logits/chosen": 10.328153610229492, "logits/rejected": 9.522310256958008, "logps/chosen": -3.158191442489624, "logps/rejected": -3.110933542251587, "loss": 3.9889, "rewards/accuracies": 0.5, "rewards/chosen": -31.58191680908203, "rewards/margins": -0.4725794792175293, "rewards/rejected": -31.10933494567871, "step": 2339 }, { "epoch": 0.31862745098039214, "grad_norm": 42.806257198842836, "learning_rate": 6.891235768498367e-07, "logits/chosen": 12.203819274902344, "logits/rejected": 12.106572151184082, "logps/chosen": -3.9341683387756348, "logps/rejected": -3.8679728507995605, "loss": 3.7572, "rewards/accuracies": 0.75, "rewards/chosen": -39.34168243408203, "rewards/margins": -0.6619529724121094, "rewards/rejected": -38.67972946166992, "step": 2340 }, { "epoch": 0.3187636165577342, "grad_norm": 43.824359323659884, "learning_rate": 6.889921481492346e-07, "logits/chosen": 10.640384674072266, "logits/rejected": 11.168012619018555, "logps/chosen": -3.452500104904175, "logps/rejected": -3.70599365234375, "loss": 4.1644, "rewards/accuracies": 0.75, "rewards/chosen": -34.525001525878906, "rewards/margins": 2.5349349975585938, "rewards/rejected": -37.0599365234375, "step": 2341 }, { "epoch": 0.31889978213507625, "grad_norm": 41.01731325834775, "learning_rate": 6.888606541484677e-07, "logits/chosen": 11.02590560913086, "logits/rejected": 11.118907928466797, "logps/chosen": -3.3969974517822266, "logps/rejected": -3.483016014099121, "loss": 3.6748, "rewards/accuracies": 0.5, "rewards/chosen": -33.969974517822266, "rewards/margins": 0.8601846694946289, "rewards/rejected": -34.830162048339844, "step": 2342 }, { "epoch": 0.3190359477124183, "grad_norm": 38.134322405670794, "learning_rate": 6.887290948772482e-07, "logits/chosen": 11.003613471984863, "logits/rejected": 11.529211044311523, "logps/chosen": -3.156191349029541, "logps/rejected": -3.587721824645996, "loss": 3.6606, "rewards/accuracies": 1.0, "rewards/chosen": -31.561912536621094, "rewards/margins": 4.315305233001709, "rewards/rejected": -35.877220153808594, "step": 2343 }, { "epoch": 0.31917211328976036, "grad_norm": 39.67185208725668, "learning_rate": 6.885974703653032e-07, "logits/chosen": 10.545112609863281, "logits/rejected": 11.12525749206543, "logps/chosen": -3.547597646713257, "logps/rejected": -3.506850242614746, "loss": 3.9802, "rewards/accuracies": 0.25, "rewards/chosen": -35.475975036621094, "rewards/margins": -0.4074740409851074, "rewards/rejected": -35.06850051879883, "step": 2344 }, { "epoch": 0.3193082788671024, "grad_norm": 44.56288155798053, "learning_rate": 6.88465780642374e-07, "logits/chosen": 11.37762451171875, "logits/rejected": 12.820796966552734, "logps/chosen": -3.373072624206543, "logps/rejected": -4.090591907501221, "loss": 4.3452, "rewards/accuracies": 1.0, "rewards/chosen": -33.73072814941406, "rewards/margins": 7.1751933097839355, "rewards/rejected": -40.905921936035156, "step": 2345 }, { "epoch": 0.3194444444444444, "grad_norm": 41.13653687432481, "learning_rate": 6.883340257382174e-07, "logits/chosen": 11.923066139221191, "logits/rejected": 11.800232887268066, "logps/chosen": -3.449928045272827, "logps/rejected": -3.595125675201416, "loss": 4.115, "rewards/accuracies": 0.5, "rewards/chosen": -34.49928283691406, "rewards/margins": 1.4519762992858887, "rewards/rejected": -35.951255798339844, "step": 2346 }, { "epoch": 0.3195806100217865, "grad_norm": 39.133578392042644, "learning_rate": 6.882022056826041e-07, "logits/chosen": 11.006142616271973, "logits/rejected": 11.400239944458008, "logps/chosen": -3.649251937866211, "logps/rejected": -3.662264108657837, "loss": 4.1834, "rewards/accuracies": 0.5, "rewards/chosen": -36.49251937866211, "rewards/margins": 0.13012313842773438, "rewards/rejected": -36.622642517089844, "step": 2347 }, { "epoch": 0.31971677559912853, "grad_norm": 39.03551255708832, "learning_rate": 6.880703205053203e-07, "logits/chosen": 11.636302947998047, "logits/rejected": 11.421570777893066, "logps/chosen": -3.473548412322998, "logps/rejected": -3.6125805377960205, "loss": 4.0105, "rewards/accuracies": 0.5, "rewards/chosen": -34.7354850769043, "rewards/margins": 1.3903207778930664, "rewards/rejected": -36.12580490112305, "step": 2348 }, { "epoch": 0.31985294117647056, "grad_norm": 38.89157516611024, "learning_rate": 6.879383702361663e-07, "logits/chosen": 11.495723724365234, "logits/rejected": 11.385316848754883, "logps/chosen": -3.1763548851013184, "logps/rejected": -3.358358383178711, "loss": 3.7179, "rewards/accuracies": 1.0, "rewards/chosen": -31.763547897338867, "rewards/margins": 1.8200349807739258, "rewards/rejected": -33.583580017089844, "step": 2349 }, { "epoch": 0.31998910675381265, "grad_norm": 44.45683228531695, "learning_rate": 6.878063549049573e-07, "logits/chosen": 12.308921813964844, "logits/rejected": 12.050840377807617, "logps/chosen": -3.671795129776001, "logps/rejected": -3.485353469848633, "loss": 4.3982, "rewards/accuracies": 0.0, "rewards/chosen": -36.71794891357422, "rewards/margins": -1.8644137382507324, "rewards/rejected": -34.85353469848633, "step": 2350 }, { "epoch": 0.3201252723311547, "grad_norm": 42.70154880215521, "learning_rate": 6.876742745415235e-07, "logits/chosen": 11.723523139953613, "logits/rejected": 12.365583419799805, "logps/chosen": -3.793532609939575, "logps/rejected": -3.8375000953674316, "loss": 3.8495, "rewards/accuracies": 0.25, "rewards/chosen": -37.935325622558594, "rewards/margins": 0.43967533111572266, "rewards/rejected": -38.375, "step": 2351 }, { "epoch": 0.3202614379084967, "grad_norm": 42.907338316282285, "learning_rate": 6.875421291757094e-07, "logits/chosen": 11.52694320678711, "logits/rejected": 12.445865631103516, "logps/chosen": -3.6550467014312744, "logps/rejected": -3.7811477184295654, "loss": 4.1377, "rewards/accuracies": 0.5, "rewards/chosen": -36.55046844482422, "rewards/margins": 1.2610101699829102, "rewards/rejected": -37.81147766113281, "step": 2352 }, { "epoch": 0.3203976034858388, "grad_norm": 37.63413358971941, "learning_rate": 6.874099188373743e-07, "logits/chosen": 11.740222930908203, "logits/rejected": 11.919612884521484, "logps/chosen": -3.420077085494995, "logps/rejected": -3.6029226779937744, "loss": 3.7774, "rewards/accuracies": 0.5, "rewards/chosen": -34.20077133178711, "rewards/margins": 1.8284554481506348, "rewards/rejected": -36.02922821044922, "step": 2353 }, { "epoch": 0.3205337690631808, "grad_norm": 46.863096067412876, "learning_rate": 6.872776435563924e-07, "logits/chosen": 11.203953742980957, "logits/rejected": 12.119603157043457, "logps/chosen": -3.2843470573425293, "logps/rejected": -3.383439302444458, "loss": 5.0108, "rewards/accuracies": 0.5, "rewards/chosen": -32.843467712402344, "rewards/margins": 0.9909219741821289, "rewards/rejected": -33.83439254760742, "step": 2354 }, { "epoch": 0.3206699346405229, "grad_norm": 42.53282311756554, "learning_rate": 6.871453033626522e-07, "logits/chosen": 12.350500106811523, "logits/rejected": 11.851156234741211, "logps/chosen": -3.8432886600494385, "logps/rejected": -3.7559142112731934, "loss": 3.7885, "rewards/accuracies": 0.5, "rewards/chosen": -38.432884216308594, "rewards/margins": -0.8737449645996094, "rewards/rejected": -37.55914306640625, "step": 2355 }, { "epoch": 0.32080610021786493, "grad_norm": 41.870075188813274, "learning_rate": 6.870128982860573e-07, "logits/chosen": 10.89974594116211, "logits/rejected": 11.990875244140625, "logps/chosen": -3.1611461639404297, "logps/rejected": -3.6207144260406494, "loss": 4.1865, "rewards/accuracies": 1.0, "rewards/chosen": -31.61146354675293, "rewards/margins": 4.595681190490723, "rewards/rejected": -36.20714569091797, "step": 2356 }, { "epoch": 0.32094226579520696, "grad_norm": 43.392576593180124, "learning_rate": 6.868804283565254e-07, "logits/chosen": 11.003878593444824, "logits/rejected": 11.21590805053711, "logps/chosen": -3.499467134475708, "logps/rejected": -3.528179883956909, "loss": 4.5635, "rewards/accuracies": 0.5, "rewards/chosen": -34.99467086791992, "rewards/margins": 0.2871265411376953, "rewards/rejected": -35.28179931640625, "step": 2357 }, { "epoch": 0.32107843137254904, "grad_norm": 47.87064726854864, "learning_rate": 6.867478936039892e-07, "logits/chosen": 11.824474334716797, "logits/rejected": 11.646965026855469, "logps/chosen": -3.6511189937591553, "logps/rejected": -3.4681360721588135, "loss": 4.2482, "rewards/accuracies": 0.5, "rewards/chosen": -36.51118850708008, "rewards/margins": -1.8298263549804688, "rewards/rejected": -34.68136215209961, "step": 2358 }, { "epoch": 0.3212145969498911, "grad_norm": 39.695191665363765, "learning_rate": 6.866152940583964e-07, "logits/chosen": 11.372699737548828, "logits/rejected": 11.838645935058594, "logps/chosen": -3.6035208702087402, "logps/rejected": -3.6755125522613525, "loss": 3.7485, "rewards/accuracies": 0.75, "rewards/chosen": -36.03520584106445, "rewards/margins": 0.7199192047119141, "rewards/rejected": -36.755126953125, "step": 2359 }, { "epoch": 0.3213507625272331, "grad_norm": 39.0700999356888, "learning_rate": 6.864826297497086e-07, "logits/chosen": 11.225936889648438, "logits/rejected": 11.580270767211914, "logps/chosen": -3.646361827850342, "logps/rejected": -3.8061611652374268, "loss": 3.9515, "rewards/accuracies": 0.75, "rewards/chosen": -36.463619232177734, "rewards/margins": 1.5979938507080078, "rewards/rejected": -38.06161117553711, "step": 2360 }, { "epoch": 0.3214869281045752, "grad_norm": 41.749852074196816, "learning_rate": 6.863499007079026e-07, "logits/chosen": 10.6565580368042, "logits/rejected": 11.2509183883667, "logps/chosen": -3.1339235305786133, "logps/rejected": -3.4958057403564453, "loss": 3.7336, "rewards/accuracies": 1.0, "rewards/chosen": -31.3392333984375, "rewards/margins": 3.618823528289795, "rewards/rejected": -34.95805740356445, "step": 2361 }, { "epoch": 0.3216230936819172, "grad_norm": 41.40021038823849, "learning_rate": 6.862171069629695e-07, "logits/chosen": 12.642380714416504, "logits/rejected": 11.769697189331055, "logps/chosen": -3.6538150310516357, "logps/rejected": -3.5344882011413574, "loss": 4.3205, "rewards/accuracies": 0.25, "rewards/chosen": -36.538150787353516, "rewards/margins": -1.193270206451416, "rewards/rejected": -35.344879150390625, "step": 2362 }, { "epoch": 0.32175925925925924, "grad_norm": 37.606197400971325, "learning_rate": 6.860842485449153e-07, "logits/chosen": 11.735447883605957, "logits/rejected": 11.210132598876953, "logps/chosen": -3.5605950355529785, "logps/rejected": -3.7604284286499023, "loss": 4.3228, "rewards/accuracies": 0.75, "rewards/chosen": -35.60594940185547, "rewards/margins": 1.998335838317871, "rewards/rejected": -37.604286193847656, "step": 2363 }, { "epoch": 0.32189542483660133, "grad_norm": 39.863274731300606, "learning_rate": 6.859513254837601e-07, "logits/chosen": 11.80320930480957, "logits/rejected": 11.339590072631836, "logps/chosen": -3.4026312828063965, "logps/rejected": -3.3994603157043457, "loss": 3.7142, "rewards/accuracies": 0.5, "rewards/chosen": -34.026309967041016, "rewards/margins": -0.03170585632324219, "rewards/rejected": -33.994606018066406, "step": 2364 }, { "epoch": 0.32203159041394336, "grad_norm": 39.475222645915956, "learning_rate": 6.858183378095394e-07, "logits/chosen": 10.839672088623047, "logits/rejected": 11.416674613952637, "logps/chosen": -3.653787612915039, "logps/rejected": -3.5925300121307373, "loss": 3.7791, "rewards/accuracies": 0.5, "rewards/chosen": -36.537872314453125, "rewards/margins": -0.6125736236572266, "rewards/rejected": -35.92530059814453, "step": 2365 }, { "epoch": 0.3221677559912854, "grad_norm": 41.72439243015702, "learning_rate": 6.856852855523026e-07, "logits/chosen": 11.099271774291992, "logits/rejected": 11.426923751831055, "logps/chosen": -3.561774969100952, "logps/rejected": -3.4544575214385986, "loss": 4.0872, "rewards/accuracies": 0.5, "rewards/chosen": -35.61774826049805, "rewards/margins": -1.0731735229492188, "rewards/rejected": -34.544578552246094, "step": 2366 }, { "epoch": 0.32230392156862747, "grad_norm": 41.545973361838165, "learning_rate": 6.855521687421141e-07, "logits/chosen": 11.147321701049805, "logits/rejected": 11.528324127197266, "logps/chosen": -3.313966751098633, "logps/rejected": -3.4423563480377197, "loss": 4.6299, "rewards/accuracies": 0.5, "rewards/chosen": -33.13966751098633, "rewards/margins": 1.2838945388793945, "rewards/rejected": -34.42356491088867, "step": 2367 }, { "epoch": 0.3224400871459695, "grad_norm": 44.3663597824487, "learning_rate": 6.854189874090525e-07, "logits/chosen": 10.809209823608398, "logits/rejected": 11.088930130004883, "logps/chosen": -3.3180840015411377, "logps/rejected": -3.5796422958374023, "loss": 3.7348, "rewards/accuracies": 1.0, "rewards/chosen": -33.18083953857422, "rewards/margins": 2.615584373474121, "rewards/rejected": -35.796424865722656, "step": 2368 }, { "epoch": 0.32257625272331153, "grad_norm": 38.256646570277866, "learning_rate": 6.852857415832117e-07, "logits/chosen": 11.800439834594727, "logits/rejected": 11.408489227294922, "logps/chosen": -3.498725414276123, "logps/rejected": -3.604100465774536, "loss": 3.8326, "rewards/accuracies": 0.25, "rewards/chosen": -34.98725509643555, "rewards/margins": 1.0537505149841309, "rewards/rejected": -36.0410041809082, "step": 2369 }, { "epoch": 0.3227124183006536, "grad_norm": 40.75421734074943, "learning_rate": 6.851524312946992e-07, "logits/chosen": 11.005657196044922, "logits/rejected": 10.817557334899902, "logps/chosen": -3.252781867980957, "logps/rejected": -3.207268714904785, "loss": 3.8793, "rewards/accuracies": 0.5, "rewards/chosen": -32.52781677246094, "rewards/margins": -0.45513343811035156, "rewards/rejected": -32.07268524169922, "step": 2370 }, { "epoch": 0.32284858387799564, "grad_norm": 40.19993488010465, "learning_rate": 6.850190565736378e-07, "logits/chosen": 12.200525283813477, "logits/rejected": 11.305086135864258, "logps/chosen": -3.952782154083252, "logps/rejected": -3.564882278442383, "loss": 4.0149, "rewards/accuracies": 0.0, "rewards/chosen": -39.52782440185547, "rewards/margins": -3.878999710083008, "rewards/rejected": -35.64882278442383, "step": 2371 }, { "epoch": 0.32298474945533767, "grad_norm": 41.25406611512053, "learning_rate": 6.848856174501645e-07, "logits/chosen": 10.3936767578125, "logits/rejected": 11.305038452148438, "logps/chosen": -3.434608221054077, "logps/rejected": -3.673402786254883, "loss": 4.228, "rewards/accuracies": 1.0, "rewards/chosen": -34.3460807800293, "rewards/margins": 2.3879456520080566, "rewards/rejected": -36.73402786254883, "step": 2372 }, { "epoch": 0.32312091503267976, "grad_norm": 40.27515944065841, "learning_rate": 6.84752113954431e-07, "logits/chosen": 10.791790962219238, "logits/rejected": 11.071369171142578, "logps/chosen": -3.7389934062957764, "logps/rejected": -4.098642349243164, "loss": 3.4007, "rewards/accuracies": 0.75, "rewards/chosen": -37.38993453979492, "rewards/margins": 3.5964860916137695, "rewards/rejected": -40.986419677734375, "step": 2373 }, { "epoch": 0.3232570806100218, "grad_norm": 39.160914659144, "learning_rate": 6.846185461166036e-07, "logits/chosen": 11.150976181030273, "logits/rejected": 11.665474891662598, "logps/chosen": -3.345658779144287, "logps/rejected": -3.5224838256835938, "loss": 4.4187, "rewards/accuracies": 0.5, "rewards/chosen": -33.45658874511719, "rewards/margins": 1.7682528495788574, "rewards/rejected": -35.22483825683594, "step": 2374 }, { "epoch": 0.3233932461873638, "grad_norm": 40.991647626157174, "learning_rate": 6.844849139668632e-07, "logits/chosen": 13.01555347442627, "logits/rejected": 12.06711196899414, "logps/chosen": -3.5858218669891357, "logps/rejected": -3.8358922004699707, "loss": 4.0452, "rewards/accuracies": 0.75, "rewards/chosen": -35.85821533203125, "rewards/margins": 2.5007057189941406, "rewards/rejected": -38.358924865722656, "step": 2375 }, { "epoch": 0.3235294117647059, "grad_norm": 44.473756972848314, "learning_rate": 6.843512175354048e-07, "logits/chosen": 11.073738098144531, "logits/rejected": 11.490766525268555, "logps/chosen": -3.531489849090576, "logps/rejected": -3.7244701385498047, "loss": 4.7233, "rewards/accuracies": 0.5, "rewards/chosen": -35.31489562988281, "rewards/margins": 1.929800033569336, "rewards/rejected": -37.24469757080078, "step": 2376 }, { "epoch": 0.3236655773420479, "grad_norm": 45.81931603055669, "learning_rate": 6.842174568524382e-07, "logits/chosen": 11.172016143798828, "logits/rejected": 11.390512466430664, "logps/chosen": -3.4858877658843994, "logps/rejected": -3.6489763259887695, "loss": 4.5153, "rewards/accuracies": 0.75, "rewards/chosen": -34.85887908935547, "rewards/margins": 1.6308870315551758, "rewards/rejected": -36.48976516723633, "step": 2377 }, { "epoch": 0.32380174291938996, "grad_norm": 40.647593317221414, "learning_rate": 6.840836319481882e-07, "logits/chosen": 10.569877624511719, "logits/rejected": 10.849387168884277, "logps/chosen": -3.4901974201202393, "logps/rejected": -3.6262030601501465, "loss": 4.2219, "rewards/accuracies": 0.75, "rewards/chosen": -34.901973724365234, "rewards/margins": 1.3600564002990723, "rewards/rejected": -36.26203155517578, "step": 2378 }, { "epoch": 0.32393790849673204, "grad_norm": 45.79890075880451, "learning_rate": 6.839497428528931e-07, "logits/chosen": 11.386639595031738, "logits/rejected": 11.073934555053711, "logps/chosen": -3.4257330894470215, "logps/rejected": -3.383218288421631, "loss": 4.1699, "rewards/accuracies": 0.25, "rewards/chosen": -34.25733184814453, "rewards/margins": -0.42514801025390625, "rewards/rejected": -33.832183837890625, "step": 2379 }, { "epoch": 0.32407407407407407, "grad_norm": 43.91758267092829, "learning_rate": 6.838157895968064e-07, "logits/chosen": 10.645866394042969, "logits/rejected": 10.893595695495605, "logps/chosen": -3.213831901550293, "logps/rejected": -3.601292848587036, "loss": 4.1173, "rewards/accuracies": 0.75, "rewards/chosen": -32.13832092285156, "rewards/margins": 3.8746085166931152, "rewards/rejected": -36.0129280090332, "step": 2380 }, { "epoch": 0.3242102396514161, "grad_norm": 45.172747262737175, "learning_rate": 6.836817722101961e-07, "logits/chosen": 11.267715454101562, "logits/rejected": 10.342935562133789, "logps/chosen": -3.6648051738739014, "logps/rejected": -3.169821262359619, "loss": 4.5244, "rewards/accuracies": 0.25, "rewards/chosen": -36.64805221557617, "rewards/margins": -4.949839115142822, "rewards/rejected": -31.698213577270508, "step": 2381 }, { "epoch": 0.3243464052287582, "grad_norm": 40.21443815497393, "learning_rate": 6.835476907233443e-07, "logits/chosen": 11.150402069091797, "logits/rejected": 11.388385772705078, "logps/chosen": -3.33128023147583, "logps/rejected": -3.717470645904541, "loss": 4.1121, "rewards/accuracies": 1.0, "rewards/chosen": -33.312801361083984, "rewards/margins": 3.8619046211242676, "rewards/rejected": -37.174705505371094, "step": 2382 }, { "epoch": 0.3244825708061002, "grad_norm": 39.62037356097582, "learning_rate": 6.83413545166548e-07, "logits/chosen": 10.471717834472656, "logits/rejected": 10.688578605651855, "logps/chosen": -3.4417519569396973, "logps/rejected": -3.4122917652130127, "loss": 3.6512, "rewards/accuracies": 0.5, "rewards/chosen": -34.417518615722656, "rewards/margins": -0.2946000099182129, "rewards/rejected": -34.12291717529297, "step": 2383 }, { "epoch": 0.32461873638344224, "grad_norm": 46.96064186067611, "learning_rate": 6.832793355701184e-07, "logits/chosen": 11.730637550354004, "logits/rejected": 11.786170959472656, "logps/chosen": -3.276674270629883, "logps/rejected": -3.621347188949585, "loss": 4.1147, "rewards/accuracies": 0.75, "rewards/chosen": -32.76674270629883, "rewards/margins": 3.446728229522705, "rewards/rejected": -36.213470458984375, "step": 2384 }, { "epoch": 0.3247549019607843, "grad_norm": 40.88342841385815, "learning_rate": 6.831450619643815e-07, "logits/chosen": 11.37907600402832, "logits/rejected": 11.078346252441406, "logps/chosen": -3.398911237716675, "logps/rejected": -3.331390380859375, "loss": 4.401, "rewards/accuracies": 0.25, "rewards/chosen": -33.989112854003906, "rewards/margins": -0.675208568572998, "rewards/rejected": -33.31390380859375, "step": 2385 }, { "epoch": 0.32489106753812635, "grad_norm": 36.62908855292159, "learning_rate": 6.830107243796771e-07, "logits/chosen": 11.89653491973877, "logits/rejected": 12.425559043884277, "logps/chosen": -3.4735803604125977, "logps/rejected": -3.527564525604248, "loss": 4.0236, "rewards/accuracies": 0.5, "rewards/chosen": -34.73580551147461, "rewards/margins": 0.5398387908935547, "rewards/rejected": -35.27564239501953, "step": 2386 }, { "epoch": 0.3250272331154684, "grad_norm": 55.29148616633885, "learning_rate": 6.828763228463603e-07, "logits/chosen": 10.644601821899414, "logits/rejected": 11.49150276184082, "logps/chosen": -3.519564151763916, "logps/rejected": -3.623478651046753, "loss": 3.8214, "rewards/accuracies": 0.5, "rewards/chosen": -35.195640563964844, "rewards/margins": 1.0391440391540527, "rewards/rejected": -36.23478698730469, "step": 2387 }, { "epoch": 0.32516339869281047, "grad_norm": 43.037618534625715, "learning_rate": 6.827418573948001e-07, "logits/chosen": 11.734323501586914, "logits/rejected": 11.686302185058594, "logps/chosen": -3.4708166122436523, "logps/rejected": -3.7618017196655273, "loss": 3.9672, "rewards/accuracies": 1.0, "rewards/chosen": -34.708168029785156, "rewards/margins": 2.9098496437072754, "rewards/rejected": -37.61801528930664, "step": 2388 }, { "epoch": 0.3252995642701525, "grad_norm": 36.7110339917596, "learning_rate": 6.826073280553799e-07, "logits/chosen": 11.05903148651123, "logits/rejected": 12.011375427246094, "logps/chosen": -3.1069087982177734, "logps/rejected": -3.553614854812622, "loss": 4.4181, "rewards/accuracies": 1.0, "rewards/chosen": -31.069087982177734, "rewards/margins": 4.467058181762695, "rewards/rejected": -35.53614807128906, "step": 2389 }, { "epoch": 0.3254357298474945, "grad_norm": 39.20121305420789, "learning_rate": 6.824727348584981e-07, "logits/chosen": 11.186736106872559, "logits/rejected": 10.513071060180664, "logps/chosen": -3.3829402923583984, "logps/rejected": -3.396115303039551, "loss": 4.3273, "rewards/accuracies": 0.5, "rewards/chosen": -33.82940673828125, "rewards/margins": 0.13174819946289062, "rewards/rejected": -33.961151123046875, "step": 2390 }, { "epoch": 0.3255718954248366, "grad_norm": 36.283016943605574, "learning_rate": 6.823380778345667e-07, "logits/chosen": 11.74185562133789, "logits/rejected": 11.006836891174316, "logps/chosen": -3.4898595809936523, "logps/rejected": -3.274279832839966, "loss": 3.959, "rewards/accuracies": 0.0, "rewards/chosen": -34.89859390258789, "rewards/margins": -2.155795097351074, "rewards/rejected": -32.7427978515625, "step": 2391 }, { "epoch": 0.32570806100217864, "grad_norm": 45.103108804155276, "learning_rate": 6.822033570140129e-07, "logits/chosen": 11.039863586425781, "logits/rejected": 11.933906555175781, "logps/chosen": -3.354067802429199, "logps/rejected": -4.084918975830078, "loss": 4.3003, "rewards/accuracies": 0.75, "rewards/chosen": -33.540679931640625, "rewards/margins": 7.3085126876831055, "rewards/rejected": -40.84918975830078, "step": 2392 }, { "epoch": 0.3258442265795207, "grad_norm": 39.74384476164356, "learning_rate": 6.820685724272779e-07, "logits/chosen": 11.069385528564453, "logits/rejected": 11.771363258361816, "logps/chosen": -3.4625680446624756, "logps/rejected": -3.6833436489105225, "loss": 3.7974, "rewards/accuracies": 0.75, "rewards/chosen": -34.62567901611328, "rewards/margins": 2.2077560424804688, "rewards/rejected": -36.83343505859375, "step": 2393 }, { "epoch": 0.32598039215686275, "grad_norm": 37.41449216390149, "learning_rate": 6.819337241048172e-07, "logits/chosen": 10.486189842224121, "logits/rejected": 11.274019241333008, "logps/chosen": -3.2640793323516846, "logps/rejected": -3.8199055194854736, "loss": 3.7288, "rewards/accuracies": 1.0, "rewards/chosen": -32.64079284667969, "rewards/margins": 5.558260440826416, "rewards/rejected": -38.19905471801758, "step": 2394 }, { "epoch": 0.3261165577342048, "grad_norm": 41.801612963028774, "learning_rate": 6.817988120771012e-07, "logits/chosen": 11.167377471923828, "logits/rejected": 11.628398895263672, "logps/chosen": -3.8131179809570312, "logps/rejected": -3.6974282264709473, "loss": 4.0666, "rewards/accuracies": 0.25, "rewards/chosen": -38.13117980957031, "rewards/margins": -1.1568961143493652, "rewards/rejected": -36.974281311035156, "step": 2395 }, { "epoch": 0.32625272331154687, "grad_norm": 43.47756896166121, "learning_rate": 6.816638363746142e-07, "logits/chosen": 11.65481185913086, "logits/rejected": 11.451980590820312, "logps/chosen": -3.655179262161255, "logps/rejected": -3.81266713142395, "loss": 4.153, "rewards/accuracies": 0.5, "rewards/chosen": -36.551795959472656, "rewards/margins": 1.5748767852783203, "rewards/rejected": -38.126670837402344, "step": 2396 }, { "epoch": 0.3263888888888889, "grad_norm": 54.784664328159344, "learning_rate": 6.81528797027855e-07, "logits/chosen": 11.999448776245117, "logits/rejected": 12.342702865600586, "logps/chosen": -3.596691846847534, "logps/rejected": -3.7155158519744873, "loss": 3.532, "rewards/accuracies": 0.75, "rewards/chosen": -35.9669189453125, "rewards/margins": 1.1882424354553223, "rewards/rejected": -37.15515899658203, "step": 2397 }, { "epoch": 0.3265250544662309, "grad_norm": 52.95389097334412, "learning_rate": 6.81393694067337e-07, "logits/chosen": 11.849331855773926, "logits/rejected": 12.5577974319458, "logps/chosen": -3.676041841506958, "logps/rejected": -4.02791166305542, "loss": 4.2966, "rewards/accuracies": 0.75, "rewards/chosen": -36.76041793823242, "rewards/margins": 3.5186996459960938, "rewards/rejected": -40.27911376953125, "step": 2398 }, { "epoch": 0.326661220043573, "grad_norm": 41.1794982055665, "learning_rate": 6.81258527523588e-07, "logits/chosen": 11.77042007446289, "logits/rejected": 11.885879516601562, "logps/chosen": -3.6487646102905273, "logps/rejected": -3.73999285697937, "loss": 4.1805, "rewards/accuracies": 0.5, "rewards/chosen": -36.487648010253906, "rewards/margins": 0.9122810363769531, "rewards/rejected": -37.399925231933594, "step": 2399 }, { "epoch": 0.32679738562091504, "grad_norm": 42.69119496384913, "learning_rate": 6.811232974271496e-07, "logits/chosen": 11.422042846679688, "logits/rejected": 12.380301475524902, "logps/chosen": -3.413229465484619, "logps/rejected": -3.7116146087646484, "loss": 4.3024, "rewards/accuracies": 0.75, "rewards/chosen": -34.132293701171875, "rewards/margins": 2.983854293823242, "rewards/rejected": -37.116146087646484, "step": 2400 }, { "epoch": 0.32693355119825707, "grad_norm": 41.909105708971644, "learning_rate": 6.809880038085784e-07, "logits/chosen": 11.065110206604004, "logits/rejected": 11.516874313354492, "logps/chosen": -3.329744815826416, "logps/rejected": -3.734302520751953, "loss": 3.6331, "rewards/accuracies": 0.75, "rewards/chosen": -33.29745101928711, "rewards/margins": 4.045575141906738, "rewards/rejected": -37.34302520751953, "step": 2401 }, { "epoch": 0.32706971677559915, "grad_norm": 41.08160284611268, "learning_rate": 6.808526466984451e-07, "logits/chosen": 11.307229995727539, "logits/rejected": 10.976245880126953, "logps/chosen": -3.2609703540802, "logps/rejected": -3.1564674377441406, "loss": 4.3187, "rewards/accuracies": 0.25, "rewards/chosen": -32.609703063964844, "rewards/margins": -1.0450315475463867, "rewards/rejected": -31.564674377441406, "step": 2402 }, { "epoch": 0.3272058823529412, "grad_norm": 44.61703522276992, "learning_rate": 6.807172261273347e-07, "logits/chosen": 11.317743301391602, "logits/rejected": 11.332496643066406, "logps/chosen": -3.6312384605407715, "logps/rejected": -3.626765727996826, "loss": 3.9993, "rewards/accuracies": 0.5, "rewards/chosen": -36.31238555908203, "rewards/margins": -0.04472970962524414, "rewards/rejected": -36.26765823364258, "step": 2403 }, { "epoch": 0.3273420479302832, "grad_norm": 39.925768400242944, "learning_rate": 6.805817421258467e-07, "logits/chosen": 10.812847137451172, "logits/rejected": 11.557600021362305, "logps/chosen": -3.514835834503174, "logps/rejected": -3.7562711238861084, "loss": 3.7108, "rewards/accuracies": 1.0, "rewards/chosen": -35.14835739135742, "rewards/margins": 2.414351463317871, "rewards/rejected": -37.562713623046875, "step": 2404 }, { "epoch": 0.3274782135076253, "grad_norm": 38.71592605770375, "learning_rate": 6.804461947245947e-07, "logits/chosen": 11.032794952392578, "logits/rejected": 11.685798645019531, "logps/chosen": -3.7000837326049805, "logps/rejected": -3.9011690616607666, "loss": 3.3648, "rewards/accuracies": 0.75, "rewards/chosen": -37.00083923339844, "rewards/margins": 2.010854721069336, "rewards/rejected": -39.01169204711914, "step": 2405 }, { "epoch": 0.3276143790849673, "grad_norm": 41.62033489470501, "learning_rate": 6.803105839542068e-07, "logits/chosen": 11.964083671569824, "logits/rejected": 11.510446548461914, "logps/chosen": -3.4547767639160156, "logps/rejected": -3.541443347930908, "loss": 4.1978, "rewards/accuracies": 0.5, "rewards/chosen": -34.54776382446289, "rewards/margins": 0.8666682243347168, "rewards/rejected": -35.41443634033203, "step": 2406 }, { "epoch": 0.32775054466230935, "grad_norm": 43.05415842155588, "learning_rate": 6.801749098453253e-07, "logits/chosen": 11.682910919189453, "logits/rejected": 12.494060516357422, "logps/chosen": -3.180659532546997, "logps/rejected": -3.9538941383361816, "loss": 3.694, "rewards/accuracies": 1.0, "rewards/chosen": -31.806594848632812, "rewards/margins": 7.73234224319458, "rewards/rejected": -39.538936614990234, "step": 2407 }, { "epoch": 0.32788671023965144, "grad_norm": 40.32087182801884, "learning_rate": 6.800391724286072e-07, "logits/chosen": 11.04088020324707, "logits/rejected": 10.220317840576172, "logps/chosen": -3.6973838806152344, "logps/rejected": -3.3934507369995117, "loss": 4.1956, "rewards/accuracies": 0.25, "rewards/chosen": -36.973838806152344, "rewards/margins": -3.0393314361572266, "rewards/rejected": -33.93450927734375, "step": 2408 }, { "epoch": 0.32802287581699346, "grad_norm": 36.33489951804484, "learning_rate": 6.799033717347229e-07, "logits/chosen": 10.80783748626709, "logits/rejected": 11.344963073730469, "logps/chosen": -3.3935441970825195, "logps/rejected": -3.2950615882873535, "loss": 4.3832, "rewards/accuracies": 0.25, "rewards/chosen": -33.93544006347656, "rewards/margins": -0.9848260879516602, "rewards/rejected": -32.95061492919922, "step": 2409 }, { "epoch": 0.3281590413943355, "grad_norm": 39.1125580292447, "learning_rate": 6.797675077943583e-07, "logits/chosen": 11.298299789428711, "logits/rejected": 11.59166431427002, "logps/chosen": -3.4296011924743652, "logps/rejected": -3.6788363456726074, "loss": 3.8785, "rewards/accuracies": 0.75, "rewards/chosen": -34.29601287841797, "rewards/margins": 2.4923505783081055, "rewards/rejected": -36.788360595703125, "step": 2410 }, { "epoch": 0.3282952069716776, "grad_norm": 37.27022758435023, "learning_rate": 6.796315806382129e-07, "logits/chosen": 10.863136291503906, "logits/rejected": 11.790401458740234, "logps/chosen": -3.3864247798919678, "logps/rejected": -3.7751400470733643, "loss": 4.1302, "rewards/accuracies": 1.0, "rewards/chosen": -33.86425018310547, "rewards/margins": 3.8871517181396484, "rewards/rejected": -37.751399993896484, "step": 2411 }, { "epoch": 0.3284313725490196, "grad_norm": 42.505074695339594, "learning_rate": 6.794955902970001e-07, "logits/chosen": 11.619718551635742, "logits/rejected": 12.013072967529297, "logps/chosen": -3.3364460468292236, "logps/rejected": -3.824242115020752, "loss": 3.7708, "rewards/accuracies": 0.75, "rewards/chosen": -33.36445999145508, "rewards/margins": 4.8779616355896, "rewards/rejected": -38.24242401123047, "step": 2412 }, { "epoch": 0.32856753812636164, "grad_norm": 37.62486117289205, "learning_rate": 6.793595368014485e-07, "logits/chosen": 11.527307510375977, "logits/rejected": 10.752664566040039, "logps/chosen": -3.5805001258850098, "logps/rejected": -3.4181246757507324, "loss": 3.5917, "rewards/accuracies": 0.25, "rewards/chosen": -35.80500030517578, "rewards/margins": -1.6237540245056152, "rewards/rejected": -34.18124771118164, "step": 2413 }, { "epoch": 0.3287037037037037, "grad_norm": 39.253492350548676, "learning_rate": 6.792234201823003e-07, "logits/chosen": 10.907960891723633, "logits/rejected": 12.32911205291748, "logps/chosen": -2.997180700302124, "logps/rejected": -3.4472708702087402, "loss": 4.1877, "rewards/accuracies": 1.0, "rewards/chosen": -29.9718074798584, "rewards/margins": 4.5009026527404785, "rewards/rejected": -34.47270965576172, "step": 2414 }, { "epoch": 0.32883986928104575, "grad_norm": 37.43859063279034, "learning_rate": 6.790872404703122e-07, "logits/chosen": 11.02441692352295, "logits/rejected": 12.434043884277344, "logps/chosen": -3.4191970825195312, "logps/rejected": -3.7074766159057617, "loss": 3.7814, "rewards/accuracies": 0.75, "rewards/chosen": -34.19197082519531, "rewards/margins": 2.8827943801879883, "rewards/rejected": -37.07476806640625, "step": 2415 }, { "epoch": 0.3289760348583878, "grad_norm": 37.8049425011719, "learning_rate": 6.789509976962553e-07, "logits/chosen": 11.088939666748047, "logits/rejected": 11.495670318603516, "logps/chosen": -3.080509662628174, "logps/rejected": -3.45225191116333, "loss": 3.8112, "rewards/accuracies": 0.75, "rewards/chosen": -30.805099487304688, "rewards/margins": 3.7174205780029297, "rewards/rejected": -34.522518157958984, "step": 2416 }, { "epoch": 0.32911220043572986, "grad_norm": 41.83363759857361, "learning_rate": 6.788146918909144e-07, "logits/chosen": 10.798053741455078, "logits/rejected": 11.652963638305664, "logps/chosen": -3.169900417327881, "logps/rejected": -3.422208309173584, "loss": 4.911, "rewards/accuracies": 0.75, "rewards/chosen": -31.699003219604492, "rewards/margins": 2.5230793952941895, "rewards/rejected": -34.222084045410156, "step": 2417 }, { "epoch": 0.3292483660130719, "grad_norm": 38.847684679538, "learning_rate": 6.786783230850892e-07, "logits/chosen": 11.239402770996094, "logits/rejected": 11.22475814819336, "logps/chosen": -3.4145591259002686, "logps/rejected": -3.570024251937866, "loss": 4.2583, "rewards/accuracies": 0.5, "rewards/chosen": -34.145591735839844, "rewards/margins": 1.5546526908874512, "rewards/rejected": -35.70024108886719, "step": 2418 }, { "epoch": 0.3293845315904139, "grad_norm": 34.45008478994481, "learning_rate": 6.785418913095935e-07, "logits/chosen": 11.512414932250977, "logits/rejected": 11.412050247192383, "logps/chosen": -3.0098636150360107, "logps/rejected": -3.203023672103882, "loss": 3.9374, "rewards/accuracies": 0.75, "rewards/chosen": -30.0986328125, "rewards/margins": 1.9316020011901855, "rewards/rejected": -32.030235290527344, "step": 2419 }, { "epoch": 0.329520697167756, "grad_norm": 41.98474961403763, "learning_rate": 6.784053965952549e-07, "logits/chosen": 11.448389053344727, "logits/rejected": 11.822025299072266, "logps/chosen": -3.1152231693267822, "logps/rejected": -3.2579824924468994, "loss": 4.3957, "rewards/accuracies": 0.5, "rewards/chosen": -31.152233123779297, "rewards/margins": 1.4275932312011719, "rewards/rejected": -32.57982635498047, "step": 2420 }, { "epoch": 0.32965686274509803, "grad_norm": 39.79594774826345, "learning_rate": 6.782688389729156e-07, "logits/chosen": 11.34501838684082, "logits/rejected": 10.959856033325195, "logps/chosen": -3.3720340728759766, "logps/rejected": -2.9733481407165527, "loss": 3.8916, "rewards/accuracies": 0.5, "rewards/chosen": -33.7203369140625, "rewards/margins": -3.986858367919922, "rewards/rejected": -29.73348045349121, "step": 2421 }, { "epoch": 0.32979302832244006, "grad_norm": 37.14359635338397, "learning_rate": 6.781322184734319e-07, "logits/chosen": 10.762704849243164, "logits/rejected": 11.849250793457031, "logps/chosen": -2.916468620300293, "logps/rejected": -3.2745869159698486, "loss": 4.0636, "rewards/accuracies": 1.0, "rewards/chosen": -29.164684295654297, "rewards/margins": 3.5811848640441895, "rewards/rejected": -32.74586868286133, "step": 2422 }, { "epoch": 0.32992919389978215, "grad_norm": 35.62122736483658, "learning_rate": 6.779955351276746e-07, "logits/chosen": 11.762153625488281, "logits/rejected": 11.459575653076172, "logps/chosen": -3.689452648162842, "logps/rejected": -3.8060786724090576, "loss": 4.2925, "rewards/accuracies": 0.75, "rewards/chosen": -36.89452362060547, "rewards/margins": 1.1662616729736328, "rewards/rejected": -38.060787200927734, "step": 2423 }, { "epoch": 0.3300653594771242, "grad_norm": 37.02976351819052, "learning_rate": 6.77858788966528e-07, "logits/chosen": 11.535642623901367, "logits/rejected": 11.52219009399414, "logps/chosen": -3.2586305141448975, "logps/rejected": -3.3926706314086914, "loss": 4.2597, "rewards/accuracies": 0.5, "rewards/chosen": -32.5863037109375, "rewards/margins": 1.340400218963623, "rewards/rejected": -33.92670440673828, "step": 2424 }, { "epoch": 0.3302015250544662, "grad_norm": 35.745413711572766, "learning_rate": 6.777219800208913e-07, "logits/chosen": 11.90524673461914, "logits/rejected": 11.695975303649902, "logps/chosen": -3.1171205043792725, "logps/rejected": -3.370619297027588, "loss": 3.9398, "rewards/accuracies": 0.75, "rewards/chosen": -31.171207427978516, "rewards/margins": 2.5349860191345215, "rewards/rejected": -33.70619201660156, "step": 2425 }, { "epoch": 0.3303376906318083, "grad_norm": 37.91278931672474, "learning_rate": 6.775851083216773e-07, "logits/chosen": 11.943349838256836, "logits/rejected": 12.373053550720215, "logps/chosen": -3.1283750534057617, "logps/rejected": -3.2836196422576904, "loss": 4.0851, "rewards/accuracies": 0.75, "rewards/chosen": -31.283750534057617, "rewards/margins": 1.552445888519287, "rewards/rejected": -32.83619689941406, "step": 2426 }, { "epoch": 0.3304738562091503, "grad_norm": 43.096059309964026, "learning_rate": 6.774481738998138e-07, "logits/chosen": 10.865118026733398, "logits/rejected": 12.233558654785156, "logps/chosen": -3.254545211791992, "logps/rejected": -3.746340751647949, "loss": 4.0226, "rewards/accuracies": 1.0, "rewards/chosen": -32.54545593261719, "rewards/margins": 4.917953968048096, "rewards/rejected": -37.463409423828125, "step": 2427 }, { "epoch": 0.33061002178649235, "grad_norm": 42.31251599202115, "learning_rate": 6.77311176786242e-07, "logits/chosen": 10.984476089477539, "logits/rejected": 12.188848495483398, "logps/chosen": -3.36312198638916, "logps/rejected": -3.2444067001342773, "loss": 4.7778, "rewards/accuracies": 0.5, "rewards/chosen": -33.63121795654297, "rewards/margins": -1.187150478363037, "rewards/rejected": -32.444068908691406, "step": 2428 }, { "epoch": 0.33074618736383443, "grad_norm": 44.76137205181234, "learning_rate": 6.771741170119174e-07, "logits/chosen": 11.689233779907227, "logits/rejected": 11.600166320800781, "logps/chosen": -3.2417125701904297, "logps/rejected": -3.754450798034668, "loss": 2.9441, "rewards/accuracies": 1.0, "rewards/chosen": -32.41712188720703, "rewards/margins": 5.127381801605225, "rewards/rejected": -37.54450607299805, "step": 2429 }, { "epoch": 0.33088235294117646, "grad_norm": 37.970436115888326, "learning_rate": 6.7703699460781e-07, "logits/chosen": 11.827491760253906, "logits/rejected": 12.579876899719238, "logps/chosen": -3.331298351287842, "logps/rejected": -3.48344087600708, "loss": 4.0443, "rewards/accuracies": 0.75, "rewards/chosen": -33.31298065185547, "rewards/margins": 1.52142333984375, "rewards/rejected": -34.834407806396484, "step": 2430 }, { "epoch": 0.33101851851851855, "grad_norm": 42.25148966976065, "learning_rate": 6.768998096049037e-07, "logits/chosen": 11.685308456420898, "logits/rejected": 11.469346046447754, "logps/chosen": -3.695796251296997, "logps/rejected": -3.55118465423584, "loss": 4.1448, "rewards/accuracies": 0.25, "rewards/chosen": -36.95796203613281, "rewards/margins": -1.4461174011230469, "rewards/rejected": -35.51184844970703, "step": 2431 }, { "epoch": 0.3311546840958606, "grad_norm": 48.680561604064515, "learning_rate": 6.767625620341965e-07, "logits/chosen": 11.386587142944336, "logits/rejected": 11.858386993408203, "logps/chosen": -3.300225257873535, "logps/rejected": -3.8576624393463135, "loss": 4.3451, "rewards/accuracies": 0.75, "rewards/chosen": -33.002254486083984, "rewards/margins": 5.574370384216309, "rewards/rejected": -38.576622009277344, "step": 2432 }, { "epoch": 0.3312908496732026, "grad_norm": 44.306162945893796, "learning_rate": 6.766252519267005e-07, "logits/chosen": 12.122675895690918, "logits/rejected": 11.730619430541992, "logps/chosen": -3.2789077758789062, "logps/rejected": -3.2787880897521973, "loss": 4.8808, "rewards/accuracies": 0.75, "rewards/chosen": -32.7890739440918, "rewards/margins": -0.0011949539184570312, "rewards/rejected": -32.787879943847656, "step": 2433 }, { "epoch": 0.3314270152505447, "grad_norm": 38.097112851269365, "learning_rate": 6.764878793134425e-07, "logits/chosen": 11.103058815002441, "logits/rejected": 11.700325965881348, "logps/chosen": -3.4164743423461914, "logps/rejected": -3.730738401412964, "loss": 4.4005, "rewards/accuracies": 0.75, "rewards/chosen": -34.16474151611328, "rewards/margins": 3.142641067504883, "rewards/rejected": -37.3073844909668, "step": 2434 }, { "epoch": 0.3315631808278867, "grad_norm": 38.35467984567533, "learning_rate": 6.763504442254626e-07, "logits/chosen": 11.597888946533203, "logits/rejected": 11.427230834960938, "logps/chosen": -3.169839382171631, "logps/rejected": -3.398237466812134, "loss": 4.095, "rewards/accuracies": 0.5, "rewards/chosen": -31.698396682739258, "rewards/margins": 2.283979892730713, "rewards/rejected": -33.98237609863281, "step": 2435 }, { "epoch": 0.33169934640522875, "grad_norm": 38.322831810109754, "learning_rate": 6.762129466938153e-07, "logits/chosen": 12.430116653442383, "logits/rejected": 12.086925506591797, "logps/chosen": -3.788809061050415, "logps/rejected": -3.6203575134277344, "loss": 4.0643, "rewards/accuracies": 0.25, "rewards/chosen": -37.888092041015625, "rewards/margins": -1.6845149993896484, "rewards/rejected": -36.203575134277344, "step": 2436 }, { "epoch": 0.33183551198257083, "grad_norm": 42.805343124719066, "learning_rate": 6.760753867495698e-07, "logits/chosen": 12.175886154174805, "logits/rejected": 12.156757354736328, "logps/chosen": -3.712474822998047, "logps/rejected": -3.5928261280059814, "loss": 4.5806, "rewards/accuracies": 0.25, "rewards/chosen": -37.12474822998047, "rewards/margins": -1.1964874267578125, "rewards/rejected": -35.928260803222656, "step": 2437 }, { "epoch": 0.33197167755991286, "grad_norm": 48.01090412550645, "learning_rate": 6.759377644238083e-07, "logits/chosen": 11.842062950134277, "logits/rejected": 12.220512390136719, "logps/chosen": -3.438504695892334, "logps/rejected": -3.643217086791992, "loss": 3.9377, "rewards/accuracies": 0.75, "rewards/chosen": -34.385047912597656, "rewards/margins": 2.047125816345215, "rewards/rejected": -36.43217086791992, "step": 2438 }, { "epoch": 0.3321078431372549, "grad_norm": 40.067599006833206, "learning_rate": 6.758000797476283e-07, "logits/chosen": 11.30494499206543, "logits/rejected": 11.868200302124023, "logps/chosen": -3.172349452972412, "logps/rejected": -3.290342330932617, "loss": 3.9533, "rewards/accuracies": 1.0, "rewards/chosen": -31.723493576049805, "rewards/margins": 1.1799287796020508, "rewards/rejected": -32.90342330932617, "step": 2439 }, { "epoch": 0.332244008714597, "grad_norm": 54.490237317074076, "learning_rate": 6.756623327521403e-07, "logits/chosen": 11.560576438903809, "logits/rejected": 11.250187873840332, "logps/chosen": -3.5238256454467773, "logps/rejected": -3.477066993713379, "loss": 3.7243, "rewards/accuracies": 0.25, "rewards/chosen": -35.238258361816406, "rewards/margins": -0.4675865173339844, "rewards/rejected": -34.770668029785156, "step": 2440 }, { "epoch": 0.332380174291939, "grad_norm": 46.478295307180346, "learning_rate": 6.755245234684696e-07, "logits/chosen": 10.688556671142578, "logits/rejected": 11.698226928710938, "logps/chosen": -3.18792724609375, "logps/rejected": -3.2033848762512207, "loss": 4.3083, "rewards/accuracies": 0.5, "rewards/chosen": -31.8792724609375, "rewards/margins": 0.15457868576049805, "rewards/rejected": -32.033851623535156, "step": 2441 }, { "epoch": 0.33251633986928103, "grad_norm": 40.14630657755002, "learning_rate": 6.753866519277554e-07, "logits/chosen": 11.333212852478027, "logits/rejected": 10.897886276245117, "logps/chosen": -3.251479387283325, "logps/rejected": -3.457669496536255, "loss": 4.1897, "rewards/accuracies": 0.5, "rewards/chosen": -32.514793395996094, "rewards/margins": 2.061899185180664, "rewards/rejected": -34.57669448852539, "step": 2442 }, { "epoch": 0.3326525054466231, "grad_norm": 42.12052628944934, "learning_rate": 6.752487181611507e-07, "logits/chosen": 11.83597183227539, "logits/rejected": 12.560290336608887, "logps/chosen": -3.771916389465332, "logps/rejected": -3.9883809089660645, "loss": 3.4733, "rewards/accuracies": 0.75, "rewards/chosen": -37.71916580200195, "rewards/margins": 2.1646456718444824, "rewards/rejected": -39.883811950683594, "step": 2443 }, { "epoch": 0.33278867102396514, "grad_norm": 42.811826985428894, "learning_rate": 6.751107221998231e-07, "logits/chosen": 11.416937828063965, "logits/rejected": 11.570005416870117, "logps/chosen": -3.2827305793762207, "logps/rejected": -3.422605037689209, "loss": 3.5104, "rewards/accuracies": 0.25, "rewards/chosen": -32.82730484008789, "rewards/margins": 1.3987469673156738, "rewards/rejected": -34.226051330566406, "step": 2444 }, { "epoch": 0.3329248366013072, "grad_norm": 40.12909750256067, "learning_rate": 6.749726640749534e-07, "logits/chosen": 11.60281753540039, "logits/rejected": 11.828518867492676, "logps/chosen": -3.4329538345336914, "logps/rejected": -3.642449140548706, "loss": 3.5364, "rewards/accuracies": 0.75, "rewards/chosen": -34.32953643798828, "rewards/margins": 2.094954490661621, "rewards/rejected": -36.42449188232422, "step": 2445 }, { "epoch": 0.33306100217864926, "grad_norm": 39.95361674445754, "learning_rate": 6.748345438177375e-07, "logits/chosen": 11.141894340515137, "logits/rejected": 12.053014755249023, "logps/chosen": -3.548933982849121, "logps/rejected": -4.099447727203369, "loss": 4.4291, "rewards/accuracies": 1.0, "rewards/chosen": -35.489341735839844, "rewards/margins": 5.505140781402588, "rewards/rejected": -40.99448013305664, "step": 2446 }, { "epoch": 0.3331971677559913, "grad_norm": 43.339467404833165, "learning_rate": 6.746963614593846e-07, "logits/chosen": 11.313920974731445, "logits/rejected": 11.557199478149414, "logps/chosen": -3.1465747356414795, "logps/rejected": -3.462712049484253, "loss": 4.5405, "rewards/accuracies": 0.75, "rewards/chosen": -31.465747833251953, "rewards/margins": 3.161372184753418, "rewards/rejected": -34.62712097167969, "step": 2447 }, { "epoch": 0.3333333333333333, "grad_norm": 40.93113520944378, "learning_rate": 6.745581170311183e-07, "logits/chosen": 11.458028793334961, "logits/rejected": 11.459222793579102, "logps/chosen": -3.652294874191284, "logps/rejected": -3.4992480278015137, "loss": 4.4475, "rewards/accuracies": 0.5, "rewards/chosen": -36.52294921875, "rewards/margins": -1.530470848083496, "rewards/rejected": -34.99248123168945, "step": 2448 }, { "epoch": 0.3334694989106754, "grad_norm": 73.24162316407141, "learning_rate": 6.744198105641758e-07, "logits/chosen": 10.80964469909668, "logits/rejected": 10.868841171264648, "logps/chosen": -3.0845227241516113, "logps/rejected": -3.3862478733062744, "loss": 3.364, "rewards/accuracies": 0.75, "rewards/chosen": -30.84522819519043, "rewards/margins": 3.0172505378723145, "rewards/rejected": -33.86248016357422, "step": 2449 }, { "epoch": 0.33360566448801743, "grad_norm": 38.32287106752955, "learning_rate": 6.742814420898086e-07, "logits/chosen": 11.627120018005371, "logits/rejected": 11.53665542602539, "logps/chosen": -3.6982550621032715, "logps/rejected": -3.4313182830810547, "loss": 3.9914, "rewards/accuracies": 0.25, "rewards/chosen": -36.98255157470703, "rewards/margins": -2.669368267059326, "rewards/rejected": -34.31318283081055, "step": 2450 }, { "epoch": 0.33374183006535946, "grad_norm": 37.547963056351975, "learning_rate": 6.741430116392826e-07, "logits/chosen": 10.468429565429688, "logits/rejected": 11.530323028564453, "logps/chosen": -3.3722033500671387, "logps/rejected": -3.7495439052581787, "loss": 3.9928, "rewards/accuracies": 0.75, "rewards/chosen": -33.72203063964844, "rewards/margins": 3.773407459259033, "rewards/rejected": -37.49543762207031, "step": 2451 }, { "epoch": 0.33387799564270154, "grad_norm": 38.18155124615584, "learning_rate": 6.740045192438769e-07, "logits/chosen": 11.314773559570312, "logits/rejected": 12.376693725585938, "logps/chosen": -2.9476094245910645, "logps/rejected": -3.6814076900482178, "loss": 4.0587, "rewards/accuracies": 0.75, "rewards/chosen": -29.476093292236328, "rewards/margins": 7.33798360824585, "rewards/rejected": -36.81407928466797, "step": 2452 }, { "epoch": 0.33401416122004357, "grad_norm": 39.04770232076314, "learning_rate": 6.738659649348852e-07, "logits/chosen": 11.12932014465332, "logits/rejected": 11.618197441101074, "logps/chosen": -3.4345293045043945, "logps/rejected": -3.3646867275238037, "loss": 4.2648, "rewards/accuracies": 0.25, "rewards/chosen": -34.34529113769531, "rewards/margins": -0.6984267234802246, "rewards/rejected": -33.64686584472656, "step": 2453 }, { "epoch": 0.3341503267973856, "grad_norm": 41.15224577467889, "learning_rate": 6.737273487436148e-07, "logits/chosen": 11.660284996032715, "logits/rejected": 12.206746101379395, "logps/chosen": -3.4949111938476562, "logps/rejected": -3.805683135986328, "loss": 4.3516, "rewards/accuracies": 0.5, "rewards/chosen": -34.94911193847656, "rewards/margins": 3.1077189445495605, "rewards/rejected": -38.05683135986328, "step": 2454 }, { "epoch": 0.3342864923747277, "grad_norm": 40.059247232450794, "learning_rate": 6.735886707013874e-07, "logits/chosen": 10.80998706817627, "logits/rejected": 11.135387420654297, "logps/chosen": -3.2677688598632812, "logps/rejected": -3.3998470306396484, "loss": 3.6249, "rewards/accuracies": 0.5, "rewards/chosen": -32.67768859863281, "rewards/margins": 1.3207826614379883, "rewards/rejected": -33.99847412109375, "step": 2455 }, { "epoch": 0.3344226579520697, "grad_norm": 50.50708962952296, "learning_rate": 6.734499308395382e-07, "logits/chosen": 11.71220588684082, "logits/rejected": 10.719895362854004, "logps/chosen": -3.8593225479125977, "logps/rejected": -3.4954092502593994, "loss": 4.7428, "rewards/accuracies": 0.0, "rewards/chosen": -38.59322738647461, "rewards/margins": -3.6391334533691406, "rewards/rejected": -34.95409393310547, "step": 2456 }, { "epoch": 0.33455882352941174, "grad_norm": 37.131705140784604, "learning_rate": 6.733111291894168e-07, "logits/chosen": 11.432750701904297, "logits/rejected": 11.376855850219727, "logps/chosen": -3.3624086380004883, "logps/rejected": -3.4123337268829346, "loss": 4.0916, "rewards/accuracies": 0.5, "rewards/chosen": -33.62408447265625, "rewards/margins": 0.4992518424987793, "rewards/rejected": -34.12333679199219, "step": 2457 }, { "epoch": 0.3346949891067538, "grad_norm": 39.593618134492665, "learning_rate": 6.731722657823867e-07, "logits/chosen": 10.79188346862793, "logits/rejected": 10.407054901123047, "logps/chosen": -3.5359175205230713, "logps/rejected": -3.2545461654663086, "loss": 4.0078, "rewards/accuracies": 0.25, "rewards/chosen": -35.35917663574219, "rewards/margins": -2.8137130737304688, "rewards/rejected": -32.54545974731445, "step": 2458 }, { "epoch": 0.33483115468409586, "grad_norm": 39.50044796370003, "learning_rate": 6.73033340649825e-07, "logits/chosen": 11.038649559020996, "logits/rejected": 11.263190269470215, "logps/chosen": -3.1586568355560303, "logps/rejected": -3.5563290119171143, "loss": 3.6932, "rewards/accuracies": 1.0, "rewards/chosen": -31.58656883239746, "rewards/margins": 3.97672176361084, "rewards/rejected": -35.563289642333984, "step": 2459 }, { "epoch": 0.3349673202614379, "grad_norm": 36.68928180611318, "learning_rate": 6.728943538231231e-07, "logits/chosen": 10.950149536132812, "logits/rejected": 11.360689163208008, "logps/chosen": -3.350597858428955, "logps/rejected": -3.485647678375244, "loss": 3.815, "rewards/accuracies": 0.5, "rewards/chosen": -33.505977630615234, "rewards/margins": 1.3504981994628906, "rewards/rejected": -34.85647964477539, "step": 2460 }, { "epoch": 0.33510348583877997, "grad_norm": 40.380257645206285, "learning_rate": 6.727553053336861e-07, "logits/chosen": 11.376803398132324, "logits/rejected": 11.468786239624023, "logps/chosen": -3.5250303745269775, "logps/rejected": -3.5203537940979004, "loss": 4.1582, "rewards/accuracies": 0.5, "rewards/chosen": -35.25030517578125, "rewards/margins": -0.04676532745361328, "rewards/rejected": -35.20353698730469, "step": 2461 }, { "epoch": 0.335239651416122, "grad_norm": 40.29232161314545, "learning_rate": 6.726161952129334e-07, "logits/chosen": 11.484277725219727, "logits/rejected": 11.325215339660645, "logps/chosen": -3.480149745941162, "logps/rejected": -3.2960293292999268, "loss": 4.2663, "rewards/accuracies": 0.5, "rewards/chosen": -34.80149841308594, "rewards/margins": -1.841203212738037, "rewards/rejected": -32.960296630859375, "step": 2462 }, { "epoch": 0.335375816993464, "grad_norm": 37.83487417147255, "learning_rate": 6.724770234922977e-07, "logits/chosen": 12.020246505737305, "logits/rejected": 10.80555534362793, "logps/chosen": -3.5852787494659424, "logps/rejected": -3.3763427734375, "loss": 4.2252, "rewards/accuracies": 0.0, "rewards/chosen": -35.85279083251953, "rewards/margins": -2.0893616676330566, "rewards/rejected": -33.763427734375, "step": 2463 }, { "epoch": 0.3355119825708061, "grad_norm": 37.28386913042898, "learning_rate": 6.723377902032264e-07, "logits/chosen": 10.491735458374023, "logits/rejected": 11.300434112548828, "logps/chosen": -3.4181931018829346, "logps/rejected": -3.6622581481933594, "loss": 3.8844, "rewards/accuracies": 0.75, "rewards/chosen": -34.18193054199219, "rewards/margins": 2.4406495094299316, "rewards/rejected": -36.622581481933594, "step": 2464 }, { "epoch": 0.33564814814814814, "grad_norm": 38.044062830911514, "learning_rate": 6.721984953771802e-07, "logits/chosen": 11.536765098571777, "logits/rejected": 12.015121459960938, "logps/chosen": -3.2928340435028076, "logps/rejected": -3.657222270965576, "loss": 3.8717, "rewards/accuracies": 0.75, "rewards/chosen": -32.928340911865234, "rewards/margins": 3.643880844116211, "rewards/rejected": -36.57221984863281, "step": 2465 }, { "epoch": 0.33578431372549017, "grad_norm": 41.91418218370699, "learning_rate": 6.720591390456339e-07, "logits/chosen": 11.484643936157227, "logits/rejected": 11.689783096313477, "logps/chosen": -3.1188268661499023, "logps/rejected": -3.3944454193115234, "loss": 4.2701, "rewards/accuracies": 0.75, "rewards/chosen": -31.188270568847656, "rewards/margins": 2.756186008453369, "rewards/rejected": -33.944454193115234, "step": 2466 }, { "epoch": 0.33592047930283225, "grad_norm": 41.25700098167894, "learning_rate": 6.719197212400763e-07, "logits/chosen": 11.4829740524292, "logits/rejected": 11.239792823791504, "logps/chosen": -2.9516608715057373, "logps/rejected": -3.166766405105591, "loss": 4.4437, "rewards/accuracies": 0.5, "rewards/chosen": -29.51660919189453, "rewards/margins": 2.1510558128356934, "rewards/rejected": -31.667665481567383, "step": 2467 }, { "epoch": 0.3360566448801743, "grad_norm": 50.73278515055687, "learning_rate": 6.717802419920099e-07, "logits/chosen": 11.523174285888672, "logits/rejected": 11.215195655822754, "logps/chosen": -3.6667888164520264, "logps/rejected": -3.6688032150268555, "loss": 4.5231, "rewards/accuracies": 0.5, "rewards/chosen": -36.66788864135742, "rewards/margins": 0.02014446258544922, "rewards/rejected": -36.68803405761719, "step": 2468 }, { "epoch": 0.33619281045751637, "grad_norm": 39.91435566780187, "learning_rate": 6.716407013329514e-07, "logits/chosen": 11.833623886108398, "logits/rejected": 11.002376556396484, "logps/chosen": -3.5031707286834717, "logps/rejected": -3.487649440765381, "loss": 3.3611, "rewards/accuracies": 0.5, "rewards/chosen": -35.031707763671875, "rewards/margins": -0.15521478652954102, "rewards/rejected": -34.876495361328125, "step": 2469 }, { "epoch": 0.3363289760348584, "grad_norm": 40.95759374772324, "learning_rate": 6.715010992944309e-07, "logits/chosen": 11.848712921142578, "logits/rejected": 11.10047721862793, "logps/chosen": -3.704972743988037, "logps/rejected": -3.5622189044952393, "loss": 3.9194, "rewards/accuracies": 0.5, "rewards/chosen": -37.04972839355469, "rewards/margins": -1.427539348602295, "rewards/rejected": -35.6221923828125, "step": 2470 }, { "epoch": 0.3364651416122004, "grad_norm": 40.27531318823392, "learning_rate": 6.713614359079929e-07, "logits/chosen": 10.956925392150879, "logits/rejected": 11.348546981811523, "logps/chosen": -3.2828078269958496, "logps/rejected": -3.394584894180298, "loss": 3.9359, "rewards/accuracies": 0.75, "rewards/chosen": -32.82807922363281, "rewards/margins": 1.1177682876586914, "rewards/rejected": -33.94584655761719, "step": 2471 }, { "epoch": 0.3366013071895425, "grad_norm": 44.592391275727245, "learning_rate": 6.712217112051952e-07, "logits/chosen": 11.927906036376953, "logits/rejected": 12.245853424072266, "logps/chosen": -3.5429561138153076, "logps/rejected": -3.66585636138916, "loss": 3.8894, "rewards/accuracies": 0.5, "rewards/chosen": -35.42955780029297, "rewards/margins": 1.2290029525756836, "rewards/rejected": -36.65856170654297, "step": 2472 }, { "epoch": 0.33673747276688454, "grad_norm": 45.00732018751998, "learning_rate": 6.710819252176101e-07, "logits/chosen": 11.257254600524902, "logits/rejected": 11.610925674438477, "logps/chosen": -3.2820162773132324, "logps/rejected": -3.74408221244812, "loss": 4.6225, "rewards/accuracies": 0.75, "rewards/chosen": -32.82016372680664, "rewards/margins": 4.620657920837402, "rewards/rejected": -37.44082260131836, "step": 2473 }, { "epoch": 0.33687363834422657, "grad_norm": 46.6324394043807, "learning_rate": 6.70942077976823e-07, "logits/chosen": 11.4310884475708, "logits/rejected": 11.367055892944336, "logps/chosen": -3.0631818771362305, "logps/rejected": -3.3037421703338623, "loss": 4.0853, "rewards/accuracies": 0.75, "rewards/chosen": -30.631818771362305, "rewards/margins": 2.4056029319763184, "rewards/rejected": -33.03742218017578, "step": 2474 }, { "epoch": 0.33700980392156865, "grad_norm": 46.89596725503807, "learning_rate": 6.708021695144338e-07, "logits/chosen": 10.87270736694336, "logits/rejected": 11.241073608398438, "logps/chosen": -3.3192739486694336, "logps/rejected": -3.502912998199463, "loss": 3.84, "rewards/accuracies": 0.75, "rewards/chosen": -33.19274139404297, "rewards/margins": 1.8363909721374512, "rewards/rejected": -35.02912902832031, "step": 2475 }, { "epoch": 0.3371459694989107, "grad_norm": 37.6204862724195, "learning_rate": 6.70662199862056e-07, "logits/chosen": 11.386960983276367, "logits/rejected": 11.995487213134766, "logps/chosen": -3.6175384521484375, "logps/rejected": -3.966783046722412, "loss": 4.1427, "rewards/accuracies": 0.75, "rewards/chosen": -36.175384521484375, "rewards/margins": 3.492447853088379, "rewards/rejected": -39.66783142089844, "step": 2476 }, { "epoch": 0.3372821350762527, "grad_norm": 37.92080898128056, "learning_rate": 6.70522169051317e-07, "logits/chosen": 11.819548606872559, "logits/rejected": 12.096912384033203, "logps/chosen": -3.5936455726623535, "logps/rejected": -3.7222788333892822, "loss": 4.0378, "rewards/accuracies": 0.5, "rewards/chosen": -35.93645477294922, "rewards/margins": 1.286332130432129, "rewards/rejected": -37.2227897644043, "step": 2477 }, { "epoch": 0.3374183006535948, "grad_norm": 39.93981406803977, "learning_rate": 6.703820771138575e-07, "logits/chosen": 11.78361988067627, "logits/rejected": 10.9173583984375, "logps/chosen": -3.3987412452697754, "logps/rejected": -3.3133604526519775, "loss": 3.903, "rewards/accuracies": 0.5, "rewards/chosen": -33.98741149902344, "rewards/margins": -0.8538107872009277, "rewards/rejected": -33.133602142333984, "step": 2478 }, { "epoch": 0.3375544662309368, "grad_norm": 38.42983125239452, "learning_rate": 6.702419240813327e-07, "logits/chosen": 11.657490730285645, "logits/rejected": 11.926239967346191, "logps/chosen": -3.1903276443481445, "logps/rejected": -3.5079002380371094, "loss": 3.7103, "rewards/accuracies": 0.75, "rewards/chosen": -31.903274536132812, "rewards/margins": 3.175727367401123, "rewards/rejected": -35.079002380371094, "step": 2479 }, { "epoch": 0.33769063180827885, "grad_norm": 42.7115043574222, "learning_rate": 6.701017099854115e-07, "logits/chosen": 11.175338745117188, "logits/rejected": 11.256803512573242, "logps/chosen": -3.286309003829956, "logps/rejected": -3.4310107231140137, "loss": 3.9415, "rewards/accuracies": 0.75, "rewards/chosen": -32.86309051513672, "rewards/margins": 1.4470162391662598, "rewards/rejected": -34.31010818481445, "step": 2480 }, { "epoch": 0.33782679738562094, "grad_norm": 41.70333982831683, "learning_rate": 6.699614348577759e-07, "logits/chosen": 10.009578704833984, "logits/rejected": 10.868982315063477, "logps/chosen": -2.8842921257019043, "logps/rejected": -3.10320782661438, "loss": 4.2774, "rewards/accuracies": 0.75, "rewards/chosen": -28.842920303344727, "rewards/margins": 2.1891579627990723, "rewards/rejected": -31.03207778930664, "step": 2481 }, { "epoch": 0.33796296296296297, "grad_norm": 41.9366885882785, "learning_rate": 6.698210987301228e-07, "logits/chosen": 11.005084991455078, "logits/rejected": 11.344755172729492, "logps/chosen": -3.3840177059173584, "logps/rejected": -3.6243844032287598, "loss": 4.3064, "rewards/accuracies": 0.75, "rewards/chosen": -33.840179443359375, "rewards/margins": 2.4036669731140137, "rewards/rejected": -36.24384689331055, "step": 2482 }, { "epoch": 0.338099128540305, "grad_norm": 38.1393762603592, "learning_rate": 6.696807016341621e-07, "logits/chosen": 10.317371368408203, "logits/rejected": 11.863908767700195, "logps/chosen": -3.1916921138763428, "logps/rejected": -3.576063394546509, "loss": 4.3304, "rewards/accuracies": 0.75, "rewards/chosen": -31.91692352294922, "rewards/margins": 3.8437108993530273, "rewards/rejected": -35.76063537597656, "step": 2483 }, { "epoch": 0.3382352941176471, "grad_norm": 39.6720549249142, "learning_rate": 6.695402436016175e-07, "logits/chosen": 11.288032531738281, "logits/rejected": 10.696954727172852, "logps/chosen": -3.322425365447998, "logps/rejected": -3.2339351177215576, "loss": 3.9783, "rewards/accuracies": 0.5, "rewards/chosen": -33.22425079345703, "rewards/margins": -0.8848996162414551, "rewards/rejected": -32.339351654052734, "step": 2484 }, { "epoch": 0.3383714596949891, "grad_norm": 39.366161514267226, "learning_rate": 6.69399724664227e-07, "logits/chosen": 11.634389877319336, "logits/rejected": 11.88782787322998, "logps/chosen": -3.4336345195770264, "logps/rejected": -3.4593849182128906, "loss": 3.7721, "rewards/accuracies": 0.25, "rewards/chosen": -34.33634567260742, "rewards/margins": 0.25750255584716797, "rewards/rejected": -34.593849182128906, "step": 2485 }, { "epoch": 0.33850762527233114, "grad_norm": 40.58062610894493, "learning_rate": 6.692591448537417e-07, "logits/chosen": 10.724350929260254, "logits/rejected": 11.739546775817871, "logps/chosen": -2.9849467277526855, "logps/rejected": -3.498971700668335, "loss": 3.9078, "rewards/accuracies": 1.0, "rewards/chosen": -29.849468231201172, "rewards/margins": 5.140247821807861, "rewards/rejected": -34.989715576171875, "step": 2486 }, { "epoch": 0.3386437908496732, "grad_norm": 41.4232480513329, "learning_rate": 6.691185042019269e-07, "logits/chosen": 10.353991508483887, "logits/rejected": 11.885727882385254, "logps/chosen": -3.221219062805176, "logps/rejected": -3.73956036567688, "loss": 4.0547, "rewards/accuracies": 1.0, "rewards/chosen": -32.212188720703125, "rewards/margins": 5.183414936065674, "rewards/rejected": -37.39560317993164, "step": 2487 }, { "epoch": 0.33877995642701525, "grad_norm": 44.76658765995501, "learning_rate": 6.689778027405616e-07, "logits/chosen": 11.271584510803223, "logits/rejected": 11.590193748474121, "logps/chosen": -3.4038658142089844, "logps/rejected": -3.283780097961426, "loss": 4.452, "rewards/accuracies": 0.5, "rewards/chosen": -34.038658142089844, "rewards/margins": -1.2008576393127441, "rewards/rejected": -32.83780288696289, "step": 2488 }, { "epoch": 0.3389161220043573, "grad_norm": 39.91673101515302, "learning_rate": 6.688370405014384e-07, "logits/chosen": 11.39179801940918, "logits/rejected": 11.900472640991211, "logps/chosen": -3.2754039764404297, "logps/rejected": -4.081955432891846, "loss": 3.748, "rewards/accuracies": 1.0, "rewards/chosen": -32.7540397644043, "rewards/margins": 8.065511703491211, "rewards/rejected": -40.81955337524414, "step": 2489 }, { "epoch": 0.33905228758169936, "grad_norm": 48.855422427444445, "learning_rate": 6.686962175163636e-07, "logits/chosen": 11.63494873046875, "logits/rejected": 11.912006378173828, "logps/chosen": -3.578767776489258, "logps/rejected": -3.376702308654785, "loss": 4.3612, "rewards/accuracies": 0.0, "rewards/chosen": -35.78767776489258, "rewards/margins": -2.0206542015075684, "rewards/rejected": -33.76702117919922, "step": 2490 }, { "epoch": 0.3391884531590414, "grad_norm": 40.0970400068017, "learning_rate": 6.685553338171574e-07, "logits/chosen": 11.514181137084961, "logits/rejected": 11.528709411621094, "logps/chosen": -3.352708578109741, "logps/rejected": -3.47735333442688, "loss": 4.2207, "rewards/accuracies": 0.5, "rewards/chosen": -33.52708435058594, "rewards/margins": 1.2464475631713867, "rewards/rejected": -34.773536682128906, "step": 2491 }, { "epoch": 0.3393246187363834, "grad_norm": 41.66473029613848, "learning_rate": 6.684143894356535e-07, "logits/chosen": 11.374519348144531, "logits/rejected": 11.91111946105957, "logps/chosen": -3.4193243980407715, "logps/rejected": -3.6500766277313232, "loss": 3.7797, "rewards/accuracies": 0.75, "rewards/chosen": -34.19324493408203, "rewards/margins": 2.3075222969055176, "rewards/rejected": -36.50076675415039, "step": 2492 }, { "epoch": 0.3394607843137255, "grad_norm": 37.34079782843252, "learning_rate": 6.682733844036997e-07, "logits/chosen": 10.820173263549805, "logits/rejected": 10.306888580322266, "logps/chosen": -3.060314655303955, "logps/rejected": -2.9685351848602295, "loss": 3.2362, "rewards/accuracies": 0.5, "rewards/chosen": -30.603147506713867, "rewards/margins": -0.9177932739257812, "rewards/rejected": -29.685352325439453, "step": 2493 }, { "epoch": 0.33959694989106753, "grad_norm": 45.54251570850195, "learning_rate": 6.681323187531572e-07, "logits/chosen": 11.106800079345703, "logits/rejected": 12.435245513916016, "logps/chosen": -3.24600887298584, "logps/rejected": -3.731553077697754, "loss": 4.2388, "rewards/accuracies": 1.0, "rewards/chosen": -32.46009063720703, "rewards/margins": 4.855441093444824, "rewards/rejected": -37.31553268432617, "step": 2494 }, { "epoch": 0.33973311546840956, "grad_norm": 36.87195207377234, "learning_rate": 6.679911925159008e-07, "logits/chosen": 11.713272094726562, "logits/rejected": 12.594165802001953, "logps/chosen": -3.3021464347839355, "logps/rejected": -3.623134136199951, "loss": 4.0374, "rewards/accuracies": 0.75, "rewards/chosen": -33.02146530151367, "rewards/margins": 3.2098779678344727, "rewards/rejected": -36.231346130371094, "step": 2495 }, { "epoch": 0.33986928104575165, "grad_norm": 39.38148533517545, "learning_rate": 6.678500057238192e-07, "logits/chosen": 11.049249649047852, "logits/rejected": 11.379775047302246, "logps/chosen": -3.3519811630249023, "logps/rejected": -3.4713821411132812, "loss": 4.405, "rewards/accuracies": 0.5, "rewards/chosen": -33.519813537597656, "rewards/margins": 1.1940102577209473, "rewards/rejected": -34.71382141113281, "step": 2496 }, { "epoch": 0.3400054466230937, "grad_norm": 42.9626622409219, "learning_rate": 6.677087584088147e-07, "logits/chosen": 11.443765640258789, "logits/rejected": 12.039107322692871, "logps/chosen": -3.4132091999053955, "logps/rejected": -3.5630898475646973, "loss": 4.2893, "rewards/accuracies": 0.5, "rewards/chosen": -34.13209533691406, "rewards/margins": 1.4988059997558594, "rewards/rejected": -35.630897521972656, "step": 2497 }, { "epoch": 0.3401416122004357, "grad_norm": 37.29611194681901, "learning_rate": 6.675674506028034e-07, "logits/chosen": 11.067873001098633, "logits/rejected": 12.12748908996582, "logps/chosen": -3.1645493507385254, "logps/rejected": -3.846240997314453, "loss": 3.8083, "rewards/accuracies": 1.0, "rewards/chosen": -31.64549446105957, "rewards/margins": 6.816915988922119, "rewards/rejected": -38.46240997314453, "step": 2498 }, { "epoch": 0.3402777777777778, "grad_norm": 38.1231138158686, "learning_rate": 6.674260823377149e-07, "logits/chosen": 11.186925888061523, "logits/rejected": 10.971114158630371, "logps/chosen": -3.1586718559265137, "logps/rejected": -3.3052384853363037, "loss": 3.8982, "rewards/accuracies": 0.5, "rewards/chosen": -31.586715698242188, "rewards/margins": 1.4656667709350586, "rewards/rejected": -33.05238342285156, "step": 2499 }, { "epoch": 0.3404139433551198, "grad_norm": 35.517598716287935, "learning_rate": 6.672846536454924e-07, "logits/chosen": 11.599363327026367, "logits/rejected": 11.44483757019043, "logps/chosen": -3.1405649185180664, "logps/rejected": -3.2350335121154785, "loss": 4.0097, "rewards/accuracies": 0.5, "rewards/chosen": -31.405649185180664, "rewards/margins": 0.9446868896484375, "rewards/rejected": -32.350337982177734, "step": 2500 }, { "epoch": 0.34055010893246185, "grad_norm": 39.379040278875564, "learning_rate": 6.671431645580933e-07, "logits/chosen": 11.820212364196777, "logits/rejected": 12.197735786437988, "logps/chosen": -3.7661032676696777, "logps/rejected": -3.8022537231445312, "loss": 4.3861, "rewards/accuracies": 0.25, "rewards/chosen": -37.661033630371094, "rewards/margins": 0.36150550842285156, "rewards/rejected": -38.02253723144531, "step": 2501 }, { "epoch": 0.34068627450980393, "grad_norm": 58.530922652536944, "learning_rate": 6.670016151074877e-07, "logits/chosen": 11.713550567626953, "logits/rejected": 11.735122680664062, "logps/chosen": -3.959041118621826, "logps/rejected": -3.74699330329895, "loss": 4.4332, "rewards/accuracies": 0.5, "rewards/chosen": -39.59041213989258, "rewards/margins": -2.1204757690429688, "rewards/rejected": -37.469932556152344, "step": 2502 }, { "epoch": 0.34082244008714596, "grad_norm": 43.264844519026894, "learning_rate": 6.668600053256601e-07, "logits/chosen": 11.511255264282227, "logits/rejected": 11.432790756225586, "logps/chosen": -3.554215669631958, "logps/rejected": -3.939243793487549, "loss": 4.0036, "rewards/accuracies": 0.75, "rewards/chosen": -35.54216003417969, "rewards/margins": 3.850281238555908, "rewards/rejected": -39.39244079589844, "step": 2503 }, { "epoch": 0.340958605664488, "grad_norm": 40.43239701674588, "learning_rate": 6.667183352446085e-07, "logits/chosen": 11.500007629394531, "logits/rejected": 12.091400146484375, "logps/chosen": -3.5949082374572754, "logps/rejected": -3.9258508682250977, "loss": 4.0212, "rewards/accuracies": 1.0, "rewards/chosen": -35.94908142089844, "rewards/margins": 3.309427261352539, "rewards/rejected": -39.25851058959961, "step": 2504 }, { "epoch": 0.3410947712418301, "grad_norm": 40.49544658837574, "learning_rate": 6.665766048963443e-07, "logits/chosen": 11.726059913635254, "logits/rejected": 12.685402870178223, "logps/chosen": -3.24196720123291, "logps/rejected": -3.623441219329834, "loss": 4.4807, "rewards/accuracies": 0.75, "rewards/chosen": -32.41967010498047, "rewards/margins": 3.814741611480713, "rewards/rejected": -36.234413146972656, "step": 2505 }, { "epoch": 0.3412309368191721, "grad_norm": 39.62492226068134, "learning_rate": 6.664348143128928e-07, "logits/chosen": 11.611457824707031, "logits/rejected": 12.044462203979492, "logps/chosen": -3.760646343231201, "logps/rejected": -3.918133020401001, "loss": 3.2361, "rewards/accuracies": 0.75, "rewards/chosen": -37.606468200683594, "rewards/margins": 1.5748662948608398, "rewards/rejected": -39.181331634521484, "step": 2506 }, { "epoch": 0.3413671023965142, "grad_norm": 43.83484104446669, "learning_rate": 6.662929635262925e-07, "logits/chosen": 11.69355583190918, "logits/rejected": 11.647302627563477, "logps/chosen": -3.591338872909546, "logps/rejected": -3.9292595386505127, "loss": 4.03, "rewards/accuracies": 1.0, "rewards/chosen": -35.913387298583984, "rewards/margins": 3.3792076110839844, "rewards/rejected": -39.29259490966797, "step": 2507 }, { "epoch": 0.3415032679738562, "grad_norm": 42.11037392453613, "learning_rate": 6.661510525685958e-07, "logits/chosen": 11.792393684387207, "logits/rejected": 12.0761137008667, "logps/chosen": -3.7233872413635254, "logps/rejected": -3.8091347217559814, "loss": 4.0357, "rewards/accuracies": 0.5, "rewards/chosen": -37.23387145996094, "rewards/margins": 0.8574743270874023, "rewards/rejected": -38.091346740722656, "step": 2508 }, { "epoch": 0.34163943355119825, "grad_norm": 43.80680509155069, "learning_rate": 6.660090814718689e-07, "logits/chosen": 10.964420318603516, "logits/rejected": 12.206031799316406, "logps/chosen": -3.740607500076294, "logps/rejected": -3.9208059310913086, "loss": 4.0489, "rewards/accuracies": 0.75, "rewards/chosen": -37.40607452392578, "rewards/margins": 1.8019843101501465, "rewards/rejected": -39.20806121826172, "step": 2509 }, { "epoch": 0.34177559912854033, "grad_norm": 35.593701953110724, "learning_rate": 6.658670502681911e-07, "logits/chosen": 11.125091552734375, "logits/rejected": 12.407391548156738, "logps/chosen": -3.1442952156066895, "logps/rejected": -3.4326090812683105, "loss": 3.9931, "rewards/accuracies": 0.75, "rewards/chosen": -31.442951202392578, "rewards/margins": 2.883138656616211, "rewards/rejected": -34.32609176635742, "step": 2510 }, { "epoch": 0.34191176470588236, "grad_norm": 38.11902548832071, "learning_rate": 6.657249589896557e-07, "logits/chosen": 11.751721382141113, "logits/rejected": 12.767827987670898, "logps/chosen": -3.552138566970825, "logps/rejected": -3.944387197494507, "loss": 4.0544, "rewards/accuracies": 0.75, "rewards/chosen": -35.521385192871094, "rewards/margins": 3.922488212585449, "rewards/rejected": -39.44387435913086, "step": 2511 }, { "epoch": 0.3420479302832244, "grad_norm": 36.209978377823575, "learning_rate": 6.655828076683693e-07, "logits/chosen": 11.767380714416504, "logits/rejected": 12.487137794494629, "logps/chosen": -3.570852756500244, "logps/rejected": -3.7888779640197754, "loss": 4.2104, "rewards/accuracies": 0.5, "rewards/chosen": -35.708526611328125, "rewards/margins": 2.180255889892578, "rewards/rejected": -37.8887825012207, "step": 2512 }, { "epoch": 0.3421840958605665, "grad_norm": 44.752079879387516, "learning_rate": 6.654405963364521e-07, "logits/chosen": 11.586591720581055, "logits/rejected": 11.669647216796875, "logps/chosen": -3.444441080093384, "logps/rejected": -3.412097692489624, "loss": 3.8408, "rewards/accuracies": 0.5, "rewards/chosen": -34.44441223144531, "rewards/margins": -0.32343530654907227, "rewards/rejected": -34.12097930908203, "step": 2513 }, { "epoch": 0.3423202614379085, "grad_norm": 34.51875003921537, "learning_rate": 6.65298325026038e-07, "logits/chosen": 11.389888763427734, "logits/rejected": 12.123651504516602, "logps/chosen": -3.4147696495056152, "logps/rejected": -3.8206896781921387, "loss": 4.328, "rewards/accuracies": 1.0, "rewards/chosen": -34.1476936340332, "rewards/margins": 4.059202194213867, "rewards/rejected": -38.2068977355957, "step": 2514 }, { "epoch": 0.34245642701525053, "grad_norm": 37.38397998198995, "learning_rate": 6.651559937692745e-07, "logits/chosen": 11.038064002990723, "logits/rejected": 12.362791061401367, "logps/chosen": -3.216756820678711, "logps/rejected": -3.803895950317383, "loss": 3.8756, "rewards/accuracies": 1.0, "rewards/chosen": -32.16756820678711, "rewards/margins": 5.871392250061035, "rewards/rejected": -38.03895950317383, "step": 2515 }, { "epoch": 0.3425925925925926, "grad_norm": 36.007680323632414, "learning_rate": 6.650136025983224e-07, "logits/chosen": 11.979940414428711, "logits/rejected": 12.035900115966797, "logps/chosen": -3.5141007900238037, "logps/rejected": -3.957746982574463, "loss": 4.1416, "rewards/accuracies": 0.75, "rewards/chosen": -35.14100646972656, "rewards/margins": 4.436460494995117, "rewards/rejected": -39.57746887207031, "step": 2516 }, { "epoch": 0.34272875816993464, "grad_norm": 33.85263072800786, "learning_rate": 6.648711515453561e-07, "logits/chosen": 12.553772926330566, "logits/rejected": 11.790185928344727, "logps/chosen": -3.4600932598114014, "logps/rejected": -3.5061182975769043, "loss": 3.4299, "rewards/accuracies": 0.5, "rewards/chosen": -34.60093307495117, "rewards/margins": 0.4602503776550293, "rewards/rejected": -35.061180114746094, "step": 2517 }, { "epoch": 0.3428649237472767, "grad_norm": 36.783123868033776, "learning_rate": 6.647286406425636e-07, "logits/chosen": 11.363763809204102, "logits/rejected": 11.868794441223145, "logps/chosen": -3.3669753074645996, "logps/rejected": -3.4452812671661377, "loss": 3.5611, "rewards/accuracies": 0.75, "rewards/chosen": -33.66975402832031, "rewards/margins": 0.7830572128295898, "rewards/rejected": -34.45281219482422, "step": 2518 }, { "epoch": 0.34300108932461876, "grad_norm": 42.2097976611409, "learning_rate": 6.645860699221466e-07, "logits/chosen": 12.6693115234375, "logits/rejected": 11.957266807556152, "logps/chosen": -3.9610860347747803, "logps/rejected": -3.6744322776794434, "loss": 5.0439, "rewards/accuracies": 0.0, "rewards/chosen": -39.610862731933594, "rewards/margins": -2.8665356636047363, "rewards/rejected": -36.74432373046875, "step": 2519 }, { "epoch": 0.3431372549019608, "grad_norm": 34.28165312508199, "learning_rate": 6.644434394163199e-07, "logits/chosen": 11.984434127807617, "logits/rejected": 12.157806396484375, "logps/chosen": -3.6359102725982666, "logps/rejected": -3.6200037002563477, "loss": 4.0709, "rewards/accuracies": 0.75, "rewards/chosen": -36.359100341796875, "rewards/margins": -0.15906333923339844, "rewards/rejected": -36.20003890991211, "step": 2520 }, { "epoch": 0.3432734204793028, "grad_norm": 36.90064211922644, "learning_rate": 6.643007491573122e-07, "logits/chosen": 12.348060607910156, "logits/rejected": 11.315930366516113, "logps/chosen": -3.3757433891296387, "logps/rejected": -3.285834312438965, "loss": 3.9759, "rewards/accuracies": 0.5, "rewards/chosen": -33.7574348449707, "rewards/margins": -0.8990898132324219, "rewards/rejected": -32.85834503173828, "step": 2521 }, { "epoch": 0.3434095860566449, "grad_norm": 39.92701875885322, "learning_rate": 6.641579991773655e-07, "logits/chosen": 11.38775634765625, "logits/rejected": 12.334589004516602, "logps/chosen": -3.4523377418518066, "logps/rejected": -3.694713592529297, "loss": 4.3544, "rewards/accuracies": 0.5, "rewards/chosen": -34.52337646484375, "rewards/margins": 2.423758029937744, "rewards/rejected": -36.94713592529297, "step": 2522 }, { "epoch": 0.34354575163398693, "grad_norm": 36.60144141097891, "learning_rate": 6.640151895087354e-07, "logits/chosen": 12.200206756591797, "logits/rejected": 12.975391387939453, "logps/chosen": -3.43062424659729, "logps/rejected": -3.7656493186950684, "loss": 3.9409, "rewards/accuracies": 1.0, "rewards/chosen": -34.306243896484375, "rewards/margins": 3.3502488136291504, "rewards/rejected": -37.656490325927734, "step": 2523 }, { "epoch": 0.34368191721132896, "grad_norm": 44.561694642390734, "learning_rate": 6.638723201836908e-07, "logits/chosen": 12.877595901489258, "logits/rejected": 12.948989868164062, "logps/chosen": -3.5308241844177246, "logps/rejected": -3.7968802452087402, "loss": 4.8808, "rewards/accuracies": 0.75, "rewards/chosen": -35.30824279785156, "rewards/margins": 2.6605606079101562, "rewards/rejected": -37.96880340576172, "step": 2524 }, { "epoch": 0.34381808278867104, "grad_norm": 41.185342067878935, "learning_rate": 6.637293912345143e-07, "logits/chosen": 11.028667449951172, "logits/rejected": 12.275452613830566, "logps/chosen": -3.356525182723999, "logps/rejected": -3.670956611633301, "loss": 3.4932, "rewards/accuracies": 1.0, "rewards/chosen": -33.56525421142578, "rewards/margins": 3.1443142890930176, "rewards/rejected": -36.709564208984375, "step": 2525 }, { "epoch": 0.34395424836601307, "grad_norm": 37.52395770040282, "learning_rate": 6.635864026935018e-07, "logits/chosen": 12.493292808532715, "logits/rejected": 12.109518051147461, "logps/chosen": -3.4755146503448486, "logps/rejected": -3.6416733264923096, "loss": 3.9604, "rewards/accuracies": 0.75, "rewards/chosen": -34.75514602661133, "rewards/margins": 1.6615877151489258, "rewards/rejected": -36.41673278808594, "step": 2526 }, { "epoch": 0.3440904139433551, "grad_norm": 39.22235665158618, "learning_rate": 6.634433545929628e-07, "logits/chosen": 12.148999214172363, "logits/rejected": 12.660844802856445, "logps/chosen": -3.194126605987549, "logps/rejected": -3.660130023956299, "loss": 3.7351, "rewards/accuracies": 0.75, "rewards/chosen": -31.941268920898438, "rewards/margins": 4.660029411315918, "rewards/rejected": -36.60129928588867, "step": 2527 }, { "epoch": 0.3442265795206972, "grad_norm": 34.53903274221017, "learning_rate": 6.633002469652201e-07, "logits/chosen": 10.554851531982422, "logits/rejected": 11.794492721557617, "logps/chosen": -3.300506591796875, "logps/rejected": -3.55462646484375, "loss": 3.871, "rewards/accuracies": 0.5, "rewards/chosen": -33.00506591796875, "rewards/margins": 2.541198253631592, "rewards/rejected": -35.5462646484375, "step": 2528 }, { "epoch": 0.3443627450980392, "grad_norm": 34.65003425895595, "learning_rate": 6.631570798426102e-07, "logits/chosen": 10.89413833618164, "logits/rejected": 11.663664817810059, "logps/chosen": -3.0447120666503906, "logps/rejected": -3.37148380279541, "loss": 4.1181, "rewards/accuracies": 1.0, "rewards/chosen": -30.447120666503906, "rewards/margins": 3.2677159309387207, "rewards/rejected": -33.71483612060547, "step": 2529 }, { "epoch": 0.34449891067538124, "grad_norm": 35.49395399149683, "learning_rate": 6.630138532574829e-07, "logits/chosen": 11.183723449707031, "logits/rejected": 11.787616729736328, "logps/chosen": -3.249988317489624, "logps/rejected": -3.407294750213623, "loss": 3.7975, "rewards/accuracies": 0.5, "rewards/chosen": -32.49988555908203, "rewards/margins": 1.5730619430541992, "rewards/rejected": -34.07294464111328, "step": 2530 }, { "epoch": 0.3446350762527233, "grad_norm": 36.33061150965772, "learning_rate": 6.628705672422013e-07, "logits/chosen": 12.339461326599121, "logits/rejected": 12.526865005493164, "logps/chosen": -3.3945531845092773, "logps/rejected": -3.6075329780578613, "loss": 4.5248, "rewards/accuracies": 0.5, "rewards/chosen": -33.945533752441406, "rewards/margins": 2.1297965049743652, "rewards/rejected": -36.07533264160156, "step": 2531 }, { "epoch": 0.34477124183006536, "grad_norm": 41.105607746203205, "learning_rate": 6.627272218291421e-07, "logits/chosen": 12.733198165893555, "logits/rejected": 13.047517776489258, "logps/chosen": -3.932356834411621, "logps/rejected": -4.287546157836914, "loss": 4.4939, "rewards/accuracies": 0.5, "rewards/chosen": -39.323570251464844, "rewards/margins": 3.551894187927246, "rewards/rejected": -42.875465393066406, "step": 2532 }, { "epoch": 0.3449074074074074, "grad_norm": 35.68799075609808, "learning_rate": 6.625838170506954e-07, "logits/chosen": 11.476659774780273, "logits/rejected": 12.354073524475098, "logps/chosen": -3.5775253772735596, "logps/rejected": -3.9351797103881836, "loss": 3.8347, "rewards/accuracies": 0.75, "rewards/chosen": -35.77525329589844, "rewards/margins": 3.5765419006347656, "rewards/rejected": -39.3517951965332, "step": 2533 }, { "epoch": 0.34504357298474947, "grad_norm": 32.206189084124524, "learning_rate": 6.624403529392647e-07, "logits/chosen": 13.24067497253418, "logits/rejected": 12.318748474121094, "logps/chosen": -3.8623359203338623, "logps/rejected": -3.6106796264648438, "loss": 4.0626, "rewards/accuracies": 0.25, "rewards/chosen": -38.62335968017578, "rewards/margins": -2.516561985015869, "rewards/rejected": -36.10679626464844, "step": 2534 }, { "epoch": 0.3451797385620915, "grad_norm": 35.33848567882146, "learning_rate": 6.622968295272669e-07, "logits/chosen": 11.80096435546875, "logits/rejected": 12.154352188110352, "logps/chosen": -3.7187042236328125, "logps/rejected": -3.9117484092712402, "loss": 4.4409, "rewards/accuracies": 0.5, "rewards/chosen": -37.187042236328125, "rewards/margins": 1.9304428100585938, "rewards/rejected": -39.11748504638672, "step": 2535 }, { "epoch": 0.3453159041394335, "grad_norm": 42.73472980587523, "learning_rate": 6.621532468471324e-07, "logits/chosen": 11.636857032775879, "logits/rejected": 11.77206802368164, "logps/chosen": -3.3751590251922607, "logps/rejected": -3.6442220211029053, "loss": 4.298, "rewards/accuracies": 0.75, "rewards/chosen": -33.7515869140625, "rewards/margins": 2.6906309127807617, "rewards/rejected": -36.442222595214844, "step": 2536 }, { "epoch": 0.3454520697167756, "grad_norm": 45.52733846993485, "learning_rate": 6.620096049313048e-07, "logits/chosen": 12.577853202819824, "logits/rejected": 12.195331573486328, "logps/chosen": -3.54597806930542, "logps/rejected": -3.7267911434173584, "loss": 4.1363, "rewards/accuracies": 0.75, "rewards/chosen": -35.45977783203125, "rewards/margins": 1.808131217956543, "rewards/rejected": -37.26791000366211, "step": 2537 }, { "epoch": 0.34558823529411764, "grad_norm": 34.06087193509041, "learning_rate": 6.61865903812241e-07, "logits/chosen": 10.669672966003418, "logits/rejected": 12.469511032104492, "logps/chosen": -3.447929859161377, "logps/rejected": -3.6655430793762207, "loss": 3.8242, "rewards/accuracies": 0.5, "rewards/chosen": -34.47929763793945, "rewards/margins": 2.176133155822754, "rewards/rejected": -36.655433654785156, "step": 2538 }, { "epoch": 0.34572440087145967, "grad_norm": 36.52772847108553, "learning_rate": 6.617221435224117e-07, "logits/chosen": 11.630537033081055, "logits/rejected": 12.389392852783203, "logps/chosen": -3.408320665359497, "logps/rejected": -3.539353847503662, "loss": 3.7144, "rewards/accuracies": 0.5, "rewards/chosen": -34.08320617675781, "rewards/margins": 1.310330867767334, "rewards/rejected": -35.39353942871094, "step": 2539 }, { "epoch": 0.34586056644880175, "grad_norm": 37.70346870341896, "learning_rate": 6.615783240943007e-07, "logits/chosen": 11.911436080932617, "logits/rejected": 12.037237167358398, "logps/chosen": -3.4981067180633545, "logps/rejected": -3.5895748138427734, "loss": 4.053, "rewards/accuracies": 0.5, "rewards/chosen": -34.98106384277344, "rewards/margins": 0.9146833419799805, "rewards/rejected": -35.895751953125, "step": 2540 }, { "epoch": 0.3459967320261438, "grad_norm": 39.24922876794796, "learning_rate": 6.614344455604051e-07, "logits/chosen": 11.971057891845703, "logits/rejected": 11.456520080566406, "logps/chosen": -3.2707979679107666, "logps/rejected": -3.1723198890686035, "loss": 4.1264, "rewards/accuracies": 0.25, "rewards/chosen": -32.707977294921875, "rewards/margins": -0.9847798347473145, "rewards/rejected": -31.72319984436035, "step": 2541 }, { "epoch": 0.3461328976034858, "grad_norm": 37.85144449146249, "learning_rate": 6.612905079532355e-07, "logits/chosen": 12.29742431640625, "logits/rejected": 12.341060638427734, "logps/chosen": -3.6399433612823486, "logps/rejected": -3.6535568237304688, "loss": 4.2565, "rewards/accuracies": 0.5, "rewards/chosen": -36.39943313598633, "rewards/margins": 0.13613605499267578, "rewards/rejected": -36.53556823730469, "step": 2542 }, { "epoch": 0.3462690631808279, "grad_norm": 39.79745973905567, "learning_rate": 6.611465113053158e-07, "logits/chosen": 11.606439590454102, "logits/rejected": 12.343082427978516, "logps/chosen": -3.6121768951416016, "logps/rejected": -3.7651257514953613, "loss": 3.8984, "rewards/accuracies": 0.75, "rewards/chosen": -36.121768951416016, "rewards/margins": 1.5294914245605469, "rewards/rejected": -37.65126037597656, "step": 2543 }, { "epoch": 0.3464052287581699, "grad_norm": 40.159639715528385, "learning_rate": 6.610024556491831e-07, "logits/chosen": 11.432809829711914, "logits/rejected": 12.07551097869873, "logps/chosen": -3.3816871643066406, "logps/rejected": -3.786721706390381, "loss": 4.2876, "rewards/accuracies": 1.0, "rewards/chosen": -33.816871643066406, "rewards/margins": 4.050345420837402, "rewards/rejected": -37.867218017578125, "step": 2544 }, { "epoch": 0.346541394335512, "grad_norm": 41.050225576570455, "learning_rate": 6.608583410173883e-07, "logits/chosen": 11.471519470214844, "logits/rejected": 12.059281349182129, "logps/chosen": -3.4429492950439453, "logps/rejected": -3.6517906188964844, "loss": 3.8386, "rewards/accuracies": 0.75, "rewards/chosen": -34.42949295043945, "rewards/margins": 2.0884132385253906, "rewards/rejected": -36.517906188964844, "step": 2545 }, { "epoch": 0.34667755991285404, "grad_norm": 41.53041504961949, "learning_rate": 6.60714167442495e-07, "logits/chosen": 11.249288558959961, "logits/rejected": 12.433526039123535, "logps/chosen": -3.4582395553588867, "logps/rejected": -3.962573766708374, "loss": 4.0095, "rewards/accuracies": 1.0, "rewards/chosen": -34.5823974609375, "rewards/margins": 5.043343544006348, "rewards/rejected": -39.62574005126953, "step": 2546 }, { "epoch": 0.34681372549019607, "grad_norm": 39.750180825472604, "learning_rate": 6.605699349570804e-07, "logits/chosen": 10.592053413391113, "logits/rejected": 12.160285949707031, "logps/chosen": -3.4957728385925293, "logps/rejected": -3.726304054260254, "loss": 4.484, "rewards/accuracies": 0.5, "rewards/chosen": -34.957725524902344, "rewards/margins": 2.3053135871887207, "rewards/rejected": -37.263038635253906, "step": 2547 }, { "epoch": 0.34694989106753815, "grad_norm": 65.44334947491397, "learning_rate": 6.604256435937351e-07, "logits/chosen": 11.988069534301758, "logits/rejected": 12.264278411865234, "logps/chosen": -3.8014416694641113, "logps/rejected": -3.9823529720306396, "loss": 4.0023, "rewards/accuracies": 0.5, "rewards/chosen": -38.0144157409668, "rewards/margins": 1.8091135025024414, "rewards/rejected": -39.82353210449219, "step": 2548 }, { "epoch": 0.3470860566448802, "grad_norm": 49.506953846989596, "learning_rate": 6.602812933850628e-07, "logits/chosen": 12.606208801269531, "logits/rejected": 12.469709396362305, "logps/chosen": -3.8750429153442383, "logps/rejected": -3.948547124862671, "loss": 4.164, "rewards/accuracies": 0.5, "rewards/chosen": -38.75042724609375, "rewards/margins": 0.7350425720214844, "rewards/rejected": -39.4854736328125, "step": 2549 }, { "epoch": 0.3472222222222222, "grad_norm": 49.22731786588155, "learning_rate": 6.60136884363681e-07, "logits/chosen": 11.910048484802246, "logits/rejected": 11.821632385253906, "logps/chosen": -3.460350751876831, "logps/rejected": -3.8200392723083496, "loss": 4.2011, "rewards/accuracies": 1.0, "rewards/chosen": -34.60350799560547, "rewards/margins": 3.5968823432922363, "rewards/rejected": -38.20038986206055, "step": 2550 }, { "epoch": 0.3473583877995643, "grad_norm": 41.50386584213202, "learning_rate": 6.599924165622198e-07, "logits/chosen": 11.354156494140625, "logits/rejected": 12.00836181640625, "logps/chosen": -3.6589174270629883, "logps/rejected": -3.4967429637908936, "loss": 4.1855, "rewards/accuracies": 0.25, "rewards/chosen": -36.58917236328125, "rewards/margins": -1.621744155883789, "rewards/rejected": -34.967430114746094, "step": 2551 }, { "epoch": 0.3474945533769063, "grad_norm": 40.859688268053375, "learning_rate": 6.598478900133229e-07, "logits/chosen": 12.500370025634766, "logits/rejected": 12.054121017456055, "logps/chosen": -4.008111953735352, "logps/rejected": -3.914168119430542, "loss": 4.1776, "rewards/accuracies": 0.25, "rewards/chosen": -40.081119537353516, "rewards/margins": -0.9394397735595703, "rewards/rejected": -39.14167785644531, "step": 2552 }, { "epoch": 0.34763071895424835, "grad_norm": 41.52434759941503, "learning_rate": 6.597033047496474e-07, "logits/chosen": 11.590923309326172, "logits/rejected": 12.962902069091797, "logps/chosen": -3.6266276836395264, "logps/rejected": -3.897528648376465, "loss": 4.1681, "rewards/accuracies": 0.75, "rewards/chosen": -36.26627731323242, "rewards/margins": 2.7090096473693848, "rewards/rejected": -38.97528839111328, "step": 2553 }, { "epoch": 0.34776688453159044, "grad_norm": 39.636696989234586, "learning_rate": 6.595586608038634e-07, "logits/chosen": 10.474812507629395, "logits/rejected": 11.571603775024414, "logps/chosen": -3.6955623626708984, "logps/rejected": -3.876190662384033, "loss": 3.5758, "rewards/accuracies": 0.75, "rewards/chosen": -36.955623626708984, "rewards/margins": 1.806283950805664, "rewards/rejected": -38.76190948486328, "step": 2554 }, { "epoch": 0.34790305010893247, "grad_norm": 42.150381576261005, "learning_rate": 6.594139582086544e-07, "logits/chosen": 10.710216522216797, "logits/rejected": 11.291504859924316, "logps/chosen": -3.350592613220215, "logps/rejected": -3.7575149536132812, "loss": 4.0733, "rewards/accuracies": 0.75, "rewards/chosen": -33.50592803955078, "rewards/margins": 4.069225311279297, "rewards/rejected": -37.57514953613281, "step": 2555 }, { "epoch": 0.3480392156862745, "grad_norm": 44.69979111706894, "learning_rate": 6.592691969967174e-07, "logits/chosen": 11.06252670288086, "logits/rejected": 10.979654312133789, "logps/chosen": -3.5576601028442383, "logps/rejected": -3.4927239418029785, "loss": 4.2127, "rewards/accuracies": 0.75, "rewards/chosen": -35.576602935791016, "rewards/margins": -0.6493625640869141, "rewards/rejected": -34.92723846435547, "step": 2556 }, { "epoch": 0.3481753812636166, "grad_norm": 43.71221480563226, "learning_rate": 6.59124377200762e-07, "logits/chosen": 11.879563331604004, "logits/rejected": 11.854272842407227, "logps/chosen": -3.57596755027771, "logps/rejected": -3.9589059352874756, "loss": 3.4961, "rewards/accuracies": 0.75, "rewards/chosen": -35.75967788696289, "rewards/margins": 3.8293819427490234, "rewards/rejected": -39.58905792236328, "step": 2557 }, { "epoch": 0.3483115468409586, "grad_norm": 39.87798801110582, "learning_rate": 6.589794988535118e-07, "logits/chosen": 11.405868530273438, "logits/rejected": 11.610973358154297, "logps/chosen": -3.6590654850006104, "logps/rejected": -3.7210235595703125, "loss": 3.8704, "rewards/accuracies": 0.5, "rewards/chosen": -36.59065246582031, "rewards/margins": 0.6195802688598633, "rewards/rejected": -37.21023178100586, "step": 2558 }, { "epoch": 0.34844771241830064, "grad_norm": 45.76804838419252, "learning_rate": 6.588345619877028e-07, "logits/chosen": 12.831417083740234, "logits/rejected": 13.10523796081543, "logps/chosen": -3.9304418563842773, "logps/rejected": -3.905498504638672, "loss": 4.5565, "rewards/accuracies": 0.5, "rewards/chosen": -39.304420471191406, "rewards/margins": -0.2494335174560547, "rewards/rejected": -39.05498504638672, "step": 2559 }, { "epoch": 0.3485838779956427, "grad_norm": 57.473363490767944, "learning_rate": 6.586895666360852e-07, "logits/chosen": 11.622377395629883, "logits/rejected": 11.98760986328125, "logps/chosen": -3.8044326305389404, "logps/rejected": -4.0482659339904785, "loss": 4.0984, "rewards/accuracies": 0.75, "rewards/chosen": -38.04432678222656, "rewards/margins": 2.4383325576782227, "rewards/rejected": -40.48265838623047, "step": 2560 }, { "epoch": 0.34872004357298475, "grad_norm": 41.70248277000551, "learning_rate": 6.585445128314217e-07, "logits/chosen": 11.337228775024414, "logits/rejected": 12.996850967407227, "logps/chosen": -3.209151268005371, "logps/rejected": -4.001906394958496, "loss": 4.2869, "rewards/accuracies": 1.0, "rewards/chosen": -32.091514587402344, "rewards/margins": 7.927546977996826, "rewards/rejected": -40.01905822753906, "step": 2561 }, { "epoch": 0.3488562091503268, "grad_norm": 41.40116787320568, "learning_rate": 6.583994006064883e-07, "logits/chosen": 11.293066024780273, "logits/rejected": 13.053232192993164, "logps/chosen": -3.3884739875793457, "logps/rejected": -3.908656597137451, "loss": 3.5951, "rewards/accuracies": 0.75, "rewards/chosen": -33.884742736816406, "rewards/margins": 5.2018256187438965, "rewards/rejected": -39.08656311035156, "step": 2562 }, { "epoch": 0.34899237472766886, "grad_norm": 51.13096245155539, "learning_rate": 6.582542299940744e-07, "logits/chosen": 12.476245880126953, "logits/rejected": 12.49885082244873, "logps/chosen": -3.9631056785583496, "logps/rejected": -4.015259742736816, "loss": 3.6958, "rewards/accuracies": 0.5, "rewards/chosen": -39.63105773925781, "rewards/margins": 0.5215435028076172, "rewards/rejected": -40.15260314941406, "step": 2563 }, { "epoch": 0.3491285403050109, "grad_norm": 43.27642618383524, "learning_rate": 6.581090010269825e-07, "logits/chosen": 12.574786186218262, "logits/rejected": 12.381410598754883, "logps/chosen": -3.884871482849121, "logps/rejected": -3.739455223083496, "loss": 4.0437, "rewards/accuracies": 0.5, "rewards/chosen": -38.848716735839844, "rewards/margins": -1.4541630744934082, "rewards/rejected": -37.394554138183594, "step": 2564 }, { "epoch": 0.3492647058823529, "grad_norm": 43.241972338258705, "learning_rate": 6.579637137380282e-07, "logits/chosen": 12.848904609680176, "logits/rejected": 12.018932342529297, "logps/chosen": -3.733402729034424, "logps/rejected": -3.494046688079834, "loss": 4.1367, "rewards/accuracies": 0.25, "rewards/chosen": -37.33403015136719, "rewards/margins": -2.3935580253601074, "rewards/rejected": -34.940467834472656, "step": 2565 }, { "epoch": 0.349400871459695, "grad_norm": 44.72330949672399, "learning_rate": 6.578183681600405e-07, "logits/chosen": 12.112100601196289, "logits/rejected": 12.100804328918457, "logps/chosen": -3.600151538848877, "logps/rejected": -3.9968318939208984, "loss": 4.0144, "rewards/accuracies": 0.75, "rewards/chosen": -36.00151443481445, "rewards/margins": 3.966803550720215, "rewards/rejected": -39.96831512451172, "step": 2566 }, { "epoch": 0.34953703703703703, "grad_norm": 41.0065873550226, "learning_rate": 6.576729643258613e-07, "logits/chosen": 11.593473434448242, "logits/rejected": 11.824127197265625, "logps/chosen": -3.1684343814849854, "logps/rejected": -3.61498761177063, "loss": 3.5938, "rewards/accuracies": 0.75, "rewards/chosen": -31.684343338012695, "rewards/margins": 4.465532302856445, "rewards/rejected": -36.14987564086914, "step": 2567 }, { "epoch": 0.34967320261437906, "grad_norm": 39.69483441968218, "learning_rate": 6.575275022683459e-07, "logits/chosen": 11.097533226013184, "logits/rejected": 11.297365188598633, "logps/chosen": -3.6029624938964844, "logps/rejected": -3.845276355743408, "loss": 4.1429, "rewards/accuracies": 1.0, "rewards/chosen": -36.029624938964844, "rewards/margins": 2.4231371879577637, "rewards/rejected": -38.452762603759766, "step": 2568 }, { "epoch": 0.34980936819172115, "grad_norm": 37.18366585630156, "learning_rate": 6.573819820203627e-07, "logits/chosen": 11.480932235717773, "logits/rejected": 12.291081428527832, "logps/chosen": -3.4519171714782715, "logps/rejected": -3.6491620540618896, "loss": 3.453, "rewards/accuracies": 0.75, "rewards/chosen": -34.51917266845703, "rewards/margins": 1.9724507331848145, "rewards/rejected": -36.49162292480469, "step": 2569 }, { "epoch": 0.3499455337690632, "grad_norm": 40.030344575166914, "learning_rate": 6.572364036147931e-07, "logits/chosen": 11.583667755126953, "logits/rejected": 11.982927322387695, "logps/chosen": -3.4469892978668213, "logps/rejected": -3.808469295501709, "loss": 4.102, "rewards/accuracies": 0.75, "rewards/chosen": -34.46989440917969, "rewards/margins": 3.6147985458374023, "rewards/rejected": -38.084693908691406, "step": 2570 }, { "epoch": 0.3500816993464052, "grad_norm": 36.91492857297092, "learning_rate": 6.570907670845316e-07, "logits/chosen": 11.12828254699707, "logits/rejected": 11.196438789367676, "logps/chosen": -2.986656665802002, "logps/rejected": -3.430727958679199, "loss": 3.5623, "rewards/accuracies": 1.0, "rewards/chosen": -29.866567611694336, "rewards/margins": 4.4407148361206055, "rewards/rejected": -34.307281494140625, "step": 2571 }, { "epoch": 0.3502178649237473, "grad_norm": 41.21864390025073, "learning_rate": 6.569450724624863e-07, "logits/chosen": 11.35545539855957, "logits/rejected": 10.890380859375, "logps/chosen": -3.355705499649048, "logps/rejected": -3.399707794189453, "loss": 3.8469, "rewards/accuracies": 0.5, "rewards/chosen": -33.55705261230469, "rewards/margins": 0.44002485275268555, "rewards/rejected": -33.99707794189453, "step": 2572 }, { "epoch": 0.3503540305010893, "grad_norm": 36.835499653051855, "learning_rate": 6.567993197815779e-07, "logits/chosen": 10.81479549407959, "logits/rejected": 11.535726547241211, "logps/chosen": -3.602616786956787, "logps/rejected": -3.8672006130218506, "loss": 3.7047, "rewards/accuracies": 0.75, "rewards/chosen": -36.02616882324219, "rewards/margins": 2.64583683013916, "rewards/rejected": -38.67200469970703, "step": 2573 }, { "epoch": 0.35049019607843135, "grad_norm": 37.47342652264354, "learning_rate": 6.566535090747404e-07, "logits/chosen": 11.140817642211914, "logits/rejected": 11.07159423828125, "logps/chosen": -3.549149513244629, "logps/rejected": -3.7471282482147217, "loss": 3.5773, "rewards/accuracies": 0.75, "rewards/chosen": -35.491493225097656, "rewards/margins": 1.9797868728637695, "rewards/rejected": -37.471282958984375, "step": 2574 }, { "epoch": 0.35062636165577343, "grad_norm": 40.347977322402734, "learning_rate": 6.565076403749211e-07, "logits/chosen": 11.057852745056152, "logits/rejected": 11.331098556518555, "logps/chosen": -3.6069302558898926, "logps/rejected": -3.6204023361206055, "loss": 4.0085, "rewards/accuracies": 0.5, "rewards/chosen": -36.069305419921875, "rewards/margins": 0.1347203254699707, "rewards/rejected": -36.20402526855469, "step": 2575 }, { "epoch": 0.35076252723311546, "grad_norm": 44.80077619366419, "learning_rate": 6.563617137150801e-07, "logits/chosen": 11.257038116455078, "logits/rejected": 11.609855651855469, "logps/chosen": -3.4371373653411865, "logps/rejected": -3.742253303527832, "loss": 3.8432, "rewards/accuracies": 0.5, "rewards/chosen": -34.371376037597656, "rewards/margins": 3.0511598587036133, "rewards/rejected": -37.42253494262695, "step": 2576 }, { "epoch": 0.3508986928104575, "grad_norm": 41.577629079172326, "learning_rate": 6.562157291281908e-07, "logits/chosen": 11.444422721862793, "logits/rejected": 11.707406997680664, "logps/chosen": -3.201617956161499, "logps/rejected": -3.6693077087402344, "loss": 3.908, "rewards/accuracies": 1.0, "rewards/chosen": -32.01618194580078, "rewards/margins": 4.67689847946167, "rewards/rejected": -36.693077087402344, "step": 2577 }, { "epoch": 0.3510348583877996, "grad_norm": 40.28179792678638, "learning_rate": 6.560696866472396e-07, "logits/chosen": 11.86513900756836, "logits/rejected": 12.434845924377441, "logps/chosen": -3.663839817047119, "logps/rejected": -3.6120657920837402, "loss": 4.2563, "rewards/accuracies": 0.5, "rewards/chosen": -36.638397216796875, "rewards/margins": -0.5177431106567383, "rewards/rejected": -36.12065505981445, "step": 2578 }, { "epoch": 0.3511710239651416, "grad_norm": 37.21155290335487, "learning_rate": 6.559235863052259e-07, "logits/chosen": 10.693817138671875, "logits/rejected": 11.572002410888672, "logps/chosen": -3.587472438812256, "logps/rejected": -4.050544738769531, "loss": 3.8841, "rewards/accuracies": 1.0, "rewards/chosen": -35.874725341796875, "rewards/margins": 4.630724906921387, "rewards/rejected": -40.50544738769531, "step": 2579 }, { "epoch": 0.35130718954248363, "grad_norm": 41.82877860653921, "learning_rate": 6.557774281351626e-07, "logits/chosen": 10.451534271240234, "logits/rejected": 11.617424011230469, "logps/chosen": -3.1146163940429688, "logps/rejected": -3.7056353092193604, "loss": 4.0541, "rewards/accuracies": 1.0, "rewards/chosen": -31.146163940429688, "rewards/margins": 5.9101881980896, "rewards/rejected": -37.05635070800781, "step": 2580 }, { "epoch": 0.3514433551198257, "grad_norm": 44.78332347734177, "learning_rate": 6.556312121700751e-07, "logits/chosen": 10.446215629577637, "logits/rejected": 11.078575134277344, "logps/chosen": -3.0965471267700195, "logps/rejected": -3.494260787963867, "loss": 3.8344, "rewards/accuracies": 1.0, "rewards/chosen": -30.965473175048828, "rewards/margins": 3.9771342277526855, "rewards/rejected": -34.942604064941406, "step": 2581 }, { "epoch": 0.35157952069716775, "grad_norm": 51.55703959868924, "learning_rate": 6.55484938443002e-07, "logits/chosen": 10.539129257202148, "logits/rejected": 11.93136978149414, "logps/chosen": -3.43404483795166, "logps/rejected": -3.7351109981536865, "loss": 4.1838, "rewards/accuracies": 0.5, "rewards/chosen": -34.340450286865234, "rewards/margins": 3.010660171508789, "rewards/rejected": -37.351112365722656, "step": 2582 }, { "epoch": 0.35171568627450983, "grad_norm": 40.99504025551104, "learning_rate": 6.553386069869953e-07, "logits/chosen": 10.606315612792969, "logits/rejected": 11.481765747070312, "logps/chosen": -3.3238165378570557, "logps/rejected": -3.587697744369507, "loss": 3.9581, "rewards/accuracies": 0.75, "rewards/chosen": -33.23816680908203, "rewards/margins": 2.6388115882873535, "rewards/rejected": -35.876976013183594, "step": 2583 }, { "epoch": 0.35185185185185186, "grad_norm": 39.32000676860546, "learning_rate": 6.551922178351196e-07, "logits/chosen": 10.62682819366455, "logits/rejected": 11.199377059936523, "logps/chosen": -3.475156307220459, "logps/rejected": -3.583028793334961, "loss": 4.2457, "rewards/accuracies": 0.75, "rewards/chosen": -34.751564025878906, "rewards/margins": 1.0787248611450195, "rewards/rejected": -35.83028793334961, "step": 2584 }, { "epoch": 0.3519880174291939, "grad_norm": 38.894490898838555, "learning_rate": 6.55045771020453e-07, "logits/chosen": 11.212221145629883, "logits/rejected": 12.581198692321777, "logps/chosen": -3.0378992557525635, "logps/rejected": -3.7306630611419678, "loss": 3.9522, "rewards/accuracies": 1.0, "rewards/chosen": -30.37899398803711, "rewards/margins": 6.927638053894043, "rewards/rejected": -37.30663299560547, "step": 2585 }, { "epoch": 0.352124183006536, "grad_norm": 36.722013165719474, "learning_rate": 6.548992665760861e-07, "logits/chosen": 11.754928588867188, "logits/rejected": 10.990327835083008, "logps/chosen": -3.361264705657959, "logps/rejected": -3.2991058826446533, "loss": 4.0998, "rewards/accuracies": 0.5, "rewards/chosen": -33.612648010253906, "rewards/margins": -0.6215887069702148, "rewards/rejected": -32.991058349609375, "step": 2586 }, { "epoch": 0.352260348583878, "grad_norm": 39.397394160044165, "learning_rate": 6.547527045351228e-07, "logits/chosen": 11.673891067504883, "logits/rejected": 11.416221618652344, "logps/chosen": -3.380143404006958, "logps/rejected": -3.62404203414917, "loss": 3.7712, "rewards/accuracies": 0.75, "rewards/chosen": -33.80143356323242, "rewards/margins": 2.4389867782592773, "rewards/rejected": -36.24041748046875, "step": 2587 }, { "epoch": 0.35239651416122003, "grad_norm": 36.56380972491938, "learning_rate": 6.546060849306803e-07, "logits/chosen": 11.138622283935547, "logits/rejected": 10.274799346923828, "logps/chosen": -3.281320810317993, "logps/rejected": -3.3691728115081787, "loss": 3.7557, "rewards/accuracies": 0.5, "rewards/chosen": -32.813209533691406, "rewards/margins": 0.8785200119018555, "rewards/rejected": -33.69172668457031, "step": 2588 }, { "epoch": 0.3525326797385621, "grad_norm": 47.96750975924945, "learning_rate": 6.544594077958882e-07, "logits/chosen": 11.02418327331543, "logits/rejected": 11.472063064575195, "logps/chosen": -3.4791746139526367, "logps/rejected": -3.7079501152038574, "loss": 4.4409, "rewards/accuracies": 0.75, "rewards/chosen": -34.791744232177734, "rewards/margins": 2.2877583503723145, "rewards/rejected": -37.07950210571289, "step": 2589 }, { "epoch": 0.35266884531590414, "grad_norm": 38.95129412584552, "learning_rate": 6.543126731638896e-07, "logits/chosen": 11.26034927368164, "logits/rejected": 11.94973373413086, "logps/chosen": -3.442995071411133, "logps/rejected": -3.644871473312378, "loss": 4.4626, "rewards/accuracies": 0.75, "rewards/chosen": -34.42995071411133, "rewards/margins": 2.0187625885009766, "rewards/rejected": -36.44871520996094, "step": 2590 }, { "epoch": 0.3528050108932462, "grad_norm": 47.64803777234132, "learning_rate": 6.541658810678404e-07, "logits/chosen": 10.945446014404297, "logits/rejected": 12.080703735351562, "logps/chosen": -3.5930447578430176, "logps/rejected": -3.5163872241973877, "loss": 4.2433, "rewards/accuracies": 0.25, "rewards/chosen": -35.93044662475586, "rewards/margins": -0.7665739059448242, "rewards/rejected": -35.16387176513672, "step": 2591 }, { "epoch": 0.35294117647058826, "grad_norm": 40.455704566001934, "learning_rate": 6.540190315409092e-07, "logits/chosen": 10.034444808959961, "logits/rejected": 11.478012084960938, "logps/chosen": -3.099290370941162, "logps/rejected": -3.7032830715179443, "loss": 4.2416, "rewards/accuracies": 0.75, "rewards/chosen": -30.992904663085938, "rewards/margins": 6.039925575256348, "rewards/rejected": -37.03282928466797, "step": 2592 }, { "epoch": 0.3530773420479303, "grad_norm": 37.885213416323055, "learning_rate": 6.538721246162783e-07, "logits/chosen": 11.896596908569336, "logits/rejected": 11.102038383483887, "logps/chosen": -3.7707417011260986, "logps/rejected": -3.6748390197753906, "loss": 4.4551, "rewards/accuracies": 0.25, "rewards/chosen": -37.70741653442383, "rewards/margins": -0.9590277671813965, "rewards/rejected": -36.748390197753906, "step": 2593 }, { "epoch": 0.3532135076252723, "grad_norm": 44.49281176713341, "learning_rate": 6.537251603271421e-07, "logits/chosen": 11.775226593017578, "logits/rejected": 11.579885482788086, "logps/chosen": -3.497762441635132, "logps/rejected": -3.5198066234588623, "loss": 4.5675, "rewards/accuracies": 0.5, "rewards/chosen": -34.977622985839844, "rewards/margins": 0.22044038772583008, "rewards/rejected": -35.19806671142578, "step": 2594 }, { "epoch": 0.3533496732026144, "grad_norm": 39.28383084025157, "learning_rate": 6.535781387067088e-07, "logits/chosen": 12.070554733276367, "logits/rejected": 11.393267631530762, "logps/chosen": -3.787743091583252, "logps/rejected": -3.778968095779419, "loss": 3.6521, "rewards/accuracies": 0.5, "rewards/chosen": -37.8774299621582, "rewards/margins": -0.08774852752685547, "rewards/rejected": -37.78968048095703, "step": 2595 }, { "epoch": 0.35348583877995643, "grad_norm": 58.27610633764184, "learning_rate": 6.534310597881989e-07, "logits/chosen": 10.917036056518555, "logits/rejected": 11.433764457702637, "logps/chosen": -3.73576283454895, "logps/rejected": -3.8724517822265625, "loss": 4.481, "rewards/accuracies": 0.5, "rewards/chosen": -37.357627868652344, "rewards/margins": 1.3668899536132812, "rewards/rejected": -38.724517822265625, "step": 2596 }, { "epoch": 0.35362200435729846, "grad_norm": 38.96533176126957, "learning_rate": 6.532839236048461e-07, "logits/chosen": 11.950557708740234, "logits/rejected": 11.466595649719238, "logps/chosen": -3.8553714752197266, "logps/rejected": -3.704019069671631, "loss": 3.9819, "rewards/accuracies": 0.25, "rewards/chosen": -38.55371856689453, "rewards/margins": -1.513524055480957, "rewards/rejected": -37.040191650390625, "step": 2597 }, { "epoch": 0.35375816993464054, "grad_norm": 39.30525844914929, "learning_rate": 6.53136730189897e-07, "logits/chosen": 10.438521385192871, "logits/rejected": 10.89183521270752, "logps/chosen": -3.61251163482666, "logps/rejected": -3.824293613433838, "loss": 3.831, "rewards/accuracies": 0.75, "rewards/chosen": -36.12511444091797, "rewards/margins": 2.1178221702575684, "rewards/rejected": -38.24293518066406, "step": 2598 }, { "epoch": 0.35389433551198257, "grad_norm": 42.49539054277587, "learning_rate": 6.529894795766114e-07, "logits/chosen": 11.227675437927246, "logits/rejected": 11.449004173278809, "logps/chosen": -3.64900279045105, "logps/rejected": -3.5700008869171143, "loss": 4.4724, "rewards/accuracies": 0.5, "rewards/chosen": -36.490028381347656, "rewards/margins": -0.7900180816650391, "rewards/rejected": -35.700008392333984, "step": 2599 }, { "epoch": 0.3540305010893246, "grad_norm": 50.784810383664286, "learning_rate": 6.528421717982616e-07, "logits/chosen": 11.906231880187988, "logits/rejected": 11.48879623413086, "logps/chosen": -4.021893501281738, "logps/rejected": -3.922773599624634, "loss": 4.1041, "rewards/accuracies": 0.5, "rewards/chosen": -40.21893310546875, "rewards/margins": -0.9911966323852539, "rewards/rejected": -39.22773742675781, "step": 2600 }, { "epoch": 0.3541666666666667, "grad_norm": 42.56498881005107, "learning_rate": 6.526948068881332e-07, "logits/chosen": 10.593505859375, "logits/rejected": 11.082985877990723, "logps/chosen": -3.559006690979004, "logps/rejected": -3.7480289936065674, "loss": 3.9073, "rewards/accuracies": 0.75, "rewards/chosen": -35.590065002441406, "rewards/margins": 1.8902239799499512, "rewards/rejected": -37.48029327392578, "step": 2601 }, { "epoch": 0.3543028322440087, "grad_norm": 38.406405462249374, "learning_rate": 6.525473848795243e-07, "logits/chosen": 11.089881896972656, "logits/rejected": 11.187216758728027, "logps/chosen": -3.6232471466064453, "logps/rejected": -3.5703368186950684, "loss": 4.3063, "rewards/accuracies": 0.75, "rewards/chosen": -36.23247528076172, "rewards/margins": -0.5291042327880859, "rewards/rejected": -35.703369140625, "step": 2602 }, { "epoch": 0.35443899782135074, "grad_norm": 45.89491013186262, "learning_rate": 6.523999058057462e-07, "logits/chosen": 12.185530662536621, "logits/rejected": 11.26700210571289, "logps/chosen": -3.684791088104248, "logps/rejected": -3.5604162216186523, "loss": 4.3979, "rewards/accuracies": 0.5, "rewards/chosen": -36.8479118347168, "rewards/margins": -1.2437496185302734, "rewards/rejected": -35.60416030883789, "step": 2603 }, { "epoch": 0.3545751633986928, "grad_norm": 41.0046748472499, "learning_rate": 6.522523697001231e-07, "logits/chosen": 11.082655906677246, "logits/rejected": 10.30771255493164, "logps/chosen": -3.5950284004211426, "logps/rejected": -3.3469560146331787, "loss": 3.6595, "rewards/accuracies": 0.25, "rewards/chosen": -35.950286865234375, "rewards/margins": -2.4807262420654297, "rewards/rejected": -33.46955871582031, "step": 2604 }, { "epoch": 0.35471132897603486, "grad_norm": 41.829725347601766, "learning_rate": 6.521047765959919e-07, "logits/chosen": 10.467968940734863, "logits/rejected": 10.928155899047852, "logps/chosen": -3.464801073074341, "logps/rejected": -3.614236354827881, "loss": 4.6713, "rewards/accuracies": 0.75, "rewards/chosen": -34.64801025390625, "rewards/margins": 1.4943485260009766, "rewards/rejected": -36.14236068725586, "step": 2605 }, { "epoch": 0.3548474945533769, "grad_norm": 41.38131890746075, "learning_rate": 6.519571265267025e-07, "logits/chosen": 11.40889835357666, "logits/rejected": 10.443014144897461, "logps/chosen": -3.4660840034484863, "logps/rejected": -3.1490352153778076, "loss": 4.3813, "rewards/accuracies": 0.25, "rewards/chosen": -34.66083908081055, "rewards/margins": -3.170487403869629, "rewards/rejected": -31.490352630615234, "step": 2606 }, { "epoch": 0.35498366013071897, "grad_norm": 39.81081050903925, "learning_rate": 6.518094195256175e-07, "logits/chosen": 10.68980598449707, "logits/rejected": 10.985042572021484, "logps/chosen": -3.237222671508789, "logps/rejected": -3.4498915672302246, "loss": 4.1656, "rewards/accuracies": 0.5, "rewards/chosen": -32.37222671508789, "rewards/margins": 2.1266889572143555, "rewards/rejected": -34.49891662597656, "step": 2607 }, { "epoch": 0.355119825708061, "grad_norm": 41.38114588728068, "learning_rate": 6.516616556261129e-07, "logits/chosen": 11.210620880126953, "logits/rejected": 11.087647438049316, "logps/chosen": -4.172677040100098, "logps/rejected": -4.04877233505249, "loss": 4.5248, "rewards/accuracies": 0.25, "rewards/chosen": -41.72677230834961, "rewards/margins": -1.2390508651733398, "rewards/rejected": -40.48772048950195, "step": 2608 }, { "epoch": 0.355255991285403, "grad_norm": 44.33370850153083, "learning_rate": 6.51513834861577e-07, "logits/chosen": 10.581985473632812, "logits/rejected": 11.50780963897705, "logps/chosen": -3.5110106468200684, "logps/rejected": -3.5814461708068848, "loss": 4.0017, "rewards/accuracies": 0.5, "rewards/chosen": -35.110103607177734, "rewards/margins": 0.7043571472167969, "rewards/rejected": -35.81446075439453, "step": 2609 }, { "epoch": 0.3553921568627451, "grad_norm": 41.790079734866715, "learning_rate": 6.513659572654108e-07, "logits/chosen": 11.357799530029297, "logits/rejected": 12.021283149719238, "logps/chosen": -3.409646511077881, "logps/rejected": -3.5070788860321045, "loss": 4.1283, "rewards/accuracies": 0.5, "rewards/chosen": -34.096466064453125, "rewards/margins": 0.9743227958679199, "rewards/rejected": -35.0707893371582, "step": 2610 }, { "epoch": 0.35552832244008714, "grad_norm": 39.71777449703056, "learning_rate": 6.512180228710288e-07, "logits/chosen": 10.321197509765625, "logits/rejected": 11.306962966918945, "logps/chosen": -3.1069259643554688, "logps/rejected": -3.186741828918457, "loss": 4.0486, "rewards/accuracies": 0.5, "rewards/chosen": -31.069259643554688, "rewards/margins": 0.7981595993041992, "rewards/rejected": -31.867420196533203, "step": 2611 }, { "epoch": 0.35566448801742917, "grad_norm": 46.258704144273146, "learning_rate": 6.510700317118582e-07, "logits/chosen": 11.093389511108398, "logits/rejected": 12.436522483825684, "logps/chosen": -3.1828718185424805, "logps/rejected": -3.636338233947754, "loss": 3.919, "rewards/accuracies": 0.75, "rewards/chosen": -31.828720092773438, "rewards/margins": 4.534665107727051, "rewards/rejected": -36.36338424682617, "step": 2612 }, { "epoch": 0.35580065359477125, "grad_norm": 36.39140335518771, "learning_rate": 6.509219838213383e-07, "logits/chosen": 9.62155532836914, "logits/rejected": 10.909435272216797, "logps/chosen": -2.850895881652832, "logps/rejected": -3.30570650100708, "loss": 4.351, "rewards/accuracies": 0.75, "rewards/chosen": -28.508960723876953, "rewards/margins": 4.548104763031006, "rewards/rejected": -33.05706787109375, "step": 2613 }, { "epoch": 0.3559368191721133, "grad_norm": 40.04574641812795, "learning_rate": 6.507738792329222e-07, "logits/chosen": 11.635550498962402, "logits/rejected": 11.47933578491211, "logps/chosen": -3.4908719062805176, "logps/rejected": -3.7291088104248047, "loss": 4.1723, "rewards/accuracies": 0.75, "rewards/chosen": -34.908721923828125, "rewards/margins": 2.3823676109313965, "rewards/rejected": -37.29108810424805, "step": 2614 }, { "epoch": 0.3560729847494553, "grad_norm": 71.10222185857117, "learning_rate": 6.506257179800751e-07, "logits/chosen": 11.07835578918457, "logits/rejected": 10.864418983459473, "logps/chosen": -3.8124217987060547, "logps/rejected": -3.598759889602661, "loss": 4.3026, "rewards/accuracies": 0.25, "rewards/chosen": -38.12421417236328, "rewards/margins": -2.136618137359619, "rewards/rejected": -35.98759841918945, "step": 2615 }, { "epoch": 0.3562091503267974, "grad_norm": 37.42507686748991, "learning_rate": 6.504775000962752e-07, "logits/chosen": 11.582796096801758, "logits/rejected": 11.92117691040039, "logps/chosen": -3.392448902130127, "logps/rejected": -3.6701407432556152, "loss": 3.5381, "rewards/accuracies": 0.75, "rewards/chosen": -33.92448425292969, "rewards/margins": 2.776919364929199, "rewards/rejected": -36.7014045715332, "step": 2616 }, { "epoch": 0.3563453159041394, "grad_norm": 39.1491563949031, "learning_rate": 6.503292256150139e-07, "logits/chosen": 9.651887893676758, "logits/rejected": 10.97449016571045, "logps/chosen": -3.334868907928467, "logps/rejected": -3.722658157348633, "loss": 4.1009, "rewards/accuracies": 0.5, "rewards/chosen": -33.348690032958984, "rewards/margins": 3.8778934478759766, "rewards/rejected": -37.226585388183594, "step": 2617 }, { "epoch": 0.35648148148148145, "grad_norm": 42.56231085140262, "learning_rate": 6.501808945697947e-07, "logits/chosen": 10.508206367492676, "logits/rejected": 11.893125534057617, "logps/chosen": -3.1354260444641113, "logps/rejected": -3.4172916412353516, "loss": 4.2786, "rewards/accuracies": 0.75, "rewards/chosen": -31.35426139831543, "rewards/margins": 2.818657398223877, "rewards/rejected": -34.17292022705078, "step": 2618 }, { "epoch": 0.35661764705882354, "grad_norm": 49.201479924438196, "learning_rate": 6.500325069941343e-07, "logits/chosen": 11.352843284606934, "logits/rejected": 10.902658462524414, "logps/chosen": -3.536437511444092, "logps/rejected": -3.583329200744629, "loss": 3.9883, "rewards/accuracies": 0.75, "rewards/chosen": -35.364376068115234, "rewards/margins": 0.4689188003540039, "rewards/rejected": -35.83329391479492, "step": 2619 }, { "epoch": 0.35675381263616557, "grad_norm": 39.11223053944991, "learning_rate": 6.498840629215623e-07, "logits/chosen": 11.216634750366211, "logits/rejected": 11.821516036987305, "logps/chosen": -3.5163397789001465, "logps/rejected": -3.8946778774261475, "loss": 4.0172, "rewards/accuracies": 1.0, "rewards/chosen": -35.163394927978516, "rewards/margins": 3.7833828926086426, "rewards/rejected": -38.94677734375, "step": 2620 }, { "epoch": 0.35688997821350765, "grad_norm": 37.208769782051874, "learning_rate": 6.497355623856207e-07, "logits/chosen": 12.018465995788574, "logits/rejected": 11.391618728637695, "logps/chosen": -3.7156662940979004, "logps/rejected": -3.800558090209961, "loss": 3.7654, "rewards/accuracies": 0.5, "rewards/chosen": -37.15666198730469, "rewards/margins": 0.8489160537719727, "rewards/rejected": -38.00558090209961, "step": 2621 }, { "epoch": 0.3570261437908497, "grad_norm": 39.182194875998384, "learning_rate": 6.495870054198644e-07, "logits/chosen": 11.318939208984375, "logits/rejected": 10.896125793457031, "logps/chosen": -3.2180166244506836, "logps/rejected": -3.2341878414154053, "loss": 4.315, "rewards/accuracies": 0.5, "rewards/chosen": -32.18016815185547, "rewards/margins": 0.161712646484375, "rewards/rejected": -32.341880798339844, "step": 2622 }, { "epoch": 0.3571623093681917, "grad_norm": 43.70425512064614, "learning_rate": 6.494383920578612e-07, "logits/chosen": 10.245820999145508, "logits/rejected": 11.260616302490234, "logps/chosen": -2.9314465522766113, "logps/rejected": -3.181260108947754, "loss": 4.2706, "rewards/accuracies": 0.75, "rewards/chosen": -29.314464569091797, "rewards/margins": 2.4981346130371094, "rewards/rejected": -31.812599182128906, "step": 2623 }, { "epoch": 0.3572984749455338, "grad_norm": 40.87810501734275, "learning_rate": 6.492897223331913e-07, "logits/chosen": 10.368705749511719, "logits/rejected": 10.960101127624512, "logps/chosen": -3.2266316413879395, "logps/rejected": -3.886050224304199, "loss": 4.3437, "rewards/accuracies": 1.0, "rewards/chosen": -32.266319274902344, "rewards/margins": 6.594184398651123, "rewards/rejected": -38.86050033569336, "step": 2624 }, { "epoch": 0.3574346405228758, "grad_norm": 41.810510988000864, "learning_rate": 6.49140996279448e-07, "logits/chosen": 10.253844261169434, "logits/rejected": 10.503377914428711, "logps/chosen": -2.8210856914520264, "logps/rejected": -3.16786527633667, "loss": 3.8995, "rewards/accuracies": 0.75, "rewards/chosen": -28.210857391357422, "rewards/margins": 3.46779727935791, "rewards/rejected": -31.678653717041016, "step": 2625 }, { "epoch": 0.35757080610021785, "grad_norm": 36.019473226111764, "learning_rate": 6.489922139302372e-07, "logits/chosen": 10.52083969116211, "logits/rejected": 10.836383819580078, "logps/chosen": -3.4539217948913574, "logps/rejected": -3.6969614028930664, "loss": 3.6838, "rewards/accuracies": 0.75, "rewards/chosen": -34.539215087890625, "rewards/margins": 2.4303956031799316, "rewards/rejected": -36.96961212158203, "step": 2626 }, { "epoch": 0.35770697167755994, "grad_norm": 42.30446742741355, "learning_rate": 6.488433753191776e-07, "logits/chosen": 12.534900665283203, "logits/rejected": 12.239276885986328, "logps/chosen": -3.9375064373016357, "logps/rejected": -4.216677665710449, "loss": 4.0974, "rewards/accuracies": 0.75, "rewards/chosen": -39.375064849853516, "rewards/margins": 2.7917118072509766, "rewards/rejected": -42.166778564453125, "step": 2627 }, { "epoch": 0.35784313725490197, "grad_norm": 39.272534807485606, "learning_rate": 6.486944804799002e-07, "logits/chosen": 10.672521591186523, "logits/rejected": 10.877843856811523, "logps/chosen": -2.9653520584106445, "logps/rejected": -3.2878310680389404, "loss": 4.117, "rewards/accuracies": 0.75, "rewards/chosen": -29.653520584106445, "rewards/margins": 3.224790096282959, "rewards/rejected": -32.87831115722656, "step": 2628 }, { "epoch": 0.357979302832244, "grad_norm": 42.03484735151999, "learning_rate": 6.485455294460494e-07, "logits/chosen": 10.910383224487305, "logits/rejected": 11.720233917236328, "logps/chosen": -3.3905506134033203, "logps/rejected": -3.5945870876312256, "loss": 4.1979, "rewards/accuracies": 0.75, "rewards/chosen": -33.9055061340332, "rewards/margins": 2.0403666496276855, "rewards/rejected": -35.94587326049805, "step": 2629 }, { "epoch": 0.3581154684095861, "grad_norm": 39.61067210096888, "learning_rate": 6.483965222512815e-07, "logits/chosen": 11.320602416992188, "logits/rejected": 11.480588912963867, "logps/chosen": -3.6419870853424072, "logps/rejected": -3.8786160945892334, "loss": 3.6617, "rewards/accuracies": 0.75, "rewards/chosen": -36.41987228393555, "rewards/margins": 2.3662872314453125, "rewards/rejected": -38.786163330078125, "step": 2630 }, { "epoch": 0.3582516339869281, "grad_norm": 41.750761055444435, "learning_rate": 6.482474589292662e-07, "logits/chosen": 11.140953063964844, "logits/rejected": 11.117298126220703, "logps/chosen": -3.516569137573242, "logps/rejected": -3.591484546661377, "loss": 4.4164, "rewards/accuracies": 0.75, "rewards/chosen": -35.16569137573242, "rewards/margins": 0.7491540908813477, "rewards/rejected": -35.91484832763672, "step": 2631 }, { "epoch": 0.35838779956427014, "grad_norm": 45.427477397725795, "learning_rate": 6.480983395136857e-07, "logits/chosen": 11.431562423706055, "logits/rejected": 11.925987243652344, "logps/chosen": -3.315042018890381, "logps/rejected": -3.7200160026550293, "loss": 4.2911, "rewards/accuracies": 1.0, "rewards/chosen": -33.150421142578125, "rewards/margins": 4.049737930297852, "rewards/rejected": -37.200157165527344, "step": 2632 }, { "epoch": 0.3585239651416122, "grad_norm": 39.71269498881784, "learning_rate": 6.479491640382343e-07, "logits/chosen": 11.83818244934082, "logits/rejected": 11.96247673034668, "logps/chosen": -3.6489243507385254, "logps/rejected": -3.4455032348632812, "loss": 4.0234, "rewards/accuracies": 0.25, "rewards/chosen": -36.4892463684082, "rewards/margins": -2.0342116355895996, "rewards/rejected": -34.45503234863281, "step": 2633 }, { "epoch": 0.35866013071895425, "grad_norm": 40.92214242584262, "learning_rate": 6.477999325366199e-07, "logits/chosen": 10.54652214050293, "logits/rejected": 11.421384811401367, "logps/chosen": -3.5155441761016846, "logps/rejected": -3.6731035709381104, "loss": 4.2905, "rewards/accuracies": 0.75, "rewards/chosen": -35.15544128417969, "rewards/margins": 1.5755949020385742, "rewards/rejected": -36.73103332519531, "step": 2634 }, { "epoch": 0.3587962962962963, "grad_norm": 58.18026063180023, "learning_rate": 6.476506450425624e-07, "logits/chosen": 9.106855392456055, "logits/rejected": 11.063724517822266, "logps/chosen": -3.0240743160247803, "logps/rejected": -3.7264678478240967, "loss": 4.2413, "rewards/accuracies": 1.0, "rewards/chosen": -30.240745544433594, "rewards/margins": 7.023933410644531, "rewards/rejected": -37.264678955078125, "step": 2635 }, { "epoch": 0.35893246187363836, "grad_norm": 39.30561220006262, "learning_rate": 6.475013015897945e-07, "logits/chosen": 10.931739807128906, "logits/rejected": 12.335254669189453, "logps/chosen": -3.1483073234558105, "logps/rejected": -3.4562952518463135, "loss": 3.8129, "rewards/accuracies": 0.75, "rewards/chosen": -31.48307228088379, "rewards/margins": 3.079878330230713, "rewards/rejected": -34.562950134277344, "step": 2636 }, { "epoch": 0.3590686274509804, "grad_norm": 48.70035173976094, "learning_rate": 6.473519022120616e-07, "logits/chosen": 10.138431549072266, "logits/rejected": 10.715166091918945, "logps/chosen": -3.47640323638916, "logps/rejected": -3.680243730545044, "loss": 4.1758, "rewards/accuracies": 1.0, "rewards/chosen": -34.764034271240234, "rewards/margins": 2.0384035110473633, "rewards/rejected": -36.80243682861328, "step": 2637 }, { "epoch": 0.3592047930283224, "grad_norm": 44.1837464576094, "learning_rate": 6.47202446943122e-07, "logits/chosen": 10.80662727355957, "logits/rejected": 11.522308349609375, "logps/chosen": -3.385047674179077, "logps/rejected": -3.8060953617095947, "loss": 3.7515, "rewards/accuracies": 0.75, "rewards/chosen": -33.85047912597656, "rewards/margins": 4.210477828979492, "rewards/rejected": -38.06095504760742, "step": 2638 }, { "epoch": 0.3593409586056645, "grad_norm": 37.95151297408086, "learning_rate": 6.470529358167459e-07, "logits/chosen": 10.459224700927734, "logits/rejected": 11.119461059570312, "logps/chosen": -3.393733501434326, "logps/rejected": -3.733696937561035, "loss": 3.758, "rewards/accuracies": 0.75, "rewards/chosen": -33.93733215332031, "rewards/margins": 3.399636745452881, "rewards/rejected": -37.33696746826172, "step": 2639 }, { "epoch": 0.35947712418300654, "grad_norm": 42.21872391041109, "learning_rate": 6.469033688667167e-07, "logits/chosen": 10.543841361999512, "logits/rejected": 11.389944076538086, "logps/chosen": -3.2768449783325195, "logps/rejected": -3.901893377304077, "loss": 4.041, "rewards/accuracies": 1.0, "rewards/chosen": -32.76844787597656, "rewards/margins": 6.250484466552734, "rewards/rejected": -39.0189323425293, "step": 2640 }, { "epoch": 0.35961328976034856, "grad_norm": 39.6202489406811, "learning_rate": 6.467537461268306e-07, "logits/chosen": 11.138126373291016, "logits/rejected": 10.453474044799805, "logps/chosen": -3.249150514602661, "logps/rejected": -3.3493857383728027, "loss": 3.8312, "rewards/accuracies": 0.5, "rewards/chosen": -32.49150466918945, "rewards/margins": 1.002352237701416, "rewards/rejected": -33.493858337402344, "step": 2641 }, { "epoch": 0.35974945533769065, "grad_norm": 39.63636596156635, "learning_rate": 6.466040676308959e-07, "logits/chosen": 10.60045051574707, "logits/rejected": 10.835835456848145, "logps/chosen": -3.318605661392212, "logps/rejected": -3.6754744052886963, "loss": 4.0195, "rewards/accuracies": 0.75, "rewards/chosen": -33.186058044433594, "rewards/margins": 3.5686874389648438, "rewards/rejected": -36.75474548339844, "step": 2642 }, { "epoch": 0.3598856209150327, "grad_norm": 38.52876624746526, "learning_rate": 6.464543334127334e-07, "logits/chosen": 10.529191970825195, "logits/rejected": 10.928796768188477, "logps/chosen": -3.393465042114258, "logps/rejected": -3.6521384716033936, "loss": 3.9703, "rewards/accuracies": 0.75, "rewards/chosen": -33.93465042114258, "rewards/margins": 2.5867342948913574, "rewards/rejected": -36.521385192871094, "step": 2643 }, { "epoch": 0.3600217864923747, "grad_norm": 42.41089518003115, "learning_rate": 6.463045435061772e-07, "logits/chosen": 11.153482437133789, "logits/rejected": 11.72752857208252, "logps/chosen": -3.4101977348327637, "logps/rejected": -3.8546183109283447, "loss": 4.0342, "rewards/accuracies": 1.0, "rewards/chosen": -34.10197830200195, "rewards/margins": 4.444203853607178, "rewards/rejected": -38.54618453979492, "step": 2644 }, { "epoch": 0.3601579520697168, "grad_norm": 43.54989216750022, "learning_rate": 6.461546979450736e-07, "logits/chosen": 10.832024574279785, "logits/rejected": 11.24879264831543, "logps/chosen": -3.3498473167419434, "logps/rejected": -3.7036519050598145, "loss": 4.2528, "rewards/accuracies": 0.75, "rewards/chosen": -33.49847412109375, "rewards/margins": 3.5380468368530273, "rewards/rejected": -37.036521911621094, "step": 2645 }, { "epoch": 0.3602941176470588, "grad_norm": 39.14942749702591, "learning_rate": 6.46004796763281e-07, "logits/chosen": 11.682327270507812, "logits/rejected": 11.705570220947266, "logps/chosen": -3.548367977142334, "logps/rejected": -3.5856218338012695, "loss": 3.8488, "rewards/accuracies": 0.75, "rewards/chosen": -35.483680725097656, "rewards/margins": 0.37253665924072266, "rewards/rejected": -35.85621643066406, "step": 2646 }, { "epoch": 0.36043028322440085, "grad_norm": 44.106859577618884, "learning_rate": 6.458548399946712e-07, "logits/chosen": 10.27782154083252, "logits/rejected": 11.554258346557617, "logps/chosen": -3.131624221801758, "logps/rejected": -3.682835340499878, "loss": 4.1195, "rewards/accuracies": 1.0, "rewards/chosen": -31.31624412536621, "rewards/margins": 5.512110233306885, "rewards/rejected": -36.82835388183594, "step": 2647 }, { "epoch": 0.36056644880174293, "grad_norm": 40.02515772881092, "learning_rate": 6.457048276731279e-07, "logits/chosen": 10.911136627197266, "logits/rejected": 11.270882606506348, "logps/chosen": -3.4351253509521484, "logps/rejected": -3.7789721488952637, "loss": 3.8692, "rewards/accuracies": 1.0, "rewards/chosen": -34.35124969482422, "rewards/margins": 3.438469886779785, "rewards/rejected": -37.78971862792969, "step": 2648 }, { "epoch": 0.36070261437908496, "grad_norm": 46.2090211992841, "learning_rate": 6.45554759832548e-07, "logits/chosen": 11.024904251098633, "logits/rejected": 11.517796516418457, "logps/chosen": -3.2849724292755127, "logps/rejected": -3.5588760375976562, "loss": 3.8852, "rewards/accuracies": 0.75, "rewards/chosen": -32.84972381591797, "rewards/margins": 2.7390356063842773, "rewards/rejected": -35.58876037597656, "step": 2649 }, { "epoch": 0.360838779956427, "grad_norm": 48.84504950031499, "learning_rate": 6.454046365068401e-07, "logits/chosen": 11.209186553955078, "logits/rejected": 11.697381973266602, "logps/chosen": -3.2502846717834473, "logps/rejected": -3.4551472663879395, "loss": 4.1342, "rewards/accuracies": 0.75, "rewards/chosen": -32.502845764160156, "rewards/margins": 2.048625946044922, "rewards/rejected": -34.55147171020508, "step": 2650 }, { "epoch": 0.3609749455337691, "grad_norm": 38.49195698993717, "learning_rate": 6.452544577299263e-07, "logits/chosen": 11.108678817749023, "logits/rejected": 11.59057331085205, "logps/chosen": -3.398017644882202, "logps/rejected": -3.2765440940856934, "loss": 3.8742, "rewards/accuracies": 0.5, "rewards/chosen": -33.98017883300781, "rewards/margins": -1.2147345542907715, "rewards/rejected": -32.76544189453125, "step": 2651 }, { "epoch": 0.3611111111111111, "grad_norm": 39.252001388584084, "learning_rate": 6.451042235357403e-07, "logits/chosen": 11.439384460449219, "logits/rejected": 12.119363784790039, "logps/chosen": -3.331598997116089, "logps/rejected": -3.8411545753479004, "loss": 3.3834, "rewards/accuracies": 1.0, "rewards/chosen": -33.31598663330078, "rewards/margins": 5.095555782318115, "rewards/rejected": -38.41154479980469, "step": 2652 }, { "epoch": 0.36124727668845313, "grad_norm": 37.84590115127072, "learning_rate": 6.44953933958229e-07, "logits/chosen": 11.736688613891602, "logits/rejected": 12.680835723876953, "logps/chosen": -3.655000686645508, "logps/rejected": -3.871525287628174, "loss": 3.5082, "rewards/accuracies": 0.75, "rewards/chosen": -36.55000686645508, "rewards/margins": 2.1652441024780273, "rewards/rejected": -38.715248107910156, "step": 2653 }, { "epoch": 0.3613834422657952, "grad_norm": 40.76822269715652, "learning_rate": 6.448035890313516e-07, "logits/chosen": 10.491907119750977, "logits/rejected": 10.71851921081543, "logps/chosen": -2.700018882751465, "logps/rejected": -3.36307954788208, "loss": 3.7802, "rewards/accuracies": 1.0, "rewards/chosen": -27.00018882751465, "rewards/margins": 6.630606174468994, "rewards/rejected": -33.630794525146484, "step": 2654 }, { "epoch": 0.36151960784313725, "grad_norm": 42.77354997022281, "learning_rate": 6.446531887890796e-07, "logits/chosen": 10.817298889160156, "logits/rejected": 10.852293014526367, "logps/chosen": -3.1128439903259277, "logps/rejected": -2.942293882369995, "loss": 3.6562, "rewards/accuracies": 0.25, "rewards/chosen": -31.12843894958496, "rewards/margins": -1.7054991722106934, "rewards/rejected": -29.42293930053711, "step": 2655 }, { "epoch": 0.3616557734204793, "grad_norm": 40.60438816239374, "learning_rate": 6.445027332653971e-07, "logits/chosen": 11.763593673706055, "logits/rejected": 10.249899864196777, "logps/chosen": -3.3002679347991943, "logps/rejected": -3.25639271736145, "loss": 3.79, "rewards/accuracies": 0.5, "rewards/chosen": -33.00267791748047, "rewards/margins": -0.4387526512145996, "rewards/rejected": -32.563926696777344, "step": 2656 }, { "epoch": 0.36179193899782136, "grad_norm": 39.64432608404716, "learning_rate": 6.443522224943013e-07, "logits/chosen": 10.443036079406738, "logits/rejected": 11.002013206481934, "logps/chosen": -3.2827916145324707, "logps/rejected": -3.447254180908203, "loss": 4.0234, "rewards/accuracies": 1.0, "rewards/chosen": -32.82791519165039, "rewards/margins": 1.6446242332458496, "rewards/rejected": -34.47254180908203, "step": 2657 }, { "epoch": 0.3619281045751634, "grad_norm": 74.55819239519809, "learning_rate": 6.442016565098006e-07, "logits/chosen": 10.876052856445312, "logits/rejected": 11.504600524902344, "logps/chosen": -3.1531949043273926, "logps/rejected": -3.8150134086608887, "loss": 4.2087, "rewards/accuracies": 1.0, "rewards/chosen": -31.531949996948242, "rewards/margins": 6.618185043334961, "rewards/rejected": -38.1501350402832, "step": 2658 }, { "epoch": 0.3620642701525055, "grad_norm": 44.57522081111625, "learning_rate": 6.440510353459173e-07, "logits/chosen": 11.341965675354004, "logits/rejected": 10.897714614868164, "logps/chosen": -3.5923280715942383, "logps/rejected": -3.5921759605407715, "loss": 4.3712, "rewards/accuracies": 0.5, "rewards/chosen": -35.92327880859375, "rewards/margins": -0.001522064208984375, "rewards/rejected": -35.92176055908203, "step": 2659 }, { "epoch": 0.3622004357298475, "grad_norm": 44.729631500470916, "learning_rate": 6.439003590366851e-07, "logits/chosen": 11.387866973876953, "logits/rejected": 13.186606407165527, "logps/chosen": -3.0801193714141846, "logps/rejected": -3.8230748176574707, "loss": 4.604, "rewards/accuracies": 1.0, "rewards/chosen": -30.801193237304688, "rewards/margins": 7.429553985595703, "rewards/rejected": -38.23074722290039, "step": 2660 }, { "epoch": 0.36233660130718953, "grad_norm": 44.99566730985947, "learning_rate": 6.437496276161507e-07, "logits/chosen": 11.99185848236084, "logits/rejected": 11.691229820251465, "logps/chosen": -3.6527597904205322, "logps/rejected": -3.593114137649536, "loss": 3.7215, "rewards/accuracies": 0.5, "rewards/chosen": -36.52759552001953, "rewards/margins": -0.5964570045471191, "rewards/rejected": -35.9311408996582, "step": 2661 }, { "epoch": 0.3624727668845316, "grad_norm": 50.39787585910225, "learning_rate": 6.435988411183732e-07, "logits/chosen": 10.525755882263184, "logits/rejected": 11.071115493774414, "logps/chosen": -3.465691328048706, "logps/rejected": -3.611660957336426, "loss": 4.4133, "rewards/accuracies": 0.5, "rewards/chosen": -34.65691375732422, "rewards/margins": 1.4596953392028809, "rewards/rejected": -36.116607666015625, "step": 2662 }, { "epoch": 0.36260893246187365, "grad_norm": 44.6335019166523, "learning_rate": 6.434479995774238e-07, "logits/chosen": 12.243464469909668, "logits/rejected": 12.020503997802734, "logps/chosen": -3.6287312507629395, "logps/rejected": -3.32196044921875, "loss": 4.0222, "rewards/accuracies": 0.25, "rewards/chosen": -36.28731155395508, "rewards/margins": -3.0677080154418945, "rewards/rejected": -33.2196044921875, "step": 2663 }, { "epoch": 0.3627450980392157, "grad_norm": 48.24477036216024, "learning_rate": 6.432971030273865e-07, "logits/chosen": 11.361210823059082, "logits/rejected": 11.581474304199219, "logps/chosen": -3.018613815307617, "logps/rejected": -3.505951166152954, "loss": 4.0182, "rewards/accuracies": 0.75, "rewards/chosen": -30.186138153076172, "rewards/margins": 4.873373508453369, "rewards/rejected": -35.059513092041016, "step": 2664 }, { "epoch": 0.36288126361655776, "grad_norm": 40.58264486698456, "learning_rate": 6.431461515023578e-07, "logits/chosen": 10.808753967285156, "logits/rejected": 11.812742233276367, "logps/chosen": -3.3208887577056885, "logps/rejected": -3.6332483291625977, "loss": 3.6309, "rewards/accuracies": 0.5, "rewards/chosen": -33.208885192871094, "rewards/margins": 3.12359619140625, "rewards/rejected": -36.33248519897461, "step": 2665 }, { "epoch": 0.3630174291938998, "grad_norm": 41.40294002834957, "learning_rate": 6.429951450364462e-07, "logits/chosen": 10.287322998046875, "logits/rejected": 12.201488494873047, "logps/chosen": -3.4314098358154297, "logps/rejected": -3.8159573078155518, "loss": 3.9468, "rewards/accuracies": 1.0, "rewards/chosen": -34.3140983581543, "rewards/margins": 3.8454766273498535, "rewards/rejected": -38.15957260131836, "step": 2666 }, { "epoch": 0.3631535947712418, "grad_norm": 42.46631157282104, "learning_rate": 6.42844083663773e-07, "logits/chosen": 10.93680477142334, "logits/rejected": 11.127435684204102, "logps/chosen": -3.5181121826171875, "logps/rejected": -3.4753003120422363, "loss": 4.1082, "rewards/accuracies": 0.5, "rewards/chosen": -35.181121826171875, "rewards/margins": -0.4281191825866699, "rewards/rejected": -34.75300598144531, "step": 2667 }, { "epoch": 0.3632897603485839, "grad_norm": 42.98460547631781, "learning_rate": 6.426929674184718e-07, "logits/chosen": 10.422689437866211, "logits/rejected": 11.436811447143555, "logps/chosen": -3.2715072631835938, "logps/rejected": -3.480483293533325, "loss": 3.8994, "rewards/accuracies": 0.5, "rewards/chosen": -32.71507263183594, "rewards/margins": 2.089761257171631, "rewards/rejected": -34.804832458496094, "step": 2668 }, { "epoch": 0.36342592592592593, "grad_norm": 40.05637838451055, "learning_rate": 6.425417963346884e-07, "logits/chosen": 10.833274841308594, "logits/rejected": 10.913703918457031, "logps/chosen": -3.285064697265625, "logps/rejected": -3.2196621894836426, "loss": 3.9082, "rewards/accuracies": 0.25, "rewards/chosen": -32.85064697265625, "rewards/margins": -0.654026985168457, "rewards/rejected": -32.19662094116211, "step": 2669 }, { "epoch": 0.36356209150326796, "grad_norm": 40.8158310716826, "learning_rate": 6.423905704465812e-07, "logits/chosen": 11.186281204223633, "logits/rejected": 11.482507705688477, "logps/chosen": -3.389752149581909, "logps/rejected": -3.6682651042938232, "loss": 3.6316, "rewards/accuracies": 0.75, "rewards/chosen": -33.89752197265625, "rewards/margins": 2.785130023956299, "rewards/rejected": -36.682647705078125, "step": 2670 }, { "epoch": 0.36369825708061004, "grad_norm": 56.54703461403649, "learning_rate": 6.42239289788321e-07, "logits/chosen": 11.48794937133789, "logits/rejected": 11.225482940673828, "logps/chosen": -3.644195795059204, "logps/rejected": -3.5857937335968018, "loss": 4.3758, "rewards/accuracies": 0.5, "rewards/chosen": -36.441959381103516, "rewards/margins": -0.5840215682983398, "rewards/rejected": -35.85793685913086, "step": 2671 }, { "epoch": 0.3638344226579521, "grad_norm": 42.10495955189933, "learning_rate": 6.42087954394091e-07, "logits/chosen": 11.461883544921875, "logits/rejected": 11.347885131835938, "logps/chosen": -3.6010711193084717, "logps/rejected": -3.3527631759643555, "loss": 4.7749, "rewards/accuracies": 0.25, "rewards/chosen": -36.010711669921875, "rewards/margins": -2.4830799102783203, "rewards/rejected": -33.52763366699219, "step": 2672 }, { "epoch": 0.3639705882352941, "grad_norm": 44.221201011800815, "learning_rate": 6.419365642980866e-07, "logits/chosen": 11.158262252807617, "logits/rejected": 12.222555160522461, "logps/chosen": -3.5571508407592773, "logps/rejected": -3.757815361022949, "loss": 3.5453, "rewards/accuracies": 0.75, "rewards/chosen": -35.57150650024414, "rewards/margins": 2.0066452026367188, "rewards/rejected": -37.578155517578125, "step": 2673 }, { "epoch": 0.3641067538126362, "grad_norm": 41.87150272883631, "learning_rate": 6.417851195345155e-07, "logits/chosen": 12.620550155639648, "logits/rejected": 12.646662712097168, "logps/chosen": -3.4158177375793457, "logps/rejected": -4.020669937133789, "loss": 3.8588, "rewards/accuracies": 1.0, "rewards/chosen": -34.158180236816406, "rewards/margins": 6.048523426055908, "rewards/rejected": -40.206703186035156, "step": 2674 }, { "epoch": 0.3642429193899782, "grad_norm": 40.05929154981517, "learning_rate": 6.416336201375981e-07, "logits/chosen": 10.680730819702148, "logits/rejected": 11.34451675415039, "logps/chosen": -3.0651392936706543, "logps/rejected": -3.467621326446533, "loss": 3.8817, "rewards/accuracies": 1.0, "rewards/chosen": -30.65139389038086, "rewards/margins": 4.024819850921631, "rewards/rejected": -34.676212310791016, "step": 2675 }, { "epoch": 0.36437908496732024, "grad_norm": 39.005154833202106, "learning_rate": 6.414820661415667e-07, "logits/chosen": 11.72358512878418, "logits/rejected": 11.990755081176758, "logps/chosen": -3.2863821983337402, "logps/rejected": -3.57843017578125, "loss": 4.3501, "rewards/accuracies": 0.75, "rewards/chosen": -32.86382293701172, "rewards/margins": 2.920478343963623, "rewards/rejected": -35.7843017578125, "step": 2676 }, { "epoch": 0.36451525054466233, "grad_norm": 39.232691906201104, "learning_rate": 6.413304575806667e-07, "logits/chosen": 11.396709442138672, "logits/rejected": 11.87984848022461, "logps/chosen": -3.328644037246704, "logps/rejected": -3.485840082168579, "loss": 3.6062, "rewards/accuracies": 0.5, "rewards/chosen": -33.28643798828125, "rewards/margins": 1.5719594955444336, "rewards/rejected": -34.8583984375, "step": 2677 }, { "epoch": 0.36465141612200436, "grad_norm": 50.100379692885156, "learning_rate": 6.411787944891547e-07, "logits/chosen": 11.523422241210938, "logits/rejected": 11.48672866821289, "logps/chosen": -3.3642828464508057, "logps/rejected": -3.5466513633728027, "loss": 3.769, "rewards/accuracies": 0.5, "rewards/chosen": -33.64282989501953, "rewards/margins": 1.823686122894287, "rewards/rejected": -35.466514587402344, "step": 2678 }, { "epoch": 0.3647875816993464, "grad_norm": 41.15848853416161, "learning_rate": 6.410270769013005e-07, "logits/chosen": 11.729106903076172, "logits/rejected": 11.201919555664062, "logps/chosen": -3.591698408126831, "logps/rejected": -3.552183151245117, "loss": 4.4162, "rewards/accuracies": 0.5, "rewards/chosen": -35.91698455810547, "rewards/margins": -0.3951549530029297, "rewards/rejected": -35.521827697753906, "step": 2679 }, { "epoch": 0.36492374727668847, "grad_norm": 42.4536871413159, "learning_rate": 6.408753048513859e-07, "logits/chosen": 11.250012397766113, "logits/rejected": 11.289525985717773, "logps/chosen": -3.1483399868011475, "logps/rejected": -3.141223907470703, "loss": 3.8535, "rewards/accuracies": 0.5, "rewards/chosen": -31.4833984375, "rewards/margins": -0.07115888595581055, "rewards/rejected": -31.41223907470703, "step": 2680 }, { "epoch": 0.3650599128540305, "grad_norm": 38.84073755789771, "learning_rate": 6.407234783737052e-07, "logits/chosen": 10.676820755004883, "logits/rejected": 11.07958984375, "logps/chosen": -3.063817024230957, "logps/rejected": -3.2643423080444336, "loss": 3.8666, "rewards/accuracies": 0.75, "rewards/chosen": -30.638172149658203, "rewards/margins": 2.005251407623291, "rewards/rejected": -32.64342498779297, "step": 2681 }, { "epoch": 0.36519607843137253, "grad_norm": 38.09330047751919, "learning_rate": 6.405715975025646e-07, "logits/chosen": 10.896081924438477, "logits/rejected": 11.141549110412598, "logps/chosen": -3.3620975017547607, "logps/rejected": -3.387439727783203, "loss": 3.8186, "rewards/accuracies": 0.5, "rewards/chosen": -33.620975494384766, "rewards/margins": 0.253420352935791, "rewards/rejected": -33.87439727783203, "step": 2682 }, { "epoch": 0.3653322440087146, "grad_norm": 41.93502832929124, "learning_rate": 6.40419662272283e-07, "logits/chosen": 11.275003433227539, "logits/rejected": 11.427936553955078, "logps/chosen": -3.6731295585632324, "logps/rejected": -3.818495273590088, "loss": 4.2019, "rewards/accuracies": 0.75, "rewards/chosen": -36.731300354003906, "rewards/margins": 1.4536561965942383, "rewards/rejected": -38.18495559692383, "step": 2683 }, { "epoch": 0.36546840958605664, "grad_norm": 40.82512090671653, "learning_rate": 6.402676727171913e-07, "logits/chosen": 10.927488327026367, "logits/rejected": 12.330787658691406, "logps/chosen": -3.676826000213623, "logps/rejected": -3.902054786682129, "loss": 4.189, "rewards/accuracies": 0.75, "rewards/chosen": -36.76826095581055, "rewards/margins": 2.252285957336426, "rewards/rejected": -39.020545959472656, "step": 2684 }, { "epoch": 0.36560457516339867, "grad_norm": 38.94317690542441, "learning_rate": 6.401156288716331e-07, "logits/chosen": 11.38361930847168, "logits/rejected": 11.088547706604004, "logps/chosen": -3.6851744651794434, "logps/rejected": -3.7886862754821777, "loss": 3.8826, "rewards/accuracies": 0.75, "rewards/chosen": -36.851741790771484, "rewards/margins": 1.0351176261901855, "rewards/rejected": -37.88685989379883, "step": 2685 }, { "epoch": 0.36574074074074076, "grad_norm": 42.83830985990653, "learning_rate": 6.399635307699636e-07, "logits/chosen": 11.550971984863281, "logits/rejected": 11.454635620117188, "logps/chosen": -3.3795366287231445, "logps/rejected": -3.401127815246582, "loss": 4.2593, "rewards/accuracies": 0.5, "rewards/chosen": -33.79536437988281, "rewards/margins": 0.21591615676879883, "rewards/rejected": -34.01128005981445, "step": 2686 }, { "epoch": 0.3658769063180828, "grad_norm": 46.42149280540445, "learning_rate": 6.398113784465508e-07, "logits/chosen": 10.747377395629883, "logits/rejected": 11.571191787719727, "logps/chosen": -3.632683753967285, "logps/rejected": -3.665827512741089, "loss": 4.72, "rewards/accuracies": 0.5, "rewards/chosen": -36.326839447021484, "rewards/margins": 0.3314371109008789, "rewards/rejected": -36.65827560424805, "step": 2687 }, { "epoch": 0.3660130718954248, "grad_norm": 42.55192586479218, "learning_rate": 6.396591719357746e-07, "logits/chosen": 11.243120193481445, "logits/rejected": 10.81003189086914, "logps/chosen": -3.627122402191162, "logps/rejected": -3.5206286907196045, "loss": 4.4017, "rewards/accuracies": 0.5, "rewards/chosen": -36.27122497558594, "rewards/margins": -1.0649375915527344, "rewards/rejected": -35.20628356933594, "step": 2688 }, { "epoch": 0.3661492374727669, "grad_norm": 43.68099499945994, "learning_rate": 6.395069112720275e-07, "logits/chosen": 10.747234344482422, "logits/rejected": 11.7366304397583, "logps/chosen": -3.282910108566284, "logps/rejected": -3.634418487548828, "loss": 3.6936, "rewards/accuracies": 0.75, "rewards/chosen": -32.8291015625, "rewards/margins": 3.5150814056396484, "rewards/rejected": -36.34418487548828, "step": 2689 }, { "epoch": 0.3662854030501089, "grad_norm": 42.845295837882546, "learning_rate": 6.393545964897142e-07, "logits/chosen": 10.39924430847168, "logits/rejected": 11.475729942321777, "logps/chosen": -3.616368293762207, "logps/rejected": -3.8991174697875977, "loss": 4.2969, "rewards/accuracies": 0.5, "rewards/chosen": -36.16368103027344, "rewards/margins": 2.8274903297424316, "rewards/rejected": -38.991172790527344, "step": 2690 }, { "epoch": 0.36642156862745096, "grad_norm": 39.92954097555474, "learning_rate": 6.392022276232511e-07, "logits/chosen": 10.19000244140625, "logits/rejected": 9.950773239135742, "logps/chosen": -3.1059153079986572, "logps/rejected": -2.9165821075439453, "loss": 4.1331, "rewards/accuracies": 0.25, "rewards/chosen": -31.05915069580078, "rewards/margins": -1.893331527709961, "rewards/rejected": -29.165821075439453, "step": 2691 }, { "epoch": 0.36655773420479304, "grad_norm": 40.09022102628338, "learning_rate": 6.390498047070675e-07, "logits/chosen": 11.920345306396484, "logits/rejected": 11.639822006225586, "logps/chosen": -3.7596631050109863, "logps/rejected": -3.5880980491638184, "loss": 4.2037, "rewards/accuracies": 0.5, "rewards/chosen": -37.59663009643555, "rewards/margins": -1.7156481742858887, "rewards/rejected": -35.8809814453125, "step": 2692 }, { "epoch": 0.36669389978213507, "grad_norm": 59.607542536691945, "learning_rate": 6.388973277756045e-07, "logits/chosen": 10.731636047363281, "logits/rejected": 10.973604202270508, "logps/chosen": -3.3803186416625977, "logps/rejected": -3.526723623275757, "loss": 4.663, "rewards/accuracies": 0.75, "rewards/chosen": -33.803184509277344, "rewards/margins": 1.4640507698059082, "rewards/rejected": -35.267234802246094, "step": 2693 }, { "epoch": 0.3668300653594771, "grad_norm": 45.58438307985827, "learning_rate": 6.387447968633156e-07, "logits/chosen": 11.649284362792969, "logits/rejected": 10.996683120727539, "logps/chosen": -3.7772789001464844, "logps/rejected": -3.6675655841827393, "loss": 3.9069, "rewards/accuracies": 0.5, "rewards/chosen": -37.772789001464844, "rewards/margins": -1.0971331596374512, "rewards/rejected": -36.675655364990234, "step": 2694 }, { "epoch": 0.3669662309368192, "grad_norm": 50.01869814235294, "learning_rate": 6.385922120046663e-07, "logits/chosen": 11.425853729248047, "logits/rejected": 11.979584693908691, "logps/chosen": -3.7154974937438965, "logps/rejected": -4.127840042114258, "loss": 4.5102, "rewards/accuracies": 1.0, "rewards/chosen": -37.15497589111328, "rewards/margins": 4.1234283447265625, "rewards/rejected": -41.278404235839844, "step": 2695 }, { "epoch": 0.3671023965141612, "grad_norm": 40.599990601400236, "learning_rate": 6.384395732341344e-07, "logits/chosen": 11.065418243408203, "logits/rejected": 11.257621765136719, "logps/chosen": -3.6263203620910645, "logps/rejected": -3.575289011001587, "loss": 4.0089, "rewards/accuracies": 0.5, "rewards/chosen": -36.26320266723633, "rewards/margins": -0.5103139877319336, "rewards/rejected": -35.752891540527344, "step": 2696 }, { "epoch": 0.3672385620915033, "grad_norm": 36.73081883684728, "learning_rate": 6.382868805862101e-07, "logits/chosen": 10.729084014892578, "logits/rejected": 11.669330596923828, "logps/chosen": -3.4479687213897705, "logps/rejected": -3.740476369857788, "loss": 3.4303, "rewards/accuracies": 0.75, "rewards/chosen": -34.47968673706055, "rewards/margins": 2.925077438354492, "rewards/rejected": -37.40476608276367, "step": 2697 }, { "epoch": 0.3673747276688453, "grad_norm": 35.38407235166962, "learning_rate": 6.381341340953953e-07, "logits/chosen": 11.960851669311523, "logits/rejected": 10.792455673217773, "logps/chosen": -3.628018617630005, "logps/rejected": -3.6385693550109863, "loss": 4.1557, "rewards/accuracies": 0.5, "rewards/chosen": -36.280181884765625, "rewards/margins": 0.10550880432128906, "rewards/rejected": -36.38569259643555, "step": 2698 }, { "epoch": 0.36751089324618735, "grad_norm": 44.000945991888436, "learning_rate": 6.379813337962046e-07, "logits/chosen": 11.321290969848633, "logits/rejected": 12.084778785705566, "logps/chosen": -3.6657161712646484, "logps/rejected": -3.8396568298339844, "loss": 4.0969, "rewards/accuracies": 0.5, "rewards/chosen": -36.657161712646484, "rewards/margins": 1.7394051551818848, "rewards/rejected": -38.396568298339844, "step": 2699 }, { "epoch": 0.36764705882352944, "grad_norm": 37.043538110510376, "learning_rate": 6.378284797231643e-07, "logits/chosen": 10.553665161132812, "logits/rejected": 10.83138656616211, "logps/chosen": -3.3795199394226074, "logps/rejected": -3.5298681259155273, "loss": 3.5793, "rewards/accuracies": 0.75, "rewards/chosen": -33.79520034790039, "rewards/margins": 1.5034799575805664, "rewards/rejected": -35.29867935180664, "step": 2700 }, { "epoch": 0.36778322440087147, "grad_norm": 38.19897868703697, "learning_rate": 6.376755719108131e-07, "logits/chosen": 11.4415283203125, "logits/rejected": 11.532058715820312, "logps/chosen": -3.540809392929077, "logps/rejected": -3.7928049564361572, "loss": 4.0452, "rewards/accuracies": 0.75, "rewards/chosen": -35.40809631347656, "rewards/margins": 2.5199575424194336, "rewards/rejected": -37.92805480957031, "step": 2701 }, { "epoch": 0.3679193899782135, "grad_norm": 41.367013384627846, "learning_rate": 6.375226103937019e-07, "logits/chosen": 10.867501258850098, "logits/rejected": 12.067461013793945, "logps/chosen": -3.364372491836548, "logps/rejected": -3.739408016204834, "loss": 4.3409, "rewards/accuracies": 1.0, "rewards/chosen": -33.64372634887695, "rewards/margins": 3.7503552436828613, "rewards/rejected": -37.394081115722656, "step": 2702 }, { "epoch": 0.3680555555555556, "grad_norm": 42.462547839431416, "learning_rate": 6.373695952063933e-07, "logits/chosen": 11.615215301513672, "logits/rejected": 11.958698272705078, "logps/chosen": -3.67619252204895, "logps/rejected": -3.911698341369629, "loss": 3.7324, "rewards/accuracies": 1.0, "rewards/chosen": -36.761924743652344, "rewards/margins": 2.3550586700439453, "rewards/rejected": -39.116981506347656, "step": 2703 }, { "epoch": 0.3681917211328976, "grad_norm": 38.384386466736956, "learning_rate": 6.372165263834625e-07, "logits/chosen": 10.989320755004883, "logits/rejected": 10.943825721740723, "logps/chosen": -3.604036808013916, "logps/rejected": -3.6447575092315674, "loss": 3.6223, "rewards/accuracies": 0.5, "rewards/chosen": -36.040367126464844, "rewards/margins": 0.40720653533935547, "rewards/rejected": -36.447574615478516, "step": 2704 }, { "epoch": 0.36832788671023964, "grad_norm": 46.695280292872006, "learning_rate": 6.370634039594969e-07, "logits/chosen": 12.129887580871582, "logits/rejected": 12.433523178100586, "logps/chosen": -3.6385841369628906, "logps/rejected": -4.0477070808410645, "loss": 4.1502, "rewards/accuracies": 0.75, "rewards/chosen": -36.385841369628906, "rewards/margins": 4.0912275314331055, "rewards/rejected": -40.47706604003906, "step": 2705 }, { "epoch": 0.3684640522875817, "grad_norm": 39.92801876410855, "learning_rate": 6.369102279690955e-07, "logits/chosen": 10.981517791748047, "logits/rejected": 12.192347526550293, "logps/chosen": -3.5169734954833984, "logps/rejected": -4.005954742431641, "loss": 4.0302, "rewards/accuracies": 1.0, "rewards/chosen": -35.169734954833984, "rewards/margins": 4.889813423156738, "rewards/rejected": -40.059547424316406, "step": 2706 }, { "epoch": 0.36860021786492375, "grad_norm": 39.71211333623737, "learning_rate": 6.367569984468698e-07, "logits/chosen": 12.257170677185059, "logits/rejected": 11.747140884399414, "logps/chosen": -3.693406581878662, "logps/rejected": -3.59391188621521, "loss": 4.8825, "rewards/accuracies": 0.25, "rewards/chosen": -36.93406677246094, "rewards/margins": -0.9949493408203125, "rewards/rejected": -35.939117431640625, "step": 2707 }, { "epoch": 0.3687363834422658, "grad_norm": 39.18972401936811, "learning_rate": 6.366037154274433e-07, "logits/chosen": 11.316959381103516, "logits/rejected": 11.832755088806152, "logps/chosen": -3.4577128887176514, "logps/rejected": -3.6964755058288574, "loss": 3.8482, "rewards/accuracies": 0.75, "rewards/chosen": -34.57712936401367, "rewards/margins": 2.3876280784606934, "rewards/rejected": -36.964752197265625, "step": 2708 }, { "epoch": 0.36887254901960786, "grad_norm": 39.60929627543409, "learning_rate": 6.364503789454514e-07, "logits/chosen": 10.831954002380371, "logits/rejected": 11.2481050491333, "logps/chosen": -3.2264535427093506, "logps/rejected": -3.4633355140686035, "loss": 3.8734, "rewards/accuracies": 0.75, "rewards/chosen": -32.26453399658203, "rewards/margins": 2.3688220977783203, "rewards/rejected": -34.633358001708984, "step": 2709 }, { "epoch": 0.3690087145969499, "grad_norm": 41.90399589533562, "learning_rate": 6.362969890355419e-07, "logits/chosen": 11.509921073913574, "logits/rejected": 11.51401424407959, "logps/chosen": -3.5445919036865234, "logps/rejected": -3.692124843597412, "loss": 3.9324, "rewards/accuracies": 0.5, "rewards/chosen": -35.445919036865234, "rewards/margins": 1.475329875946045, "rewards/rejected": -36.92124938964844, "step": 2710 }, { "epoch": 0.3691448801742919, "grad_norm": 44.83323077644838, "learning_rate": 6.361435457323745e-07, "logits/chosen": 10.64832592010498, "logits/rejected": 11.530344009399414, "logps/chosen": -3.2249748706817627, "logps/rejected": -3.3755979537963867, "loss": 4.2619, "rewards/accuracies": 0.5, "rewards/chosen": -32.24974822998047, "rewards/margins": 1.5062294006347656, "rewards/rejected": -33.755977630615234, "step": 2711 }, { "epoch": 0.369281045751634, "grad_norm": 41.8880869972784, "learning_rate": 6.359900490706209e-07, "logits/chosen": 11.33950138092041, "logits/rejected": 11.453523635864258, "logps/chosen": -3.6542563438415527, "logps/rejected": -3.666581630706787, "loss": 3.874, "rewards/accuracies": 0.75, "rewards/chosen": -36.542564392089844, "rewards/margins": 0.12325286865234375, "rewards/rejected": -36.66581726074219, "step": 2712 }, { "epoch": 0.36941721132897604, "grad_norm": 40.534868320161706, "learning_rate": 6.358364990849651e-07, "logits/chosen": 11.086053848266602, "logits/rejected": 12.2618989944458, "logps/chosen": -3.305185079574585, "logps/rejected": -3.9220480918884277, "loss": 4.3591, "rewards/accuracies": 0.75, "rewards/chosen": -33.051849365234375, "rewards/margins": 6.168628692626953, "rewards/rejected": -39.220481872558594, "step": 2713 }, { "epoch": 0.36955337690631807, "grad_norm": 36.8571182017376, "learning_rate": 6.35682895810103e-07, "logits/chosen": 10.543853759765625, "logits/rejected": 10.705693244934082, "logps/chosen": -3.2394590377807617, "logps/rejected": -3.415131092071533, "loss": 3.7973, "rewards/accuracies": 0.5, "rewards/chosen": -32.39459228515625, "rewards/margins": 1.756721019744873, "rewards/rejected": -34.15131378173828, "step": 2714 }, { "epoch": 0.36968954248366015, "grad_norm": 35.633852847543714, "learning_rate": 6.35529239280742e-07, "logits/chosen": 11.653717041015625, "logits/rejected": 12.32548713684082, "logps/chosen": -3.9011940956115723, "logps/rejected": -4.010651111602783, "loss": 3.6318, "rewards/accuracies": 0.5, "rewards/chosen": -39.011940002441406, "rewards/margins": 1.0945701599121094, "rewards/rejected": -40.10651397705078, "step": 2715 }, { "epoch": 0.3698257080610022, "grad_norm": 39.31736867996435, "learning_rate": 6.353755295316029e-07, "logits/chosen": 11.820140838623047, "logits/rejected": 11.567087173461914, "logps/chosen": -3.4459316730499268, "logps/rejected": -3.6439638137817383, "loss": 3.756, "rewards/accuracies": 0.5, "rewards/chosen": -34.45931625366211, "rewards/margins": 1.9803218841552734, "rewards/rejected": -36.439640045166016, "step": 2716 }, { "epoch": 0.3699618736383442, "grad_norm": 40.533805532215496, "learning_rate": 6.352217665974171e-07, "logits/chosen": 11.780830383300781, "logits/rejected": 12.35391616821289, "logps/chosen": -3.753602981567383, "logps/rejected": -3.8178699016571045, "loss": 4.1184, "rewards/accuracies": 0.5, "rewards/chosen": -37.53602981567383, "rewards/margins": 0.6426706314086914, "rewards/rejected": -38.1786994934082, "step": 2717 }, { "epoch": 0.3700980392156863, "grad_norm": 37.96987634834981, "learning_rate": 6.350679505129287e-07, "logits/chosen": 10.22819995880127, "logits/rejected": 10.861888885498047, "logps/chosen": -3.7104873657226562, "logps/rejected": -3.775421142578125, "loss": 3.5253, "rewards/accuracies": 0.5, "rewards/chosen": -37.10487365722656, "rewards/margins": 0.6493368148803711, "rewards/rejected": -37.754207611083984, "step": 2718 }, { "epoch": 0.3702342047930283, "grad_norm": 39.67558927057442, "learning_rate": 6.34914081312894e-07, "logits/chosen": 10.652542114257812, "logits/rejected": 10.456239700317383, "logps/chosen": -3.4100136756896973, "logps/rejected": -3.5108251571655273, "loss": 4.0852, "rewards/accuracies": 0.5, "rewards/chosen": -34.100135803222656, "rewards/margins": 1.0081138610839844, "rewards/rejected": -35.108253479003906, "step": 2719 }, { "epoch": 0.37037037037037035, "grad_norm": 48.44713044157318, "learning_rate": 6.347601590320806e-07, "logits/chosen": 12.221946716308594, "logits/rejected": 11.697735786437988, "logps/chosen": -3.4974472522735596, "logps/rejected": -3.7826387882232666, "loss": 3.5342, "rewards/accuracies": 0.75, "rewards/chosen": -34.97447204589844, "rewards/margins": 2.8519177436828613, "rewards/rejected": -37.82638931274414, "step": 2720 }, { "epoch": 0.37050653594771243, "grad_norm": 34.16191879844342, "learning_rate": 6.346061837052687e-07, "logits/chosen": 11.02855110168457, "logits/rejected": 10.854756355285645, "logps/chosen": -3.6377432346343994, "logps/rejected": -3.46360445022583, "loss": 3.6543, "rewards/accuracies": 0.25, "rewards/chosen": -36.37743377685547, "rewards/margins": -1.7413887977600098, "rewards/rejected": -34.636043548583984, "step": 2721 }, { "epoch": 0.37064270152505446, "grad_norm": 41.239340971305765, "learning_rate": 6.344521553672505e-07, "logits/chosen": 11.13661003112793, "logits/rejected": 11.997891426086426, "logps/chosen": -3.469367742538452, "logps/rejected": -3.52480411529541, "loss": 4.3161, "rewards/accuracies": 0.75, "rewards/chosen": -34.69367980957031, "rewards/margins": 0.5543622970581055, "rewards/rejected": -35.24803924560547, "step": 2722 }, { "epoch": 0.3707788671023965, "grad_norm": 42.95612133489487, "learning_rate": 6.342980740528297e-07, "logits/chosen": 11.048442840576172, "logits/rejected": 12.095406532287598, "logps/chosen": -3.569385290145874, "logps/rejected": -3.938723087310791, "loss": 4.0633, "rewards/accuracies": 1.0, "rewards/chosen": -35.693851470947266, "rewards/margins": 3.693380355834961, "rewards/rejected": -39.387229919433594, "step": 2723 }, { "epoch": 0.3709150326797386, "grad_norm": 44.30747270971746, "learning_rate": 6.341439397968222e-07, "logits/chosen": 11.729562759399414, "logits/rejected": 11.769956588745117, "logps/chosen": -3.639939308166504, "logps/rejected": -3.8339476585388184, "loss": 3.7711, "rewards/accuracies": 1.0, "rewards/chosen": -36.399391174316406, "rewards/margins": 1.9400863647460938, "rewards/rejected": -38.3394775390625, "step": 2724 }, { "epoch": 0.3710511982570806, "grad_norm": 38.87556773838822, "learning_rate": 6.339897526340562e-07, "logits/chosen": 11.628795623779297, "logits/rejected": 12.60947036743164, "logps/chosen": -3.6836116313934326, "logps/rejected": -3.897857666015625, "loss": 3.7598, "rewards/accuracies": 0.5, "rewards/chosen": -36.836116790771484, "rewards/margins": 2.1424598693847656, "rewards/rejected": -38.97857666015625, "step": 2725 }, { "epoch": 0.37118736383442263, "grad_norm": 45.399621425152915, "learning_rate": 6.338355125993715e-07, "logits/chosen": 12.386524200439453, "logits/rejected": 11.62690258026123, "logps/chosen": -3.6134233474731445, "logps/rejected": -3.6087751388549805, "loss": 4.0147, "rewards/accuracies": 0.5, "rewards/chosen": -36.13423156738281, "rewards/margins": -0.04648113250732422, "rewards/rejected": -36.08775329589844, "step": 2726 }, { "epoch": 0.3713235294117647, "grad_norm": 44.33254546695618, "learning_rate": 6.336812197276197e-07, "logits/chosen": 11.325145721435547, "logits/rejected": 11.045051574707031, "logps/chosen": -3.087352752685547, "logps/rejected": -3.618772506713867, "loss": 3.8887, "rewards/accuracies": 1.0, "rewards/chosen": -30.87352752685547, "rewards/margins": 5.314197063446045, "rewards/rejected": -36.187721252441406, "step": 2727 }, { "epoch": 0.37145969498910675, "grad_norm": 42.135358699424344, "learning_rate": 6.335268740536648e-07, "logits/chosen": 10.888672828674316, "logits/rejected": 11.293240547180176, "logps/chosen": -3.5259203910827637, "logps/rejected": -3.6932103633880615, "loss": 4.4494, "rewards/accuracies": 0.5, "rewards/chosen": -35.25920486450195, "rewards/margins": 1.6728992462158203, "rewards/rejected": -36.932106018066406, "step": 2728 }, { "epoch": 0.3715958605664488, "grad_norm": 45.243350103439376, "learning_rate": 6.333724756123823e-07, "logits/chosen": 11.001068115234375, "logits/rejected": 11.194116592407227, "logps/chosen": -3.226637601852417, "logps/rejected": -3.6342642307281494, "loss": 4.3358, "rewards/accuracies": 0.5, "rewards/chosen": -32.26637649536133, "rewards/margins": 4.076266288757324, "rewards/rejected": -36.34264373779297, "step": 2729 }, { "epoch": 0.37173202614379086, "grad_norm": 42.91998790689562, "learning_rate": 6.332180244386597e-07, "logits/chosen": 11.049654006958008, "logits/rejected": 11.912729263305664, "logps/chosen": -3.8174052238464355, "logps/rejected": -3.9224495887756348, "loss": 4.6493, "rewards/accuracies": 0.75, "rewards/chosen": -38.17405319213867, "rewards/margins": 1.0504446029663086, "rewards/rejected": -39.2244987487793, "step": 2730 }, { "epoch": 0.3718681917211329, "grad_norm": 41.437386408635426, "learning_rate": 6.330635205673968e-07, "logits/chosen": 10.884221076965332, "logits/rejected": 11.326661109924316, "logps/chosen": -3.4305381774902344, "logps/rejected": -3.56622314453125, "loss": 3.8961, "rewards/accuracies": 0.75, "rewards/chosen": -34.305381774902344, "rewards/margins": 1.3568496704101562, "rewards/rejected": -35.6622314453125, "step": 2731 }, { "epoch": 0.3720043572984749, "grad_norm": 40.27440974410651, "learning_rate": 6.32908964033505e-07, "logits/chosen": 10.400005340576172, "logits/rejected": 11.137811660766602, "logps/chosen": -3.2235231399536133, "logps/rejected": -3.637083053588867, "loss": 3.7793, "rewards/accuracies": 1.0, "rewards/chosen": -32.2352294921875, "rewards/margins": 4.1355977058410645, "rewards/rejected": -36.370826721191406, "step": 2732 }, { "epoch": 0.372140522875817, "grad_norm": 50.70792973405881, "learning_rate": 6.327543548719074e-07, "logits/chosen": 10.945608139038086, "logits/rejected": 11.32711410522461, "logps/chosen": -3.7625045776367188, "logps/rejected": -3.919429063796997, "loss": 3.7679, "rewards/accuracies": 0.75, "rewards/chosen": -37.62504577636719, "rewards/margins": 1.5692434310913086, "rewards/rejected": -39.19429016113281, "step": 2733 }, { "epoch": 0.37227668845315903, "grad_norm": 48.44568762625796, "learning_rate": 6.325996931175393e-07, "logits/chosen": 11.583578109741211, "logits/rejected": 11.905406951904297, "logps/chosen": -3.8852553367614746, "logps/rejected": -3.8705592155456543, "loss": 4.0734, "rewards/accuracies": 0.5, "rewards/chosen": -38.85255432128906, "rewards/margins": -0.14696311950683594, "rewards/rejected": -38.705589294433594, "step": 2734 }, { "epoch": 0.3724128540305011, "grad_norm": 40.672442892846945, "learning_rate": 6.32444978805348e-07, "logits/chosen": 11.130887985229492, "logits/rejected": 11.792201042175293, "logps/chosen": -3.419016122817993, "logps/rejected": -3.732919216156006, "loss": 3.4777, "rewards/accuracies": 0.75, "rewards/chosen": -34.190162658691406, "rewards/margins": 3.139030933380127, "rewards/rejected": -37.329193115234375, "step": 2735 }, { "epoch": 0.37254901960784315, "grad_norm": 48.872095073866085, "learning_rate": 6.322902119702922e-07, "logits/chosen": 11.15712833404541, "logits/rejected": 11.699310302734375, "logps/chosen": -3.609142303466797, "logps/rejected": -3.8919119834899902, "loss": 4.9244, "rewards/accuracies": 0.75, "rewards/chosen": -36.09142303466797, "rewards/margins": 2.8276944160461426, "rewards/rejected": -38.91911697387695, "step": 2736 }, { "epoch": 0.3726851851851852, "grad_norm": 42.67204629756815, "learning_rate": 6.321353926473429e-07, "logits/chosen": 10.654350280761719, "logits/rejected": 11.178597450256348, "logps/chosen": -3.6069650650024414, "logps/rejected": -3.8495521545410156, "loss": 4.0361, "rewards/accuracies": 0.75, "rewards/chosen": -36.06964874267578, "rewards/margins": 2.425873279571533, "rewards/rejected": -38.495521545410156, "step": 2737 }, { "epoch": 0.37282135076252726, "grad_norm": 35.88908232048052, "learning_rate": 6.319805208714829e-07, "logits/chosen": 11.694823265075684, "logits/rejected": 11.757247924804688, "logps/chosen": -3.5723259449005127, "logps/rejected": -3.731757164001465, "loss": 4.239, "rewards/accuracies": 0.75, "rewards/chosen": -35.72325897216797, "rewards/margins": 1.594313621520996, "rewards/rejected": -37.31757354736328, "step": 2738 }, { "epoch": 0.3729575163398693, "grad_norm": 41.69123343357849, "learning_rate": 6.318255966777065e-07, "logits/chosen": 11.753676414489746, "logits/rejected": 11.836545944213867, "logps/chosen": -3.98595929145813, "logps/rejected": -3.978414297103882, "loss": 4.2061, "rewards/accuracies": 0.5, "rewards/chosen": -39.85959243774414, "rewards/margins": -0.07544898986816406, "rewards/rejected": -39.78414535522461, "step": 2739 }, { "epoch": 0.3730936819172113, "grad_norm": 57.47135091677284, "learning_rate": 6.316706201010204e-07, "logits/chosen": 11.804532051086426, "logits/rejected": 11.558061599731445, "logps/chosen": -3.7863030433654785, "logps/rejected": -3.655534267425537, "loss": 4.2706, "rewards/accuracies": 0.25, "rewards/chosen": -37.86302947998047, "rewards/margins": -1.3076858520507812, "rewards/rejected": -36.55534362792969, "step": 2740 }, { "epoch": 0.3732298474945534, "grad_norm": 44.29041033697641, "learning_rate": 6.315155911764426e-07, "logits/chosen": 11.460830688476562, "logits/rejected": 11.269377708435059, "logps/chosen": -3.3588099479675293, "logps/rejected": -3.6957716941833496, "loss": 4.6854, "rewards/accuracies": 0.75, "rewards/chosen": -33.588096618652344, "rewards/margins": 3.3696188926696777, "rewards/rejected": -36.95771408081055, "step": 2741 }, { "epoch": 0.37336601307189543, "grad_norm": 38.233379557111114, "learning_rate": 6.313605099390032e-07, "logits/chosen": 11.906259536743164, "logits/rejected": 11.386191368103027, "logps/chosen": -3.4314868450164795, "logps/rejected": -3.4388136863708496, "loss": 3.4087, "rewards/accuracies": 0.5, "rewards/chosen": -34.31486511230469, "rewards/margins": 0.07326841354370117, "rewards/rejected": -34.38813781738281, "step": 2742 }, { "epoch": 0.37350217864923746, "grad_norm": 38.750777408897314, "learning_rate": 6.312053764237441e-07, "logits/chosen": 11.127494812011719, "logits/rejected": 11.663419723510742, "logps/chosen": -3.5096757411956787, "logps/rejected": -3.6514523029327393, "loss": 3.853, "rewards/accuracies": 0.75, "rewards/chosen": -35.09675598144531, "rewards/margins": 1.4177656173706055, "rewards/rejected": -36.514522552490234, "step": 2743 }, { "epoch": 0.37363834422657954, "grad_norm": 52.44170917120017, "learning_rate": 6.310501906657192e-07, "logits/chosen": 11.784875869750977, "logits/rejected": 11.467841148376465, "logps/chosen": -3.8444602489471436, "logps/rejected": -3.6512300968170166, "loss": 3.4912, "rewards/accuracies": 0.25, "rewards/chosen": -38.444602966308594, "rewards/margins": -1.9323034286499023, "rewards/rejected": -36.512298583984375, "step": 2744 }, { "epoch": 0.3737745098039216, "grad_norm": 39.8156863531222, "learning_rate": 6.308949526999937e-07, "logits/chosen": 12.007052421569824, "logits/rejected": 12.37297248840332, "logps/chosen": -3.973226547241211, "logps/rejected": -3.96720027923584, "loss": 4.17, "rewards/accuracies": 0.5, "rewards/chosen": -39.73226547241211, "rewards/margins": -0.060260772705078125, "rewards/rejected": -39.67200469970703, "step": 2745 }, { "epoch": 0.3739106753812636, "grad_norm": 47.13595559926178, "learning_rate": 6.30739662561645e-07, "logits/chosen": 12.435944557189941, "logits/rejected": 12.666913986206055, "logps/chosen": -3.5948472023010254, "logps/rejected": -3.8068742752075195, "loss": 3.55, "rewards/accuracies": 0.5, "rewards/chosen": -35.9484748840332, "rewards/margins": 2.120269775390625, "rewards/rejected": -38.06874084472656, "step": 2746 }, { "epoch": 0.3740468409586057, "grad_norm": 47.58095259316451, "learning_rate": 6.305843202857624e-07, "logits/chosen": 10.891878128051758, "logits/rejected": 11.871479034423828, "logps/chosen": -3.649428606033325, "logps/rejected": -3.877087116241455, "loss": 4.5405, "rewards/accuracies": 0.5, "rewards/chosen": -36.494285583496094, "rewards/margins": 2.276585102081299, "rewards/rejected": -38.7708740234375, "step": 2747 }, { "epoch": 0.3741830065359477, "grad_norm": 37.49519387199698, "learning_rate": 6.304289259074464e-07, "logits/chosen": 11.262935638427734, "logits/rejected": 11.661056518554688, "logps/chosen": -3.2585434913635254, "logps/rejected": -3.771084785461426, "loss": 3.7836, "rewards/accuracies": 1.0, "rewards/chosen": -32.5854377746582, "rewards/margins": 5.125410079956055, "rewards/rejected": -37.710845947265625, "step": 2748 }, { "epoch": 0.37431917211328974, "grad_norm": 39.21763047384419, "learning_rate": 6.302734794618099e-07, "logits/chosen": 10.739143371582031, "logits/rejected": 12.296947479248047, "logps/chosen": -3.238407611846924, "logps/rejected": -3.759939670562744, "loss": 3.8627, "rewards/accuracies": 0.75, "rewards/chosen": -32.38407897949219, "rewards/margins": 5.2153191566467285, "rewards/rejected": -37.599395751953125, "step": 2749 }, { "epoch": 0.37445533769063183, "grad_norm": 39.57558801772471, "learning_rate": 6.301179809839774e-07, "logits/chosen": 11.909011840820312, "logits/rejected": 12.277244567871094, "logps/chosen": -4.102020263671875, "logps/rejected": -3.625579357147217, "loss": 3.9673, "rewards/accuracies": 0.0, "rewards/chosen": -41.02020263671875, "rewards/margins": -4.764409065246582, "rewards/rejected": -36.255794525146484, "step": 2750 }, { "epoch": 0.37459150326797386, "grad_norm": 41.842462832081786, "learning_rate": 6.299624305090848e-07, "logits/chosen": 11.441886901855469, "logits/rejected": 11.94148063659668, "logps/chosen": -3.242424249649048, "logps/rejected": -3.6897594928741455, "loss": 4.5213, "rewards/accuracies": 0.75, "rewards/chosen": -32.42424011230469, "rewards/margins": 4.473352909088135, "rewards/rejected": -36.8975944519043, "step": 2751 }, { "epoch": 0.3747276688453159, "grad_norm": 57.0275498904763, "learning_rate": 6.298068280722802e-07, "logits/chosen": 12.433853149414062, "logits/rejected": 12.819023132324219, "logps/chosen": -3.765307903289795, "logps/rejected": -4.142189025878906, "loss": 3.8387, "rewards/accuracies": 0.75, "rewards/chosen": -37.653079986572266, "rewards/margins": 3.7688093185424805, "rewards/rejected": -41.42189025878906, "step": 2752 }, { "epoch": 0.37486383442265797, "grad_norm": 73.06957341120896, "learning_rate": 6.296511737087232e-07, "logits/chosen": 11.817070007324219, "logits/rejected": 12.069344520568848, "logps/chosen": -3.6425974369049072, "logps/rejected": -3.498823642730713, "loss": 4.2514, "rewards/accuracies": 0.25, "rewards/chosen": -36.42597198486328, "rewards/margins": -1.4377384185791016, "rewards/rejected": -34.98823547363281, "step": 2753 }, { "epoch": 0.375, "grad_norm": 41.50810855087639, "learning_rate": 6.29495467453585e-07, "logits/chosen": 11.116071701049805, "logits/rejected": 11.473939895629883, "logps/chosen": -3.296320676803589, "logps/rejected": -3.6325793266296387, "loss": 4.0653, "rewards/accuracies": 1.0, "rewards/chosen": -32.96320724487305, "rewards/margins": 3.3625850677490234, "rewards/rejected": -36.32579040527344, "step": 2754 }, { "epoch": 0.37513616557734203, "grad_norm": 37.95819213137921, "learning_rate": 6.293397093420492e-07, "logits/chosen": 10.356832504272461, "logits/rejected": 10.873163223266602, "logps/chosen": -3.455869674682617, "logps/rejected": -3.667752504348755, "loss": 3.7062, "rewards/accuracies": 0.75, "rewards/chosen": -34.55869674682617, "rewards/margins": 2.1188297271728516, "rewards/rejected": -36.677528381347656, "step": 2755 }, { "epoch": 0.3752723311546841, "grad_norm": 46.29140786600947, "learning_rate": 6.291838994093101e-07, "logits/chosen": 11.88539981842041, "logits/rejected": 12.285192489624023, "logps/chosen": -4.086435317993164, "logps/rejected": -4.277326583862305, "loss": 3.9621, "rewards/accuracies": 0.75, "rewards/chosen": -40.86435317993164, "rewards/margins": 1.9089164733886719, "rewards/rejected": -42.77326965332031, "step": 2756 }, { "epoch": 0.37540849673202614, "grad_norm": 40.88860785298326, "learning_rate": 6.290280376905745e-07, "logits/chosen": 11.96645736694336, "logits/rejected": 12.601537704467773, "logps/chosen": -3.5757064819335938, "logps/rejected": -3.744704246520996, "loss": 4.0652, "rewards/accuracies": 1.0, "rewards/chosen": -35.75706481933594, "rewards/margins": 1.689976692199707, "rewards/rejected": -37.447044372558594, "step": 2757 }, { "epoch": 0.37554466230936817, "grad_norm": 38.411631160598546, "learning_rate": 6.288721242210608e-07, "logits/chosen": 12.694602966308594, "logits/rejected": 11.27817153930664, "logps/chosen": -4.01650333404541, "logps/rejected": -3.487305164337158, "loss": 4.0566, "rewards/accuracies": 0.0, "rewards/chosen": -40.1650390625, "rewards/margins": -5.291984558105469, "rewards/rejected": -34.873050689697266, "step": 2758 }, { "epoch": 0.37568082788671026, "grad_norm": 39.99030209678582, "learning_rate": 6.287161590359986e-07, "logits/chosen": 11.240236282348633, "logits/rejected": 11.835063934326172, "logps/chosen": -3.852358341217041, "logps/rejected": -3.7294797897338867, "loss": 4.2477, "rewards/accuracies": 0.25, "rewards/chosen": -38.523582458496094, "rewards/margins": -1.228785514831543, "rewards/rejected": -37.2947998046875, "step": 2759 }, { "epoch": 0.3758169934640523, "grad_norm": 40.85461957228786, "learning_rate": 6.285601421706296e-07, "logits/chosen": 12.625076293945312, "logits/rejected": 12.724032402038574, "logps/chosen": -4.309150695800781, "logps/rejected": -4.283775329589844, "loss": 4.4282, "rewards/accuracies": 0.5, "rewards/chosen": -43.09149932861328, "rewards/margins": -0.25374794006347656, "rewards/rejected": -42.83775329589844, "step": 2760 }, { "epoch": 0.3759531590413943, "grad_norm": 46.945946326729874, "learning_rate": 6.284040736602074e-07, "logits/chosen": 11.87452507019043, "logits/rejected": 12.436971664428711, "logps/chosen": -3.5354208946228027, "logps/rejected": -3.5034539699554443, "loss": 4.5291, "rewards/accuracies": 0.5, "rewards/chosen": -35.354209899902344, "rewards/margins": -0.3196687698364258, "rewards/rejected": -35.03453826904297, "step": 2761 }, { "epoch": 0.3760893246187364, "grad_norm": 40.96009856344889, "learning_rate": 6.282479535399966e-07, "logits/chosen": 11.646564483642578, "logits/rejected": 12.311349868774414, "logps/chosen": -3.800914764404297, "logps/rejected": -3.958127737045288, "loss": 4.3509, "rewards/accuracies": 0.75, "rewards/chosen": -38.00914764404297, "rewards/margins": 1.572127342224121, "rewards/rejected": -39.581275939941406, "step": 2762 }, { "epoch": 0.3762254901960784, "grad_norm": 38.12744527717779, "learning_rate": 6.280917818452741e-07, "logits/chosen": 12.308237075805664, "logits/rejected": 12.169984817504883, "logps/chosen": -3.7202091217041016, "logps/rejected": -4.017120361328125, "loss": 3.4633, "rewards/accuracies": 0.75, "rewards/chosen": -37.202091217041016, "rewards/margins": 2.9691085815429688, "rewards/rejected": -40.171199798583984, "step": 2763 }, { "epoch": 0.37636165577342046, "grad_norm": 40.78823586053825, "learning_rate": 6.279355586113279e-07, "logits/chosen": 11.747437477111816, "logits/rejected": 12.016120910644531, "logps/chosen": -3.7042236328125, "logps/rejected": -4.036575794219971, "loss": 4.1752, "rewards/accuracies": 1.0, "rewards/chosen": -37.042240142822266, "rewards/margins": 3.3235206604003906, "rewards/rejected": -40.365760803222656, "step": 2764 }, { "epoch": 0.37649782135076254, "grad_norm": 39.77438640582924, "learning_rate": 6.277792838734582e-07, "logits/chosen": 10.99984359741211, "logits/rejected": 11.930171966552734, "logps/chosen": -3.7223634719848633, "logps/rejected": -3.843895435333252, "loss": 3.9906, "rewards/accuracies": 0.75, "rewards/chosen": -37.2236328125, "rewards/margins": 1.215322494506836, "rewards/rejected": -38.43895721435547, "step": 2765 }, { "epoch": 0.37663398692810457, "grad_norm": 43.34255257311297, "learning_rate": 6.276229576669765e-07, "logits/chosen": 11.885610580444336, "logits/rejected": 11.72317886352539, "logps/chosen": -3.5711169242858887, "logps/rejected": -3.925086498260498, "loss": 4.1775, "rewards/accuracies": 0.5, "rewards/chosen": -35.71116638183594, "rewards/margins": 3.5396976470947266, "rewards/rejected": -39.25086975097656, "step": 2766 }, { "epoch": 0.3767701525054466, "grad_norm": 42.18589484224524, "learning_rate": 6.274665800272059e-07, "logits/chosen": 11.1283540725708, "logits/rejected": 12.06690788269043, "logps/chosen": -3.499023199081421, "logps/rejected": -3.7118728160858154, "loss": 4.1658, "rewards/accuracies": 0.75, "rewards/chosen": -34.990234375, "rewards/margins": 2.128493309020996, "rewards/rejected": -37.11872863769531, "step": 2767 }, { "epoch": 0.3769063180827887, "grad_norm": 42.43919210576562, "learning_rate": 6.273101509894813e-07, "logits/chosen": 11.82758903503418, "logits/rejected": 11.354317665100098, "logps/chosen": -3.7201547622680664, "logps/rejected": -3.47332763671875, "loss": 4.125, "rewards/accuracies": 0.0, "rewards/chosen": -37.2015495300293, "rewards/margins": -2.4682717323303223, "rewards/rejected": -34.7332763671875, "step": 2768 }, { "epoch": 0.3770424836601307, "grad_norm": 40.26610169045129, "learning_rate": 6.27153670589149e-07, "logits/chosen": 11.565337181091309, "logits/rejected": 11.393953323364258, "logps/chosen": -3.477006673812866, "logps/rejected": -3.8447635173797607, "loss": 4.1135, "rewards/accuracies": 0.75, "rewards/chosen": -34.77006530761719, "rewards/margins": 3.6775689125061035, "rewards/rejected": -38.447635650634766, "step": 2769 }, { "epoch": 0.37717864923747274, "grad_norm": 39.12513463754278, "learning_rate": 6.269971388615674e-07, "logits/chosen": 11.840351104736328, "logits/rejected": 11.501615524291992, "logps/chosen": -3.6693131923675537, "logps/rejected": -3.619093656539917, "loss": 3.743, "rewards/accuracies": 0.5, "rewards/chosen": -36.69313049316406, "rewards/margins": -0.5021953582763672, "rewards/rejected": -36.19093704223633, "step": 2770 }, { "epoch": 0.3773148148148148, "grad_norm": 39.73728677174764, "learning_rate": 6.268405558421057e-07, "logits/chosen": 11.283737182617188, "logits/rejected": 11.573997497558594, "logps/chosen": -3.3241078853607178, "logps/rejected": -3.50679874420166, "loss": 3.5643, "rewards/accuracies": 1.0, "rewards/chosen": -33.24108123779297, "rewards/margins": 1.8269076347351074, "rewards/rejected": -35.06798553466797, "step": 2771 }, { "epoch": 0.37745098039215685, "grad_norm": 40.737198345992454, "learning_rate": 6.266839215661454e-07, "logits/chosen": 11.910882949829102, "logits/rejected": 12.320589065551758, "logps/chosen": -3.6724023818969727, "logps/rejected": -4.057432174682617, "loss": 4.001, "rewards/accuracies": 1.0, "rewards/chosen": -36.724021911621094, "rewards/margins": 3.8502988815307617, "rewards/rejected": -40.57432556152344, "step": 2772 }, { "epoch": 0.3775871459694989, "grad_norm": 42.75174860202917, "learning_rate": 6.265272360690793e-07, "logits/chosen": 12.611271858215332, "logits/rejected": 13.42342472076416, "logps/chosen": -4.033651351928711, "logps/rejected": -4.2389373779296875, "loss": 4.602, "rewards/accuracies": 0.75, "rewards/chosen": -40.336517333984375, "rewards/margins": 2.0528554916381836, "rewards/rejected": -42.38936996459961, "step": 2773 }, { "epoch": 0.37772331154684097, "grad_norm": 40.06260459450556, "learning_rate": 6.263704993863116e-07, "logits/chosen": 11.748634338378906, "logits/rejected": 12.269678115844727, "logps/chosen": -3.923438549041748, "logps/rejected": -4.063390731811523, "loss": 4.2164, "rewards/accuracies": 0.75, "rewards/chosen": -39.2343864440918, "rewards/margins": 1.3995208740234375, "rewards/rejected": -40.633907318115234, "step": 2774 }, { "epoch": 0.377859477124183, "grad_norm": 40.144424375439485, "learning_rate": 6.262137115532584e-07, "logits/chosen": 11.859931945800781, "logits/rejected": 12.291930198669434, "logps/chosen": -3.7407965660095215, "logps/rejected": -4.283015251159668, "loss": 4.0197, "rewards/accuracies": 1.0, "rewards/chosen": -37.40796661376953, "rewards/margins": 5.422186851501465, "rewards/rejected": -42.83015441894531, "step": 2775 }, { "epoch": 0.3779956427015251, "grad_norm": 44.811376455015086, "learning_rate": 6.260568726053472e-07, "logits/chosen": 11.031900405883789, "logits/rejected": 11.607938766479492, "logps/chosen": -3.463780164718628, "logps/rejected": -3.593632936477661, "loss": 4.4373, "rewards/accuracies": 0.75, "rewards/chosen": -34.63779830932617, "rewards/margins": 1.2985296249389648, "rewards/rejected": -35.93633270263672, "step": 2776 }, { "epoch": 0.3781318082788671, "grad_norm": 42.39734829544318, "learning_rate": 6.25899982578017e-07, "logits/chosen": 12.218301773071289, "logits/rejected": 12.412128448486328, "logps/chosen": -4.123432636260986, "logps/rejected": -3.9258737564086914, "loss": 4.3066, "rewards/accuracies": 0.25, "rewards/chosen": -41.23432540893555, "rewards/margins": -1.9755897521972656, "rewards/rejected": -39.25873565673828, "step": 2777 }, { "epoch": 0.37826797385620914, "grad_norm": 58.091708342554654, "learning_rate": 6.257430415067185e-07, "logits/chosen": 10.903139114379883, "logits/rejected": 11.10206127166748, "logps/chosen": -3.5987629890441895, "logps/rejected": -3.467349052429199, "loss": 4.0803, "rewards/accuracies": 0.25, "rewards/chosen": -35.98762893676758, "rewards/margins": -1.3141393661499023, "rewards/rejected": -34.673492431640625, "step": 2778 }, { "epoch": 0.3784041394335512, "grad_norm": 40.06528845747757, "learning_rate": 6.255860494269137e-07, "logits/chosen": 10.374937057495117, "logits/rejected": 11.493617057800293, "logps/chosen": -3.545992851257324, "logps/rejected": -3.8458774089813232, "loss": 3.8075, "rewards/accuracies": 1.0, "rewards/chosen": -35.459930419921875, "rewards/margins": 2.9988460540771484, "rewards/rejected": -38.45877456665039, "step": 2779 }, { "epoch": 0.37854030501089325, "grad_norm": 41.06960216595299, "learning_rate": 6.254290063740763e-07, "logits/chosen": 11.377979278564453, "logits/rejected": 11.822065353393555, "logps/chosen": -3.687561511993408, "logps/rejected": -3.8757083415985107, "loss": 4.2781, "rewards/accuracies": 0.5, "rewards/chosen": -36.875614166259766, "rewards/margins": 1.8814697265625, "rewards/rejected": -38.757083892822266, "step": 2780 }, { "epoch": 0.3786764705882353, "grad_norm": 43.48554947068115, "learning_rate": 6.252719123836915e-07, "logits/chosen": 11.159952163696289, "logits/rejected": 11.28255844116211, "logps/chosen": -3.38997745513916, "logps/rejected": -3.7171883583068848, "loss": 4.2804, "rewards/accuracies": 0.75, "rewards/chosen": -33.89977264404297, "rewards/margins": 3.2721076011657715, "rewards/rejected": -37.17188262939453, "step": 2781 }, { "epoch": 0.37881263616557737, "grad_norm": 43.872940175930076, "learning_rate": 6.251147674912561e-07, "logits/chosen": 11.134208679199219, "logits/rejected": 12.229923248291016, "logps/chosen": -3.755467176437378, "logps/rejected": -3.8242135047912598, "loss": 3.7055, "rewards/accuracies": 0.75, "rewards/chosen": -37.55467224121094, "rewards/margins": 0.687464714050293, "rewards/rejected": -38.24213790893555, "step": 2782 }, { "epoch": 0.3789488017429194, "grad_norm": 41.23913737077721, "learning_rate": 6.249575717322779e-07, "logits/chosen": 11.258947372436523, "logits/rejected": 11.696654319763184, "logps/chosen": -3.7799038887023926, "logps/rejected": -3.690396785736084, "loss": 4.1868, "rewards/accuracies": 0.25, "rewards/chosen": -37.799041748046875, "rewards/margins": -0.8950724601745605, "rewards/rejected": -36.903968811035156, "step": 2783 }, { "epoch": 0.3790849673202614, "grad_norm": 40.161541724143554, "learning_rate": 6.248003251422771e-07, "logits/chosen": 10.860586166381836, "logits/rejected": 11.185322761535645, "logps/chosen": -3.4402592182159424, "logps/rejected": -3.6572604179382324, "loss": 3.9908, "rewards/accuracies": 0.75, "rewards/chosen": -34.402591705322266, "rewards/margins": 2.1700100898742676, "rewards/rejected": -36.57260513305664, "step": 2784 }, { "epoch": 0.3792211328976035, "grad_norm": 41.15343516416872, "learning_rate": 6.246430277567846e-07, "logits/chosen": 12.011421203613281, "logits/rejected": 11.810083389282227, "logps/chosen": -3.7235381603240967, "logps/rejected": -3.9983246326446533, "loss": 4.2106, "rewards/accuracies": 0.5, "rewards/chosen": -37.235382080078125, "rewards/margins": 2.74786376953125, "rewards/rejected": -39.983245849609375, "step": 2785 }, { "epoch": 0.37935729847494554, "grad_norm": 39.834865242973656, "learning_rate": 6.244856796113429e-07, "logits/chosen": 11.737469673156738, "logits/rejected": 11.398242950439453, "logps/chosen": -3.7740769386291504, "logps/rejected": -3.9782233238220215, "loss": 4.0911, "rewards/accuracies": 0.75, "rewards/chosen": -37.74077224731445, "rewards/margins": 2.041461944580078, "rewards/rejected": -39.78223419189453, "step": 2786 }, { "epoch": 0.37949346405228757, "grad_norm": 38.062174816803115, "learning_rate": 6.243282807415063e-07, "logits/chosen": 11.444845199584961, "logits/rejected": 11.624078750610352, "logps/chosen": -3.415412425994873, "logps/rejected": -3.571594715118408, "loss": 4.1746, "rewards/accuracies": 0.75, "rewards/chosen": -34.15412521362305, "rewards/margins": 1.5618228912353516, "rewards/rejected": -35.715946197509766, "step": 2787 }, { "epoch": 0.37962962962962965, "grad_norm": 38.8347142714287, "learning_rate": 6.241708311828406e-07, "logits/chosen": 11.08292007446289, "logits/rejected": 11.350175857543945, "logps/chosen": -3.534064531326294, "logps/rejected": -3.699225425720215, "loss": 4.1622, "rewards/accuracies": 0.5, "rewards/chosen": -35.34064483642578, "rewards/margins": 1.6516094207763672, "rewards/rejected": -36.99225616455078, "step": 2788 }, { "epoch": 0.3797657952069717, "grad_norm": 40.032034319330315, "learning_rate": 6.240133309709223e-07, "logits/chosen": 10.662435531616211, "logits/rejected": 11.994196891784668, "logps/chosen": -3.2374401092529297, "logps/rejected": -3.7436439990997314, "loss": 4.3564, "rewards/accuracies": 1.0, "rewards/chosen": -32.3744010925293, "rewards/margins": 5.062038421630859, "rewards/rejected": -37.436439514160156, "step": 2789 }, { "epoch": 0.3799019607843137, "grad_norm": 37.56897951819368, "learning_rate": 6.238557801413402e-07, "logits/chosen": 11.014331817626953, "logits/rejected": 11.34471607208252, "logps/chosen": -3.03348970413208, "logps/rejected": -3.485832452774048, "loss": 3.8316, "rewards/accuracies": 0.75, "rewards/chosen": -30.334896087646484, "rewards/margins": 4.523427486419678, "rewards/rejected": -34.85832214355469, "step": 2790 }, { "epoch": 0.3800381263616558, "grad_norm": 38.894092285178196, "learning_rate": 6.236981787296942e-07, "logits/chosen": 10.9620361328125, "logits/rejected": 10.288963317871094, "logps/chosen": -3.1725029945373535, "logps/rejected": -3.429452896118164, "loss": 4.0079, "rewards/accuracies": 1.0, "rewards/chosen": -31.72502899169922, "rewards/margins": 2.5694994926452637, "rewards/rejected": -34.29452896118164, "step": 2791 }, { "epoch": 0.3801742919389978, "grad_norm": 39.82956061345571, "learning_rate": 6.235405267715955e-07, "logits/chosen": 11.894038200378418, "logits/rejected": 11.627378463745117, "logps/chosen": -3.726433753967285, "logps/rejected": -4.151719093322754, "loss": 4.146, "rewards/accuracies": 1.0, "rewards/chosen": -37.26433563232422, "rewards/margins": 4.252850532531738, "rewards/rejected": -41.517189025878906, "step": 2792 }, { "epoch": 0.38031045751633985, "grad_norm": 41.99543116547211, "learning_rate": 6.233828243026673e-07, "logits/chosen": 10.810543060302734, "logits/rejected": 11.274911880493164, "logps/chosen": -3.543018102645874, "logps/rejected": -3.656141757965088, "loss": 4.1365, "rewards/accuracies": 0.5, "rewards/chosen": -35.43018341064453, "rewards/margins": 1.131235122680664, "rewards/rejected": -36.56141662597656, "step": 2793 }, { "epoch": 0.38044662309368193, "grad_norm": 40.254594678999894, "learning_rate": 6.232250713585432e-07, "logits/chosen": 10.70111083984375, "logits/rejected": 10.951201438903809, "logps/chosen": -3.391165256500244, "logps/rejected": -3.427539348602295, "loss": 3.5609, "rewards/accuracies": 0.25, "rewards/chosen": -33.91165542602539, "rewards/margins": 0.3637418746948242, "rewards/rejected": -34.27539825439453, "step": 2794 }, { "epoch": 0.38058278867102396, "grad_norm": 44.08035252164952, "learning_rate": 6.230672679748691e-07, "logits/chosen": 10.10248851776123, "logits/rejected": 10.79544734954834, "logps/chosen": -3.1521193981170654, "logps/rejected": -3.3304014205932617, "loss": 4.0313, "rewards/accuracies": 0.5, "rewards/chosen": -31.521190643310547, "rewards/margins": 1.7828216552734375, "rewards/rejected": -33.30401611328125, "step": 2795 }, { "epoch": 0.380718954248366, "grad_norm": 41.457934861158506, "learning_rate": 6.229094141873019e-07, "logits/chosen": 10.713919639587402, "logits/rejected": 11.172164916992188, "logps/chosen": -3.6697964668273926, "logps/rejected": -3.577158212661743, "loss": 4.1764, "rewards/accuracies": 0.5, "rewards/chosen": -36.69796371459961, "rewards/margins": -0.9263820648193359, "rewards/rejected": -35.771583557128906, "step": 2796 }, { "epoch": 0.3808551198257081, "grad_norm": 41.40381392844372, "learning_rate": 6.227515100315099e-07, "logits/chosen": 10.353885650634766, "logits/rejected": 10.738638877868652, "logps/chosen": -3.25991153717041, "logps/rejected": -3.410778045654297, "loss": 3.8523, "rewards/accuracies": 0.75, "rewards/chosen": -32.599117279052734, "rewards/margins": 1.508664608001709, "rewards/rejected": -34.10778045654297, "step": 2797 }, { "epoch": 0.3809912854030501, "grad_norm": 40.41919696699738, "learning_rate": 6.22593555543173e-07, "logits/chosen": 10.83482551574707, "logits/rejected": 10.953510284423828, "logps/chosen": -3.789486885070801, "logps/rejected": -4.0141448974609375, "loss": 3.4969, "rewards/accuracies": 0.75, "rewards/chosen": -37.89487075805664, "rewards/margins": 2.246580123901367, "rewards/rejected": -40.141448974609375, "step": 2798 }, { "epoch": 0.38112745098039214, "grad_norm": 41.34080905301357, "learning_rate": 6.224355507579822e-07, "logits/chosen": 11.093055725097656, "logits/rejected": 11.186675071716309, "logps/chosen": -3.4980039596557617, "logps/rejected": -3.584545612335205, "loss": 3.5706, "rewards/accuracies": 0.75, "rewards/chosen": -34.98004150390625, "rewards/margins": 0.8654146194458008, "rewards/rejected": -35.845458984375, "step": 2799 }, { "epoch": 0.3812636165577342, "grad_norm": 41.51534127105124, "learning_rate": 6.222774957116401e-07, "logits/chosen": 10.663168907165527, "logits/rejected": 10.157751083374023, "logps/chosen": -3.4511256217956543, "logps/rejected": -3.5884456634521484, "loss": 3.7914, "rewards/accuracies": 0.75, "rewards/chosen": -34.51125717163086, "rewards/margins": 1.3731989860534668, "rewards/rejected": -35.884456634521484, "step": 2800 }, { "epoch": 0.38139978213507625, "grad_norm": 40.685265957686944, "learning_rate": 6.221193904398604e-07, "logits/chosen": 10.78728199005127, "logits/rejected": 11.312198638916016, "logps/chosen": -3.535571336746216, "logps/rejected": -3.682358741760254, "loss": 4.0385, "rewards/accuracies": 0.75, "rewards/chosen": -35.355712890625, "rewards/margins": 1.467874526977539, "rewards/rejected": -36.82358932495117, "step": 2801 }, { "epoch": 0.3815359477124183, "grad_norm": 45.2328345242701, "learning_rate": 6.219612349783684e-07, "logits/chosen": 10.865930557250977, "logits/rejected": 11.08458137512207, "logps/chosen": -3.3735907077789307, "logps/rejected": -3.5106041431427, "loss": 3.9969, "rewards/accuracies": 0.5, "rewards/chosen": -33.73590850830078, "rewards/margins": 1.3701329231262207, "rewards/rejected": -35.106040954589844, "step": 2802 }, { "epoch": 0.38167211328976036, "grad_norm": 47.4719038775165, "learning_rate": 6.218030293629007e-07, "logits/chosen": 11.182491302490234, "logits/rejected": 11.551233291625977, "logps/chosen": -3.59682559967041, "logps/rejected": -3.724379062652588, "loss": 4.6824, "rewards/accuracies": 0.75, "rewards/chosen": -35.96825408935547, "rewards/margins": 1.2755351066589355, "rewards/rejected": -37.24378967285156, "step": 2803 }, { "epoch": 0.3818082788671024, "grad_norm": 43.234324706340225, "learning_rate": 6.21644773629205e-07, "logits/chosen": 10.687432289123535, "logits/rejected": 10.712371826171875, "logps/chosen": -3.539032220840454, "logps/rejected": -3.5347981452941895, "loss": 3.8316, "rewards/accuracies": 0.25, "rewards/chosen": -35.390323638916016, "rewards/margins": -0.04234027862548828, "rewards/rejected": -35.347984313964844, "step": 2804 }, { "epoch": 0.3819444444444444, "grad_norm": 47.835169723841084, "learning_rate": 6.214864678130405e-07, "logits/chosen": 10.584817886352539, "logits/rejected": 11.407679557800293, "logps/chosen": -3.068859100341797, "logps/rejected": -3.7007877826690674, "loss": 3.9564, "rewards/accuracies": 1.0, "rewards/chosen": -30.68859100341797, "rewards/margins": 6.3192853927612305, "rewards/rejected": -37.007877349853516, "step": 2805 }, { "epoch": 0.3820806100217865, "grad_norm": 51.861412141778885, "learning_rate": 6.213281119501779e-07, "logits/chosen": 10.102540016174316, "logits/rejected": 10.08596420288086, "logps/chosen": -3.1192450523376465, "logps/rejected": -2.9419660568237305, "loss": 4.3002, "rewards/accuracies": 0.25, "rewards/chosen": -31.19245147705078, "rewards/margins": -1.7727904319763184, "rewards/rejected": -29.419660568237305, "step": 2806 }, { "epoch": 0.38221677559912853, "grad_norm": 51.102311036886185, "learning_rate": 6.211697060763989e-07, "logits/chosen": 10.082094192504883, "logits/rejected": 9.270705223083496, "logps/chosen": -3.0163168907165527, "logps/rejected": -3.259972333908081, "loss": 4.0523, "rewards/accuracies": 0.5, "rewards/chosen": -30.163169860839844, "rewards/margins": 2.4365549087524414, "rewards/rejected": -32.59972381591797, "step": 2807 }, { "epoch": 0.38235294117647056, "grad_norm": 40.9052378945993, "learning_rate": 6.210112502274964e-07, "logits/chosen": 11.084375381469727, "logits/rejected": 10.982465744018555, "logps/chosen": -3.0678606033325195, "logps/rejected": -3.5417938232421875, "loss": 4.2431, "rewards/accuracies": 1.0, "rewards/chosen": -30.678607940673828, "rewards/margins": 4.739331245422363, "rewards/rejected": -35.417938232421875, "step": 2808 }, { "epoch": 0.38248910675381265, "grad_norm": 47.04289141032167, "learning_rate": 6.208527444392752e-07, "logits/chosen": 8.544591903686523, "logits/rejected": 10.336690902709961, "logps/chosen": -3.372960090637207, "logps/rejected": -3.803006172180176, "loss": 4.754, "rewards/accuracies": 0.75, "rewards/chosen": -33.72959899902344, "rewards/margins": 4.300461769104004, "rewards/rejected": -38.030059814453125, "step": 2809 }, { "epoch": 0.3826252723311547, "grad_norm": 48.397862864315684, "learning_rate": 6.206941887475507e-07, "logits/chosen": 10.721668243408203, "logits/rejected": 10.716002464294434, "logps/chosen": -3.5260989665985107, "logps/rejected": -3.4375228881835938, "loss": 4.5991, "rewards/accuracies": 0.5, "rewards/chosen": -35.260990142822266, "rewards/margins": -0.8857583999633789, "rewards/rejected": -34.37522888183594, "step": 2810 }, { "epoch": 0.3827614379084967, "grad_norm": 170.94730074817255, "learning_rate": 6.2053558318815e-07, "logits/chosen": 10.241354942321777, "logits/rejected": 10.610977172851562, "logps/chosen": -3.0451738834381104, "logps/rejected": -3.7480690479278564, "loss": 4.4334, "rewards/accuracies": 0.75, "rewards/chosen": -30.451738357543945, "rewards/margins": 7.028951644897461, "rewards/rejected": -37.480690002441406, "step": 2811 }, { "epoch": 0.3828976034858388, "grad_norm": 43.15314135238595, "learning_rate": 6.203769277969113e-07, "logits/chosen": 10.730875015258789, "logits/rejected": 10.982956886291504, "logps/chosen": -3.683825969696045, "logps/rejected": -3.8328194618225098, "loss": 4.3739, "rewards/accuracies": 0.75, "rewards/chosen": -36.8382568359375, "rewards/margins": 1.489936351776123, "rewards/rejected": -38.32819366455078, "step": 2812 }, { "epoch": 0.3830337690631808, "grad_norm": 58.73300114684624, "learning_rate": 6.202182226096842e-07, "logits/chosen": 11.806467056274414, "logits/rejected": 11.600946426391602, "logps/chosen": -3.7240235805511475, "logps/rejected": -3.729536771774292, "loss": 4.0101, "rewards/accuracies": 0.75, "rewards/chosen": -37.240238189697266, "rewards/margins": 0.055130958557128906, "rewards/rejected": -37.29536819458008, "step": 2813 }, { "epoch": 0.3831699346405229, "grad_norm": 44.37388821671674, "learning_rate": 6.200594676623293e-07, "logits/chosen": 10.560644149780273, "logits/rejected": 10.997857093811035, "logps/chosen": -3.7542147636413574, "logps/rejected": -3.789904832839966, "loss": 3.747, "rewards/accuracies": 0.75, "rewards/chosen": -37.54214859008789, "rewards/margins": 0.35689878463745117, "rewards/rejected": -37.8990478515625, "step": 2814 }, { "epoch": 0.38330610021786493, "grad_norm": 57.00197336141675, "learning_rate": 6.199006629907186e-07, "logits/chosen": 12.206646919250488, "logits/rejected": 11.710387229919434, "logps/chosen": -3.7775440216064453, "logps/rejected": -3.7392427921295166, "loss": 3.7525, "rewards/accuracies": 0.25, "rewards/chosen": -37.77544021606445, "rewards/margins": -0.3830127716064453, "rewards/rejected": -37.392425537109375, "step": 2815 }, { "epoch": 0.38344226579520696, "grad_norm": 39.77477161750449, "learning_rate": 6.197418086307355e-07, "logits/chosen": 9.860816955566406, "logits/rejected": 11.424310684204102, "logps/chosen": -3.6076791286468506, "logps/rejected": -3.7762787342071533, "loss": 4.1897, "rewards/accuracies": 0.5, "rewards/chosen": -36.07678985595703, "rewards/margins": 1.6859960556030273, "rewards/rejected": -37.762786865234375, "step": 2816 }, { "epoch": 0.38357843137254904, "grad_norm": 42.91744948312833, "learning_rate": 6.195829046182742e-07, "logits/chosen": 10.099895477294922, "logits/rejected": 11.380786895751953, "logps/chosen": -3.5141685009002686, "logps/rejected": -3.7301089763641357, "loss": 4.1562, "rewards/accuracies": 0.75, "rewards/chosen": -35.141685485839844, "rewards/margins": 2.1594033241271973, "rewards/rejected": -37.301090240478516, "step": 2817 }, { "epoch": 0.3837145969498911, "grad_norm": 41.304255651715636, "learning_rate": 6.194239509892407e-07, "logits/chosen": 10.987491607666016, "logits/rejected": 12.124859809875488, "logps/chosen": -3.349825382232666, "logps/rejected": -3.872025489807129, "loss": 4.1962, "rewards/accuracies": 1.0, "rewards/chosen": -33.498252868652344, "rewards/margins": 5.2220001220703125, "rewards/rejected": -38.720252990722656, "step": 2818 }, { "epoch": 0.3838507625272331, "grad_norm": 55.399369966109184, "learning_rate": 6.192649477795515e-07, "logits/chosen": 11.261507034301758, "logits/rejected": 11.295373916625977, "logps/chosen": -3.4196629524230957, "logps/rejected": -3.5968990325927734, "loss": 4.7161, "rewards/accuracies": 0.5, "rewards/chosen": -34.19662857055664, "rewards/margins": 1.7723617553710938, "rewards/rejected": -35.968994140625, "step": 2819 }, { "epoch": 0.3839869281045752, "grad_norm": 36.85115811918259, "learning_rate": 6.19105895025135e-07, "logits/chosen": 11.196310043334961, "logits/rejected": 11.19373893737793, "logps/chosen": -3.6608786582946777, "logps/rejected": -3.9117093086242676, "loss": 3.4162, "rewards/accuracies": 0.75, "rewards/chosen": -36.608787536621094, "rewards/margins": 2.508307456970215, "rewards/rejected": -39.117095947265625, "step": 2820 }, { "epoch": 0.3841230936819172, "grad_norm": 46.94047854877858, "learning_rate": 6.189467927619304e-07, "logits/chosen": 11.473787307739258, "logits/rejected": 11.657217025756836, "logps/chosen": -3.1293323040008545, "logps/rejected": -3.222342014312744, "loss": 3.9567, "rewards/accuracies": 0.75, "rewards/chosen": -31.293323516845703, "rewards/margins": 0.9300966262817383, "rewards/rejected": -32.223419189453125, "step": 2821 }, { "epoch": 0.38425925925925924, "grad_norm": 42.37336788761941, "learning_rate": 6.18787641025888e-07, "logits/chosen": 11.697856903076172, "logits/rejected": 10.155649185180664, "logps/chosen": -3.310556650161743, "logps/rejected": -3.1429059505462646, "loss": 4.5321, "rewards/accuracies": 0.25, "rewards/chosen": -33.105567932128906, "rewards/margins": -1.676508903503418, "rewards/rejected": -31.429058074951172, "step": 2822 }, { "epoch": 0.38439542483660133, "grad_norm": 36.046295820656674, "learning_rate": 6.186284398529696e-07, "logits/chosen": 10.149824142456055, "logits/rejected": 10.519075393676758, "logps/chosen": -3.120936870574951, "logps/rejected": -3.5289876461029053, "loss": 3.8696, "rewards/accuracies": 1.0, "rewards/chosen": -31.209369659423828, "rewards/margins": 4.080506801605225, "rewards/rejected": -35.289878845214844, "step": 2823 }, { "epoch": 0.38453159041394336, "grad_norm": 45.53095927176091, "learning_rate": 6.184691892791482e-07, "logits/chosen": 10.594881057739258, "logits/rejected": 10.575215339660645, "logps/chosen": -3.4461021423339844, "logps/rejected": -3.3608131408691406, "loss": 4.3548, "rewards/accuracies": 0.25, "rewards/chosen": -34.461021423339844, "rewards/margins": -0.8528885841369629, "rewards/rejected": -33.608131408691406, "step": 2824 }, { "epoch": 0.3846677559912854, "grad_norm": 40.68860796329409, "learning_rate": 6.183098893404075e-07, "logits/chosen": 10.914474487304688, "logits/rejected": 11.302587509155273, "logps/chosen": -3.3697657585144043, "logps/rejected": -3.4615907669067383, "loss": 3.6574, "rewards/accuracies": 0.5, "rewards/chosen": -33.697654724121094, "rewards/margins": 0.9182500839233398, "rewards/rejected": -34.61590576171875, "step": 2825 }, { "epoch": 0.38480392156862747, "grad_norm": 38.788985276423986, "learning_rate": 6.181505400727428e-07, "logits/chosen": 10.732072830200195, "logits/rejected": 11.830225944519043, "logps/chosen": -3.4171223640441895, "logps/rejected": -3.6049838066101074, "loss": 3.873, "rewards/accuracies": 0.5, "rewards/chosen": -34.17122268676758, "rewards/margins": 1.878617286682129, "rewards/rejected": -36.049842834472656, "step": 2826 }, { "epoch": 0.3849400871459695, "grad_norm": 38.24578053777446, "learning_rate": 6.179911415121602e-07, "logits/chosen": 11.044820785522461, "logits/rejected": 12.447056770324707, "logps/chosen": -3.334068775177002, "logps/rejected": -3.8241872787475586, "loss": 3.768, "rewards/accuracies": 1.0, "rewards/chosen": -33.34069061279297, "rewards/margins": 4.90118408203125, "rewards/rejected": -38.24187469482422, "step": 2827 }, { "epoch": 0.38507625272331153, "grad_norm": 43.01875889396086, "learning_rate": 6.178316936946772e-07, "logits/chosen": 11.458093643188477, "logits/rejected": 12.900623321533203, "logps/chosen": -3.488124370574951, "logps/rejected": -3.848557710647583, "loss": 3.9396, "rewards/accuracies": 0.75, "rewards/chosen": -34.88124084472656, "rewards/margins": 3.6043343544006348, "rewards/rejected": -38.48557662963867, "step": 2828 }, { "epoch": 0.3852124183006536, "grad_norm": 41.67741123090604, "learning_rate": 6.176721966563224e-07, "logits/chosen": 11.631407737731934, "logits/rejected": 11.942605972290039, "logps/chosen": -3.5400307178497314, "logps/rejected": -3.7748570442199707, "loss": 4.2005, "rewards/accuracies": 0.75, "rewards/chosen": -35.400306701660156, "rewards/margins": 2.3482666015625, "rewards/rejected": -37.748573303222656, "step": 2829 }, { "epoch": 0.38534858387799564, "grad_norm": 42.388856779630004, "learning_rate": 6.175126504331357e-07, "logits/chosen": 11.408380508422852, "logits/rejected": 11.074871063232422, "logps/chosen": -3.373530149459839, "logps/rejected": -3.5149946212768555, "loss": 3.5782, "rewards/accuracies": 0.5, "rewards/chosen": -33.73530197143555, "rewards/margins": 1.4146442413330078, "rewards/rejected": -35.14994812011719, "step": 2830 }, { "epoch": 0.38548474945533767, "grad_norm": 61.7339469937412, "learning_rate": 6.173530550611675e-07, "logits/chosen": 11.154632568359375, "logits/rejected": 11.302404403686523, "logps/chosen": -3.9148712158203125, "logps/rejected": -3.816885232925415, "loss": 4.4401, "rewards/accuracies": 0.5, "rewards/chosen": -39.148712158203125, "rewards/margins": -0.9798593521118164, "rewards/rejected": -38.168853759765625, "step": 2831 }, { "epoch": 0.38562091503267976, "grad_norm": 40.326008518815776, "learning_rate": 6.171934105764797e-07, "logits/chosen": 8.935749053955078, "logits/rejected": 10.88282585144043, "logps/chosen": -3.2824535369873047, "logps/rejected": -3.633375644683838, "loss": 4.1192, "rewards/accuracies": 0.75, "rewards/chosen": -32.82453918457031, "rewards/margins": 3.509218215942383, "rewards/rejected": -36.33375549316406, "step": 2832 }, { "epoch": 0.3857570806100218, "grad_norm": 39.128686577383576, "learning_rate": 6.170337170151457e-07, "logits/chosen": 10.317025184631348, "logits/rejected": 10.791410446166992, "logps/chosen": -3.1951253414154053, "logps/rejected": -3.470348834991455, "loss": 4.0854, "rewards/accuracies": 0.75, "rewards/chosen": -31.951251983642578, "rewards/margins": 2.752232551574707, "rewards/rejected": -34.703487396240234, "step": 2833 }, { "epoch": 0.3858932461873638, "grad_norm": 41.90633645251499, "learning_rate": 6.168739744132492e-07, "logits/chosen": 12.267902374267578, "logits/rejected": 12.007791519165039, "logps/chosen": -3.567171573638916, "logps/rejected": -3.7108988761901855, "loss": 3.8317, "rewards/accuracies": 0.5, "rewards/chosen": -35.671714782714844, "rewards/margins": 1.437272071838379, "rewards/rejected": -37.108985900878906, "step": 2834 }, { "epoch": 0.3860294117647059, "grad_norm": 42.28639606078314, "learning_rate": 6.167141828068855e-07, "logits/chosen": 10.592418670654297, "logits/rejected": 12.402029037475586, "logps/chosen": -3.5022456645965576, "logps/rejected": -3.86079740524292, "loss": 3.0149, "rewards/accuracies": 0.75, "rewards/chosen": -35.022457122802734, "rewards/margins": 3.585516929626465, "rewards/rejected": -38.60797119140625, "step": 2835 }, { "epoch": 0.3861655773420479, "grad_norm": 43.41000460689929, "learning_rate": 6.165543422321609e-07, "logits/chosen": 12.241260528564453, "logits/rejected": 12.23749828338623, "logps/chosen": -3.815500020980835, "logps/rejected": -3.960658311843872, "loss": 4.3279, "rewards/accuracies": 0.5, "rewards/chosen": -38.154998779296875, "rewards/margins": 1.451582908630371, "rewards/rejected": -39.60658264160156, "step": 2836 }, { "epoch": 0.38630174291938996, "grad_norm": 55.260337250059514, "learning_rate": 6.163944527251925e-07, "logits/chosen": 12.217737197875977, "logits/rejected": 12.906700134277344, "logps/chosen": -4.014433860778809, "logps/rejected": -4.022622585296631, "loss": 3.4293, "rewards/accuracies": 0.5, "rewards/chosen": -40.14434051513672, "rewards/margins": 0.08188915252685547, "rewards/rejected": -40.226226806640625, "step": 2837 }, { "epoch": 0.38643790849673204, "grad_norm": 44.0228162711442, "learning_rate": 6.162345143221088e-07, "logits/chosen": 12.395513534545898, "logits/rejected": 12.028558731079102, "logps/chosen": -3.814279079437256, "logps/rejected": -3.8207790851593018, "loss": 4.2946, "rewards/accuracies": 0.25, "rewards/chosen": -38.14278793334961, "rewards/margins": 0.0650014877319336, "rewards/rejected": -38.20779037475586, "step": 2838 }, { "epoch": 0.38657407407407407, "grad_norm": 44.67914623865153, "learning_rate": 6.160745270590493e-07, "logits/chosen": 10.773482322692871, "logits/rejected": 12.705166816711426, "logps/chosen": -3.422049045562744, "logps/rejected": -3.764646291732788, "loss": 3.8591, "rewards/accuracies": 0.5, "rewards/chosen": -34.220489501953125, "rewards/margins": 3.4259724617004395, "rewards/rejected": -37.646461486816406, "step": 2839 }, { "epoch": 0.3867102396514161, "grad_norm": 40.90738002902756, "learning_rate": 6.15914490972164e-07, "logits/chosen": 11.612055778503418, "logits/rejected": 11.94788932800293, "logps/chosen": -3.485065460205078, "logps/rejected": -3.7398197650909424, "loss": 4.1637, "rewards/accuracies": 0.75, "rewards/chosen": -34.85065460205078, "rewards/margins": 2.547544479370117, "rewards/rejected": -37.39820098876953, "step": 2840 }, { "epoch": 0.3868464052287582, "grad_norm": 45.160637181570095, "learning_rate": 6.15754406097615e-07, "logits/chosen": 11.437095642089844, "logits/rejected": 13.289985656738281, "logps/chosen": -3.350968837738037, "logps/rejected": -4.085592269897461, "loss": 4.6297, "rewards/accuracies": 0.75, "rewards/chosen": -33.50968933105469, "rewards/margins": 7.346232891082764, "rewards/rejected": -40.855918884277344, "step": 2841 }, { "epoch": 0.3869825708061002, "grad_norm": 54.12286303634396, "learning_rate": 6.155942724715744e-07, "logits/chosen": 12.3884916305542, "logits/rejected": 12.787382125854492, "logps/chosen": -3.688951015472412, "logps/rejected": -4.133667945861816, "loss": 3.9891, "rewards/accuracies": 0.75, "rewards/chosen": -36.88951110839844, "rewards/margins": 4.447169303894043, "rewards/rejected": -41.3366813659668, "step": 2842 }, { "epoch": 0.38711873638344224, "grad_norm": 40.816246596445204, "learning_rate": 6.154340901302257e-07, "logits/chosen": 12.329235076904297, "logits/rejected": 12.270393371582031, "logps/chosen": -4.018379211425781, "logps/rejected": -4.025310039520264, "loss": 3.8031, "rewards/accuracies": 0.5, "rewards/chosen": -40.18379211425781, "rewards/margins": 0.06930732727050781, "rewards/rejected": -40.25309753417969, "step": 2843 }, { "epoch": 0.3872549019607843, "grad_norm": 40.14711583561954, "learning_rate": 6.152738591097637e-07, "logits/chosen": 12.641592979431152, "logits/rejected": 13.780121803283691, "logps/chosen": -3.5004191398620605, "logps/rejected": -4.012096405029297, "loss": 3.6712, "rewards/accuracies": 0.75, "rewards/chosen": -35.00419235229492, "rewards/margins": 5.116767406463623, "rewards/rejected": -40.12095642089844, "step": 2844 }, { "epoch": 0.38739106753812635, "grad_norm": 42.00981063744423, "learning_rate": 6.151135794463937e-07, "logits/chosen": 12.340055465698242, "logits/rejected": 12.733728408813477, "logps/chosen": -3.4442520141601562, "logps/rejected": -3.7587761878967285, "loss": 4.2824, "rewards/accuracies": 1.0, "rewards/chosen": -34.44252014160156, "rewards/margins": 3.1452407836914062, "rewards/rejected": -37.58776092529297, "step": 2845 }, { "epoch": 0.3875272331154684, "grad_norm": 49.04761216187689, "learning_rate": 6.149532511763323e-07, "logits/chosen": 11.79130744934082, "logits/rejected": 12.173758506774902, "logps/chosen": -3.8502182960510254, "logps/rejected": -4.008823394775391, "loss": 4.2567, "rewards/accuracies": 0.75, "rewards/chosen": -38.5021858215332, "rewards/margins": 1.586047649383545, "rewards/rejected": -40.088233947753906, "step": 2846 }, { "epoch": 0.38766339869281047, "grad_norm": 42.465035270443764, "learning_rate": 6.147928743358071e-07, "logits/chosen": 11.807202339172363, "logits/rejected": 12.601645469665527, "logps/chosen": -3.718050479888916, "logps/rejected": -3.9325854778289795, "loss": 3.8026, "rewards/accuracies": 0.75, "rewards/chosen": -37.180503845214844, "rewards/margins": 2.1453490257263184, "rewards/rejected": -39.32585144042969, "step": 2847 }, { "epoch": 0.3877995642701525, "grad_norm": 41.408367087778714, "learning_rate": 6.146324489610563e-07, "logits/chosen": 12.023853302001953, "logits/rejected": 12.333858489990234, "logps/chosen": -3.598735809326172, "logps/rejected": -3.8269166946411133, "loss": 4.2475, "rewards/accuracies": 0.75, "rewards/chosen": -35.98735809326172, "rewards/margins": 2.2818098068237305, "rewards/rejected": -38.2691650390625, "step": 2848 }, { "epoch": 0.3879357298474945, "grad_norm": 44.24939270280641, "learning_rate": 6.144719750883294e-07, "logits/chosen": 12.082515716552734, "logits/rejected": 12.664101600646973, "logps/chosen": -3.9980292320251465, "logps/rejected": -3.836606025695801, "loss": 3.9624, "rewards/accuracies": 0.25, "rewards/chosen": -39.98029327392578, "rewards/margins": -1.6142311096191406, "rewards/rejected": -38.366058349609375, "step": 2849 }, { "epoch": 0.3880718954248366, "grad_norm": 37.96611601499597, "learning_rate": 6.14311452753887e-07, "logits/chosen": 12.223604202270508, "logits/rejected": 11.831867218017578, "logps/chosen": -3.739572763442993, "logps/rejected": -3.7206292152404785, "loss": 3.646, "rewards/accuracies": 0.25, "rewards/chosen": -37.395729064941406, "rewards/margins": -0.1894378662109375, "rewards/rejected": -37.20629119873047, "step": 2850 }, { "epoch": 0.38820806100217864, "grad_norm": 55.65472131096962, "learning_rate": 6.141508819940004e-07, "logits/chosen": 12.340414047241211, "logits/rejected": 12.438549041748047, "logps/chosen": -3.67539381980896, "logps/rejected": -3.930471897125244, "loss": 4.4174, "rewards/accuracies": 0.75, "rewards/chosen": -36.753936767578125, "rewards/margins": 2.5507802963256836, "rewards/rejected": -39.304718017578125, "step": 2851 }, { "epoch": 0.3883442265795207, "grad_norm": 43.09886085375482, "learning_rate": 6.139902628449517e-07, "logits/chosen": 12.515193939208984, "logits/rejected": 12.58046817779541, "logps/chosen": -3.5654096603393555, "logps/rejected": -3.677598476409912, "loss": 3.9943, "rewards/accuracies": 0.75, "rewards/chosen": -35.65409469604492, "rewards/margins": 1.1218905448913574, "rewards/rejected": -36.77598571777344, "step": 2852 }, { "epoch": 0.38848039215686275, "grad_norm": 39.90166926308392, "learning_rate": 6.138295953430343e-07, "logits/chosen": 12.162089347839355, "logits/rejected": 13.134681701660156, "logps/chosen": -3.7100648880004883, "logps/rejected": -4.184390544891357, "loss": 4.1761, "rewards/accuracies": 1.0, "rewards/chosen": -37.10064697265625, "rewards/margins": 4.743257522583008, "rewards/rejected": -41.84390640258789, "step": 2853 }, { "epoch": 0.3886165577342048, "grad_norm": 41.61512460445132, "learning_rate": 6.136688795245523e-07, "logits/chosen": 12.702457427978516, "logits/rejected": 12.69108772277832, "logps/chosen": -3.764395236968994, "logps/rejected": -4.033480644226074, "loss": 4.5651, "rewards/accuracies": 0.75, "rewards/chosen": -37.643951416015625, "rewards/margins": 2.6908512115478516, "rewards/rejected": -40.33480453491211, "step": 2854 }, { "epoch": 0.38875272331154687, "grad_norm": 66.6950387188668, "learning_rate": 6.135081154258208e-07, "logits/chosen": 12.275836944580078, "logits/rejected": 11.473597526550293, "logps/chosen": -3.5387516021728516, "logps/rejected": -3.476407527923584, "loss": 4.3425, "rewards/accuracies": 0.25, "rewards/chosen": -35.387516021728516, "rewards/margins": -0.6234416961669922, "rewards/rejected": -34.76407241821289, "step": 2855 }, { "epoch": 0.3888888888888889, "grad_norm": 42.061799648765486, "learning_rate": 6.133473030831657e-07, "logits/chosen": 12.996759414672852, "logits/rejected": 13.063352584838867, "logps/chosen": -3.7459239959716797, "logps/rejected": -3.833688259124756, "loss": 3.9162, "rewards/accuracies": 0.5, "rewards/chosen": -37.4592399597168, "rewards/margins": 0.8776426315307617, "rewards/rejected": -38.336883544921875, "step": 2856 }, { "epoch": 0.3890250544662309, "grad_norm": 37.530558707305005, "learning_rate": 6.131864425329239e-07, "logits/chosen": 10.778421401977539, "logits/rejected": 11.860065460205078, "logps/chosen": -3.3721418380737305, "logps/rejected": -3.54189133644104, "loss": 4.2687, "rewards/accuracies": 0.75, "rewards/chosen": -33.72141647338867, "rewards/margins": 1.697495937347412, "rewards/rejected": -35.418914794921875, "step": 2857 }, { "epoch": 0.389161220043573, "grad_norm": 42.74813860453052, "learning_rate": 6.130255338114432e-07, "logits/chosen": 12.500041007995605, "logits/rejected": 13.552946090698242, "logps/chosen": -4.003173351287842, "logps/rejected": -4.031623840332031, "loss": 4.4762, "rewards/accuracies": 0.25, "rewards/chosen": -40.03173065185547, "rewards/margins": 0.28450584411621094, "rewards/rejected": -40.31623840332031, "step": 2858 }, { "epoch": 0.38929738562091504, "grad_norm": 36.17591010539329, "learning_rate": 6.128645769550823e-07, "logits/chosen": 13.09321117401123, "logits/rejected": 13.333900451660156, "logps/chosen": -3.4964094161987305, "logps/rejected": -3.8536252975463867, "loss": 4.1376, "rewards/accuracies": 1.0, "rewards/chosen": -34.96409606933594, "rewards/margins": 3.5721583366394043, "rewards/rejected": -38.536251068115234, "step": 2859 }, { "epoch": 0.38943355119825707, "grad_norm": 44.20567326681375, "learning_rate": 6.127035720002107e-07, "logits/chosen": 13.0008544921875, "logits/rejected": 12.840636253356934, "logps/chosen": -3.6444709300994873, "logps/rejected": -4.015650272369385, "loss": 4.8006, "rewards/accuracies": 0.75, "rewards/chosen": -36.44470977783203, "rewards/margins": 3.7117929458618164, "rewards/rejected": -40.15650177001953, "step": 2860 }, { "epoch": 0.38956971677559915, "grad_norm": 45.52414340315272, "learning_rate": 6.125425189832086e-07, "logits/chosen": 11.816852569580078, "logits/rejected": 12.801911354064941, "logps/chosen": -3.5313193798065186, "logps/rejected": -3.801553249359131, "loss": 4.4582, "rewards/accuracies": 0.75, "rewards/chosen": -35.313194274902344, "rewards/margins": 2.702338695526123, "rewards/rejected": -38.015533447265625, "step": 2861 }, { "epoch": 0.3897058823529412, "grad_norm": 65.08933181814693, "learning_rate": 6.123814179404677e-07, "logits/chosen": 11.957304000854492, "logits/rejected": 12.453842163085938, "logps/chosen": -3.3540639877319336, "logps/rejected": -3.7589194774627686, "loss": 3.9777, "rewards/accuracies": 0.75, "rewards/chosen": -33.54064178466797, "rewards/margins": 4.048554420471191, "rewards/rejected": -37.589195251464844, "step": 2862 }, { "epoch": 0.3898420479302832, "grad_norm": 46.22413677329376, "learning_rate": 6.122202689083896e-07, "logits/chosen": 12.32382583618164, "logits/rejected": 12.832408905029297, "logps/chosen": -3.597567558288574, "logps/rejected": -3.8931069374084473, "loss": 3.959, "rewards/accuracies": 0.75, "rewards/chosen": -35.975677490234375, "rewards/margins": 2.9553937911987305, "rewards/rejected": -38.931068420410156, "step": 2863 }, { "epoch": 0.3899782135076253, "grad_norm": 37.85358066037813, "learning_rate": 6.120590719233876e-07, "logits/chosen": 11.388923645019531, "logits/rejected": 11.766500473022461, "logps/chosen": -3.6093955039978027, "logps/rejected": -4.046748161315918, "loss": 3.8528, "rewards/accuracies": 1.0, "rewards/chosen": -36.093955993652344, "rewards/margins": 4.373526573181152, "rewards/rejected": -40.46748352050781, "step": 2864 }, { "epoch": 0.3901143790849673, "grad_norm": 38.66301391609447, "learning_rate": 6.118978270218854e-07, "logits/chosen": 12.267841339111328, "logits/rejected": 12.890658378601074, "logps/chosen": -3.7198822498321533, "logps/rejected": -3.795055627822876, "loss": 3.9625, "rewards/accuracies": 0.5, "rewards/chosen": -37.198822021484375, "rewards/margins": 0.7517328262329102, "rewards/rejected": -37.95055389404297, "step": 2865 }, { "epoch": 0.39025054466230935, "grad_norm": 37.57155789043147, "learning_rate": 6.117365342403177e-07, "logits/chosen": 11.878514289855957, "logits/rejected": 11.362804412841797, "logps/chosen": -3.770141124725342, "logps/rejected": -3.600830078125, "loss": 3.8483, "rewards/accuracies": 0.25, "rewards/chosen": -37.70140838623047, "rewards/margins": -1.6931095123291016, "rewards/rejected": -36.00830078125, "step": 2866 }, { "epoch": 0.39038671023965144, "grad_norm": 42.634320308093415, "learning_rate": 6.115751936151298e-07, "logits/chosen": 11.618452072143555, "logits/rejected": 12.174501419067383, "logps/chosen": -3.4276621341705322, "logps/rejected": -3.734644889831543, "loss": 4.169, "rewards/accuracies": 0.75, "rewards/chosen": -34.2766227722168, "rewards/margins": 3.069826126098633, "rewards/rejected": -37.34645080566406, "step": 2867 }, { "epoch": 0.39052287581699346, "grad_norm": 39.76711736025574, "learning_rate": 6.114138051827779e-07, "logits/chosen": 11.554269790649414, "logits/rejected": 12.35765266418457, "logps/chosen": -3.6591103076934814, "logps/rejected": -3.7971346378326416, "loss": 3.6019, "rewards/accuracies": 0.75, "rewards/chosen": -36.591102600097656, "rewards/margins": 1.3802428245544434, "rewards/rejected": -37.971343994140625, "step": 2868 }, { "epoch": 0.3906590413943355, "grad_norm": 45.58399121945031, "learning_rate": 6.112523689797294e-07, "logits/chosen": 12.55367660522461, "logits/rejected": 12.035085678100586, "logps/chosen": -3.864694118499756, "logps/rejected": -3.663057565689087, "loss": 4.1921, "rewards/accuracies": 0.25, "rewards/chosen": -38.646942138671875, "rewards/margins": -2.016366958618164, "rewards/rejected": -36.63057327270508, "step": 2869 }, { "epoch": 0.3907952069716776, "grad_norm": 47.87125299534254, "learning_rate": 6.110908850424617e-07, "logits/chosen": 12.091875076293945, "logits/rejected": 13.261639595031738, "logps/chosen": -3.754936933517456, "logps/rejected": -4.083664417266846, "loss": 3.1208, "rewards/accuracies": 1.0, "rewards/chosen": -37.54936981201172, "rewards/margins": 3.287276268005371, "rewards/rejected": -40.83664321899414, "step": 2870 }, { "epoch": 0.3909313725490196, "grad_norm": 42.11052057442986, "learning_rate": 6.109293534074637e-07, "logits/chosen": 13.349283218383789, "logits/rejected": 12.824934005737305, "logps/chosen": -3.700608253479004, "logps/rejected": -3.6241629123687744, "loss": 4.2206, "rewards/accuracies": 0.25, "rewards/chosen": -37.006080627441406, "rewards/margins": -0.7644548416137695, "rewards/rejected": -36.24163055419922, "step": 2871 }, { "epoch": 0.39106753812636164, "grad_norm": 59.6772578826874, "learning_rate": 6.107677741112348e-07, "logits/chosen": 12.248309135437012, "logits/rejected": 11.466922760009766, "logps/chosen": -3.647317409515381, "logps/rejected": -3.753369092941284, "loss": 4.2718, "rewards/accuracies": 0.75, "rewards/chosen": -36.473175048828125, "rewards/margins": 1.060516357421875, "rewards/rejected": -37.53369140625, "step": 2872 }, { "epoch": 0.3912037037037037, "grad_norm": 42.996241232483584, "learning_rate": 6.10606147190285e-07, "logits/chosen": 11.65426254272461, "logits/rejected": 12.306957244873047, "logps/chosen": -3.662609577178955, "logps/rejected": -3.7349472045898438, "loss": 4.3148, "rewards/accuracies": 0.5, "rewards/chosen": -36.6260986328125, "rewards/margins": 0.7233762741088867, "rewards/rejected": -37.34947204589844, "step": 2873 }, { "epoch": 0.39133986928104575, "grad_norm": 39.57061664947983, "learning_rate": 6.104444726811355e-07, "logits/chosen": 10.881011009216309, "logits/rejected": 11.58372974395752, "logps/chosen": -3.4973182678222656, "logps/rejected": -3.7886886596679688, "loss": 3.6945, "rewards/accuracies": 0.75, "rewards/chosen": -34.973182678222656, "rewards/margins": 2.9137039184570312, "rewards/rejected": -37.88688659667969, "step": 2874 }, { "epoch": 0.3914760348583878, "grad_norm": 38.13129671889833, "learning_rate": 6.102827506203176e-07, "logits/chosen": 11.155238151550293, "logits/rejected": 12.077349662780762, "logps/chosen": -3.4871912002563477, "logps/rejected": -3.5096936225891113, "loss": 3.7282, "rewards/accuracies": 0.25, "rewards/chosen": -34.871910095214844, "rewards/margins": 0.22502422332763672, "rewards/rejected": -35.0969352722168, "step": 2875 }, { "epoch": 0.39161220043572986, "grad_norm": 46.680811736561196, "learning_rate": 6.101209810443742e-07, "logits/chosen": 11.053869247436523, "logits/rejected": 12.483612060546875, "logps/chosen": -3.282834529876709, "logps/rejected": -3.72359561920166, "loss": 4.1594, "rewards/accuracies": 1.0, "rewards/chosen": -32.828346252441406, "rewards/margins": 4.407609939575195, "rewards/rejected": -37.23595428466797, "step": 2876 }, { "epoch": 0.3917483660130719, "grad_norm": 42.45665624516623, "learning_rate": 6.099591639898582e-07, "logits/chosen": 12.830646514892578, "logits/rejected": 12.232202529907227, "logps/chosen": -3.8512840270996094, "logps/rejected": -3.8199374675750732, "loss": 4.209, "rewards/accuracies": 0.25, "rewards/chosen": -38.512840270996094, "rewards/margins": -0.31346607208251953, "rewards/rejected": -38.19937515258789, "step": 2877 }, { "epoch": 0.3918845315904139, "grad_norm": 38.6504286998847, "learning_rate": 6.097972994933336e-07, "logits/chosen": 13.067342758178711, "logits/rejected": 12.89677619934082, "logps/chosen": -4.0310211181640625, "logps/rejected": -3.525857448577881, "loss": 4.3395, "rewards/accuracies": 0.25, "rewards/chosen": -40.310211181640625, "rewards/margins": -5.0516357421875, "rewards/rejected": -35.258575439453125, "step": 2878 }, { "epoch": 0.392020697167756, "grad_norm": 47.30461224849457, "learning_rate": 6.096353875913749e-07, "logits/chosen": 11.437665939331055, "logits/rejected": 11.965836524963379, "logps/chosen": -3.460509777069092, "logps/rejected": -3.937466621398926, "loss": 3.982, "rewards/accuracies": 1.0, "rewards/chosen": -34.60509490966797, "rewards/margins": 4.7695698738098145, "rewards/rejected": -39.37466812133789, "step": 2879 }, { "epoch": 0.39215686274509803, "grad_norm": 45.39600841745077, "learning_rate": 6.094734283205675e-07, "logits/chosen": 12.991416931152344, "logits/rejected": 12.250171661376953, "logps/chosen": -4.176530838012695, "logps/rejected": -3.8602004051208496, "loss": 4.0706, "rewards/accuracies": 0.0, "rewards/chosen": -41.76530456542969, "rewards/margins": -3.163302421569824, "rewards/rejected": -38.60200500488281, "step": 2880 }, { "epoch": 0.39229302832244006, "grad_norm": 42.3192657263461, "learning_rate": 6.093114217175075e-07, "logits/chosen": 10.995668411254883, "logits/rejected": 12.368009567260742, "logps/chosen": -3.243518114089966, "logps/rejected": -3.3407626152038574, "loss": 4.1499, "rewards/accuracies": 0.5, "rewards/chosen": -32.4351806640625, "rewards/margins": 0.972445011138916, "rewards/rejected": -33.407623291015625, "step": 2881 }, { "epoch": 0.39242919389978215, "grad_norm": 49.80674623590167, "learning_rate": 6.091493678188015e-07, "logits/chosen": 12.371052742004395, "logits/rejected": 12.554136276245117, "logps/chosen": -4.036773681640625, "logps/rejected": -4.190086364746094, "loss": 3.665, "rewards/accuracies": 0.5, "rewards/chosen": -40.36773681640625, "rewards/margins": 1.5331239700317383, "rewards/rejected": -41.90085983276367, "step": 2882 }, { "epoch": 0.3925653594771242, "grad_norm": 44.37003708599141, "learning_rate": 6.089872666610671e-07, "logits/chosen": 11.983697891235352, "logits/rejected": 12.065916061401367, "logps/chosen": -4.017556190490723, "logps/rejected": -3.8633944988250732, "loss": 4.0153, "rewards/accuracies": 0.75, "rewards/chosen": -40.17556381225586, "rewards/margins": -1.5416183471679688, "rewards/rejected": -38.63394546508789, "step": 2883 }, { "epoch": 0.3927015250544662, "grad_norm": 41.245530838700475, "learning_rate": 6.088251182809323e-07, "logits/chosen": 11.73112678527832, "logits/rejected": 12.549591064453125, "logps/chosen": -3.555882692337036, "logps/rejected": -4.153528690338135, "loss": 3.8215, "rewards/accuracies": 1.0, "rewards/chosen": -35.5588264465332, "rewards/margins": 5.9764556884765625, "rewards/rejected": -41.53528594970703, "step": 2884 }, { "epoch": 0.3928376906318083, "grad_norm": 43.94649459975172, "learning_rate": 6.086629227150357e-07, "logits/chosen": 12.255315780639648, "logits/rejected": 12.638070106506348, "logps/chosen": -3.4396677017211914, "logps/rejected": -3.8507232666015625, "loss": 4.5268, "rewards/accuracies": 0.75, "rewards/chosen": -34.39667892456055, "rewards/margins": 4.110553741455078, "rewards/rejected": -38.507232666015625, "step": 2885 }, { "epoch": 0.3929738562091503, "grad_norm": 49.830618953087594, "learning_rate": 6.08500680000027e-07, "logits/chosen": 11.957727432250977, "logits/rejected": 12.46635627746582, "logps/chosen": -3.8625717163085938, "logps/rejected": -4.281052589416504, "loss": 4.0377, "rewards/accuracies": 0.75, "rewards/chosen": -38.62571716308594, "rewards/margins": 4.184810638427734, "rewards/rejected": -42.810523986816406, "step": 2886 }, { "epoch": 0.39311002178649235, "grad_norm": 43.07579044325133, "learning_rate": 6.083383901725662e-07, "logits/chosen": 12.066902160644531, "logits/rejected": 12.670015335083008, "logps/chosen": -3.658097982406616, "logps/rejected": -3.7431492805480957, "loss": 4.4269, "rewards/accuracies": 0.5, "rewards/chosen": -36.58097839355469, "rewards/margins": 0.8505144119262695, "rewards/rejected": -37.431495666503906, "step": 2887 }, { "epoch": 0.39324618736383443, "grad_norm": 45.60098343164459, "learning_rate": 6.08176053269324e-07, "logits/chosen": 12.494026184082031, "logits/rejected": 12.659250259399414, "logps/chosen": -3.342459201812744, "logps/rejected": -3.3693552017211914, "loss": 4.577, "rewards/accuracies": 0.5, "rewards/chosen": -33.424591064453125, "rewards/margins": 0.26895999908447266, "rewards/rejected": -33.69355010986328, "step": 2888 }, { "epoch": 0.39338235294117646, "grad_norm": 40.02965971360979, "learning_rate": 6.080136693269816e-07, "logits/chosen": 12.739418983459473, "logits/rejected": 12.510226249694824, "logps/chosen": -3.8836617469787598, "logps/rejected": -4.241971969604492, "loss": 3.9821, "rewards/accuracies": 1.0, "rewards/chosen": -38.83661651611328, "rewards/margins": 3.583102226257324, "rewards/rejected": -42.41972351074219, "step": 2889 }, { "epoch": 0.39351851851851855, "grad_norm": 40.60235307882984, "learning_rate": 6.078512383822314e-07, "logits/chosen": 12.993919372558594, "logits/rejected": 12.441520690917969, "logps/chosen": -3.7161781787872314, "logps/rejected": -3.8048739433288574, "loss": 4.4466, "rewards/accuracies": 0.5, "rewards/chosen": -37.161781311035156, "rewards/margins": 0.886960506439209, "rewards/rejected": -38.048744201660156, "step": 2890 }, { "epoch": 0.3936546840958606, "grad_norm": 39.53415396947117, "learning_rate": 6.076887604717756e-07, "logits/chosen": 12.28707218170166, "logits/rejected": 12.330644607543945, "logps/chosen": -3.7788233757019043, "logps/rejected": -4.051403999328613, "loss": 4.3528, "rewards/accuracies": 0.75, "rewards/chosen": -37.788230895996094, "rewards/margins": 2.7258071899414062, "rewards/rejected": -40.5140380859375, "step": 2891 }, { "epoch": 0.3937908496732026, "grad_norm": 39.1324682191657, "learning_rate": 6.075262356323277e-07, "logits/chosen": 11.181321144104004, "logits/rejected": 13.012500762939453, "logps/chosen": -3.609384059906006, "logps/rejected": -4.307079792022705, "loss": 4.0315, "rewards/accuracies": 1.0, "rewards/chosen": -36.093841552734375, "rewards/margins": 6.976955413818359, "rewards/rejected": -43.070796966552734, "step": 2892 }, { "epoch": 0.3939270152505447, "grad_norm": 39.75186097874026, "learning_rate": 6.073636639006113e-07, "logits/chosen": 11.89573860168457, "logits/rejected": 11.676584243774414, "logps/chosen": -3.8182945251464844, "logps/rejected": -3.822645664215088, "loss": 3.6993, "rewards/accuracies": 0.5, "rewards/chosen": -38.182945251464844, "rewards/margins": 0.04351043701171875, "rewards/rejected": -38.22645568847656, "step": 2893 }, { "epoch": 0.3940631808278867, "grad_norm": 37.06587005862837, "learning_rate": 6.07201045313361e-07, "logits/chosen": 11.004801750183105, "logits/rejected": 12.375295639038086, "logps/chosen": -3.170758008956909, "logps/rejected": -3.597222328186035, "loss": 3.989, "rewards/accuracies": 1.0, "rewards/chosen": -31.707578659057617, "rewards/margins": 4.264645576477051, "rewards/rejected": -35.972225189208984, "step": 2894 }, { "epoch": 0.39419934640522875, "grad_norm": 40.86271272203776, "learning_rate": 6.070383799073219e-07, "logits/chosen": 13.02468490600586, "logits/rejected": 12.921380996704102, "logps/chosen": -3.929719924926758, "logps/rejected": -4.062885761260986, "loss": 4.7487, "rewards/accuracies": 0.75, "rewards/chosen": -39.29719543457031, "rewards/margins": 1.3316564559936523, "rewards/rejected": -40.62885284423828, "step": 2895 }, { "epoch": 0.39433551198257083, "grad_norm": 41.87898342154235, "learning_rate": 6.068756677192493e-07, "logits/chosen": 12.034543991088867, "logits/rejected": 12.312078475952148, "logps/chosen": -3.71165132522583, "logps/rejected": -3.884953022003174, "loss": 4.3321, "rewards/accuracies": 0.5, "rewards/chosen": -37.116512298583984, "rewards/margins": 1.733017921447754, "rewards/rejected": -38.84953308105469, "step": 2896 }, { "epoch": 0.39447167755991286, "grad_norm": 40.46429321044703, "learning_rate": 6.067129087859095e-07, "logits/chosen": 12.84486198425293, "logits/rejected": 11.833746910095215, "logps/chosen": -4.040854454040527, "logps/rejected": -3.856504440307617, "loss": 4.1338, "rewards/accuracies": 0.25, "rewards/chosen": -40.408538818359375, "rewards/margins": -1.843496322631836, "rewards/rejected": -38.56504821777344, "step": 2897 }, { "epoch": 0.3946078431372549, "grad_norm": 39.643883463844205, "learning_rate": 6.065501031440793e-07, "logits/chosen": 11.946805953979492, "logits/rejected": 12.082998275756836, "logps/chosen": -3.660162925720215, "logps/rejected": -3.8634026050567627, "loss": 3.8332, "rewards/accuracies": 0.75, "rewards/chosen": -36.60163116455078, "rewards/margins": 2.032395839691162, "rewards/rejected": -38.63402557373047, "step": 2898 }, { "epoch": 0.394744008714597, "grad_norm": 45.012419274873075, "learning_rate": 6.063872508305461e-07, "logits/chosen": 13.243749618530273, "logits/rejected": 13.542845726013184, "logps/chosen": -4.329341888427734, "logps/rejected": -4.633475303649902, "loss": 4.1143, "rewards/accuracies": 0.75, "rewards/chosen": -43.293418884277344, "rewards/margins": 3.041337013244629, "rewards/rejected": -46.334754943847656, "step": 2899 }, { "epoch": 0.394880174291939, "grad_norm": 38.496855850227945, "learning_rate": 6.062243518821075e-07, "logits/chosen": 12.340656280517578, "logits/rejected": 12.554422378540039, "logps/chosen": -3.888619899749756, "logps/rejected": -3.7452235221862793, "loss": 4.1326, "rewards/accuracies": 0.0, "rewards/chosen": -38.886199951171875, "rewards/margins": -1.4339637756347656, "rewards/rejected": -37.45223617553711, "step": 2900 }, { "epoch": 0.39501633986928103, "grad_norm": 40.54921318405289, "learning_rate": 6.060614063355718e-07, "logits/chosen": 11.907033920288086, "logits/rejected": 12.412088394165039, "logps/chosen": -3.8751463890075684, "logps/rejected": -4.185377597808838, "loss": 3.9642, "rewards/accuracies": 0.75, "rewards/chosen": -38.75146484375, "rewards/margins": 3.102313995361328, "rewards/rejected": -41.85377883911133, "step": 2901 }, { "epoch": 0.3951525054466231, "grad_norm": 46.03231135110544, "learning_rate": 6.058984142277582e-07, "logits/chosen": 12.769245147705078, "logits/rejected": 12.632854461669922, "logps/chosen": -3.791835308074951, "logps/rejected": -4.103149890899658, "loss": 3.678, "rewards/accuracies": 0.75, "rewards/chosen": -37.91835403442383, "rewards/margins": 3.1131439208984375, "rewards/rejected": -41.031497955322266, "step": 2902 }, { "epoch": 0.39528867102396514, "grad_norm": 37.11639205980829, "learning_rate": 6.057353755954957e-07, "logits/chosen": 11.468233108520508, "logits/rejected": 12.758403778076172, "logps/chosen": -3.7875988483428955, "logps/rejected": -4.069204807281494, "loss": 4.3426, "rewards/accuracies": 0.75, "rewards/chosen": -37.8759880065918, "rewards/margins": 2.816061019897461, "rewards/rejected": -40.692047119140625, "step": 2903 }, { "epoch": 0.3954248366013072, "grad_norm": 40.9778614988901, "learning_rate": 6.055722904756246e-07, "logits/chosen": 11.946826934814453, "logits/rejected": 12.849037170410156, "logps/chosen": -3.5985054969787598, "logps/rejected": -3.909496545791626, "loss": 3.7578, "rewards/accuracies": 0.5, "rewards/chosen": -35.98505401611328, "rewards/margins": 3.1099090576171875, "rewards/rejected": -39.09496307373047, "step": 2904 }, { "epoch": 0.39556100217864926, "grad_norm": 41.75657025793043, "learning_rate": 6.054091589049951e-07, "logits/chosen": 11.545517921447754, "logits/rejected": 12.079883575439453, "logps/chosen": -3.678851842880249, "logps/rejected": -3.957296133041382, "loss": 3.3616, "rewards/accuracies": 1.0, "rewards/chosen": -36.78852081298828, "rewards/margins": 2.7844438552856445, "rewards/rejected": -39.57296371459961, "step": 2905 }, { "epoch": 0.3956971677559913, "grad_norm": 40.19818358850009, "learning_rate": 6.052459809204683e-07, "logits/chosen": 11.949050903320312, "logits/rejected": 12.015079498291016, "logps/chosen": -3.598581552505493, "logps/rejected": -3.634904384613037, "loss": 4.3788, "rewards/accuracies": 0.5, "rewards/chosen": -35.985816955566406, "rewards/margins": 0.36322784423828125, "rewards/rejected": -36.34904479980469, "step": 2906 }, { "epoch": 0.3958333333333333, "grad_norm": 42.53071229428509, "learning_rate": 6.050827565589156e-07, "logits/chosen": 12.920613288879395, "logits/rejected": 12.34234619140625, "logps/chosen": -3.912381172180176, "logps/rejected": -3.623453140258789, "loss": 4.1758, "rewards/accuracies": 0.25, "rewards/chosen": -39.123809814453125, "rewards/margins": -2.889279365539551, "rewards/rejected": -36.23453140258789, "step": 2907 }, { "epoch": 0.3959694989106754, "grad_norm": 38.25984917900326, "learning_rate": 6.049194858572187e-07, "logits/chosen": 12.179241180419922, "logits/rejected": 13.208433151245117, "logps/chosen": -3.605010509490967, "logps/rejected": -3.6275477409362793, "loss": 4.1267, "rewards/accuracies": 0.5, "rewards/chosen": -36.050106048583984, "rewards/margins": 0.225372314453125, "rewards/rejected": -36.27547836303711, "step": 2908 }, { "epoch": 0.39610566448801743, "grad_norm": 44.50574821856858, "learning_rate": 6.047561688522701e-07, "logits/chosen": 12.984481811523438, "logits/rejected": 12.10848331451416, "logps/chosen": -3.7790262699127197, "logps/rejected": -3.7505533695220947, "loss": 4.2913, "rewards/accuracies": 0.5, "rewards/chosen": -37.79026412963867, "rewards/margins": -0.2847285270690918, "rewards/rejected": -37.50553512573242, "step": 2909 }, { "epoch": 0.39624183006535946, "grad_norm": 38.04859964678833, "learning_rate": 6.045928055809726e-07, "logits/chosen": 11.846149444580078, "logits/rejected": 12.512989044189453, "logps/chosen": -3.825655698776245, "logps/rejected": -3.7130234241485596, "loss": 4.3195, "rewards/accuracies": 0.5, "rewards/chosen": -38.256553649902344, "rewards/margins": -1.1263227462768555, "rewards/rejected": -37.13023376464844, "step": 2910 }, { "epoch": 0.39637799564270154, "grad_norm": 39.978434159295446, "learning_rate": 6.044293960802395e-07, "logits/chosen": 13.151144027709961, "logits/rejected": 12.291293144226074, "logps/chosen": -3.826478958129883, "logps/rejected": -3.803288459777832, "loss": 3.8166, "rewards/accuracies": 0.5, "rewards/chosen": -38.26478576660156, "rewards/margins": -0.231903076171875, "rewards/rejected": -38.03288650512695, "step": 2911 }, { "epoch": 0.39651416122004357, "grad_norm": 39.441560852048966, "learning_rate": 6.042659403869945e-07, "logits/chosen": 11.782063484191895, "logits/rejected": 12.633942604064941, "logps/chosen": -3.696462631225586, "logps/rejected": -3.7832512855529785, "loss": 3.9862, "rewards/accuracies": 0.5, "rewards/chosen": -36.96462631225586, "rewards/margins": 0.8678874969482422, "rewards/rejected": -37.83251190185547, "step": 2912 }, { "epoch": 0.3966503267973856, "grad_norm": 43.52782245991136, "learning_rate": 6.04102438538172e-07, "logits/chosen": 13.018007278442383, "logits/rejected": 13.04157829284668, "logps/chosen": -3.851759910583496, "logps/rejected": -3.6992263793945312, "loss": 4.4695, "rewards/accuracies": 0.5, "rewards/chosen": -38.51759719848633, "rewards/margins": -1.5253310203552246, "rewards/rejected": -36.99226379394531, "step": 2913 }, { "epoch": 0.3967864923747277, "grad_norm": 40.543368152169045, "learning_rate": 6.039388905707162e-07, "logits/chosen": 12.192272186279297, "logits/rejected": 12.774189949035645, "logps/chosen": -3.549191474914551, "logps/rejected": -3.977077007293701, "loss": 4.3708, "rewards/accuracies": 0.75, "rewards/chosen": -35.491912841796875, "rewards/margins": 4.278858184814453, "rewards/rejected": -39.770774841308594, "step": 2914 }, { "epoch": 0.3969226579520697, "grad_norm": 41.09284283745288, "learning_rate": 6.037752965215824e-07, "logits/chosen": 11.785619735717773, "logits/rejected": 12.507745742797852, "logps/chosen": -3.5698530673980713, "logps/rejected": -3.996225357055664, "loss": 4.3948, "rewards/accuracies": 0.75, "rewards/chosen": -35.69853210449219, "rewards/margins": 4.263723373413086, "rewards/rejected": -39.96225357055664, "step": 2915 }, { "epoch": 0.39705882352941174, "grad_norm": 40.72599704320091, "learning_rate": 6.036116564277358e-07, "logits/chosen": 13.127922058105469, "logits/rejected": 12.45012092590332, "logps/chosen": -3.800771713256836, "logps/rejected": -3.7992424964904785, "loss": 4.0436, "rewards/accuracies": 0.75, "rewards/chosen": -38.00771713256836, "rewards/margins": -0.015294075012207031, "rewards/rejected": -37.99242401123047, "step": 2916 }, { "epoch": 0.3971949891067538, "grad_norm": 39.57472677757098, "learning_rate": 6.034479703261524e-07, "logits/chosen": 12.935197830200195, "logits/rejected": 12.267455101013184, "logps/chosen": -3.8091342449188232, "logps/rejected": -3.940835952758789, "loss": 3.698, "rewards/accuracies": 0.75, "rewards/chosen": -38.09134292602539, "rewards/margins": 1.3170175552368164, "rewards/rejected": -39.408363342285156, "step": 2917 }, { "epoch": 0.39733115468409586, "grad_norm": 37.91447294066483, "learning_rate": 6.032842382538184e-07, "logits/chosen": 12.209033966064453, "logits/rejected": 12.51888656616211, "logps/chosen": -3.4007787704467773, "logps/rejected": -3.659757137298584, "loss": 3.5676, "rewards/accuracies": 0.75, "rewards/chosen": -34.007789611816406, "rewards/margins": 2.58978271484375, "rewards/rejected": -36.597572326660156, "step": 2918 }, { "epoch": 0.3974673202614379, "grad_norm": 39.237696094390785, "learning_rate": 6.031204602477304e-07, "logits/chosen": 12.813392639160156, "logits/rejected": 13.039782524108887, "logps/chosen": -4.055669784545898, "logps/rejected": -4.0660858154296875, "loss": 4.0252, "rewards/accuracies": 0.75, "rewards/chosen": -40.556697845458984, "rewards/margins": 0.10415840148925781, "rewards/rejected": -40.660858154296875, "step": 2919 }, { "epoch": 0.39760348583877997, "grad_norm": 43.251164974827866, "learning_rate": 6.029566363448954e-07, "logits/chosen": 12.378608703613281, "logits/rejected": 12.158199310302734, "logps/chosen": -3.679750919342041, "logps/rejected": -3.876779556274414, "loss": 4.0681, "rewards/accuracies": 0.5, "rewards/chosen": -36.797508239746094, "rewards/margins": 1.9702835083007812, "rewards/rejected": -38.767791748046875, "step": 2920 }, { "epoch": 0.397739651416122, "grad_norm": 47.29695686910142, "learning_rate": 6.027927665823307e-07, "logits/chosen": 12.229639053344727, "logits/rejected": 12.790578842163086, "logps/chosen": -3.628929853439331, "logps/rejected": -4.076796531677246, "loss": 3.4323, "rewards/accuracies": 1.0, "rewards/chosen": -36.28929901123047, "rewards/margins": 4.478667259216309, "rewards/rejected": -40.767967224121094, "step": 2921 }, { "epoch": 0.397875816993464, "grad_norm": 40.66566204789606, "learning_rate": 6.026288509970643e-07, "logits/chosen": 12.612410545349121, "logits/rejected": 12.652503967285156, "logps/chosen": -3.95961332321167, "logps/rejected": -4.032916069030762, "loss": 4.1262, "rewards/accuracies": 0.5, "rewards/chosen": -39.596134185791016, "rewards/margins": 0.7330236434936523, "rewards/rejected": -40.32915496826172, "step": 2922 }, { "epoch": 0.3980119825708061, "grad_norm": 47.42104361023625, "learning_rate": 6.024648896261339e-07, "logits/chosen": 12.009159088134766, "logits/rejected": 12.593694686889648, "logps/chosen": -3.7216086387634277, "logps/rejected": -3.9126698970794678, "loss": 4.372, "rewards/accuracies": 0.5, "rewards/chosen": -37.216087341308594, "rewards/margins": 1.9106111526489258, "rewards/rejected": -39.12670135498047, "step": 2923 }, { "epoch": 0.39814814814814814, "grad_norm": 42.03834627839435, "learning_rate": 6.023008825065881e-07, "logits/chosen": 11.939929008483887, "logits/rejected": 12.980857849121094, "logps/chosen": -3.916879177093506, "logps/rejected": -4.192840099334717, "loss": 4.0382, "rewards/accuracies": 1.0, "rewards/chosen": -39.168792724609375, "rewards/margins": 2.7596092224121094, "rewards/rejected": -41.928401947021484, "step": 2924 }, { "epoch": 0.39828431372549017, "grad_norm": 38.65032645554322, "learning_rate": 6.021368296754857e-07, "logits/chosen": 12.298175811767578, "logits/rejected": 12.847094535827637, "logps/chosen": -3.8642208576202393, "logps/rejected": -4.302093505859375, "loss": 3.6326, "rewards/accuracies": 1.0, "rewards/chosen": -38.64220428466797, "rewards/margins": 4.378725051879883, "rewards/rejected": -43.020931243896484, "step": 2925 }, { "epoch": 0.39842047930283225, "grad_norm": 43.38022317968642, "learning_rate": 6.019727311698957e-07, "logits/chosen": 11.926177978515625, "logits/rejected": 12.016902923583984, "logps/chosen": -3.4977662563323975, "logps/rejected": -3.5222692489624023, "loss": 4.4763, "rewards/accuracies": 0.75, "rewards/chosen": -34.9776611328125, "rewards/margins": 0.24502849578857422, "rewards/rejected": -35.222694396972656, "step": 2926 }, { "epoch": 0.3985566448801743, "grad_norm": 46.412723375118304, "learning_rate": 6.018085870268976e-07, "logits/chosen": 12.309606552124023, "logits/rejected": 12.736682891845703, "logps/chosen": -3.700937271118164, "logps/rejected": -3.8923189640045166, "loss": 4.9248, "rewards/accuracies": 0.5, "rewards/chosen": -37.009376525878906, "rewards/margins": 1.9138174057006836, "rewards/rejected": -38.92319107055664, "step": 2927 }, { "epoch": 0.39869281045751637, "grad_norm": 44.213265878389656, "learning_rate": 6.016443972835811e-07, "logits/chosen": 11.472698211669922, "logits/rejected": 11.84424877166748, "logps/chosen": -3.518857955932617, "logps/rejected": -3.6513254642486572, "loss": 3.5971, "rewards/accuracies": 0.5, "rewards/chosen": -35.18857955932617, "rewards/margins": 1.3246746063232422, "rewards/rejected": -36.51325225830078, "step": 2928 }, { "epoch": 0.3988289760348584, "grad_norm": 42.58711004743122, "learning_rate": 6.014801619770463e-07, "logits/chosen": 11.260065078735352, "logits/rejected": 11.993070602416992, "logps/chosen": -3.617746591567993, "logps/rejected": -3.856649398803711, "loss": 4.0984, "rewards/accuracies": 0.75, "rewards/chosen": -36.177467346191406, "rewards/margins": 2.389026641845703, "rewards/rejected": -38.56649398803711, "step": 2929 }, { "epoch": 0.3989651416122004, "grad_norm": 38.826112328720576, "learning_rate": 6.013158811444033e-07, "logits/chosen": 12.310585021972656, "logits/rejected": 13.134584426879883, "logps/chosen": -3.6008710861206055, "logps/rejected": -3.97462797164917, "loss": 4.1925, "rewards/accuracies": 0.75, "rewards/chosen": -36.00871276855469, "rewards/margins": 3.7375659942626953, "rewards/rejected": -39.74627685546875, "step": 2930 }, { "epoch": 0.3991013071895425, "grad_norm": 41.493123667220964, "learning_rate": 6.01151554822773e-07, "logits/chosen": 12.39293384552002, "logits/rejected": 12.068370819091797, "logps/chosen": -3.7101423740386963, "logps/rejected": -3.6983156204223633, "loss": 4.1788, "rewards/accuracies": 0.5, "rewards/chosen": -37.10142517089844, "rewards/margins": -0.11826896667480469, "rewards/rejected": -36.983154296875, "step": 2931 }, { "epoch": 0.39923747276688454, "grad_norm": 38.803443729093665, "learning_rate": 6.00987183049286e-07, "logits/chosen": 11.116558074951172, "logits/rejected": 11.725442886352539, "logps/chosen": -3.5504088401794434, "logps/rejected": -3.7783353328704834, "loss": 4.1422, "rewards/accuracies": 0.75, "rewards/chosen": -35.50408935546875, "rewards/margins": 2.2792654037475586, "rewards/rejected": -37.783355712890625, "step": 2932 }, { "epoch": 0.39937363834422657, "grad_norm": 35.700761190470274, "learning_rate": 6.008227658610838e-07, "logits/chosen": 11.532968521118164, "logits/rejected": 13.446161270141602, "logps/chosen": -3.528500556945801, "logps/rejected": -3.865875482559204, "loss": 3.8222, "rewards/accuracies": 0.5, "rewards/chosen": -35.28500747680664, "rewards/margins": 3.3737478256225586, "rewards/rejected": -38.65875244140625, "step": 2933 }, { "epoch": 0.39950980392156865, "grad_norm": 36.82996002889216, "learning_rate": 6.006583032953175e-07, "logits/chosen": 11.80742359161377, "logits/rejected": 12.560272216796875, "logps/chosen": -3.3092546463012695, "logps/rejected": -3.5933942794799805, "loss": 3.6355, "rewards/accuracies": 0.75, "rewards/chosen": -33.09254837036133, "rewards/margins": 2.841397285461426, "rewards/rejected": -35.93394470214844, "step": 2934 }, { "epoch": 0.3996459694989107, "grad_norm": 44.03934365365353, "learning_rate": 6.00493795389149e-07, "logits/chosen": 11.697123527526855, "logits/rejected": 12.38904094696045, "logps/chosen": -3.5669076442718506, "logps/rejected": -3.5527634620666504, "loss": 4.8473, "rewards/accuracies": 0.5, "rewards/chosen": -35.66907501220703, "rewards/margins": -0.14144372940063477, "rewards/rejected": -35.52763366699219, "step": 2935 }, { "epoch": 0.3997821350762527, "grad_norm": 427.1079540007381, "learning_rate": 6.0032924217975e-07, "logits/chosen": 11.830560684204102, "logits/rejected": 12.98311996459961, "logps/chosen": -3.4819047451019287, "logps/rejected": -3.905100107192993, "loss": 3.9845, "rewards/accuracies": 1.0, "rewards/chosen": -34.81904983520508, "rewards/margins": 4.231952667236328, "rewards/rejected": -39.051002502441406, "step": 2936 }, { "epoch": 0.3999183006535948, "grad_norm": 40.75423486607735, "learning_rate": 6.00164643704303e-07, "logits/chosen": 11.721689224243164, "logits/rejected": 12.028861045837402, "logps/chosen": -3.311443567276001, "logps/rejected": -3.4994587898254395, "loss": 4.1003, "rewards/accuracies": 0.75, "rewards/chosen": -33.11443328857422, "rewards/margins": 1.880155086517334, "rewards/rejected": -34.994590759277344, "step": 2937 }, { "epoch": 0.4000544662309368, "grad_norm": 43.34559222575924, "learning_rate": 6e-07, "logits/chosen": 12.501483917236328, "logits/rejected": 12.603199005126953, "logps/chosen": -3.9088377952575684, "logps/rejected": -3.7057530879974365, "loss": 4.381, "rewards/accuracies": 0.25, "rewards/chosen": -39.08837890625, "rewards/margins": -2.03084659576416, "rewards/rejected": -37.057533264160156, "step": 2938 }, { "epoch": 0.40019063180827885, "grad_norm": 37.74424726852854, "learning_rate": 5.998353111040437e-07, "logits/chosen": 12.640405654907227, "logits/rejected": 11.833928108215332, "logps/chosen": -3.582454204559326, "logps/rejected": -3.329986333847046, "loss": 3.8652, "rewards/accuracies": 0.25, "rewards/chosen": -35.82453918457031, "rewards/margins": -2.5246782302856445, "rewards/rejected": -33.29986572265625, "step": 2939 }, { "epoch": 0.40032679738562094, "grad_norm": 43.49840394434846, "learning_rate": 5.996705770536472e-07, "logits/chosen": 11.937108993530273, "logits/rejected": 11.60904312133789, "logps/chosen": -3.6180734634399414, "logps/rejected": -4.134097099304199, "loss": 4.0173, "rewards/accuracies": 1.0, "rewards/chosen": -36.18073272705078, "rewards/margins": 5.160236358642578, "rewards/rejected": -41.340972900390625, "step": 2940 }, { "epoch": 0.40046296296296297, "grad_norm": 34.263020550548475, "learning_rate": 5.995057978860334e-07, "logits/chosen": 12.309921264648438, "logits/rejected": 12.3900146484375, "logps/chosen": -3.6975817680358887, "logps/rejected": -3.89040470123291, "loss": 3.9054, "rewards/accuracies": 0.5, "rewards/chosen": -36.97581481933594, "rewards/margins": 1.9282293319702148, "rewards/rejected": -38.90404510498047, "step": 2941 }, { "epoch": 0.400599128540305, "grad_norm": 39.12973351879619, "learning_rate": 5.993409736384352e-07, "logits/chosen": 10.711471557617188, "logits/rejected": 12.329366683959961, "logps/chosen": -3.358163356781006, "logps/rejected": -3.850158929824829, "loss": 3.8497, "rewards/accuracies": 1.0, "rewards/chosen": -33.581634521484375, "rewards/margins": 4.919953346252441, "rewards/rejected": -38.5015869140625, "step": 2942 }, { "epoch": 0.4007352941176471, "grad_norm": 37.338181605312734, "learning_rate": 5.991761043480964e-07, "logits/chosen": 11.84289264678955, "logits/rejected": 12.283393859863281, "logps/chosen": -3.7423999309539795, "logps/rejected": -3.817781925201416, "loss": 3.9498, "rewards/accuracies": 0.75, "rewards/chosen": -37.42399978637695, "rewards/margins": 0.7538185119628906, "rewards/rejected": -38.177818298339844, "step": 2943 }, { "epoch": 0.4008714596949891, "grad_norm": 41.15198085497551, "learning_rate": 5.990111900522703e-07, "logits/chosen": 12.333915710449219, "logits/rejected": 10.722182273864746, "logps/chosen": -3.5773305892944336, "logps/rejected": -3.3222010135650635, "loss": 3.8907, "rewards/accuracies": 0.0, "rewards/chosen": -35.77330780029297, "rewards/margins": -2.551298141479492, "rewards/rejected": -33.222007751464844, "step": 2944 }, { "epoch": 0.40100762527233114, "grad_norm": 112.75411755853516, "learning_rate": 5.988462307882208e-07, "logits/chosen": 12.166533470153809, "logits/rejected": 12.718358993530273, "logps/chosen": -3.74208927154541, "logps/rejected": -3.7448949813842773, "loss": 4.9208, "rewards/accuracies": 0.5, "rewards/chosen": -37.420894622802734, "rewards/margins": 0.02805614471435547, "rewards/rejected": -37.448951721191406, "step": 2945 }, { "epoch": 0.4011437908496732, "grad_norm": 34.67416328593542, "learning_rate": 5.986812265932218e-07, "logits/chosen": 12.279565811157227, "logits/rejected": 11.787437438964844, "logps/chosen": -3.6342294216156006, "logps/rejected": -3.52248477935791, "loss": 4.1874, "rewards/accuracies": 0.5, "rewards/chosen": -36.34229278564453, "rewards/margins": -1.1174468994140625, "rewards/rejected": -35.22484588623047, "step": 2946 }, { "epoch": 0.40127995642701525, "grad_norm": 37.83574624155595, "learning_rate": 5.985161775045574e-07, "logits/chosen": 12.217123031616211, "logits/rejected": 12.54869270324707, "logps/chosen": -3.6747124195098877, "logps/rejected": -3.807939052581787, "loss": 4.2433, "rewards/accuracies": 0.75, "rewards/chosen": -36.74712371826172, "rewards/margins": 1.3322649002075195, "rewards/rejected": -38.07939147949219, "step": 2947 }, { "epoch": 0.4014161220043573, "grad_norm": 33.87963980758627, "learning_rate": 5.983510835595216e-07, "logits/chosen": 11.372591972351074, "logits/rejected": 12.049652099609375, "logps/chosen": -3.361032724380493, "logps/rejected": -3.6514534950256348, "loss": 3.9477, "rewards/accuracies": 0.75, "rewards/chosen": -33.610328674316406, "rewards/margins": 2.9042086601257324, "rewards/rejected": -36.51453399658203, "step": 2948 }, { "epoch": 0.40155228758169936, "grad_norm": 47.846405000233496, "learning_rate": 5.981859447954189e-07, "logits/chosen": 11.651925086975098, "logits/rejected": 11.777738571166992, "logps/chosen": -3.504406213760376, "logps/rejected": -3.4334659576416016, "loss": 3.824, "rewards/accuracies": 0.25, "rewards/chosen": -35.04405975341797, "rewards/margins": -0.709404468536377, "rewards/rejected": -34.33465576171875, "step": 2949 }, { "epoch": 0.4016884531590414, "grad_norm": 37.085431457314805, "learning_rate": 5.980207612495638e-07, "logits/chosen": 12.039728164672852, "logits/rejected": 12.021986961364746, "logps/chosen": -3.478177547454834, "logps/rejected": -3.5988948345184326, "loss": 4.0182, "rewards/accuracies": 0.75, "rewards/chosen": -34.781776428222656, "rewards/margins": 1.2071733474731445, "rewards/rejected": -35.988948822021484, "step": 2950 }, { "epoch": 0.4018246187363834, "grad_norm": 36.03747726637975, "learning_rate": 5.978555329592808e-07, "logits/chosen": 11.924930572509766, "logits/rejected": 12.197836875915527, "logps/chosen": -3.533690929412842, "logps/rejected": -3.6333565711975098, "loss": 4.1122, "rewards/accuracies": 0.5, "rewards/chosen": -35.336910247802734, "rewards/margins": 0.9966535568237305, "rewards/rejected": -36.33356475830078, "step": 2951 }, { "epoch": 0.4019607843137255, "grad_norm": 34.02337056177818, "learning_rate": 5.976902599619047e-07, "logits/chosen": 12.572151184082031, "logits/rejected": 12.786766052246094, "logps/chosen": -3.5518999099731445, "logps/rejected": -3.905263662338257, "loss": 3.3656, "rewards/accuracies": 0.5, "rewards/chosen": -35.51899719238281, "rewards/margins": 3.533637523651123, "rewards/rejected": -39.052635192871094, "step": 2952 }, { "epoch": 0.40209694989106753, "grad_norm": 38.93308071249707, "learning_rate": 5.975249422947802e-07, "logits/chosen": 12.16439437866211, "logits/rejected": 13.106101036071777, "logps/chosen": -3.738913059234619, "logps/rejected": -3.852672576904297, "loss": 4.9125, "rewards/accuracies": 0.75, "rewards/chosen": -37.389129638671875, "rewards/margins": 1.137594223022461, "rewards/rejected": -38.52672576904297, "step": 2953 }, { "epoch": 0.40223311546840956, "grad_norm": 39.46455479411176, "learning_rate": 5.973595799952622e-07, "logits/chosen": 12.037919998168945, "logits/rejected": 13.551340103149414, "logps/chosen": -3.592817783355713, "logps/rejected": -3.9801225662231445, "loss": 3.8418, "rewards/accuracies": 1.0, "rewards/chosen": -35.92817687988281, "rewards/margins": 3.8730478286743164, "rewards/rejected": -39.80122756958008, "step": 2954 }, { "epoch": 0.40236928104575165, "grad_norm": 32.57308478267774, "learning_rate": 5.971941731007158e-07, "logits/chosen": 12.75627613067627, "logits/rejected": 12.546211242675781, "logps/chosen": -3.8632514476776123, "logps/rejected": -3.9335482120513916, "loss": 4.0378, "rewards/accuracies": 0.5, "rewards/chosen": -38.63251495361328, "rewards/margins": 0.7029666900634766, "rewards/rejected": -39.335479736328125, "step": 2955 }, { "epoch": 0.4025054466230937, "grad_norm": 47.51209255007917, "learning_rate": 5.97028721648516e-07, "logits/chosen": 12.68276596069336, "logits/rejected": 12.893577575683594, "logps/chosen": -3.732923984527588, "logps/rejected": -3.962625741958618, "loss": 3.4738, "rewards/accuracies": 0.75, "rewards/chosen": -37.32923889160156, "rewards/margins": 2.2970175743103027, "rewards/rejected": -39.626258850097656, "step": 2956 }, { "epoch": 0.4026416122004357, "grad_norm": 36.33285397419207, "learning_rate": 5.968632256760477e-07, "logits/chosen": 11.80721664428711, "logits/rejected": 12.367555618286133, "logps/chosen": -3.5584349632263184, "logps/rejected": -3.8386776447296143, "loss": 4.1502, "rewards/accuracies": 0.75, "rewards/chosen": -35.5843505859375, "rewards/margins": 2.802428722381592, "rewards/rejected": -38.38677978515625, "step": 2957 }, { "epoch": 0.4027777777777778, "grad_norm": 34.752131817805044, "learning_rate": 5.966976852207064e-07, "logits/chosen": 12.912703514099121, "logits/rejected": 12.852296829223633, "logps/chosen": -3.8373255729675293, "logps/rejected": -3.712249994277954, "loss": 4.0887, "rewards/accuracies": 0.5, "rewards/chosen": -38.373260498046875, "rewards/margins": -1.2507572174072266, "rewards/rejected": -37.12249755859375, "step": 2958 }, { "epoch": 0.4029139433551198, "grad_norm": 36.9960709808267, "learning_rate": 5.965321003198972e-07, "logits/chosen": 11.918190002441406, "logits/rejected": 12.303718566894531, "logps/chosen": -3.3511409759521484, "logps/rejected": -3.576723098754883, "loss": 4.0032, "rewards/accuracies": 0.75, "rewards/chosen": -33.511409759521484, "rewards/margins": 2.255821704864502, "rewards/rejected": -35.76723098754883, "step": 2959 }, { "epoch": 0.40305010893246185, "grad_norm": 38.52916500866209, "learning_rate": 5.963664710110354e-07, "logits/chosen": 11.97546100616455, "logits/rejected": 12.007740020751953, "logps/chosen": -3.5798609256744385, "logps/rejected": -3.8473362922668457, "loss": 4.5885, "rewards/accuracies": 0.75, "rewards/chosen": -35.798606872558594, "rewards/margins": 2.6747546195983887, "rewards/rejected": -38.473365783691406, "step": 2960 }, { "epoch": 0.40318627450980393, "grad_norm": 38.27540741923537, "learning_rate": 5.962007973315462e-07, "logits/chosen": 12.824838638305664, "logits/rejected": 12.431238174438477, "logps/chosen": -3.7704145908355713, "logps/rejected": -3.723602294921875, "loss": 4.262, "rewards/accuracies": 0.5, "rewards/chosen": -37.70414733886719, "rewards/margins": -0.4681215286254883, "rewards/rejected": -37.23602294921875, "step": 2961 }, { "epoch": 0.40332244008714596, "grad_norm": 40.88890660862068, "learning_rate": 5.960350793188651e-07, "logits/chosen": 12.042070388793945, "logits/rejected": 12.044347763061523, "logps/chosen": -3.9128026962280273, "logps/rejected": -4.104647636413574, "loss": 3.5663, "rewards/accuracies": 0.75, "rewards/chosen": -39.12802505493164, "rewards/margins": 1.9184513092041016, "rewards/rejected": -41.046478271484375, "step": 2962 }, { "epoch": 0.403458605664488, "grad_norm": 36.617823626048214, "learning_rate": 5.958693170104373e-07, "logits/chosen": 12.809206008911133, "logits/rejected": 12.598251342773438, "logps/chosen": -3.8019845485687256, "logps/rejected": -3.467648983001709, "loss": 4.0567, "rewards/accuracies": 0.25, "rewards/chosen": -38.01984786987305, "rewards/margins": -3.3433570861816406, "rewards/rejected": -34.676490783691406, "step": 2963 }, { "epoch": 0.4035947712418301, "grad_norm": 36.97931119270216, "learning_rate": 5.957035104437183e-07, "logits/chosen": 12.077447891235352, "logits/rejected": 12.826348304748535, "logps/chosen": -3.7857956886291504, "logps/rejected": -4.118680000305176, "loss": 3.4782, "rewards/accuracies": 1.0, "rewards/chosen": -37.85795593261719, "rewards/margins": 3.3288445472717285, "rewards/rejected": -41.186798095703125, "step": 2964 }, { "epoch": 0.4037309368191721, "grad_norm": 37.102066911424146, "learning_rate": 5.955376596561735e-07, "logits/chosen": 11.79020881652832, "logits/rejected": 13.621015548706055, "logps/chosen": -3.588395595550537, "logps/rejected": -3.816974639892578, "loss": 3.9239, "rewards/accuracies": 0.75, "rewards/chosen": -35.88395690917969, "rewards/margins": 2.2857933044433594, "rewards/rejected": -38.16974639892578, "step": 2965 }, { "epoch": 0.4038671023965142, "grad_norm": 40.944180019306785, "learning_rate": 5.953717646852781e-07, "logits/chosen": 12.734160423278809, "logits/rejected": 12.686569213867188, "logps/chosen": -3.81683087348938, "logps/rejected": -4.09109354019165, "loss": 4.1474, "rewards/accuracies": 0.75, "rewards/chosen": -38.16830825805664, "rewards/margins": 2.7426252365112305, "rewards/rejected": -40.91093444824219, "step": 2966 }, { "epoch": 0.4040032679738562, "grad_norm": 44.37155814199838, "learning_rate": 5.952058255685175e-07, "logits/chosen": 12.942558288574219, "logits/rejected": 13.094708442687988, "logps/chosen": -3.774397611618042, "logps/rejected": -4.1078033447265625, "loss": 4.3322, "rewards/accuracies": 0.75, "rewards/chosen": -37.743980407714844, "rewards/margins": 3.3340535163879395, "rewards/rejected": -41.078033447265625, "step": 2967 }, { "epoch": 0.40413943355119825, "grad_norm": 39.23914402226371, "learning_rate": 5.950398423433871e-07, "logits/chosen": 11.844785690307617, "logits/rejected": 11.934147834777832, "logps/chosen": -3.7302637100219727, "logps/rejected": -3.6890814304351807, "loss": 4.1941, "rewards/accuracies": 0.5, "rewards/chosen": -37.30263900756836, "rewards/margins": -0.4118213653564453, "rewards/rejected": -36.89081573486328, "step": 2968 }, { "epoch": 0.40427559912854033, "grad_norm": 44.314963599595565, "learning_rate": 5.94873815047392e-07, "logits/chosen": 12.355996131896973, "logits/rejected": 13.329907417297363, "logps/chosen": -3.756291627883911, "logps/rejected": -4.280960559844971, "loss": 4.2176, "rewards/accuracies": 0.75, "rewards/chosen": -37.56291580200195, "rewards/margins": 5.246687889099121, "rewards/rejected": -42.80960464477539, "step": 2969 }, { "epoch": 0.40441176470588236, "grad_norm": 45.05723299480032, "learning_rate": 5.947077437180475e-07, "logits/chosen": 12.195576667785645, "logits/rejected": 12.287035942077637, "logps/chosen": -3.6916539669036865, "logps/rejected": -3.497663974761963, "loss": 4.0583, "rewards/accuracies": 0.5, "rewards/chosen": -36.916542053222656, "rewards/margins": -1.939901351928711, "rewards/rejected": -34.97663879394531, "step": 2970 }, { "epoch": 0.4045479302832244, "grad_norm": 42.37071058499893, "learning_rate": 5.94541628392879e-07, "logits/chosen": 12.881722450256348, "logits/rejected": 12.952688217163086, "logps/chosen": -3.90594482421875, "logps/rejected": -3.9386720657348633, "loss": 4.1271, "rewards/accuracies": 0.5, "rewards/chosen": -39.0594482421875, "rewards/margins": 0.3272724151611328, "rewards/rejected": -39.38671875, "step": 2971 }, { "epoch": 0.4046840958605665, "grad_norm": 43.98257084179841, "learning_rate": 5.943754691094213e-07, "logits/chosen": 12.918737411499023, "logits/rejected": 12.765989303588867, "logps/chosen": -3.600620746612549, "logps/rejected": -3.8418872356414795, "loss": 5.0358, "rewards/accuracies": 0.5, "rewards/chosen": -36.00621032714844, "rewards/margins": 2.412663459777832, "rewards/rejected": -38.41887283325195, "step": 2972 }, { "epoch": 0.4048202614379085, "grad_norm": 41.84073042768928, "learning_rate": 5.942092659052198e-07, "logits/chosen": 12.333351135253906, "logits/rejected": 11.882884979248047, "logps/chosen": -3.7972867488861084, "logps/rejected": -3.951077461242676, "loss": 4.0015, "rewards/accuracies": 0.75, "rewards/chosen": -37.97286605834961, "rewards/margins": 1.5379080772399902, "rewards/rejected": -39.510772705078125, "step": 2973 }, { "epoch": 0.40495642701525053, "grad_norm": 40.45471367293558, "learning_rate": 5.940430188178293e-07, "logits/chosen": 12.190817832946777, "logits/rejected": 12.575836181640625, "logps/chosen": -3.7278685569763184, "logps/rejected": -3.8497142791748047, "loss": 4.0794, "rewards/accuracies": 0.75, "rewards/chosen": -37.278682708740234, "rewards/margins": 1.2184596061706543, "rewards/rejected": -38.49714279174805, "step": 2974 }, { "epoch": 0.4050925925925926, "grad_norm": 34.5313894792163, "learning_rate": 5.938767278848146e-07, "logits/chosen": 12.297490119934082, "logits/rejected": 12.029794692993164, "logps/chosen": -3.6289002895355225, "logps/rejected": -3.717071771621704, "loss": 3.6272, "rewards/accuracies": 0.5, "rewards/chosen": -36.28900146484375, "rewards/margins": 0.8817148208618164, "rewards/rejected": -37.170719146728516, "step": 2975 }, { "epoch": 0.40522875816993464, "grad_norm": 40.596984198325174, "learning_rate": 5.937103931437507e-07, "logits/chosen": 12.37564468383789, "logits/rejected": 12.299576759338379, "logps/chosen": -4.19270133972168, "logps/rejected": -3.8369622230529785, "loss": 4.3729, "rewards/accuracies": 0.0, "rewards/chosen": -41.92700958251953, "rewards/margins": -3.5573883056640625, "rewards/rejected": -38.36962127685547, "step": 2976 }, { "epoch": 0.4053649237472767, "grad_norm": 39.302173504103635, "learning_rate": 5.935440146322223e-07, "logits/chosen": 12.248100280761719, "logits/rejected": 12.738546371459961, "logps/chosen": -3.740652561187744, "logps/rejected": -3.898038387298584, "loss": 3.4331, "rewards/accuracies": 0.75, "rewards/chosen": -37.406524658203125, "rewards/margins": 1.5738577842712402, "rewards/rejected": -38.980384826660156, "step": 2977 }, { "epoch": 0.40550108932461876, "grad_norm": 43.92218428603731, "learning_rate": 5.933775923878238e-07, "logits/chosen": 12.063749313354492, "logits/rejected": 12.93546199798584, "logps/chosen": -3.7024197578430176, "logps/rejected": -3.990994691848755, "loss": 3.6953, "rewards/accuracies": 1.0, "rewards/chosen": -37.024200439453125, "rewards/margins": 2.8857460021972656, "rewards/rejected": -39.909942626953125, "step": 2978 }, { "epoch": 0.4056372549019608, "grad_norm": 39.31704395395612, "learning_rate": 5.9321112644816e-07, "logits/chosen": 11.77320671081543, "logits/rejected": 12.686314582824707, "logps/chosen": -3.722895622253418, "logps/rejected": -4.036956310272217, "loss": 4.3483, "rewards/accuracies": 1.0, "rewards/chosen": -37.22895431518555, "rewards/margins": 3.1406078338623047, "rewards/rejected": -40.369564056396484, "step": 2979 }, { "epoch": 0.4057734204793028, "grad_norm": 43.870460054513956, "learning_rate": 5.93044616850845e-07, "logits/chosen": 11.296998977661133, "logits/rejected": 12.064302444458008, "logps/chosen": -3.4931745529174805, "logps/rejected": -3.8389430046081543, "loss": 4.4543, "rewards/accuracies": 0.75, "rewards/chosen": -34.93174362182617, "rewards/margins": 3.4576854705810547, "rewards/rejected": -38.38943099975586, "step": 2980 }, { "epoch": 0.4059095860566449, "grad_norm": 36.739315773408, "learning_rate": 5.92878063633503e-07, "logits/chosen": 11.736146926879883, "logits/rejected": 11.851946830749512, "logps/chosen": -3.7088565826416016, "logps/rejected": -3.9239373207092285, "loss": 3.8406, "rewards/accuracies": 1.0, "rewards/chosen": -37.08856201171875, "rewards/margins": 2.1508092880249023, "rewards/rejected": -39.23937225341797, "step": 2981 }, { "epoch": 0.40604575163398693, "grad_norm": 41.48586348179293, "learning_rate": 5.927114668337683e-07, "logits/chosen": 12.067859649658203, "logits/rejected": 12.305180549621582, "logps/chosen": -4.009631633758545, "logps/rejected": -3.831023931503296, "loss": 4.4049, "rewards/accuracies": 0.25, "rewards/chosen": -40.0963134765625, "rewards/margins": -1.786076545715332, "rewards/rejected": -38.31024169921875, "step": 2982 }, { "epoch": 0.40618191721132896, "grad_norm": 42.93767003846976, "learning_rate": 5.925448264892847e-07, "logits/chosen": 11.741094589233398, "logits/rejected": 11.80588150024414, "logps/chosen": -4.070340633392334, "logps/rejected": -3.587538719177246, "loss": 4.428, "rewards/accuracies": 0.25, "rewards/chosen": -40.703407287597656, "rewards/margins": -4.828020095825195, "rewards/rejected": -35.87538528442383, "step": 2983 }, { "epoch": 0.40631808278867104, "grad_norm": 37.49145396906605, "learning_rate": 5.923781426377059e-07, "logits/chosen": 11.918017387390137, "logits/rejected": 12.58167839050293, "logps/chosen": -3.7833735942840576, "logps/rejected": -3.9573137760162354, "loss": 3.629, "rewards/accuracies": 0.75, "rewards/chosen": -37.83373260498047, "rewards/margins": 1.7394018173217773, "rewards/rejected": -39.57313537597656, "step": 2984 }, { "epoch": 0.40645424836601307, "grad_norm": 40.47590529132398, "learning_rate": 5.922114153166956e-07, "logits/chosen": 12.170158386230469, "logits/rejected": 12.71706771850586, "logps/chosen": -3.945040702819824, "logps/rejected": -4.110439300537109, "loss": 4.2836, "rewards/accuracies": 0.75, "rewards/chosen": -39.450408935546875, "rewards/margins": 1.6539878845214844, "rewards/rejected": -41.104393005371094, "step": 2985 }, { "epoch": 0.4065904139433551, "grad_norm": 37.27677638387367, "learning_rate": 5.920446445639272e-07, "logits/chosen": 12.527685165405273, "logits/rejected": 12.515868186950684, "logps/chosen": -3.7354025840759277, "logps/rejected": -4.242862224578857, "loss": 3.8677, "rewards/accuracies": 0.75, "rewards/chosen": -37.354026794433594, "rewards/margins": 5.0746002197265625, "rewards/rejected": -42.42862319946289, "step": 2986 }, { "epoch": 0.4067265795206972, "grad_norm": 36.867578058372196, "learning_rate": 5.918778304170838e-07, "logits/chosen": 12.169968605041504, "logits/rejected": 12.687080383300781, "logps/chosen": -3.714658498764038, "logps/rejected": -3.7875165939331055, "loss": 4.0491, "rewards/accuracies": 0.75, "rewards/chosen": -37.146583557128906, "rewards/margins": 0.728581428527832, "rewards/rejected": -37.87516403198242, "step": 2987 }, { "epoch": 0.4068627450980392, "grad_norm": 44.537524922601875, "learning_rate": 5.917109729138586e-07, "logits/chosen": 12.11403751373291, "logits/rejected": 11.317998886108398, "logps/chosen": -4.028419494628906, "logps/rejected": -3.3891587257385254, "loss": 4.4635, "rewards/accuracies": 0.0, "rewards/chosen": -40.28419494628906, "rewards/margins": -6.392609596252441, "rewards/rejected": -33.89158630371094, "step": 2988 }, { "epoch": 0.40699891067538124, "grad_norm": 40.980822561691326, "learning_rate": 5.915440720919545e-07, "logits/chosen": 12.81559944152832, "logits/rejected": 13.123876571655273, "logps/chosen": -4.0709004402160645, "logps/rejected": -4.117004871368408, "loss": 4.4323, "rewards/accuracies": 0.75, "rewards/chosen": -40.70900344848633, "rewards/margins": 0.4610462188720703, "rewards/rejected": -41.17005157470703, "step": 2989 }, { "epoch": 0.4071350762527233, "grad_norm": 40.53022403591372, "learning_rate": 5.913771279890838e-07, "logits/chosen": 11.093855857849121, "logits/rejected": 11.849079132080078, "logps/chosen": -3.7544829845428467, "logps/rejected": -4.079785346984863, "loss": 3.6899, "rewards/accuracies": 1.0, "rewards/chosen": -37.544830322265625, "rewards/margins": 3.253024101257324, "rewards/rejected": -40.797855377197266, "step": 2990 }, { "epoch": 0.40727124183006536, "grad_norm": 38.10393546978769, "learning_rate": 5.912101406429691e-07, "logits/chosen": 12.016830444335938, "logits/rejected": 12.377187728881836, "logps/chosen": -3.661759853363037, "logps/rejected": -3.7169065475463867, "loss": 3.5976, "rewards/accuracies": 0.5, "rewards/chosen": -36.61759948730469, "rewards/margins": 0.5514693260192871, "rewards/rejected": -37.1690673828125, "step": 2991 }, { "epoch": 0.4074074074074074, "grad_norm": 42.040338805159514, "learning_rate": 5.910431100913427e-07, "logits/chosen": 11.431777954101562, "logits/rejected": 11.252723693847656, "logps/chosen": -3.7608089447021484, "logps/rejected": -3.659886360168457, "loss": 4.3675, "rewards/accuracies": 0.25, "rewards/chosen": -37.60808563232422, "rewards/margins": -1.009225845336914, "rewards/rejected": -36.59886169433594, "step": 2992 }, { "epoch": 0.40754357298474947, "grad_norm": 38.8908644087204, "learning_rate": 5.908760363719463e-07, "logits/chosen": 10.585556030273438, "logits/rejected": 11.583932876586914, "logps/chosen": -3.4548707008361816, "logps/rejected": -3.6635632514953613, "loss": 4.2357, "rewards/accuracies": 0.75, "rewards/chosen": -34.5487060546875, "rewards/margins": 2.0869264602661133, "rewards/rejected": -36.6356315612793, "step": 2993 }, { "epoch": 0.4076797385620915, "grad_norm": 41.871214402376324, "learning_rate": 5.907089195225316e-07, "logits/chosen": 12.186311721801758, "logits/rejected": 12.010946273803711, "logps/chosen": -4.117388725280762, "logps/rejected": -4.166601181030273, "loss": 4.7196, "rewards/accuracies": 0.25, "rewards/chosen": -41.17388916015625, "rewards/margins": 0.4921245574951172, "rewards/rejected": -41.666011810302734, "step": 2994 }, { "epoch": 0.4078159041394335, "grad_norm": 37.91697472306761, "learning_rate": 5.905417595808603e-07, "logits/chosen": 11.510688781738281, "logits/rejected": 11.387348175048828, "logps/chosen": -3.826775074005127, "logps/rejected": -3.860144853591919, "loss": 4.3129, "rewards/accuracies": 0.5, "rewards/chosen": -38.26774978637695, "rewards/margins": 0.3336982727050781, "rewards/rejected": -38.60144805908203, "step": 2995 }, { "epoch": 0.4079520697167756, "grad_norm": 39.37978348956718, "learning_rate": 5.903745565847033e-07, "logits/chosen": 11.286006927490234, "logits/rejected": 11.276928901672363, "logps/chosen": -3.44307804107666, "logps/rejected": -3.9229390621185303, "loss": 3.6388, "rewards/accuracies": 0.75, "rewards/chosen": -34.43077850341797, "rewards/margins": 4.798609733581543, "rewards/rejected": -39.229393005371094, "step": 2996 }, { "epoch": 0.40808823529411764, "grad_norm": 42.212968474048864, "learning_rate": 5.902073105718416e-07, "logits/chosen": 11.46088981628418, "logits/rejected": 11.501407623291016, "logps/chosen": -3.430562973022461, "logps/rejected": -3.574728488922119, "loss": 3.7437, "rewards/accuracies": 0.75, "rewards/chosen": -34.30562973022461, "rewards/margins": 1.441655158996582, "rewards/rejected": -35.747283935546875, "step": 2997 }, { "epoch": 0.40822440087145967, "grad_norm": 40.15182702517928, "learning_rate": 5.900400215800658e-07, "logits/chosen": 11.712324142456055, "logits/rejected": 12.219257354736328, "logps/chosen": -3.7278988361358643, "logps/rejected": -4.171760082244873, "loss": 3.8238, "rewards/accuracies": 1.0, "rewards/chosen": -37.278987884521484, "rewards/margins": 4.438613414764404, "rewards/rejected": -41.71760177612305, "step": 2998 }, { "epoch": 0.40836056644880175, "grad_norm": 40.713983361269676, "learning_rate": 5.898726896471763e-07, "logits/chosen": 12.630241394042969, "logits/rejected": 12.319063186645508, "logps/chosen": -3.7633306980133057, "logps/rejected": -3.7623419761657715, "loss": 4.1619, "rewards/accuracies": 0.5, "rewards/chosen": -37.63330841064453, "rewards/margins": -0.009885787963867188, "rewards/rejected": -37.62342071533203, "step": 2999 }, { "epoch": 0.4084967320261438, "grad_norm": 42.251828034196684, "learning_rate": 5.89705314810983e-07, "logits/chosen": 11.981451988220215, "logits/rejected": 11.699769973754883, "logps/chosen": -3.797236919403076, "logps/rejected": -3.7481002807617188, "loss": 3.426, "rewards/accuracies": 0.75, "rewards/chosen": -37.97236633300781, "rewards/margins": -0.4913649559020996, "rewards/rejected": -37.48100280761719, "step": 3000 }, { "epoch": 0.4086328976034858, "grad_norm": 44.7631595187961, "learning_rate": 5.895378971093056e-07, "logits/chosen": 11.77853012084961, "logits/rejected": 11.819169044494629, "logps/chosen": -3.3401474952697754, "logps/rejected": -4.039588928222656, "loss": 4.1605, "rewards/accuracies": 1.0, "rewards/chosen": -33.40147399902344, "rewards/margins": 6.994412422180176, "rewards/rejected": -40.39588928222656, "step": 3001 }, { "epoch": 0.4087690631808279, "grad_norm": 56.88216080094172, "learning_rate": 5.893704365799738e-07, "logits/chosen": 11.020462036132812, "logits/rejected": 12.365694046020508, "logps/chosen": -3.7141880989074707, "logps/rejected": -4.2217254638671875, "loss": 4.2447, "rewards/accuracies": 1.0, "rewards/chosen": -37.141883850097656, "rewards/margins": 5.075369834899902, "rewards/rejected": -42.217254638671875, "step": 3002 }, { "epoch": 0.4089052287581699, "grad_norm": 42.522575971469216, "learning_rate": 5.892029332608263e-07, "logits/chosen": 11.1091947555542, "logits/rejected": 10.725496292114258, "logps/chosen": -3.566262722015381, "logps/rejected": -3.61685848236084, "loss": 4.1534, "rewards/accuracies": 0.5, "rewards/chosen": -35.662628173828125, "rewards/margins": 0.505958080291748, "rewards/rejected": -36.16858673095703, "step": 3003 }, { "epoch": 0.409041394335512, "grad_norm": 51.55201718696897, "learning_rate": 5.890353871897122e-07, "logits/chosen": 12.222354888916016, "logits/rejected": 12.415159225463867, "logps/chosen": -3.999553680419922, "logps/rejected": -3.9746408462524414, "loss": 3.907, "rewards/accuracies": 0.25, "rewards/chosen": -39.995540618896484, "rewards/margins": -0.2491302490234375, "rewards/rejected": -39.74640655517578, "step": 3004 }, { "epoch": 0.40917755991285404, "grad_norm": 38.97554638049427, "learning_rate": 5.888677984044898e-07, "logits/chosen": 10.097330093383789, "logits/rejected": 10.422469139099121, "logps/chosen": -3.525014877319336, "logps/rejected": -3.510996103286743, "loss": 4.0233, "rewards/accuracies": 0.75, "rewards/chosen": -35.250144958496094, "rewards/margins": -0.14018726348876953, "rewards/rejected": -35.109962463378906, "step": 3005 }, { "epoch": 0.40931372549019607, "grad_norm": 44.3634250930736, "learning_rate": 5.887001669430271e-07, "logits/chosen": 11.321417808532715, "logits/rejected": 10.981134414672852, "logps/chosen": -3.807454824447632, "logps/rejected": -3.6045775413513184, "loss": 4.8009, "rewards/accuracies": 0.5, "rewards/chosen": -38.074546813964844, "rewards/margins": -2.0287704467773438, "rewards/rejected": -36.0457763671875, "step": 3006 }, { "epoch": 0.40944989106753815, "grad_norm": 42.20378872928597, "learning_rate": 5.88532492843202e-07, "logits/chosen": 11.465641021728516, "logits/rejected": 11.972526550292969, "logps/chosen": -3.488889694213867, "logps/rejected": -3.517935037612915, "loss": 4.1674, "rewards/accuracies": 0.75, "rewards/chosen": -34.88889694213867, "rewards/margins": 0.2904520034790039, "rewards/rejected": -35.179351806640625, "step": 3007 }, { "epoch": 0.4095860566448802, "grad_norm": 48.20407661779447, "learning_rate": 5.883647761429015e-07, "logits/chosen": 11.835187911987305, "logits/rejected": 12.233688354492188, "logps/chosen": -3.895045042037964, "logps/rejected": -3.9968955516815186, "loss": 4.3806, "rewards/accuracies": 0.75, "rewards/chosen": -38.95044708251953, "rewards/margins": 1.0185070037841797, "rewards/rejected": -39.968955993652344, "step": 3008 }, { "epoch": 0.4097222222222222, "grad_norm": 35.88288345077489, "learning_rate": 5.88197016880023e-07, "logits/chosen": 11.262094497680664, "logits/rejected": 12.686464309692383, "logps/chosen": -3.4743504524230957, "logps/rejected": -4.174921035766602, "loss": 3.9517, "rewards/accuracies": 1.0, "rewards/chosen": -34.74350357055664, "rewards/margins": 7.005707740783691, "rewards/rejected": -41.749210357666016, "step": 3009 }, { "epoch": 0.4098583877995643, "grad_norm": 38.1819682150993, "learning_rate": 5.880292150924726e-07, "logits/chosen": 10.943988800048828, "logits/rejected": 12.18339729309082, "logps/chosen": -4.084486484527588, "logps/rejected": -4.323209762573242, "loss": 3.5115, "rewards/accuracies": 0.5, "rewards/chosen": -40.84486389160156, "rewards/margins": 2.387234687805176, "rewards/rejected": -43.23210144042969, "step": 3010 }, { "epoch": 0.4099945533769063, "grad_norm": 36.50997875376225, "learning_rate": 5.878613708181671e-07, "logits/chosen": 11.210012435913086, "logits/rejected": 12.402524948120117, "logps/chosen": -3.692554473876953, "logps/rejected": -4.17850399017334, "loss": 3.9684, "rewards/accuracies": 0.75, "rewards/chosen": -36.92554473876953, "rewards/margins": 4.859495162963867, "rewards/rejected": -41.78504180908203, "step": 3011 }, { "epoch": 0.41013071895424835, "grad_norm": 42.183771974095436, "learning_rate": 5.876934840950319e-07, "logits/chosen": 11.818717956542969, "logits/rejected": 12.106520652770996, "logps/chosen": -3.503732204437256, "logps/rejected": -3.6479365825653076, "loss": 3.6777, "rewards/accuracies": 0.75, "rewards/chosen": -35.037322998046875, "rewards/margins": 1.4420413970947266, "rewards/rejected": -36.47936248779297, "step": 3012 }, { "epoch": 0.41026688453159044, "grad_norm": 45.76455543949519, "learning_rate": 5.875255549610023e-07, "logits/chosen": 11.511672019958496, "logits/rejected": 11.764707565307617, "logps/chosen": -3.6507585048675537, "logps/rejected": -3.9249937534332275, "loss": 4.3689, "rewards/accuracies": 0.5, "rewards/chosen": -36.50758743286133, "rewards/margins": 2.7423524856567383, "rewards/rejected": -39.24993896484375, "step": 3013 }, { "epoch": 0.41040305010893247, "grad_norm": 38.31083376975645, "learning_rate": 5.873575834540236e-07, "logits/chosen": 11.265876770019531, "logits/rejected": 12.052803039550781, "logps/chosen": -3.391695499420166, "logps/rejected": -3.6110880374908447, "loss": 3.2843, "rewards/accuracies": 0.75, "rewards/chosen": -33.916954040527344, "rewards/margins": 2.1939239501953125, "rewards/rejected": -36.110877990722656, "step": 3014 }, { "epoch": 0.4105392156862745, "grad_norm": 41.92743620611266, "learning_rate": 5.871895696120502e-07, "logits/chosen": 10.219159126281738, "logits/rejected": 11.46277904510498, "logps/chosen": -3.6992852687835693, "logps/rejected": -3.8999557495117188, "loss": 3.9225, "rewards/accuracies": 0.75, "rewards/chosen": -36.99285125732422, "rewards/margins": 2.0067033767700195, "rewards/rejected": -38.99955749511719, "step": 3015 }, { "epoch": 0.4106753812636166, "grad_norm": 39.66820410792051, "learning_rate": 5.870215134730463e-07, "logits/chosen": 11.029792785644531, "logits/rejected": 11.563834190368652, "logps/chosen": -3.829519271850586, "logps/rejected": -3.877387762069702, "loss": 3.386, "rewards/accuracies": 0.5, "rewards/chosen": -38.29519271850586, "rewards/margins": 0.4786872863769531, "rewards/rejected": -38.77388000488281, "step": 3016 }, { "epoch": 0.4108115468409586, "grad_norm": 38.98975304769738, "learning_rate": 5.868534150749852e-07, "logits/chosen": 12.27197265625, "logits/rejected": 12.50662899017334, "logps/chosen": -4.199267864227295, "logps/rejected": -4.574864387512207, "loss": 3.4135, "rewards/accuracies": 1.0, "rewards/chosen": -41.992679595947266, "rewards/margins": 3.755965232849121, "rewards/rejected": -45.74864196777344, "step": 3017 }, { "epoch": 0.41094771241830064, "grad_norm": 75.88002804279226, "learning_rate": 5.866852744558507e-07, "logits/chosen": 10.901683807373047, "logits/rejected": 11.067262649536133, "logps/chosen": -3.5733935832977295, "logps/rejected": -3.799595355987549, "loss": 3.9921, "rewards/accuracies": 0.75, "rewards/chosen": -35.73393249511719, "rewards/margins": 2.2620177268981934, "rewards/rejected": -37.99595260620117, "step": 3018 }, { "epoch": 0.4110838779956427, "grad_norm": 39.479235194016105, "learning_rate": 5.865170916536353e-07, "logits/chosen": 11.791587829589844, "logits/rejected": 11.802108764648438, "logps/chosen": -3.993971109390259, "logps/rejected": -3.703181743621826, "loss": 3.7811, "rewards/accuracies": 0.25, "rewards/chosen": -39.93971252441406, "rewards/margins": -2.907895088195801, "rewards/rejected": -37.03181457519531, "step": 3019 }, { "epoch": 0.41122004357298475, "grad_norm": 39.12698559941547, "learning_rate": 5.863488667063411e-07, "logits/chosen": 11.585409164428711, "logits/rejected": 11.053792953491211, "logps/chosen": -3.475471019744873, "logps/rejected": -3.652120351791382, "loss": 4.1945, "rewards/accuracies": 0.5, "rewards/chosen": -34.75471115112305, "rewards/margins": 1.7664942741394043, "rewards/rejected": -36.52120590209961, "step": 3020 }, { "epoch": 0.4113562091503268, "grad_norm": 38.43899538956215, "learning_rate": 5.861805996519801e-07, "logits/chosen": 11.123777389526367, "logits/rejected": 11.674376487731934, "logps/chosen": -3.3488807678222656, "logps/rejected": -3.7625417709350586, "loss": 3.8683, "rewards/accuracies": 0.75, "rewards/chosen": -33.488807678222656, "rewards/margins": 4.136610507965088, "rewards/rejected": -37.62541580200195, "step": 3021 }, { "epoch": 0.41149237472766886, "grad_norm": 42.829631668374596, "learning_rate": 5.860122905285737e-07, "logits/chosen": 11.450756072998047, "logits/rejected": 12.515913009643555, "logps/chosen": -3.3261144161224365, "logps/rejected": -3.723869800567627, "loss": 4.4932, "rewards/accuracies": 0.75, "rewards/chosen": -33.261146545410156, "rewards/margins": 3.977555274963379, "rewards/rejected": -37.23870086669922, "step": 3022 }, { "epoch": 0.4116285403050109, "grad_norm": 43.30060521179201, "learning_rate": 5.858439393741527e-07, "logits/chosen": 11.629413604736328, "logits/rejected": 12.890735626220703, "logps/chosen": -3.35201096534729, "logps/rejected": -3.9244155883789062, "loss": 4.1295, "rewards/accuracies": 1.0, "rewards/chosen": -33.520111083984375, "rewards/margins": 5.724045753479004, "rewards/rejected": -39.24415588378906, "step": 3023 }, { "epoch": 0.4117647058823529, "grad_norm": 45.49884139977954, "learning_rate": 5.856755462267573e-07, "logits/chosen": 12.022590637207031, "logits/rejected": 12.723024368286133, "logps/chosen": -3.8531856536865234, "logps/rejected": -4.146700382232666, "loss": 3.5504, "rewards/accuracies": 0.75, "rewards/chosen": -38.531856536865234, "rewards/margins": 2.935148239135742, "rewards/rejected": -41.467002868652344, "step": 3024 }, { "epoch": 0.411900871459695, "grad_norm": 40.70627243559358, "learning_rate": 5.855071111244376e-07, "logits/chosen": 11.922331809997559, "logits/rejected": 12.367023468017578, "logps/chosen": -3.948105812072754, "logps/rejected": -4.192493915557861, "loss": 3.7339, "rewards/accuracies": 0.75, "rewards/chosen": -39.481056213378906, "rewards/margins": 2.443881034851074, "rewards/rejected": -41.92493438720703, "step": 3025 }, { "epoch": 0.41203703703703703, "grad_norm": 42.14375413236662, "learning_rate": 5.853386341052525e-07, "logits/chosen": 10.954471588134766, "logits/rejected": 11.269472122192383, "logps/chosen": -3.5724282264709473, "logps/rejected": -3.7500507831573486, "loss": 4.0137, "rewards/accuracies": 0.75, "rewards/chosen": -35.724281311035156, "rewards/margins": 1.7762227058410645, "rewards/rejected": -37.50050354003906, "step": 3026 }, { "epoch": 0.41217320261437906, "grad_norm": 40.21095744370335, "learning_rate": 5.851701152072711e-07, "logits/chosen": 11.052026748657227, "logits/rejected": 11.529369354248047, "logps/chosen": -3.9030303955078125, "logps/rejected": -4.186338424682617, "loss": 4.1081, "rewards/accuracies": 0.75, "rewards/chosen": -39.030303955078125, "rewards/margins": 2.8330774307250977, "rewards/rejected": -41.863380432128906, "step": 3027 }, { "epoch": 0.41230936819172115, "grad_norm": 37.14767868477843, "learning_rate": 5.850015544685716e-07, "logits/chosen": 12.782548904418945, "logits/rejected": 11.510734558105469, "logps/chosen": -4.037545204162598, "logps/rejected": -3.7957115173339844, "loss": 3.7612, "rewards/accuracies": 0.25, "rewards/chosen": -40.37545394897461, "rewards/margins": -2.418336868286133, "rewards/rejected": -37.957115173339844, "step": 3028 }, { "epoch": 0.4124455337690632, "grad_norm": 38.71008184499003, "learning_rate": 5.848329519272414e-07, "logits/chosen": 12.3867826461792, "logits/rejected": 13.201811790466309, "logps/chosen": -4.018135070800781, "logps/rejected": -4.218866348266602, "loss": 3.8211, "rewards/accuracies": 0.5, "rewards/chosen": -40.18135452270508, "rewards/margins": 2.0073070526123047, "rewards/rejected": -42.18865966796875, "step": 3029 }, { "epoch": 0.4125816993464052, "grad_norm": 44.66535356299792, "learning_rate": 5.846643076213781e-07, "logits/chosen": 11.595441818237305, "logits/rejected": 11.818754196166992, "logps/chosen": -3.7671453952789307, "logps/rejected": -3.9366085529327393, "loss": 4.068, "rewards/accuracies": 0.5, "rewards/chosen": -37.67145538330078, "rewards/margins": 1.694629192352295, "rewards/rejected": -39.366085052490234, "step": 3030 }, { "epoch": 0.4127178649237473, "grad_norm": 41.6707525067542, "learning_rate": 5.84495621589088e-07, "logits/chosen": 11.72240924835205, "logits/rejected": 12.405269622802734, "logps/chosen": -3.534954786300659, "logps/rejected": -4.151716709136963, "loss": 3.8235, "rewards/accuracies": 1.0, "rewards/chosen": -35.34954833984375, "rewards/margins": 6.1676177978515625, "rewards/rejected": -41.51716613769531, "step": 3031 }, { "epoch": 0.4128540305010893, "grad_norm": 44.48002443805005, "learning_rate": 5.843268938684871e-07, "logits/chosen": 12.489194869995117, "logits/rejected": 12.102404594421387, "logps/chosen": -3.8895373344421387, "logps/rejected": -4.017647743225098, "loss": 4.3557, "rewards/accuracies": 0.75, "rewards/chosen": -38.8953742980957, "rewards/margins": 1.2811012268066406, "rewards/rejected": -40.176475524902344, "step": 3032 }, { "epoch": 0.41299019607843135, "grad_norm": 36.706625142240476, "learning_rate": 5.841581244977009e-07, "logits/chosen": 12.194648742675781, "logits/rejected": 12.559186935424805, "logps/chosen": -3.878648519515991, "logps/rejected": -3.9035449028015137, "loss": 4.0596, "rewards/accuracies": 0.5, "rewards/chosen": -38.78648376464844, "rewards/margins": 0.2489643096923828, "rewards/rejected": -39.03544998168945, "step": 3033 }, { "epoch": 0.41312636165577343, "grad_norm": 43.07442510957419, "learning_rate": 5.839893135148642e-07, "logits/chosen": 12.33547592163086, "logits/rejected": 12.454740524291992, "logps/chosen": -3.788878917694092, "logps/rejected": -4.038830757141113, "loss": 3.4228, "rewards/accuracies": 0.75, "rewards/chosen": -37.888790130615234, "rewards/margins": 2.4995193481445312, "rewards/rejected": -40.3883056640625, "step": 3034 }, { "epoch": 0.41326252723311546, "grad_norm": 38.858371874481264, "learning_rate": 5.838204609581212e-07, "logits/chosen": 12.636924743652344, "logits/rejected": 12.759271621704102, "logps/chosen": -3.575437307357788, "logps/rejected": -3.748034954071045, "loss": 3.5149, "rewards/accuracies": 0.5, "rewards/chosen": -35.754371643066406, "rewards/margins": 1.7259759902954102, "rewards/rejected": -37.4803466796875, "step": 3035 }, { "epoch": 0.4133986928104575, "grad_norm": 39.13037290343174, "learning_rate": 5.836515668656256e-07, "logits/chosen": 12.846170425415039, "logits/rejected": 13.758893966674805, "logps/chosen": -4.12081241607666, "logps/rejected": -4.385051727294922, "loss": 3.6499, "rewards/accuracies": 0.75, "rewards/chosen": -41.208126068115234, "rewards/margins": 2.642388343811035, "rewards/rejected": -43.85051345825195, "step": 3036 }, { "epoch": 0.4135348583877996, "grad_norm": 38.88841376995989, "learning_rate": 5.834826312755404e-07, "logits/chosen": 10.869498252868652, "logits/rejected": 12.386873245239258, "logps/chosen": -3.384186267852783, "logps/rejected": -3.9208884239196777, "loss": 4.0075, "rewards/accuracies": 1.0, "rewards/chosen": -33.841861724853516, "rewards/margins": 5.367022514343262, "rewards/rejected": -39.208885192871094, "step": 3037 }, { "epoch": 0.4136710239651416, "grad_norm": 42.95274612723899, "learning_rate": 5.83313654226038e-07, "logits/chosen": 12.768632888793945, "logits/rejected": 12.023064613342285, "logps/chosen": -4.105066299438477, "logps/rejected": -4.140658855438232, "loss": 4.3162, "rewards/accuracies": 0.25, "rewards/chosen": -41.0506591796875, "rewards/margins": 0.3559274673461914, "rewards/rejected": -41.40658950805664, "step": 3038 }, { "epoch": 0.41380718954248363, "grad_norm": 43.81982671023233, "learning_rate": 5.831446357553001e-07, "logits/chosen": 11.553131103515625, "logits/rejected": 12.215961456298828, "logps/chosen": -3.7618489265441895, "logps/rejected": -4.012207984924316, "loss": 4.2623, "rewards/accuracies": 0.5, "rewards/chosen": -37.618492126464844, "rewards/margins": 2.503591537475586, "rewards/rejected": -40.1220817565918, "step": 3039 }, { "epoch": 0.4139433551198257, "grad_norm": 42.01378898177441, "learning_rate": 5.829755759015179e-07, "logits/chosen": 11.530574798583984, "logits/rejected": 12.412267684936523, "logps/chosen": -3.984473943710327, "logps/rejected": -4.310772895812988, "loss": 4.1157, "rewards/accuracies": 0.75, "rewards/chosen": -39.8447380065918, "rewards/margins": 3.2629880905151367, "rewards/rejected": -43.10772705078125, "step": 3040 }, { "epoch": 0.41407952069716775, "grad_norm": 42.97953702992001, "learning_rate": 5.828064747028918e-07, "logits/chosen": 12.582487106323242, "logits/rejected": 12.711697578430176, "logps/chosen": -4.455732822418213, "logps/rejected": -4.394222736358643, "loss": 3.5662, "rewards/accuracies": 0.75, "rewards/chosen": -44.55732727050781, "rewards/margins": -0.6151018142700195, "rewards/rejected": -43.94222640991211, "step": 3041 }, { "epoch": 0.41421568627450983, "grad_norm": 36.79055105180995, "learning_rate": 5.826373321976316e-07, "logits/chosen": 12.05628776550293, "logits/rejected": 12.968263626098633, "logps/chosen": -4.080760955810547, "logps/rejected": -4.394286632537842, "loss": 4.1099, "rewards/accuracies": 0.75, "rewards/chosen": -40.8076057434082, "rewards/margins": 3.1352615356445312, "rewards/rejected": -43.94286346435547, "step": 3042 }, { "epoch": 0.41435185185185186, "grad_norm": 43.97691898596567, "learning_rate": 5.824681484239565e-07, "logits/chosen": 12.857152938842773, "logits/rejected": 13.029394149780273, "logps/chosen": -4.629711151123047, "logps/rejected": -4.557461261749268, "loss": 4.0354, "rewards/accuracies": 0.5, "rewards/chosen": -46.29711151123047, "rewards/margins": -0.7224998474121094, "rewards/rejected": -45.57461166381836, "step": 3043 }, { "epoch": 0.4144880174291939, "grad_norm": 42.79341574433245, "learning_rate": 5.82298923420095e-07, "logits/chosen": 12.47092056274414, "logits/rejected": 11.555463790893555, "logps/chosen": -4.104700088500977, "logps/rejected": -3.735642433166504, "loss": 4.1387, "rewards/accuracies": 0.0, "rewards/chosen": -41.047000885009766, "rewards/margins": -3.69057559967041, "rewards/rejected": -37.356422424316406, "step": 3044 }, { "epoch": 0.414624183006536, "grad_norm": 39.212022634007695, "learning_rate": 5.821296572242849e-07, "logits/chosen": 11.761796951293945, "logits/rejected": 12.703424453735352, "logps/chosen": -3.8044285774230957, "logps/rejected": -4.17343807220459, "loss": 3.861, "rewards/accuracies": 1.0, "rewards/chosen": -38.04428482055664, "rewards/margins": 3.690093994140625, "rewards/rejected": -41.73438262939453, "step": 3045 }, { "epoch": 0.414760348583878, "grad_norm": 41.82197259211361, "learning_rate": 5.819603498747733e-07, "logits/chosen": 12.577357292175293, "logits/rejected": 12.750123977661133, "logps/chosen": -4.2071661949157715, "logps/rejected": -4.2650604248046875, "loss": 3.5378, "rewards/accuracies": 0.5, "rewards/chosen": -42.07166290283203, "rewards/margins": 0.5789432525634766, "rewards/rejected": -42.650604248046875, "step": 3046 }, { "epoch": 0.41489651416122003, "grad_norm": 50.13669653869709, "learning_rate": 5.817910014098164e-07, "logits/chosen": 12.76482105255127, "logits/rejected": 13.342477798461914, "logps/chosen": -3.8915412425994873, "logps/rejected": -4.358480453491211, "loss": 3.8977, "rewards/accuracies": 0.75, "rewards/chosen": -38.91541290283203, "rewards/margins": 4.669394016265869, "rewards/rejected": -43.58480453491211, "step": 3047 }, { "epoch": 0.4150326797385621, "grad_norm": 39.33910419100949, "learning_rate": 5.816216118676801e-07, "logits/chosen": 11.202037811279297, "logits/rejected": 12.000896453857422, "logps/chosen": -3.6484174728393555, "logps/rejected": -4.167641639709473, "loss": 4.2511, "rewards/accuracies": 0.75, "rewards/chosen": -36.48417663574219, "rewards/margins": 5.192239761352539, "rewards/rejected": -41.676414489746094, "step": 3048 }, { "epoch": 0.41516884531590414, "grad_norm": 39.87907036635376, "learning_rate": 5.814521812866394e-07, "logits/chosen": 12.251523971557617, "logits/rejected": 12.605337142944336, "logps/chosen": -4.135918617248535, "logps/rejected": -4.299619674682617, "loss": 3.7958, "rewards/accuracies": 0.75, "rewards/chosen": -41.35918426513672, "rewards/margins": 1.6370105743408203, "rewards/rejected": -42.99619674682617, "step": 3049 }, { "epoch": 0.4153050108932462, "grad_norm": 42.57666453110247, "learning_rate": 5.812827097049782e-07, "logits/chosen": 12.142147064208984, "logits/rejected": 12.146503448486328, "logps/chosen": -3.6693997383117676, "logps/rejected": -4.430869102478027, "loss": 4.3791, "rewards/accuracies": 1.0, "rewards/chosen": -36.69399642944336, "rewards/margins": 7.614693641662598, "rewards/rejected": -44.30868911743164, "step": 3050 }, { "epoch": 0.41544117647058826, "grad_norm": 43.81165659935157, "learning_rate": 5.811131971609905e-07, "logits/chosen": 12.356489181518555, "logits/rejected": 12.162662506103516, "logps/chosen": -3.967745780944824, "logps/rejected": -4.217538833618164, "loss": 3.6829, "rewards/accuracies": 0.75, "rewards/chosen": -39.677459716796875, "rewards/margins": 2.4979286193847656, "rewards/rejected": -42.17538833618164, "step": 3051 }, { "epoch": 0.4155773420479303, "grad_norm": 45.81111956669284, "learning_rate": 5.809436436929787e-07, "logits/chosen": 11.196893692016602, "logits/rejected": 12.8974027633667, "logps/chosen": -3.748736619949341, "logps/rejected": -4.1433024406433105, "loss": 4.438, "rewards/accuracies": 1.0, "rewards/chosen": -37.48736572265625, "rewards/margins": 3.9456558227539062, "rewards/rejected": -41.433021545410156, "step": 3052 }, { "epoch": 0.4157135076252723, "grad_norm": 41.490011868020254, "learning_rate": 5.807740493392549e-07, "logits/chosen": 12.532933235168457, "logits/rejected": 12.617569923400879, "logps/chosen": -4.106143474578857, "logps/rejected": -4.254050254821777, "loss": 3.235, "rewards/accuracies": 0.75, "rewards/chosen": -41.061431884765625, "rewards/margins": 1.4790668487548828, "rewards/rejected": -42.540504455566406, "step": 3053 }, { "epoch": 0.4158496732026144, "grad_norm": 40.02437471172265, "learning_rate": 5.806044141381403e-07, "logits/chosen": 12.61100959777832, "logits/rejected": 13.320572853088379, "logps/chosen": -3.7934341430664062, "logps/rejected": -4.040341377258301, "loss": 3.6907, "rewards/accuracies": 1.0, "rewards/chosen": -37.93434143066406, "rewards/margins": 2.4690723419189453, "rewards/rejected": -40.403411865234375, "step": 3054 }, { "epoch": 0.41598583877995643, "grad_norm": 37.80375987565714, "learning_rate": 5.804347381279655e-07, "logits/chosen": 12.226755142211914, "logits/rejected": 12.407978057861328, "logps/chosen": -3.7969038486480713, "logps/rejected": -3.9295713901519775, "loss": 3.783, "rewards/accuracies": 0.5, "rewards/chosen": -37.96903991699219, "rewards/margins": 1.3266735076904297, "rewards/rejected": -39.29571533203125, "step": 3055 }, { "epoch": 0.41612200435729846, "grad_norm": 37.91757269585103, "learning_rate": 5.802650213470701e-07, "logits/chosen": 12.645380973815918, "logits/rejected": 13.346168518066406, "logps/chosen": -4.187013626098633, "logps/rejected": -4.304723739624023, "loss": 3.8156, "rewards/accuracies": 0.5, "rewards/chosen": -41.870140075683594, "rewards/margins": 1.1770973205566406, "rewards/rejected": -43.047237396240234, "step": 3056 }, { "epoch": 0.41625816993464054, "grad_norm": 37.96483209394132, "learning_rate": 5.800952638338031e-07, "logits/chosen": 11.711210250854492, "logits/rejected": 12.794912338256836, "logps/chosen": -3.617072582244873, "logps/rejected": -4.2120232582092285, "loss": 4.0418, "rewards/accuracies": 0.75, "rewards/chosen": -36.17072677612305, "rewards/margins": 5.949504852294922, "rewards/rejected": -42.12023162841797, "step": 3057 }, { "epoch": 0.41639433551198257, "grad_norm": 40.49702356649646, "learning_rate": 5.799254656265225e-07, "logits/chosen": 11.948028564453125, "logits/rejected": 11.851611137390137, "logps/chosen": -3.852937936782837, "logps/rejected": -4.0968499183654785, "loss": 3.9667, "rewards/accuracies": 0.75, "rewards/chosen": -38.529380798339844, "rewards/margins": 2.4391212463378906, "rewards/rejected": -40.968502044677734, "step": 3058 }, { "epoch": 0.4165305010893246, "grad_norm": 38.59905786994046, "learning_rate": 5.797556267635957e-07, "logits/chosen": 13.392792701721191, "logits/rejected": 14.222639083862305, "logps/chosen": -4.044855117797852, "logps/rejected": -4.251181125640869, "loss": 3.7685, "rewards/accuracies": 0.75, "rewards/chosen": -40.44854736328125, "rewards/margins": 2.063261032104492, "rewards/rejected": -42.511810302734375, "step": 3059 }, { "epoch": 0.4166666666666667, "grad_norm": 41.39844484656719, "learning_rate": 5.795857472833991e-07, "logits/chosen": 11.509119033813477, "logits/rejected": 11.65583610534668, "logps/chosen": -3.680696964263916, "logps/rejected": -3.717386484146118, "loss": 4.1768, "rewards/accuracies": 0.5, "rewards/chosen": -36.806968688964844, "rewards/margins": 0.3668961524963379, "rewards/rejected": -37.173866271972656, "step": 3060 }, { "epoch": 0.4168028322440087, "grad_norm": 37.09688719845634, "learning_rate": 5.794158272243185e-07, "logits/chosen": 12.822087287902832, "logits/rejected": 12.55759048461914, "logps/chosen": -4.022273063659668, "logps/rejected": -3.7083334922790527, "loss": 3.6599, "rewards/accuracies": 0.0, "rewards/chosen": -40.22273254394531, "rewards/margins": -3.139397621154785, "rewards/rejected": -37.083335876464844, "step": 3061 }, { "epoch": 0.41693899782135074, "grad_norm": 44.31310049525761, "learning_rate": 5.792458666247486e-07, "logits/chosen": 12.171656608581543, "logits/rejected": 12.460123062133789, "logps/chosen": -4.088510513305664, "logps/rejected": -4.064488887786865, "loss": 4.6048, "rewards/accuracies": 0.5, "rewards/chosen": -40.88510513305664, "rewards/margins": -0.24021530151367188, "rewards/rejected": -40.64488983154297, "step": 3062 }, { "epoch": 0.4170751633986928, "grad_norm": 36.18694965836731, "learning_rate": 5.790758655230935e-07, "logits/chosen": 11.476258277893066, "logits/rejected": 12.831153869628906, "logps/chosen": -3.7258448600769043, "logps/rejected": -4.234068870544434, "loss": 3.1056, "rewards/accuracies": 0.75, "rewards/chosen": -37.258445739746094, "rewards/margins": 5.082242965698242, "rewards/rejected": -42.34069061279297, "step": 3063 }, { "epoch": 0.41721132897603486, "grad_norm": 39.0168941802689, "learning_rate": 5.789058239577663e-07, "logits/chosen": 11.467581748962402, "logits/rejected": 12.30819320678711, "logps/chosen": -3.6768784523010254, "logps/rejected": -3.980104684829712, "loss": 4.0197, "rewards/accuracies": 1.0, "rewards/chosen": -36.76878356933594, "rewards/margins": 3.0322608947753906, "rewards/rejected": -39.80104446411133, "step": 3064 }, { "epoch": 0.4173474945533769, "grad_norm": 35.797665063924306, "learning_rate": 5.787357419671895e-07, "logits/chosen": 11.802301406860352, "logits/rejected": 13.30801773071289, "logps/chosen": -3.5966830253601074, "logps/rejected": -4.136909008026123, "loss": 3.7065, "rewards/accuracies": 1.0, "rewards/chosen": -35.966827392578125, "rewards/margins": 5.40225887298584, "rewards/rejected": -41.36908721923828, "step": 3065 }, { "epoch": 0.41748366013071897, "grad_norm": 45.5074969922113, "learning_rate": 5.785656195897942e-07, "logits/chosen": 12.142709732055664, "logits/rejected": 11.848848342895508, "logps/chosen": -3.686002731323242, "logps/rejected": -3.6772913932800293, "loss": 4.0501, "rewards/accuracies": 0.5, "rewards/chosen": -36.86002731323242, "rewards/margins": -0.0871129035949707, "rewards/rejected": -36.772911071777344, "step": 3066 }, { "epoch": 0.417619825708061, "grad_norm": 38.23789387729004, "learning_rate": 5.783954568640211e-07, "logits/chosen": 10.753557205200195, "logits/rejected": 11.37175178527832, "logps/chosen": -3.4459035396575928, "logps/rejected": -3.7317986488342285, "loss": 3.21, "rewards/accuracies": 0.5, "rewards/chosen": -34.45903396606445, "rewards/margins": 2.8589515686035156, "rewards/rejected": -37.31798553466797, "step": 3067 }, { "epoch": 0.417755991285403, "grad_norm": 35.7806242772673, "learning_rate": 5.782252538283199e-07, "logits/chosen": 12.143621444702148, "logits/rejected": 12.18479061126709, "logps/chosen": -4.030364036560059, "logps/rejected": -3.8520350456237793, "loss": 3.5302, "rewards/accuracies": 0.25, "rewards/chosen": -40.30364227294922, "rewards/margins": -1.7832908630371094, "rewards/rejected": -38.520347595214844, "step": 3068 }, { "epoch": 0.4178921568627451, "grad_norm": 36.33727464404223, "learning_rate": 5.780550105211494e-07, "logits/chosen": 12.364269256591797, "logits/rejected": 11.642581939697266, "logps/chosen": -3.831718921661377, "logps/rejected": -3.547168254852295, "loss": 4.2671, "rewards/accuracies": 0.0, "rewards/chosen": -38.31719207763672, "rewards/margins": -2.845508575439453, "rewards/rejected": -35.471683502197266, "step": 3069 }, { "epoch": 0.41802832244008714, "grad_norm": 38.09211025204371, "learning_rate": 5.778847269809775e-07, "logits/chosen": 12.46829605102539, "logits/rejected": 12.559405326843262, "logps/chosen": -3.92124342918396, "logps/rejected": -4.140068054199219, "loss": 3.7859, "rewards/accuracies": 0.75, "rewards/chosen": -39.212432861328125, "rewards/margins": 2.188243865966797, "rewards/rejected": -41.40068054199219, "step": 3070 }, { "epoch": 0.41816448801742917, "grad_norm": 37.956981613521904, "learning_rate": 5.777144032462811e-07, "logits/chosen": 12.286200523376465, "logits/rejected": 12.633331298828125, "logps/chosen": -3.697255849838257, "logps/rejected": -4.192605495452881, "loss": 4.2358, "rewards/accuracies": 0.75, "rewards/chosen": -36.97256088256836, "rewards/margins": 4.953495025634766, "rewards/rejected": -41.926055908203125, "step": 3071 }, { "epoch": 0.41830065359477125, "grad_norm": 37.697969075535745, "learning_rate": 5.775440393555463e-07, "logits/chosen": 11.828506469726562, "logits/rejected": 12.942147254943848, "logps/chosen": -3.556123971939087, "logps/rejected": -3.866588592529297, "loss": 3.971, "rewards/accuracies": 1.0, "rewards/chosen": -35.56123733520508, "rewards/margins": 3.104644775390625, "rewards/rejected": -38.66588592529297, "step": 3072 }, { "epoch": 0.4184368191721133, "grad_norm": 39.02310408521701, "learning_rate": 5.773736353472682e-07, "logits/chosen": 10.633273124694824, "logits/rejected": 12.099739074707031, "logps/chosen": -3.286298990249634, "logps/rejected": -3.8850245475769043, "loss": 4.0775, "rewards/accuracies": 0.75, "rewards/chosen": -32.86299133300781, "rewards/margins": 5.9872565269470215, "rewards/rejected": -38.85024642944336, "step": 3073 }, { "epoch": 0.4185729847494553, "grad_norm": 49.02488342243944, "learning_rate": 5.772031912599509e-07, "logits/chosen": 11.824926376342773, "logits/rejected": 11.465744018554688, "logps/chosen": -3.441260814666748, "logps/rejected": -3.535238265991211, "loss": 4.3034, "rewards/accuracies": 0.75, "rewards/chosen": -34.4126091003418, "rewards/margins": 0.9397745132446289, "rewards/rejected": -35.352386474609375, "step": 3074 }, { "epoch": 0.4187091503267974, "grad_norm": 38.70703880400412, "learning_rate": 5.770327071321078e-07, "logits/chosen": 12.120637893676758, "logits/rejected": 12.394302368164062, "logps/chosen": -3.8287465572357178, "logps/rejected": -4.190361499786377, "loss": 3.9829, "rewards/accuracies": 0.5, "rewards/chosen": -38.28746795654297, "rewards/margins": 3.6161489486694336, "rewards/rejected": -41.90361404418945, "step": 3075 }, { "epoch": 0.4188453159041394, "grad_norm": 41.682349004672794, "learning_rate": 5.768621830022613e-07, "logits/chosen": 11.411420822143555, "logits/rejected": 11.711082458496094, "logps/chosen": -3.5410072803497314, "logps/rejected": -3.7455437183380127, "loss": 4.0475, "rewards/accuracies": 0.75, "rewards/chosen": -35.410072326660156, "rewards/margins": 2.045365333557129, "rewards/rejected": -37.45543670654297, "step": 3076 }, { "epoch": 0.41898148148148145, "grad_norm": 43.42143456247324, "learning_rate": 5.766916189089425e-07, "logits/chosen": 12.042232513427734, "logits/rejected": 11.6242094039917, "logps/chosen": -3.859487533569336, "logps/rejected": -4.154213905334473, "loss": 4.0828, "rewards/accuracies": 0.75, "rewards/chosen": -38.594879150390625, "rewards/margins": 2.947265625, "rewards/rejected": -41.54214096069336, "step": 3077 }, { "epoch": 0.41911764705882354, "grad_norm": 40.17640358969308, "learning_rate": 5.765210148906918e-07, "logits/chosen": 12.0645751953125, "logits/rejected": 11.451311111450195, "logps/chosen": -3.5477333068847656, "logps/rejected": -3.7984561920166016, "loss": 4.2708, "rewards/accuracies": 0.75, "rewards/chosen": -35.47732925415039, "rewards/margins": 2.5072317123413086, "rewards/rejected": -37.98456573486328, "step": 3078 }, { "epoch": 0.41925381263616557, "grad_norm": 47.71994492092464, "learning_rate": 5.763503709860588e-07, "logits/chosen": 11.840360641479492, "logits/rejected": 11.807920455932617, "logps/chosen": -3.98407244682312, "logps/rejected": -4.22608757019043, "loss": 4.2869, "rewards/accuracies": 1.0, "rewards/chosen": -39.84072494506836, "rewards/margins": 2.4201502799987793, "rewards/rejected": -42.2608757019043, "step": 3079 }, { "epoch": 0.41938997821350765, "grad_norm": 57.83672864427387, "learning_rate": 5.761796872336016e-07, "logits/chosen": 11.39269733428955, "logits/rejected": 11.99753189086914, "logps/chosen": -3.651224136352539, "logps/rejected": -3.7806549072265625, "loss": 4.3282, "rewards/accuracies": 0.5, "rewards/chosen": -36.51224136352539, "rewards/margins": 1.2943062782287598, "rewards/rejected": -37.806549072265625, "step": 3080 }, { "epoch": 0.4195261437908497, "grad_norm": 38.26144399476582, "learning_rate": 5.760089636718878e-07, "logits/chosen": 11.661569595336914, "logits/rejected": 12.449951171875, "logps/chosen": -3.651470422744751, "logps/rejected": -4.086700439453125, "loss": 3.5673, "rewards/accuracies": 1.0, "rewards/chosen": -36.51470184326172, "rewards/margins": 4.3523054122924805, "rewards/rejected": -40.867008209228516, "step": 3081 }, { "epoch": 0.4196623093681917, "grad_norm": 38.240868848498266, "learning_rate": 5.758382003394938e-07, "logits/chosen": 12.061209678649902, "logits/rejected": 12.108498573303223, "logps/chosen": -3.7589352130889893, "logps/rejected": -3.61120867729187, "loss": 4.0565, "rewards/accuracies": 0.25, "rewards/chosen": -37.589351654052734, "rewards/margins": -1.4772682189941406, "rewards/rejected": -36.112083435058594, "step": 3082 }, { "epoch": 0.4197984749455338, "grad_norm": 36.16891222693901, "learning_rate": 5.756673972750049e-07, "logits/chosen": 12.357715606689453, "logits/rejected": 13.221220016479492, "logps/chosen": -3.837660312652588, "logps/rejected": -4.338088035583496, "loss": 3.797, "rewards/accuracies": 0.75, "rewards/chosen": -38.37660217285156, "rewards/margins": 5.004273414611816, "rewards/rejected": -43.38087463378906, "step": 3083 }, { "epoch": 0.4199346405228758, "grad_norm": 48.004690399774674, "learning_rate": 5.754965545170155e-07, "logits/chosen": 12.090176582336426, "logits/rejected": 13.141158103942871, "logps/chosen": -4.070225715637207, "logps/rejected": -4.4483137130737305, "loss": 4.1024, "rewards/accuracies": 0.75, "rewards/chosen": -40.7022590637207, "rewards/margins": 3.780880928039551, "rewards/rejected": -44.48313903808594, "step": 3084 }, { "epoch": 0.42007080610021785, "grad_norm": 43.666828399334776, "learning_rate": 5.75325672104129e-07, "logits/chosen": 12.733268737792969, "logits/rejected": 13.106929779052734, "logps/chosen": -3.5613491535186768, "logps/rejected": -4.051868438720703, "loss": 3.5435, "rewards/accuracies": 1.0, "rewards/chosen": -35.61349105834961, "rewards/margins": 4.90518856048584, "rewards/rejected": -40.518680572509766, "step": 3085 }, { "epoch": 0.42020697167755994, "grad_norm": 45.31019589719259, "learning_rate": 5.751547500749575e-07, "logits/chosen": 12.154019355773926, "logits/rejected": 12.491337776184082, "logps/chosen": -3.7915220260620117, "logps/rejected": -4.2081475257873535, "loss": 4.0523, "rewards/accuracies": 1.0, "rewards/chosen": -37.91522216796875, "rewards/margins": 4.166253089904785, "rewards/rejected": -42.08147430419922, "step": 3086 }, { "epoch": 0.42034313725490197, "grad_norm": 39.436066777915485, "learning_rate": 5.749837884681226e-07, "logits/chosen": 12.35940170288086, "logits/rejected": 12.793342590332031, "logps/chosen": -4.3347673416137695, "logps/rejected": -4.246380805969238, "loss": 4.2217, "rewards/accuracies": 0.25, "rewards/chosen": -43.34767532348633, "rewards/margins": -0.8838710784912109, "rewards/rejected": -42.46380615234375, "step": 3087 }, { "epoch": 0.420479302832244, "grad_norm": 44.733533941214894, "learning_rate": 5.74812787322254e-07, "logits/chosen": 11.986083984375, "logits/rejected": 12.22741985321045, "logps/chosen": -3.824920177459717, "logps/rejected": -3.665139675140381, "loss": 4.5031, "rewards/accuracies": 0.25, "rewards/chosen": -38.249202728271484, "rewards/margins": -1.5978050231933594, "rewards/rejected": -36.651397705078125, "step": 3088 }, { "epoch": 0.4206154684095861, "grad_norm": 40.56425994916374, "learning_rate": 5.746417466759913e-07, "logits/chosen": 12.527332305908203, "logits/rejected": 12.747430801391602, "logps/chosen": -4.078184127807617, "logps/rejected": -3.9666748046875, "loss": 4.6987, "rewards/accuracies": 0.0, "rewards/chosen": -40.781837463378906, "rewards/margins": -1.115091323852539, "rewards/rejected": -39.666748046875, "step": 3089 }, { "epoch": 0.4207516339869281, "grad_norm": 41.600730440846114, "learning_rate": 5.744706665679822e-07, "logits/chosen": 12.412979125976562, "logits/rejected": 12.584431648254395, "logps/chosen": -3.7407002449035645, "logps/rejected": -3.7576303482055664, "loss": 3.8149, "rewards/accuracies": 0.5, "rewards/chosen": -37.40700149536133, "rewards/margins": 0.16930294036865234, "rewards/rejected": -37.57630157470703, "step": 3090 }, { "epoch": 0.42088779956427014, "grad_norm": 38.10906069714481, "learning_rate": 5.742995470368838e-07, "logits/chosen": 12.365049362182617, "logits/rejected": 12.585173606872559, "logps/chosen": -3.8592991828918457, "logps/rejected": -4.141702175140381, "loss": 3.9182, "rewards/accuracies": 0.75, "rewards/chosen": -38.592994689941406, "rewards/margins": 2.8240270614624023, "rewards/rejected": -41.41701889038086, "step": 3091 }, { "epoch": 0.4210239651416122, "grad_norm": 42.906839827025145, "learning_rate": 5.74128388121362e-07, "logits/chosen": 12.12506103515625, "logits/rejected": 12.521600723266602, "logps/chosen": -4.02778434753418, "logps/rejected": -4.291440486907959, "loss": 3.8395, "rewards/accuracies": 1.0, "rewards/chosen": -40.27784729003906, "rewards/margins": 2.6365585327148438, "rewards/rejected": -42.914405822753906, "step": 3092 }, { "epoch": 0.42116013071895425, "grad_norm": 40.74285689414022, "learning_rate": 5.739571898600916e-07, "logits/chosen": 10.93268871307373, "logits/rejected": 11.622491836547852, "logps/chosen": -3.641502618789673, "logps/rejected": -3.698334217071533, "loss": 3.769, "rewards/accuracies": 0.75, "rewards/chosen": -36.41502380371094, "rewards/margins": 0.5683155059814453, "rewards/rejected": -36.983341217041016, "step": 3093 }, { "epoch": 0.4212962962962963, "grad_norm": 42.27133183336934, "learning_rate": 5.737859522917561e-07, "logits/chosen": 11.711580276489258, "logits/rejected": 11.182666778564453, "logps/chosen": -3.66690731048584, "logps/rejected": -3.7073941230773926, "loss": 4.1105, "rewards/accuracies": 0.75, "rewards/chosen": -36.66907501220703, "rewards/margins": 0.40486812591552734, "rewards/rejected": -37.07394027709961, "step": 3094 }, { "epoch": 0.42143246187363836, "grad_norm": 39.6662356257656, "learning_rate": 5.736146754550482e-07, "logits/chosen": 10.997373580932617, "logits/rejected": 11.195301055908203, "logps/chosen": -3.3318467140197754, "logps/rejected": -3.6100456714630127, "loss": 4.078, "rewards/accuracies": 0.5, "rewards/chosen": -33.31846618652344, "rewards/margins": 2.7819924354553223, "rewards/rejected": -36.10045623779297, "step": 3095 }, { "epoch": 0.4215686274509804, "grad_norm": 86.36659385969243, "learning_rate": 5.734433593886694e-07, "logits/chosen": 12.698573112487793, "logits/rejected": 12.706899642944336, "logps/chosen": -4.036070346832275, "logps/rejected": -4.09814977645874, "loss": 4.1989, "rewards/accuracies": 0.5, "rewards/chosen": -40.36070251464844, "rewards/margins": 0.620793342590332, "rewards/rejected": -40.98149871826172, "step": 3096 }, { "epoch": 0.4217047930283224, "grad_norm": 41.69973340974226, "learning_rate": 5.732720041313297e-07, "logits/chosen": 12.216909408569336, "logits/rejected": 12.28902816772461, "logps/chosen": -3.9806947708129883, "logps/rejected": -3.715052366256714, "loss": 4.2278, "rewards/accuracies": 0.75, "rewards/chosen": -39.80694580078125, "rewards/margins": -2.6564197540283203, "rewards/rejected": -37.1505241394043, "step": 3097 }, { "epoch": 0.4218409586056645, "grad_norm": 37.84712752152663, "learning_rate": 5.731006097217485e-07, "logits/chosen": 12.24038314819336, "logits/rejected": 11.750844955444336, "logps/chosen": -3.6102755069732666, "logps/rejected": -3.432436466217041, "loss": 3.7324, "rewards/accuracies": 0.25, "rewards/chosen": -36.10275650024414, "rewards/margins": -1.778390884399414, "rewards/rejected": -34.324363708496094, "step": 3098 }, { "epoch": 0.42197712418300654, "grad_norm": 41.083085353128176, "learning_rate": 5.729291761986535e-07, "logits/chosen": 12.06904411315918, "logits/rejected": 12.296491622924805, "logps/chosen": -3.426621913909912, "logps/rejected": -3.5198049545288086, "loss": 3.6315, "rewards/accuracies": 0.5, "rewards/chosen": -34.26622009277344, "rewards/margins": 0.9318313598632812, "rewards/rejected": -35.19805145263672, "step": 3099 }, { "epoch": 0.42211328976034856, "grad_norm": 39.14873284812296, "learning_rate": 5.727577036007818e-07, "logits/chosen": 11.59874153137207, "logits/rejected": 12.798004150390625, "logps/chosen": -3.8059816360473633, "logps/rejected": -3.992363214492798, "loss": 3.5703, "rewards/accuracies": 0.75, "rewards/chosen": -38.05982208251953, "rewards/margins": 1.8638114929199219, "rewards/rejected": -39.92362976074219, "step": 3100 }, { "epoch": 0.42224945533769065, "grad_norm": 38.696635323671146, "learning_rate": 5.725861919668789e-07, "logits/chosen": 11.462247848510742, "logits/rejected": 11.134176254272461, "logps/chosen": -3.470365285873413, "logps/rejected": -3.5811972618103027, "loss": 4.3871, "rewards/accuracies": 0.5, "rewards/chosen": -34.70365524291992, "rewards/margins": 1.10831880569458, "rewards/rejected": -35.811973571777344, "step": 3101 }, { "epoch": 0.4223856209150327, "grad_norm": 38.49861165820583, "learning_rate": 5.724146413356994e-07, "logits/chosen": 12.306413650512695, "logits/rejected": 12.50238037109375, "logps/chosen": -3.6050314903259277, "logps/rejected": -3.7292795181274414, "loss": 4.3133, "rewards/accuracies": 0.25, "rewards/chosen": -36.050315856933594, "rewards/margins": 1.242478847503662, "rewards/rejected": -37.29279327392578, "step": 3102 }, { "epoch": 0.4225217864923747, "grad_norm": 46.687735971005104, "learning_rate": 5.722430517460064e-07, "logits/chosen": 11.374545097351074, "logits/rejected": 12.253162384033203, "logps/chosen": -3.715664863586426, "logps/rejected": -4.029405117034912, "loss": 3.2745, "rewards/accuracies": 0.75, "rewards/chosen": -37.156646728515625, "rewards/margins": 3.1374025344848633, "rewards/rejected": -40.29405212402344, "step": 3103 }, { "epoch": 0.4226579520697168, "grad_norm": 42.94372035689253, "learning_rate": 5.720714232365721e-07, "logits/chosen": 12.64082145690918, "logits/rejected": 12.980429649353027, "logps/chosen": -3.7995550632476807, "logps/rejected": -4.001802921295166, "loss": 4.438, "rewards/accuracies": 0.5, "rewards/chosen": -37.995548248291016, "rewards/margins": 2.0224790573120117, "rewards/rejected": -40.018028259277344, "step": 3104 }, { "epoch": 0.4227941176470588, "grad_norm": 39.06158036153856, "learning_rate": 5.718997558461774e-07, "logits/chosen": 11.63559627532959, "logits/rejected": 11.934978485107422, "logps/chosen": -3.937347650527954, "logps/rejected": -4.373166084289551, "loss": 3.8604, "rewards/accuracies": 0.75, "rewards/chosen": -39.37347412109375, "rewards/margins": 4.358181953430176, "rewards/rejected": -43.73165512084961, "step": 3105 }, { "epoch": 0.42293028322440085, "grad_norm": 40.33480043236246, "learning_rate": 5.717280496136119e-07, "logits/chosen": 12.255440711975098, "logits/rejected": 11.659651756286621, "logps/chosen": -3.819037675857544, "logps/rejected": -3.820863962173462, "loss": 4.12, "rewards/accuracies": 0.5, "rewards/chosen": -38.19037628173828, "rewards/margins": 0.018263816833496094, "rewards/rejected": -38.208641052246094, "step": 3106 }, { "epoch": 0.42306644880174293, "grad_norm": 43.831182294896436, "learning_rate": 5.71556304577674e-07, "logits/chosen": 11.941883087158203, "logits/rejected": 12.073867797851562, "logps/chosen": -3.8116037845611572, "logps/rejected": -3.5008633136749268, "loss": 4.1496, "rewards/accuracies": 0.25, "rewards/chosen": -38.11603546142578, "rewards/margins": -3.107402801513672, "rewards/rejected": -35.00863265991211, "step": 3107 }, { "epoch": 0.42320261437908496, "grad_norm": 38.346591740460454, "learning_rate": 5.713845207771711e-07, "logits/chosen": 11.383456230163574, "logits/rejected": 11.525535583496094, "logps/chosen": -3.885784864425659, "logps/rejected": -4.058061599731445, "loss": 3.9283, "rewards/accuracies": 0.75, "rewards/chosen": -38.85784912109375, "rewards/margins": 1.722768783569336, "rewards/rejected": -40.58061981201172, "step": 3108 }, { "epoch": 0.423338779956427, "grad_norm": 38.40183522361937, "learning_rate": 5.712126982509189e-07, "logits/chosen": 12.185680389404297, "logits/rejected": 11.997518539428711, "logps/chosen": -3.645256757736206, "logps/rejected": -3.7440121173858643, "loss": 4.2211, "rewards/accuracies": 0.75, "rewards/chosen": -36.45256805419922, "rewards/margins": 0.9875540733337402, "rewards/rejected": -37.440120697021484, "step": 3109 }, { "epoch": 0.4234749455337691, "grad_norm": 40.16943084426009, "learning_rate": 5.710408370377424e-07, "logits/chosen": 11.39767074584961, "logits/rejected": 11.374432563781738, "logps/chosen": -3.8743600845336914, "logps/rejected": -4.105737209320068, "loss": 3.721, "rewards/accuracies": 0.75, "rewards/chosen": -38.74360275268555, "rewards/margins": 2.3137712478637695, "rewards/rejected": -41.057373046875, "step": 3110 }, { "epoch": 0.4236111111111111, "grad_norm": 36.3421451371603, "learning_rate": 5.70868937176475e-07, "logits/chosen": 11.562369346618652, "logits/rejected": 12.158670425415039, "logps/chosen": -3.268378973007202, "logps/rejected": -3.895320415496826, "loss": 3.7996, "rewards/accuracies": 1.0, "rewards/chosen": -32.68379211425781, "rewards/margins": 6.269415855407715, "rewards/rejected": -38.95320510864258, "step": 3111 }, { "epoch": 0.42374727668845313, "grad_norm": 41.99052137247572, "learning_rate": 5.706969987059587e-07, "logits/chosen": 10.224414825439453, "logits/rejected": 12.131175994873047, "logps/chosen": -3.394834518432617, "logps/rejected": -3.841571807861328, "loss": 4.1508, "rewards/accuracies": 0.75, "rewards/chosen": -33.94834518432617, "rewards/margins": 4.467373847961426, "rewards/rejected": -38.41571807861328, "step": 3112 }, { "epoch": 0.4238834422657952, "grad_norm": 40.70208214974222, "learning_rate": 5.705250216650446e-07, "logits/chosen": 11.175348281860352, "logits/rejected": 11.292112350463867, "logps/chosen": -3.808016300201416, "logps/rejected": -3.997020721435547, "loss": 3.8954, "rewards/accuracies": 1.0, "rewards/chosen": -38.080162048339844, "rewards/margins": 1.8900432586669922, "rewards/rejected": -39.97020721435547, "step": 3113 }, { "epoch": 0.42401960784313725, "grad_norm": 41.90167160137692, "learning_rate": 5.703530060925922e-07, "logits/chosen": 11.51321792602539, "logits/rejected": 12.357267379760742, "logps/chosen": -3.61655855178833, "logps/rejected": -4.147712707519531, "loss": 4.6728, "rewards/accuracies": 0.75, "rewards/chosen": -36.165584564208984, "rewards/margins": 5.31154727935791, "rewards/rejected": -41.47713088989258, "step": 3114 }, { "epoch": 0.4241557734204793, "grad_norm": 44.765524784974815, "learning_rate": 5.7018095202747e-07, "logits/chosen": 12.019693374633789, "logits/rejected": 12.874530792236328, "logps/chosen": -4.156140327453613, "logps/rejected": -4.179282188415527, "loss": 4.1522, "rewards/accuracies": 0.5, "rewards/chosen": -41.561405181884766, "rewards/margins": 0.2314167022705078, "rewards/rejected": -41.792823791503906, "step": 3115 }, { "epoch": 0.42429193899782136, "grad_norm": 40.15595263004808, "learning_rate": 5.70008859508555e-07, "logits/chosen": 10.696731567382812, "logits/rejected": 12.117056846618652, "logps/chosen": -3.7057080268859863, "logps/rejected": -3.844412088394165, "loss": 4.1663, "rewards/accuracies": 0.75, "rewards/chosen": -37.05707931518555, "rewards/margins": 1.3870391845703125, "rewards/rejected": -38.444122314453125, "step": 3116 }, { "epoch": 0.4244281045751634, "grad_norm": 40.43644439061638, "learning_rate": 5.698367285747328e-07, "logits/chosen": 11.268644332885742, "logits/rejected": 13.019780158996582, "logps/chosen": -3.8585896492004395, "logps/rejected": -4.095440864562988, "loss": 3.6263, "rewards/accuracies": 0.75, "rewards/chosen": -38.585899353027344, "rewards/margins": 2.3685121536254883, "rewards/rejected": -40.954410552978516, "step": 3117 }, { "epoch": 0.4245642701525055, "grad_norm": 44.08463482332553, "learning_rate": 5.696645592648979e-07, "logits/chosen": 11.258289337158203, "logits/rejected": 12.129728317260742, "logps/chosen": -3.80403470993042, "logps/rejected": -4.072978973388672, "loss": 4.2804, "rewards/accuracies": 1.0, "rewards/chosen": -38.04034423828125, "rewards/margins": 2.6894445419311523, "rewards/rejected": -40.72978973388672, "step": 3118 }, { "epoch": 0.4247004357298475, "grad_norm": 40.415559679044385, "learning_rate": 5.694923516179534e-07, "logits/chosen": 11.157215118408203, "logits/rejected": 11.781304359436035, "logps/chosen": -3.844326972961426, "logps/rejected": -4.155804634094238, "loss": 3.4763, "rewards/accuracies": 0.75, "rewards/chosen": -38.44327163696289, "rewards/margins": 3.114774703979492, "rewards/rejected": -41.55804443359375, "step": 3119 }, { "epoch": 0.42483660130718953, "grad_norm": 55.272262465515574, "learning_rate": 5.693201056728111e-07, "logits/chosen": 12.387372016906738, "logits/rejected": 12.27186107635498, "logps/chosen": -4.062991619110107, "logps/rejected": -3.9536190032958984, "loss": 3.88, "rewards/accuracies": 0.25, "rewards/chosen": -40.629913330078125, "rewards/margins": -1.093724250793457, "rewards/rejected": -39.536190032958984, "step": 3120 }, { "epoch": 0.4249727668845316, "grad_norm": 42.88700324441656, "learning_rate": 5.691478214683912e-07, "logits/chosen": 12.134913444519043, "logits/rejected": 11.9365234375, "logps/chosen": -3.9567618370056152, "logps/rejected": -3.836759090423584, "loss": 4.228, "rewards/accuracies": 0.5, "rewards/chosen": -39.56761932373047, "rewards/margins": -1.200026512145996, "rewards/rejected": -38.367591857910156, "step": 3121 }, { "epoch": 0.42510893246187365, "grad_norm": 36.48526787924526, "learning_rate": 5.689754990436229e-07, "logits/chosen": 11.716650009155273, "logits/rejected": 12.238007545471191, "logps/chosen": -3.438253402709961, "logps/rejected": -3.682389736175537, "loss": 3.8351, "rewards/accuracies": 1.0, "rewards/chosen": -34.38253402709961, "rewards/margins": 2.441361904144287, "rewards/rejected": -36.82389831542969, "step": 3122 }, { "epoch": 0.4252450980392157, "grad_norm": 39.958945704333566, "learning_rate": 5.688031384374437e-07, "logits/chosen": 10.947786331176758, "logits/rejected": 11.413291931152344, "logps/chosen": -3.549025058746338, "logps/rejected": -3.8086447715759277, "loss": 4.3014, "rewards/accuracies": 0.75, "rewards/chosen": -35.49024963378906, "rewards/margins": 2.596198081970215, "rewards/rejected": -38.086448669433594, "step": 3123 }, { "epoch": 0.42538126361655776, "grad_norm": 43.14544079213811, "learning_rate": 5.686307396888002e-07, "logits/chosen": 11.413030624389648, "logits/rejected": 11.905428886413574, "logps/chosen": -3.7993106842041016, "logps/rejected": -3.9850077629089355, "loss": 4.5665, "rewards/accuracies": 0.75, "rewards/chosen": -37.993106842041016, "rewards/margins": 1.8569707870483398, "rewards/rejected": -39.85007858276367, "step": 3124 }, { "epoch": 0.4255174291938998, "grad_norm": 41.49816242686203, "learning_rate": 5.68458302836647e-07, "logits/chosen": 11.293702125549316, "logits/rejected": 12.303756713867188, "logps/chosen": -3.68422532081604, "logps/rejected": -4.070578575134277, "loss": 4.0539, "rewards/accuracies": 1.0, "rewards/chosen": -36.842254638671875, "rewards/margins": 3.863530158996582, "rewards/rejected": -40.70578384399414, "step": 3125 }, { "epoch": 0.4256535947712418, "grad_norm": 38.76198012750256, "learning_rate": 5.682858279199478e-07, "logits/chosen": 11.89757251739502, "logits/rejected": 12.019577980041504, "logps/chosen": -3.8435301780700684, "logps/rejected": -3.9576520919799805, "loss": 4.3613, "rewards/accuracies": 0.5, "rewards/chosen": -38.435302734375, "rewards/margins": 1.1412181854248047, "rewards/rejected": -39.57652282714844, "step": 3126 }, { "epoch": 0.4257897603485839, "grad_norm": 41.84645027372333, "learning_rate": 5.681133149776748e-07, "logits/chosen": 12.386510848999023, "logits/rejected": 12.244189262390137, "logps/chosen": -3.874221086502075, "logps/rejected": -3.9240808486938477, "loss": 3.7777, "rewards/accuracies": 0.5, "rewards/chosen": -38.74221420288086, "rewards/margins": 0.4985980987548828, "rewards/rejected": -39.24081039428711, "step": 3127 }, { "epoch": 0.42592592592592593, "grad_norm": 37.47520905210058, "learning_rate": 5.679407640488086e-07, "logits/chosen": 12.656503677368164, "logits/rejected": 12.303586959838867, "logps/chosen": -3.89646577835083, "logps/rejected": -4.019355773925781, "loss": 3.9624, "rewards/accuracies": 0.5, "rewards/chosen": -38.964656829833984, "rewards/margins": 1.2288970947265625, "rewards/rejected": -40.19355392456055, "step": 3128 }, { "epoch": 0.42606209150326796, "grad_norm": 39.886488282185674, "learning_rate": 5.677681751723387e-07, "logits/chosen": 11.865591049194336, "logits/rejected": 12.185395240783691, "logps/chosen": -3.7984354496002197, "logps/rejected": -3.8907992839813232, "loss": 4.1775, "rewards/accuracies": 0.75, "rewards/chosen": -37.98435592651367, "rewards/margins": 0.9236383438110352, "rewards/rejected": -38.90799331665039, "step": 3129 }, { "epoch": 0.42619825708061004, "grad_norm": 38.62291452852921, "learning_rate": 5.675955483872627e-07, "logits/chosen": 11.960807800292969, "logits/rejected": 12.224054336547852, "logps/chosen": -4.077714920043945, "logps/rejected": -4.365161895751953, "loss": 4.2713, "rewards/accuracies": 0.5, "rewards/chosen": -40.77715301513672, "rewards/margins": 2.8744659423828125, "rewards/rejected": -43.651615142822266, "step": 3130 }, { "epoch": 0.4263344226579521, "grad_norm": 37.02813059283484, "learning_rate": 5.674228837325872e-07, "logits/chosen": 12.770109176635742, "logits/rejected": 12.389015197753906, "logps/chosen": -3.8167238235473633, "logps/rejected": -3.84318208694458, "loss": 4.2325, "rewards/accuracies": 0.5, "rewards/chosen": -38.167236328125, "rewards/margins": 0.26458263397216797, "rewards/rejected": -38.43182373046875, "step": 3131 }, { "epoch": 0.4264705882352941, "grad_norm": 49.89109307416174, "learning_rate": 5.672501812473272e-07, "logits/chosen": 11.454483032226562, "logits/rejected": 12.909341812133789, "logps/chosen": -3.874398946762085, "logps/rejected": -4.144242286682129, "loss": 4.3815, "rewards/accuracies": 0.75, "rewards/chosen": -38.743988037109375, "rewards/margins": 2.6984357833862305, "rewards/rejected": -41.44242477416992, "step": 3132 }, { "epoch": 0.4266067538126362, "grad_norm": 37.6069499608342, "learning_rate": 5.670774409705062e-07, "logits/chosen": 13.018607139587402, "logits/rejected": 12.618267059326172, "logps/chosen": -3.7605791091918945, "logps/rejected": -4.134089946746826, "loss": 4.0363, "rewards/accuracies": 0.75, "rewards/chosen": -37.60578918457031, "rewards/margins": 3.735107421875, "rewards/rejected": -41.34089660644531, "step": 3133 }, { "epoch": 0.4267429193899782, "grad_norm": 42.89057107272854, "learning_rate": 5.669046629411563e-07, "logits/chosen": 11.799982070922852, "logits/rejected": 11.47799301147461, "logps/chosen": -3.8280997276306152, "logps/rejected": -3.8119866847991943, "loss": 4.0035, "rewards/accuracies": 0.5, "rewards/chosen": -38.28099822998047, "rewards/margins": -0.1611309051513672, "rewards/rejected": -38.11986541748047, "step": 3134 }, { "epoch": 0.42687908496732024, "grad_norm": 42.97705451678273, "learning_rate": 5.667318471983183e-07, "logits/chosen": 11.230417251586914, "logits/rejected": 11.8739013671875, "logps/chosen": -3.796410083770752, "logps/rejected": -3.975161075592041, "loss": 3.9453, "rewards/accuracies": 0.75, "rewards/chosen": -37.9640998840332, "rewards/margins": 1.787510871887207, "rewards/rejected": -39.751609802246094, "step": 3135 }, { "epoch": 0.42701525054466233, "grad_norm": 40.220261229167335, "learning_rate": 5.665589937810412e-07, "logits/chosen": 11.36567497253418, "logits/rejected": 12.188365936279297, "logps/chosen": -3.7657394409179688, "logps/rejected": -3.9952821731567383, "loss": 4.0149, "rewards/accuracies": 0.5, "rewards/chosen": -37.65739440917969, "rewards/margins": 2.295426368713379, "rewards/rejected": -39.95281982421875, "step": 3136 }, { "epoch": 0.42715141612200436, "grad_norm": 46.286442009786924, "learning_rate": 5.663861027283826e-07, "logits/chosen": 11.659025192260742, "logits/rejected": 12.377462387084961, "logps/chosen": -3.8602230548858643, "logps/rejected": -4.302262306213379, "loss": 4.3219, "rewards/accuracies": 1.0, "rewards/chosen": -38.602230072021484, "rewards/margins": 4.420395851135254, "rewards/rejected": -43.02262496948242, "step": 3137 }, { "epoch": 0.4272875816993464, "grad_norm": 40.310872496640506, "learning_rate": 5.662131740794086e-07, "logits/chosen": 12.426773071289062, "logits/rejected": 12.043742179870605, "logps/chosen": -4.063864231109619, "logps/rejected": -3.9616858959198, "loss": 4.3249, "rewards/accuracies": 0.25, "rewards/chosen": -40.638641357421875, "rewards/margins": -1.0217809677124023, "rewards/rejected": -39.616859436035156, "step": 3138 }, { "epoch": 0.42742374727668847, "grad_norm": 38.2251463893645, "learning_rate": 5.660402078731941e-07, "logits/chosen": 11.635553359985352, "logits/rejected": 11.641992568969727, "logps/chosen": -3.7086479663848877, "logps/rejected": -3.8577659130096436, "loss": 4.3221, "rewards/accuracies": 0.75, "rewards/chosen": -37.08647918701172, "rewards/margins": 1.4911775588989258, "rewards/rejected": -38.577659606933594, "step": 3139 }, { "epoch": 0.4275599128540305, "grad_norm": 40.70186619483433, "learning_rate": 5.658672041488222e-07, "logits/chosen": 12.697978973388672, "logits/rejected": 13.136749267578125, "logps/chosen": -4.058009147644043, "logps/rejected": -4.274540424346924, "loss": 4.4127, "rewards/accuracies": 0.75, "rewards/chosen": -40.58009338378906, "rewards/margins": 2.1653127670288086, "rewards/rejected": -42.74540710449219, "step": 3140 }, { "epoch": 0.42769607843137253, "grad_norm": 36.61737385636438, "learning_rate": 5.656941629453843e-07, "logits/chosen": 11.496110916137695, "logits/rejected": 11.83479118347168, "logps/chosen": -3.894925832748413, "logps/rejected": -4.034534931182861, "loss": 4.1528, "rewards/accuracies": 0.75, "rewards/chosen": -38.949256896972656, "rewards/margins": 1.3960905075073242, "rewards/rejected": -40.3453483581543, "step": 3141 }, { "epoch": 0.4278322440087146, "grad_norm": 40.57723792274433, "learning_rate": 5.655210843019807e-07, "logits/chosen": 11.144014358520508, "logits/rejected": 11.8974027633667, "logps/chosen": -3.8367297649383545, "logps/rejected": -4.004003524780273, "loss": 3.3916, "rewards/accuracies": 0.75, "rewards/chosen": -38.36729431152344, "rewards/margins": 1.6727371215820312, "rewards/rejected": -40.04003143310547, "step": 3142 }, { "epoch": 0.42796840958605664, "grad_norm": 37.33393270072115, "learning_rate": 5.6534796825772e-07, "logits/chosen": 11.372087478637695, "logits/rejected": 11.710563659667969, "logps/chosen": -3.805360794067383, "logps/rejected": -4.1116557121276855, "loss": 4.1543, "rewards/accuracies": 1.0, "rewards/chosen": -38.05360794067383, "rewards/margins": 3.06295108795166, "rewards/rejected": -41.11656188964844, "step": 3143 }, { "epoch": 0.42810457516339867, "grad_norm": 40.65798213957327, "learning_rate": 5.65174814851719e-07, "logits/chosen": 12.292753219604492, "logits/rejected": 12.750198364257812, "logps/chosen": -3.656838893890381, "logps/rejected": -4.128571510314941, "loss": 4.2098, "rewards/accuracies": 1.0, "rewards/chosen": -36.568389892578125, "rewards/margins": 4.717327117919922, "rewards/rejected": -41.28571319580078, "step": 3144 }, { "epoch": 0.42824074074074076, "grad_norm": 47.88068144501903, "learning_rate": 5.650016241231032e-07, "logits/chosen": 11.963532447814941, "logits/rejected": 12.39923095703125, "logps/chosen": -3.9695003032684326, "logps/rejected": -4.110554218292236, "loss": 3.9375, "rewards/accuracies": 0.5, "rewards/chosen": -39.69499969482422, "rewards/margins": 1.4105396270751953, "rewards/rejected": -41.10554504394531, "step": 3145 }, { "epoch": 0.4283769063180828, "grad_norm": 40.197662258526336, "learning_rate": 5.648283961110065e-07, "logits/chosen": 11.818511962890625, "logits/rejected": 12.786093711853027, "logps/chosen": -3.9628031253814697, "logps/rejected": -4.353971481323242, "loss": 4.5609, "rewards/accuracies": 1.0, "rewards/chosen": -39.628028869628906, "rewards/margins": 3.9116811752319336, "rewards/rejected": -43.539710998535156, "step": 3146 }, { "epoch": 0.4285130718954248, "grad_norm": 58.68702502124539, "learning_rate": 5.646551308545714e-07, "logits/chosen": 12.542749404907227, "logits/rejected": 12.337149620056152, "logps/chosen": -4.129722595214844, "logps/rejected": -4.346099853515625, "loss": 4.0609, "rewards/accuracies": 0.75, "rewards/chosen": -41.29722595214844, "rewards/margins": 2.1637744903564453, "rewards/rejected": -43.461002349853516, "step": 3147 }, { "epoch": 0.4286492374727669, "grad_norm": 41.80124227046689, "learning_rate": 5.644818283929482e-07, "logits/chosen": 12.249340057373047, "logits/rejected": 13.008942604064941, "logps/chosen": -3.5954883098602295, "logps/rejected": -3.810661554336548, "loss": 3.8386, "rewards/accuracies": 0.5, "rewards/chosen": -35.95488357543945, "rewards/margins": 2.151732921600342, "rewards/rejected": -38.10661697387695, "step": 3148 }, { "epoch": 0.4287854030501089, "grad_norm": 48.7784214266746, "learning_rate": 5.643084887652964e-07, "logits/chosen": 11.988858222961426, "logits/rejected": 11.827491760253906, "logps/chosen": -3.677943229675293, "logps/rejected": -4.069235801696777, "loss": 4.2441, "rewards/accuracies": 0.75, "rewards/chosen": -36.77943420410156, "rewards/margins": 3.9129281044006348, "rewards/rejected": -40.692359924316406, "step": 3149 }, { "epoch": 0.42892156862745096, "grad_norm": 49.65682701001798, "learning_rate": 5.641351120107833e-07, "logits/chosen": 11.756301879882812, "logits/rejected": 11.620145797729492, "logps/chosen": -3.7057926654815674, "logps/rejected": -3.728545665740967, "loss": 3.9085, "rewards/accuracies": 0.5, "rewards/chosen": -37.057926177978516, "rewards/margins": 0.22752952575683594, "rewards/rejected": -37.28545379638672, "step": 3150 }, { "epoch": 0.42905773420479304, "grad_norm": 39.7349858659888, "learning_rate": 5.639616981685849e-07, "logits/chosen": 12.217755317687988, "logits/rejected": 12.831567764282227, "logps/chosen": -3.676248550415039, "logps/rejected": -4.18857479095459, "loss": 3.9568, "rewards/accuracies": 0.75, "rewards/chosen": -36.762489318847656, "rewards/margins": 5.123261451721191, "rewards/rejected": -41.88574981689453, "step": 3151 }, { "epoch": 0.42919389978213507, "grad_norm": 50.56980614886758, "learning_rate": 5.637882472778855e-07, "logits/chosen": 11.465376853942871, "logits/rejected": 11.939348220825195, "logps/chosen": -3.847430467605591, "logps/rejected": -4.108889579772949, "loss": 3.5524, "rewards/accuracies": 1.0, "rewards/chosen": -38.47430419921875, "rewards/margins": 2.6145944595336914, "rewards/rejected": -41.088897705078125, "step": 3152 }, { "epoch": 0.4293300653594771, "grad_norm": 46.20500065505677, "learning_rate": 5.636147593778778e-07, "logits/chosen": 12.078116416931152, "logits/rejected": 12.150960922241211, "logps/chosen": -3.856053352355957, "logps/rejected": -3.8446478843688965, "loss": 4.4013, "rewards/accuracies": 0.25, "rewards/chosen": -38.56053161621094, "rewards/margins": -0.11405324935913086, "rewards/rejected": -38.44647979736328, "step": 3153 }, { "epoch": 0.4294662309368192, "grad_norm": 41.59612374978635, "learning_rate": 5.634412345077626e-07, "logits/chosen": 11.934480667114258, "logits/rejected": 12.597431182861328, "logps/chosen": -4.154121398925781, "logps/rejected": -3.815877676010132, "loss": 3.2606, "rewards/accuracies": 0.25, "rewards/chosen": -41.54121398925781, "rewards/margins": -3.3824377059936523, "rewards/rejected": -38.158775329589844, "step": 3154 }, { "epoch": 0.4296023965141612, "grad_norm": 38.539089500155235, "learning_rate": 5.632676727067496e-07, "logits/chosen": 11.697608947753906, "logits/rejected": 12.64629077911377, "logps/chosen": -3.9340333938598633, "logps/rejected": -4.448782920837402, "loss": 3.3191, "rewards/accuracies": 1.0, "rewards/chosen": -39.34033203125, "rewards/margins": 5.147500038146973, "rewards/rejected": -44.487831115722656, "step": 3155 }, { "epoch": 0.4297385620915033, "grad_norm": 39.17514103971655, "learning_rate": 5.630940740140563e-07, "logits/chosen": 12.906379699707031, "logits/rejected": 12.025999069213867, "logps/chosen": -3.9345219135284424, "logps/rejected": -3.765906810760498, "loss": 3.7657, "rewards/accuracies": 0.0, "rewards/chosen": -39.345218658447266, "rewards/margins": -1.686152458190918, "rewards/rejected": -37.6590690612793, "step": 3156 }, { "epoch": 0.4298747276688453, "grad_norm": 38.41747115709624, "learning_rate": 5.629204384689088e-07, "logits/chosen": 11.956939697265625, "logits/rejected": 11.635881423950195, "logps/chosen": -4.198006629943848, "logps/rejected": -3.9726619720458984, "loss": 3.7669, "rewards/accuracies": 0.5, "rewards/chosen": -41.980064392089844, "rewards/margins": -2.2534427642822266, "rewards/rejected": -39.726619720458984, "step": 3157 }, { "epoch": 0.43001089324618735, "grad_norm": 45.87023605101941, "learning_rate": 5.627467661105416e-07, "logits/chosen": 12.25685977935791, "logits/rejected": 12.709155082702637, "logps/chosen": -4.081093788146973, "logps/rejected": -3.7403223514556885, "loss": 4.1728, "rewards/accuracies": 0.25, "rewards/chosen": -40.810935974121094, "rewards/margins": -3.407711982727051, "rewards/rejected": -37.40322494506836, "step": 3158 }, { "epoch": 0.43014705882352944, "grad_norm": 44.08687834656198, "learning_rate": 5.625730569781973e-07, "logits/chosen": 12.310195922851562, "logits/rejected": 12.91861343383789, "logps/chosen": -4.104334354400635, "logps/rejected": -4.344840049743652, "loss": 4.225, "rewards/accuracies": 0.75, "rewards/chosen": -41.04334259033203, "rewards/margins": 2.4050559997558594, "rewards/rejected": -43.44839859008789, "step": 3159 }, { "epoch": 0.43028322440087147, "grad_norm": 49.2235193813853, "learning_rate": 5.623993111111267e-07, "logits/chosen": 12.2904634475708, "logits/rejected": 13.619648933410645, "logps/chosen": -3.690838098526001, "logps/rejected": -4.259744167327881, "loss": 4.5127, "rewards/accuracies": 0.75, "rewards/chosen": -36.90837860107422, "rewards/margins": 5.689062118530273, "rewards/rejected": -42.597442626953125, "step": 3160 }, { "epoch": 0.4304193899782135, "grad_norm": 41.974045156857564, "learning_rate": 5.622255285485897e-07, "logits/chosen": 11.468992233276367, "logits/rejected": 11.96429443359375, "logps/chosen": -3.7813358306884766, "logps/rejected": -3.875288963317871, "loss": 4.0587, "rewards/accuracies": 0.5, "rewards/chosen": -37.813358306884766, "rewards/margins": 0.9395318031311035, "rewards/rejected": -38.752891540527344, "step": 3161 }, { "epoch": 0.4305555555555556, "grad_norm": 43.08165861114111, "learning_rate": 5.620517093298533e-07, "logits/chosen": 11.816537857055664, "logits/rejected": 11.77784252166748, "logps/chosen": -3.705648183822632, "logps/rejected": -3.6025593280792236, "loss": 3.7371, "rewards/accuracies": 0.5, "rewards/chosen": -37.056480407714844, "rewards/margins": -1.0308866500854492, "rewards/rejected": -36.025596618652344, "step": 3162 }, { "epoch": 0.4306917211328976, "grad_norm": 40.903544260915545, "learning_rate": 5.618778534941938e-07, "logits/chosen": 11.678600311279297, "logits/rejected": 12.319208145141602, "logps/chosen": -3.647169828414917, "logps/rejected": -3.736758232116699, "loss": 3.8278, "rewards/accuracies": 0.5, "rewards/chosen": -36.47169494628906, "rewards/margins": 0.8958864212036133, "rewards/rejected": -37.367584228515625, "step": 3163 }, { "epoch": 0.43082788671023964, "grad_norm": 41.059085390155055, "learning_rate": 5.617039610808952e-07, "logits/chosen": 11.436870574951172, "logits/rejected": 12.472452163696289, "logps/chosen": -3.2988574504852295, "logps/rejected": -4.019051551818848, "loss": 3.8711, "rewards/accuracies": 1.0, "rewards/chosen": -32.98857116699219, "rewards/margins": 7.201947212219238, "rewards/rejected": -40.190521240234375, "step": 3164 }, { "epoch": 0.4309640522875817, "grad_norm": 38.90956286706887, "learning_rate": 5.615300321292499e-07, "logits/chosen": 12.619927406311035, "logits/rejected": 12.766603469848633, "logps/chosen": -4.087168216705322, "logps/rejected": -3.9870858192443848, "loss": 3.7492, "rewards/accuracies": 0.25, "rewards/chosen": -40.871681213378906, "rewards/margins": -1.0008230209350586, "rewards/rejected": -39.87085723876953, "step": 3165 }, { "epoch": 0.43110021786492375, "grad_norm": 38.32849743324476, "learning_rate": 5.613560666785585e-07, "logits/chosen": 12.173542022705078, "logits/rejected": 13.325947761535645, "logps/chosen": -3.596982955932617, "logps/rejected": -4.504859447479248, "loss": 4.207, "rewards/accuracies": 1.0, "rewards/chosen": -35.96983337402344, "rewards/margins": 9.078763008117676, "rewards/rejected": -45.04859161376953, "step": 3166 }, { "epoch": 0.4312363834422658, "grad_norm": 45.035578165594444, "learning_rate": 5.611820647681302e-07, "logits/chosen": 11.722101211547852, "logits/rejected": 13.247137069702148, "logps/chosen": -3.8612449169158936, "logps/rejected": -4.375763416290283, "loss": 4.2071, "rewards/accuracies": 0.75, "rewards/chosen": -38.612449645996094, "rewards/margins": 5.145184516906738, "rewards/rejected": -43.757633209228516, "step": 3167 }, { "epoch": 0.43137254901960786, "grad_norm": 40.91730447515633, "learning_rate": 5.61008026437282e-07, "logits/chosen": 12.375114440917969, "logits/rejected": 12.208595275878906, "logps/chosen": -3.9230637550354004, "logps/rejected": -3.8166303634643555, "loss": 4.0479, "rewards/accuracies": 0.25, "rewards/chosen": -39.23063659667969, "rewards/margins": -1.0643329620361328, "rewards/rejected": -38.16630554199219, "step": 3168 }, { "epoch": 0.4315087145969499, "grad_norm": 39.55818460098867, "learning_rate": 5.608339517253393e-07, "logits/chosen": 13.043008804321289, "logits/rejected": 13.032159805297852, "logps/chosen": -4.291587829589844, "logps/rejected": -4.457042694091797, "loss": 3.4407, "rewards/accuracies": 0.75, "rewards/chosen": -42.91587829589844, "rewards/margins": 1.654550552368164, "rewards/rejected": -44.570430755615234, "step": 3169 }, { "epoch": 0.4316448801742919, "grad_norm": 48.551655528122126, "learning_rate": 5.606598406716357e-07, "logits/chosen": 11.014930725097656, "logits/rejected": 12.722232818603516, "logps/chosen": -3.6568055152893066, "logps/rejected": -4.150633335113525, "loss": 3.5508, "rewards/accuracies": 1.0, "rewards/chosen": -36.56805419921875, "rewards/margins": 4.938281059265137, "rewards/rejected": -41.5063362121582, "step": 3170 }, { "epoch": 0.431781045751634, "grad_norm": 43.49345742169074, "learning_rate": 5.604856933155132e-07, "logits/chosen": 12.269377708435059, "logits/rejected": 12.383035659790039, "logps/chosen": -3.7899575233459473, "logps/rejected": -3.8052804470062256, "loss": 4.0789, "rewards/accuracies": 0.5, "rewards/chosen": -37.899574279785156, "rewards/margins": 0.153228759765625, "rewards/rejected": -38.05280303955078, "step": 3171 }, { "epoch": 0.43191721132897604, "grad_norm": 40.6415228277368, "learning_rate": 5.603115096963215e-07, "logits/chosen": 12.44809627532959, "logits/rejected": 12.818455696105957, "logps/chosen": -4.233869552612305, "logps/rejected": -4.265568733215332, "loss": 3.8075, "rewards/accuracies": 0.5, "rewards/chosen": -42.33869934082031, "rewards/margins": 0.316986083984375, "rewards/rejected": -42.65568542480469, "step": 3172 }, { "epoch": 0.43205337690631807, "grad_norm": 41.23886183104253, "learning_rate": 5.601372898534193e-07, "logits/chosen": 12.75173282623291, "logits/rejected": 14.0225191116333, "logps/chosen": -3.977638006210327, "logps/rejected": -4.212616920471191, "loss": 3.8355, "rewards/accuracies": 0.5, "rewards/chosen": -39.7763786315918, "rewards/margins": 2.349789619445801, "rewards/rejected": -42.12616729736328, "step": 3173 }, { "epoch": 0.43218954248366015, "grad_norm": 49.848830159496714, "learning_rate": 5.599630338261725e-07, "logits/chosen": 12.8964262008667, "logits/rejected": 11.157098770141602, "logps/chosen": -4.066320419311523, "logps/rejected": -3.5726535320281982, "loss": 3.8808, "rewards/accuracies": 0.25, "rewards/chosen": -40.6632080078125, "rewards/margins": -4.936671733856201, "rewards/rejected": -35.726531982421875, "step": 3174 }, { "epoch": 0.4323257080610022, "grad_norm": 44.21616425288203, "learning_rate": 5.59788741653956e-07, "logits/chosen": 12.02682113647461, "logits/rejected": 13.107017517089844, "logps/chosen": -4.015034198760986, "logps/rejected": -4.0447821617126465, "loss": 3.7823, "rewards/accuracies": 0.5, "rewards/chosen": -40.15034103393555, "rewards/margins": 0.2974815368652344, "rewards/rejected": -40.44782257080078, "step": 3175 }, { "epoch": 0.4324618736383442, "grad_norm": 43.14659874784827, "learning_rate": 5.596144133761526e-07, "logits/chosen": 11.36790657043457, "logits/rejected": 12.350198745727539, "logps/chosen": -3.913874626159668, "logps/rejected": -4.1866774559021, "loss": 3.81, "rewards/accuracies": 1.0, "rewards/chosen": -39.13874816894531, "rewards/margins": 2.7280311584472656, "rewards/rejected": -41.86677551269531, "step": 3176 }, { "epoch": 0.4325980392156863, "grad_norm": 44.223660057999396, "learning_rate": 5.594400490321531e-07, "logits/chosen": 13.086326599121094, "logits/rejected": 12.754122734069824, "logps/chosen": -4.143862724304199, "logps/rejected": -4.338270664215088, "loss": 4.489, "rewards/accuracies": 0.75, "rewards/chosen": -41.438621520996094, "rewards/margins": 1.9440841674804688, "rewards/rejected": -43.38270568847656, "step": 3177 }, { "epoch": 0.4327342047930283, "grad_norm": 39.56965392927083, "learning_rate": 5.592656486613564e-07, "logits/chosen": 12.903120040893555, "logits/rejected": 13.122699737548828, "logps/chosen": -4.163357257843018, "logps/rejected": -4.210850715637207, "loss": 3.5513, "rewards/accuracies": 0.5, "rewards/chosen": -41.633575439453125, "rewards/margins": 0.4749317169189453, "rewards/rejected": -42.10850524902344, "step": 3178 }, { "epoch": 0.43287037037037035, "grad_norm": 39.857248954038035, "learning_rate": 5.590912123031701e-07, "logits/chosen": 12.346386909484863, "logits/rejected": 11.2637939453125, "logps/chosen": -3.922102212905884, "logps/rejected": -3.811971664428711, "loss": 3.7924, "rewards/accuracies": 0.5, "rewards/chosen": -39.22102355957031, "rewards/margins": -1.1013059616088867, "rewards/rejected": -38.11971664428711, "step": 3179 }, { "epoch": 0.43300653594771243, "grad_norm": 39.995643349873156, "learning_rate": 5.589167399970092e-07, "logits/chosen": 12.090885162353516, "logits/rejected": 12.073671340942383, "logps/chosen": -3.8441596031188965, "logps/rejected": -4.098310947418213, "loss": 3.8802, "rewards/accuracies": 0.75, "rewards/chosen": -38.44159698486328, "rewards/margins": 2.541513442993164, "rewards/rejected": -40.98310852050781, "step": 3180 }, { "epoch": 0.43314270152505446, "grad_norm": 49.84433073158464, "learning_rate": 5.587422317822973e-07, "logits/chosen": 12.360076904296875, "logits/rejected": 13.218759536743164, "logps/chosen": -3.992140769958496, "logps/rejected": -4.3610005378723145, "loss": 3.802, "rewards/accuracies": 0.75, "rewards/chosen": -39.92140579223633, "rewards/margins": 3.6885995864868164, "rewards/rejected": -43.610008239746094, "step": 3181 }, { "epoch": 0.4332788671023965, "grad_norm": 40.24374095556551, "learning_rate": 5.58567687698466e-07, "logits/chosen": 12.380329132080078, "logits/rejected": 12.621365547180176, "logps/chosen": -3.6451454162597656, "logps/rejected": -4.2435221672058105, "loss": 4.2439, "rewards/accuracies": 1.0, "rewards/chosen": -36.451454162597656, "rewards/margins": 5.983767509460449, "rewards/rejected": -42.43522262573242, "step": 3182 }, { "epoch": 0.4334150326797386, "grad_norm": 42.03338564292449, "learning_rate": 5.58393107784955e-07, "logits/chosen": 11.249837875366211, "logits/rejected": 12.019424438476562, "logps/chosen": -3.239976406097412, "logps/rejected": -3.544811487197876, "loss": 4.0973, "rewards/accuracies": 0.75, "rewards/chosen": -32.39976501464844, "rewards/margins": 3.0483527183532715, "rewards/rejected": -35.448116302490234, "step": 3183 }, { "epoch": 0.4335511982570806, "grad_norm": 39.64842434147947, "learning_rate": 5.582184920812118e-07, "logits/chosen": 11.184170722961426, "logits/rejected": 12.322643280029297, "logps/chosen": -3.2444450855255127, "logps/rejected": -3.6595373153686523, "loss": 3.6693, "rewards/accuracies": 0.75, "rewards/chosen": -32.44445037841797, "rewards/margins": 4.1509222984313965, "rewards/rejected": -36.595375061035156, "step": 3184 }, { "epoch": 0.43368736383442263, "grad_norm": 51.24380900237356, "learning_rate": 5.580438406266926e-07, "logits/chosen": 10.409223556518555, "logits/rejected": 12.161114692687988, "logps/chosen": -3.285156011581421, "logps/rejected": -3.8086931705474854, "loss": 4.58, "rewards/accuracies": 0.75, "rewards/chosen": -32.8515625, "rewards/margins": 5.235370635986328, "rewards/rejected": -38.08693313598633, "step": 3185 }, { "epoch": 0.4338235294117647, "grad_norm": 41.309985232284724, "learning_rate": 5.578691534608611e-07, "logits/chosen": 11.56147575378418, "logits/rejected": 12.030355453491211, "logps/chosen": -3.5048229694366455, "logps/rejected": -3.7801759243011475, "loss": 4.1905, "rewards/accuracies": 0.75, "rewards/chosen": -35.04822540283203, "rewards/margins": 2.753530979156494, "rewards/rejected": -37.8017578125, "step": 3186 }, { "epoch": 0.43395969498910675, "grad_norm": 38.729178809920505, "learning_rate": 5.576944306231894e-07, "logits/chosen": 12.043106079101562, "logits/rejected": 13.18715763092041, "logps/chosen": -4.119102954864502, "logps/rejected": -4.388625144958496, "loss": 3.4212, "rewards/accuracies": 0.75, "rewards/chosen": -41.19103240966797, "rewards/margins": 2.695222854614258, "rewards/rejected": -43.886253356933594, "step": 3187 }, { "epoch": 0.4340958605664488, "grad_norm": 38.9838605299192, "learning_rate": 5.575196721531577e-07, "logits/chosen": 11.457067489624023, "logits/rejected": 12.28393840789795, "logps/chosen": -3.40043568611145, "logps/rejected": -3.703293800354004, "loss": 3.9355, "rewards/accuracies": 0.75, "rewards/chosen": -34.004356384277344, "rewards/margins": 3.028580665588379, "rewards/rejected": -37.032936096191406, "step": 3188 }, { "epoch": 0.43423202614379086, "grad_norm": 43.45585724539347, "learning_rate": 5.57344878090254e-07, "logits/chosen": 11.085250854492188, "logits/rejected": 11.153234481811523, "logps/chosen": -3.984536647796631, "logps/rejected": -3.7959628105163574, "loss": 4.3081, "rewards/accuracies": 0.5, "rewards/chosen": -39.845367431640625, "rewards/margins": -1.8857388496398926, "rewards/rejected": -37.959625244140625, "step": 3189 }, { "epoch": 0.4343681917211329, "grad_norm": 38.199194782175525, "learning_rate": 5.571700484739745e-07, "logits/chosen": 11.977529525756836, "logits/rejected": 13.229782104492188, "logps/chosen": -3.491318702697754, "logps/rejected": -3.9898810386657715, "loss": 3.7762, "rewards/accuracies": 1.0, "rewards/chosen": -34.913185119628906, "rewards/margins": 4.985622882843018, "rewards/rejected": -39.89881134033203, "step": 3190 }, { "epoch": 0.4345043572984749, "grad_norm": 35.12747441340351, "learning_rate": 5.569951833438233e-07, "logits/chosen": 12.07293701171875, "logits/rejected": 12.35754108428955, "logps/chosen": -3.771188259124756, "logps/rejected": -3.727719306945801, "loss": 3.9422, "rewards/accuracies": 0.5, "rewards/chosen": -37.711883544921875, "rewards/margins": -0.4346923828125, "rewards/rejected": -37.277191162109375, "step": 3191 }, { "epoch": 0.434640522875817, "grad_norm": 40.382272065462075, "learning_rate": 5.568202827393127e-07, "logits/chosen": 11.90827465057373, "logits/rejected": 12.788351058959961, "logps/chosen": -3.6414661407470703, "logps/rejected": -3.9808268547058105, "loss": 4.1108, "rewards/accuracies": 0.75, "rewards/chosen": -36.4146614074707, "rewards/margins": 3.393606185913086, "rewards/rejected": -39.808265686035156, "step": 3192 }, { "epoch": 0.43477668845315903, "grad_norm": 41.22897308974122, "learning_rate": 5.566453466999629e-07, "logits/chosen": 12.704418182373047, "logits/rejected": 13.233779907226562, "logps/chosen": -4.160934925079346, "logps/rejected": -4.652900695800781, "loss": 4.4288, "rewards/accuracies": 1.0, "rewards/chosen": -41.609352111816406, "rewards/margins": 4.9196577072143555, "rewards/rejected": -46.52900695800781, "step": 3193 }, { "epoch": 0.4349128540305011, "grad_norm": 37.378055569131774, "learning_rate": 5.564703752653022e-07, "logits/chosen": 11.361855506896973, "logits/rejected": 11.750387191772461, "logps/chosen": -3.4658937454223633, "logps/rejected": -3.6091513633728027, "loss": 3.7149, "rewards/accuracies": 0.75, "rewards/chosen": -34.658935546875, "rewards/margins": 1.4325799942016602, "rewards/rejected": -36.091514587402344, "step": 3194 }, { "epoch": 0.43504901960784315, "grad_norm": 32.830937149168506, "learning_rate": 5.562953684748668e-07, "logits/chosen": 11.867204666137695, "logits/rejected": 12.117298126220703, "logps/chosen": -3.5221924781799316, "logps/rejected": -3.939929962158203, "loss": 3.7532, "rewards/accuracies": 1.0, "rewards/chosen": -35.221923828125, "rewards/margins": 4.177375316619873, "rewards/rejected": -39.39929962158203, "step": 3195 }, { "epoch": 0.4351851851851852, "grad_norm": 35.980655221029615, "learning_rate": 5.56120326368201e-07, "logits/chosen": 11.745372772216797, "logits/rejected": 12.026036262512207, "logps/chosen": -3.7462852001190186, "logps/rejected": -3.972357988357544, "loss": 3.78, "rewards/accuracies": 0.75, "rewards/chosen": -37.462852478027344, "rewards/margins": 2.2607288360595703, "rewards/rejected": -39.72357940673828, "step": 3196 }, { "epoch": 0.43532135076252726, "grad_norm": 40.72996020128795, "learning_rate": 5.559452489848569e-07, "logits/chosen": 11.06637954711914, "logits/rejected": 11.333122253417969, "logps/chosen": -3.843423843383789, "logps/rejected": -3.971665859222412, "loss": 4.1887, "rewards/accuracies": 0.75, "rewards/chosen": -38.43423843383789, "rewards/margins": 1.2824201583862305, "rewards/rejected": -39.71665954589844, "step": 3197 }, { "epoch": 0.4354575163398693, "grad_norm": 35.19094168893825, "learning_rate": 5.557701363643949e-07, "logits/chosen": 10.75561237335205, "logits/rejected": 10.619928359985352, "logps/chosen": -3.634544849395752, "logps/rejected": -3.6410484313964844, "loss": 4.1231, "rewards/accuracies": 0.25, "rewards/chosen": -36.34545135498047, "rewards/margins": 0.06503438949584961, "rewards/rejected": -36.410484313964844, "step": 3198 }, { "epoch": 0.4355936819172113, "grad_norm": 34.45673420927352, "learning_rate": 5.555949885463827e-07, "logits/chosen": 11.632461547851562, "logits/rejected": 10.772743225097656, "logps/chosen": -3.734013557434082, "logps/rejected": -3.490525484085083, "loss": 4.2592, "rewards/accuracies": 0.25, "rewards/chosen": -37.34013366699219, "rewards/margins": -2.434880256652832, "rewards/rejected": -34.90525817871094, "step": 3199 }, { "epoch": 0.4357298474945534, "grad_norm": 39.40902698443768, "learning_rate": 5.554198055703968e-07, "logits/chosen": 11.458414077758789, "logits/rejected": 12.362261772155762, "logps/chosen": -3.720398187637329, "logps/rejected": -4.087915897369385, "loss": 4.1693, "rewards/accuracies": 0.75, "rewards/chosen": -37.203983306884766, "rewards/margins": 3.675175666809082, "rewards/rejected": -40.87915802001953, "step": 3200 }, { "epoch": 0.43586601307189543, "grad_norm": 36.22413638740887, "learning_rate": 5.552445874760211e-07, "logits/chosen": 12.22789192199707, "logits/rejected": 13.030954360961914, "logps/chosen": -3.9399869441986084, "logps/rejected": -4.222661018371582, "loss": 4.1022, "rewards/accuracies": 1.0, "rewards/chosen": -39.399871826171875, "rewards/margins": 2.826737403869629, "rewards/rejected": -42.22660827636719, "step": 3201 }, { "epoch": 0.43600217864923746, "grad_norm": 35.02098493213138, "learning_rate": 5.550693343028476e-07, "logits/chosen": 12.518735885620117, "logits/rejected": 12.707712173461914, "logps/chosen": -3.9942431449890137, "logps/rejected": -4.238611221313477, "loss": 4.0255, "rewards/accuracies": 1.0, "rewards/chosen": -39.94242858886719, "rewards/margins": 2.4436798095703125, "rewards/rejected": -42.3861083984375, "step": 3202 }, { "epoch": 0.43613834422657954, "grad_norm": 33.652422368432106, "learning_rate": 5.548940460904762e-07, "logits/chosen": 12.055095672607422, "logits/rejected": 11.759449005126953, "logps/chosen": -3.857823133468628, "logps/rejected": -3.821331024169922, "loss": 4.1657, "rewards/accuracies": 0.5, "rewards/chosen": -38.57823181152344, "rewards/margins": -0.36492061614990234, "rewards/rejected": -38.21331024169922, "step": 3203 }, { "epoch": 0.4362745098039216, "grad_norm": 38.32760796719647, "learning_rate": 5.547187228785148e-07, "logits/chosen": 11.924899101257324, "logits/rejected": 12.436056137084961, "logps/chosen": -3.781233310699463, "logps/rejected": -4.013284683227539, "loss": 4.4275, "rewards/accuracies": 0.75, "rewards/chosen": -37.81233215332031, "rewards/margins": 2.3205137252807617, "rewards/rejected": -40.132843017578125, "step": 3204 }, { "epoch": 0.4364106753812636, "grad_norm": 36.07125255058184, "learning_rate": 5.545433647065789e-07, "logits/chosen": 11.405632972717285, "logits/rejected": 12.226517677307129, "logps/chosen": -3.9311022758483887, "logps/rejected": -4.100127220153809, "loss": 3.5726, "rewards/accuracies": 0.5, "rewards/chosen": -39.31101989746094, "rewards/margins": 1.6902503967285156, "rewards/rejected": -41.00127410888672, "step": 3205 }, { "epoch": 0.4365468409586057, "grad_norm": 33.63056647657931, "learning_rate": 5.543679716142923e-07, "logits/chosen": 11.782814025878906, "logits/rejected": 11.888648986816406, "logps/chosen": -4.046395301818848, "logps/rejected": -3.979273557662964, "loss": 4.0759, "rewards/accuracies": 0.5, "rewards/chosen": -40.46395492553711, "rewards/margins": -0.6712198257446289, "rewards/rejected": -39.7927360534668, "step": 3206 }, { "epoch": 0.4366830065359477, "grad_norm": 41.55198631253579, "learning_rate": 5.541925436412866e-07, "logits/chosen": 12.162927627563477, "logits/rejected": 12.104082107543945, "logps/chosen": -3.8442935943603516, "logps/rejected": -4.066549301147461, "loss": 3.9431, "rewards/accuracies": 0.75, "rewards/chosen": -38.44293212890625, "rewards/margins": 2.222559928894043, "rewards/rejected": -40.66549301147461, "step": 3207 }, { "epoch": 0.43681917211328974, "grad_norm": 36.28677580235391, "learning_rate": 5.540170808272011e-07, "logits/chosen": 11.46811294555664, "logits/rejected": 11.505929946899414, "logps/chosen": -3.93599271774292, "logps/rejected": -3.8775293827056885, "loss": 4.0674, "rewards/accuracies": 0.5, "rewards/chosen": -39.359928131103516, "rewards/margins": -0.5846338272094727, "rewards/rejected": -38.77529525756836, "step": 3208 }, { "epoch": 0.43695533769063183, "grad_norm": 52.999159453678445, "learning_rate": 5.53841583211683e-07, "logits/chosen": 11.880074501037598, "logits/rejected": 12.599343299865723, "logps/chosen": -3.966614007949829, "logps/rejected": -4.059569358825684, "loss": 3.4501, "rewards/accuracies": 0.5, "rewards/chosen": -39.6661376953125, "rewards/margins": 0.9295530319213867, "rewards/rejected": -40.5956916809082, "step": 3209 }, { "epoch": 0.43709150326797386, "grad_norm": 33.83445636989339, "learning_rate": 5.536660508343875e-07, "logits/chosen": 11.175413131713867, "logits/rejected": 12.889725685119629, "logps/chosen": -3.5806093215942383, "logps/rejected": -4.243712425231934, "loss": 3.4932, "rewards/accuracies": 1.0, "rewards/chosen": -35.80609130859375, "rewards/margins": 6.631036758422852, "rewards/rejected": -42.43712615966797, "step": 3210 }, { "epoch": 0.4372276688453159, "grad_norm": 38.48729641651635, "learning_rate": 5.534904837349775e-07, "logits/chosen": 12.250015258789062, "logits/rejected": 11.996723175048828, "logps/chosen": -3.9895691871643066, "logps/rejected": -3.6851823329925537, "loss": 4.2356, "rewards/accuracies": 0.25, "rewards/chosen": -39.89569091796875, "rewards/margins": -3.0438690185546875, "rewards/rejected": -36.85182189941406, "step": 3211 }, { "epoch": 0.43736383442265797, "grad_norm": 38.31915490358579, "learning_rate": 5.533148819531242e-07, "logits/chosen": 12.463484764099121, "logits/rejected": 12.68955135345459, "logps/chosen": -3.9973301887512207, "logps/rejected": -3.9109835624694824, "loss": 4.2757, "rewards/accuracies": 0.5, "rewards/chosen": -39.97330093383789, "rewards/margins": -0.8634662628173828, "rewards/rejected": -39.109832763671875, "step": 3212 }, { "epoch": 0.4375, "grad_norm": 36.649109987782175, "learning_rate": 5.531392455285058e-07, "logits/chosen": 11.234199523925781, "logits/rejected": 12.299139976501465, "logps/chosen": -3.5426363945007324, "logps/rejected": -3.878347873687744, "loss": 3.7502, "rewards/accuracies": 0.75, "rewards/chosen": -35.42636489868164, "rewards/margins": 3.3571152687072754, "rewards/rejected": -38.783477783203125, "step": 3213 }, { "epoch": 0.43763616557734203, "grad_norm": 37.0793121431482, "learning_rate": 5.52963574500809e-07, "logits/chosen": 12.043649673461914, "logits/rejected": 11.625602722167969, "logps/chosen": -3.907452344894409, "logps/rejected": -3.9267754554748535, "loss": 3.9521, "rewards/accuracies": 0.25, "rewards/chosen": -39.07452392578125, "rewards/margins": 0.19323062896728516, "rewards/rejected": -39.26775360107422, "step": 3214 }, { "epoch": 0.4377723311546841, "grad_norm": 38.371766750123896, "learning_rate": 5.527878689097282e-07, "logits/chosen": 11.160941123962402, "logits/rejected": 11.281084060668945, "logps/chosen": -3.7916412353515625, "logps/rejected": -3.786898374557495, "loss": 4.1033, "rewards/accuracies": 0.75, "rewards/chosen": -37.916412353515625, "rewards/margins": -0.047428131103515625, "rewards/rejected": -37.868980407714844, "step": 3215 }, { "epoch": 0.43790849673202614, "grad_norm": 36.79443433064636, "learning_rate": 5.526121287949655e-07, "logits/chosen": 11.808553695678711, "logits/rejected": 12.045890808105469, "logps/chosen": -4.07291316986084, "logps/rejected": -4.266541481018066, "loss": 4.2559, "rewards/accuracies": 0.75, "rewards/chosen": -40.729129791259766, "rewards/margins": 1.936284065246582, "rewards/rejected": -42.66541290283203, "step": 3216 }, { "epoch": 0.43804466230936817, "grad_norm": 40.324936212261264, "learning_rate": 5.524363541962308e-07, "logits/chosen": 11.19810962677002, "logits/rejected": 11.294114112854004, "logps/chosen": -3.8853344917297363, "logps/rejected": -3.9338431358337402, "loss": 4.5278, "rewards/accuracies": 0.5, "rewards/chosen": -38.85334396362305, "rewards/margins": 0.4850893020629883, "rewards/rejected": -39.33843231201172, "step": 3217 }, { "epoch": 0.43818082788671026, "grad_norm": 41.05956201712704, "learning_rate": 5.522605451532417e-07, "logits/chosen": 12.059431076049805, "logits/rejected": 11.732460021972656, "logps/chosen": -3.7875735759735107, "logps/rejected": -3.8969950675964355, "loss": 4.0454, "rewards/accuracies": 0.75, "rewards/chosen": -37.875736236572266, "rewards/margins": 1.0942144393920898, "rewards/rejected": -38.96995162963867, "step": 3218 }, { "epoch": 0.4383169934640523, "grad_norm": 38.85020264901555, "learning_rate": 5.520847017057239e-07, "logits/chosen": 12.246234893798828, "logits/rejected": 11.61854362487793, "logps/chosen": -4.159596920013428, "logps/rejected": -4.087589740753174, "loss": 4.0006, "rewards/accuracies": 0.25, "rewards/chosen": -41.595970153808594, "rewards/margins": -0.7200708389282227, "rewards/rejected": -40.87590026855469, "step": 3219 }, { "epoch": 0.4384531590413943, "grad_norm": 42.66233420370706, "learning_rate": 5.519088238934106e-07, "logits/chosen": 13.008281707763672, "logits/rejected": 11.825399398803711, "logps/chosen": -4.032777786254883, "logps/rejected": -3.871753692626953, "loss": 4.2168, "rewards/accuracies": 0.5, "rewards/chosen": -40.32777404785156, "rewards/margins": -1.6102361679077148, "rewards/rejected": -38.71753692626953, "step": 3220 }, { "epoch": 0.4385893246187364, "grad_norm": 34.3993899959247, "learning_rate": 5.517329117560429e-07, "logits/chosen": 12.208457946777344, "logits/rejected": 12.513916015625, "logps/chosen": -3.773857831954956, "logps/rejected": -3.6928064823150635, "loss": 3.7486, "rewards/accuracies": 0.5, "rewards/chosen": -37.73857879638672, "rewards/margins": -0.8105134963989258, "rewards/rejected": -36.928062438964844, "step": 3221 }, { "epoch": 0.4387254901960784, "grad_norm": 40.99857128154412, "learning_rate": 5.515569653333695e-07, "logits/chosen": 12.12248420715332, "logits/rejected": 12.213305473327637, "logps/chosen": -3.8754560947418213, "logps/rejected": -3.757538318634033, "loss": 3.7754, "rewards/accuracies": 0.5, "rewards/chosen": -38.75456237792969, "rewards/margins": -1.1791763305664062, "rewards/rejected": -37.57538604736328, "step": 3222 }, { "epoch": 0.43886165577342046, "grad_norm": 38.92123695105742, "learning_rate": 5.513809846651469e-07, "logits/chosen": 12.23536491394043, "logits/rejected": 12.147298812866211, "logps/chosen": -3.920941114425659, "logps/rejected": -3.8001043796539307, "loss": 3.9419, "rewards/accuracies": 0.25, "rewards/chosen": -39.209407806396484, "rewards/margins": -1.2083635330200195, "rewards/rejected": -38.00104522705078, "step": 3223 }, { "epoch": 0.43899782135076254, "grad_norm": 40.459141511469646, "learning_rate": 5.512049697911397e-07, "logits/chosen": 11.788764953613281, "logits/rejected": 11.328129768371582, "logps/chosen": -3.5590662956237793, "logps/rejected": -3.707589626312256, "loss": 3.8982, "rewards/accuracies": 0.5, "rewards/chosen": -35.590660095214844, "rewards/margins": 1.4852337837219238, "rewards/rejected": -37.07589340209961, "step": 3224 }, { "epoch": 0.43913398692810457, "grad_norm": 38.28471096953677, "learning_rate": 5.510289207511196e-07, "logits/chosen": 12.151426315307617, "logits/rejected": 13.280052185058594, "logps/chosen": -3.871687412261963, "logps/rejected": -4.2561821937561035, "loss": 3.7439, "rewards/accuracies": 1.0, "rewards/chosen": -38.71687316894531, "rewards/margins": 3.8449487686157227, "rewards/rejected": -42.56182098388672, "step": 3225 }, { "epoch": 0.4392701525054466, "grad_norm": 38.55487790534751, "learning_rate": 5.508528375848664e-07, "logits/chosen": 10.880008697509766, "logits/rejected": 11.951347351074219, "logps/chosen": -3.439450979232788, "logps/rejected": -3.7096903324127197, "loss": 3.9135, "rewards/accuracies": 1.0, "rewards/chosen": -34.394508361816406, "rewards/margins": 2.7023916244506836, "rewards/rejected": -37.096900939941406, "step": 3226 }, { "epoch": 0.4394063180827887, "grad_norm": 39.19440321263993, "learning_rate": 5.506767203321676e-07, "logits/chosen": 12.279537200927734, "logits/rejected": 12.872782707214355, "logps/chosen": -4.371120452880859, "logps/rejected": -4.665647029876709, "loss": 3.9926, "rewards/accuracies": 0.75, "rewards/chosen": -43.711204528808594, "rewards/margins": 2.9452686309814453, "rewards/rejected": -46.656471252441406, "step": 3227 }, { "epoch": 0.4395424836601307, "grad_norm": 38.658364498797724, "learning_rate": 5.505005690328184e-07, "logits/chosen": 11.736188888549805, "logits/rejected": 12.393709182739258, "logps/chosen": -3.3609747886657715, "logps/rejected": -3.763542652130127, "loss": 3.8525, "rewards/accuracies": 0.75, "rewards/chosen": -33.60974884033203, "rewards/margins": 4.025680065155029, "rewards/rejected": -37.63542938232422, "step": 3228 }, { "epoch": 0.43967864923747274, "grad_norm": 40.92671081502822, "learning_rate": 5.503243837266215e-07, "logits/chosen": 10.897333145141602, "logits/rejected": 11.460978507995605, "logps/chosen": -3.476824998855591, "logps/rejected": -3.437324285507202, "loss": 3.8613, "rewards/accuracies": 0.5, "rewards/chosen": -34.76824951171875, "rewards/margins": -0.3950061798095703, "rewards/rejected": -34.37324142456055, "step": 3229 }, { "epoch": 0.4398148148148148, "grad_norm": 36.31882074458522, "learning_rate": 5.501481644533875e-07, "logits/chosen": 12.37747573852539, "logits/rejected": 11.963003158569336, "logps/chosen": -3.3649444580078125, "logps/rejected": -3.397458553314209, "loss": 4.1589, "rewards/accuracies": 0.5, "rewards/chosen": -33.649444580078125, "rewards/margins": 0.32514286041259766, "rewards/rejected": -33.974586486816406, "step": 3230 }, { "epoch": 0.43995098039215685, "grad_norm": 42.34026597077155, "learning_rate": 5.499719112529347e-07, "logits/chosen": 10.925374984741211, "logits/rejected": 12.230415344238281, "logps/chosen": -3.380471706390381, "logps/rejected": -3.7170541286468506, "loss": 4.0034, "rewards/accuracies": 0.75, "rewards/chosen": -33.804718017578125, "rewards/margins": 3.36582612991333, "rewards/rejected": -37.17053985595703, "step": 3231 }, { "epoch": 0.4400871459694989, "grad_norm": 34.92466818851421, "learning_rate": 5.497956241650888e-07, "logits/chosen": 11.068578720092773, "logits/rejected": 11.682771682739258, "logps/chosen": -3.4768893718719482, "logps/rejected": -3.899890184402466, "loss": 3.9664, "rewards/accuracies": 1.0, "rewards/chosen": -34.768890380859375, "rewards/margins": 4.230009078979492, "rewards/rejected": -38.9989013671875, "step": 3232 }, { "epoch": 0.44022331154684097, "grad_norm": 46.735371677122906, "learning_rate": 5.496193032296834e-07, "logits/chosen": 11.635061264038086, "logits/rejected": 11.810774803161621, "logps/chosen": -3.7187652587890625, "logps/rejected": -3.6586365699768066, "loss": 4.7985, "rewards/accuracies": 0.25, "rewards/chosen": -37.187652587890625, "rewards/margins": -0.6012859344482422, "rewards/rejected": -36.58636474609375, "step": 3233 }, { "epoch": 0.440359477124183, "grad_norm": 38.56349830472745, "learning_rate": 5.494429484865597e-07, "logits/chosen": 11.478261947631836, "logits/rejected": 12.976373672485352, "logps/chosen": -3.7964978218078613, "logps/rejected": -4.072526931762695, "loss": 4.0688, "rewards/accuracies": 1.0, "rewards/chosen": -37.96498107910156, "rewards/margins": 2.760289192199707, "rewards/rejected": -40.72526931762695, "step": 3234 }, { "epoch": 0.4404956427015251, "grad_norm": 34.463461615046185, "learning_rate": 5.492665599755664e-07, "logits/chosen": 12.449563980102539, "logits/rejected": 11.788078308105469, "logps/chosen": -3.6088826656341553, "logps/rejected": -3.44447660446167, "loss": 3.8421, "rewards/accuracies": 0.5, "rewards/chosen": -36.08882522583008, "rewards/margins": -1.6440606117248535, "rewards/rejected": -34.44476318359375, "step": 3235 }, { "epoch": 0.4406318082788671, "grad_norm": 39.83151207309871, "learning_rate": 5.490901377365601e-07, "logits/chosen": 10.916337966918945, "logits/rejected": 11.637337684631348, "logps/chosen": -3.77518892288208, "logps/rejected": -3.765937328338623, "loss": 4.5632, "rewards/accuracies": 0.25, "rewards/chosen": -37.751888275146484, "rewards/margins": -0.09251689910888672, "rewards/rejected": -37.65937042236328, "step": 3236 }, { "epoch": 0.44076797385620914, "grad_norm": 37.61756120479775, "learning_rate": 5.489136818094048e-07, "logits/chosen": 10.924880981445312, "logits/rejected": 11.111919403076172, "logps/chosen": -3.440277576446533, "logps/rejected": -3.494828701019287, "loss": 3.8268, "rewards/accuracies": 0.75, "rewards/chosen": -34.40277862548828, "rewards/margins": 0.5455093383789062, "rewards/rejected": -34.94828796386719, "step": 3237 }, { "epoch": 0.4409041394335512, "grad_norm": 41.04081635218829, "learning_rate": 5.487371922339721e-07, "logits/chosen": 11.163801193237305, "logits/rejected": 11.505487442016602, "logps/chosen": -3.539612293243408, "logps/rejected": -3.7543230056762695, "loss": 3.6711, "rewards/accuracies": 0.5, "rewards/chosen": -35.39612579345703, "rewards/margins": 2.1471071243286133, "rewards/rejected": -37.54322814941406, "step": 3238 }, { "epoch": 0.44104030501089325, "grad_norm": 36.849705918366816, "learning_rate": 5.485606690501414e-07, "logits/chosen": 11.92220687866211, "logits/rejected": 11.859567642211914, "logps/chosen": -3.7641730308532715, "logps/rejected": -3.8679800033569336, "loss": 4.1628, "rewards/accuracies": 0.5, "rewards/chosen": -37.641727447509766, "rewards/margins": 1.0380678176879883, "rewards/rejected": -38.6797981262207, "step": 3239 }, { "epoch": 0.4411764705882353, "grad_norm": 39.353334847396454, "learning_rate": 5.483841122977995e-07, "logits/chosen": 12.345632553100586, "logits/rejected": 13.059555053710938, "logps/chosen": -3.9770095348358154, "logps/rejected": -4.198576927185059, "loss": 4.0196, "rewards/accuracies": 1.0, "rewards/chosen": -39.77009582519531, "rewards/margins": 2.2156715393066406, "rewards/rejected": -41.98576736450195, "step": 3240 }, { "epoch": 0.44131263616557737, "grad_norm": 39.697258756866745, "learning_rate": 5.482075220168408e-07, "logits/chosen": 10.744839668273926, "logits/rejected": 11.49673843383789, "logps/chosen": -3.6166281700134277, "logps/rejected": -3.920207977294922, "loss": 3.7181, "rewards/accuracies": 0.75, "rewards/chosen": -36.166282653808594, "rewards/margins": 3.0357985496520996, "rewards/rejected": -39.20207977294922, "step": 3241 }, { "epoch": 0.4414488017429194, "grad_norm": 39.325361620167456, "learning_rate": 5.480308982471674e-07, "logits/chosen": 12.293728828430176, "logits/rejected": 11.694188117980957, "logps/chosen": -4.045181751251221, "logps/rejected": -3.8152143955230713, "loss": 3.7397, "rewards/accuracies": 0.25, "rewards/chosen": -40.45181655883789, "rewards/margins": -2.299673557281494, "rewards/rejected": -38.15214538574219, "step": 3242 }, { "epoch": 0.4415849673202614, "grad_norm": 37.951664139954126, "learning_rate": 5.47854241028689e-07, "logits/chosen": 11.72745132446289, "logits/rejected": 11.425086975097656, "logps/chosen": -3.9281742572784424, "logps/rejected": -4.181679725646973, "loss": 4.1979, "rewards/accuracies": 0.75, "rewards/chosen": -39.28173828125, "rewards/margins": 2.535055160522461, "rewards/rejected": -41.816795349121094, "step": 3243 }, { "epoch": 0.4417211328976035, "grad_norm": 49.96474639772408, "learning_rate": 5.476775504013227e-07, "logits/chosen": 12.087179183959961, "logits/rejected": 12.25469970703125, "logps/chosen": -3.6503844261169434, "logps/rejected": -4.031989097595215, "loss": 3.4081, "rewards/accuracies": 0.75, "rewards/chosen": -36.50384521484375, "rewards/margins": 3.8160476684570312, "rewards/rejected": -40.31989288330078, "step": 3244 }, { "epoch": 0.44185729847494554, "grad_norm": 40.71434098798254, "learning_rate": 5.475008264049931e-07, "logits/chosen": 11.488204002380371, "logits/rejected": 11.967775344848633, "logps/chosen": -3.999756097793579, "logps/rejected": -3.963322162628174, "loss": 4.345, "rewards/accuracies": 0.5, "rewards/chosen": -39.99755859375, "rewards/margins": -0.3643379211425781, "rewards/rejected": -39.63322067260742, "step": 3245 }, { "epoch": 0.44199346405228757, "grad_norm": 40.92017730273368, "learning_rate": 5.473240690796325e-07, "logits/chosen": 10.663015365600586, "logits/rejected": 10.361815452575684, "logps/chosen": -3.515512466430664, "logps/rejected": -3.370027542114258, "loss": 4.129, "rewards/accuracies": 0.5, "rewards/chosen": -35.155120849609375, "rewards/margins": -1.4548454284667969, "rewards/rejected": -33.70027542114258, "step": 3246 }, { "epoch": 0.44212962962962965, "grad_norm": 35.29346602045164, "learning_rate": 5.471472784651806e-07, "logits/chosen": 11.451906204223633, "logits/rejected": 12.214154243469238, "logps/chosen": -3.6623330116271973, "logps/rejected": -3.660876750946045, "loss": 4.2167, "rewards/accuracies": 0.5, "rewards/chosen": -36.623329162597656, "rewards/margins": -0.014564037322998047, "rewards/rejected": -36.6087646484375, "step": 3247 }, { "epoch": 0.4422657952069717, "grad_norm": 74.33084311132949, "learning_rate": 5.46970454601585e-07, "logits/chosen": 10.742486953735352, "logits/rejected": 10.95315933227539, "logps/chosen": -3.0863656997680664, "logps/rejected": -3.5716350078582764, "loss": 3.2693, "rewards/accuracies": 1.0, "rewards/chosen": -30.863658905029297, "rewards/margins": 4.852692127227783, "rewards/rejected": -35.71635055541992, "step": 3248 }, { "epoch": 0.4424019607843137, "grad_norm": 41.12588305817883, "learning_rate": 5.467935975288003e-07, "logits/chosen": 11.999774932861328, "logits/rejected": 12.13387393951416, "logps/chosen": -3.8632121086120605, "logps/rejected": -3.890097141265869, "loss": 3.732, "rewards/accuracies": 0.25, "rewards/chosen": -38.632118225097656, "rewards/margins": 0.26885318756103516, "rewards/rejected": -38.900970458984375, "step": 3249 }, { "epoch": 0.4425381263616558, "grad_norm": 37.17292318393855, "learning_rate": 5.466167072867887e-07, "logits/chosen": 11.050010681152344, "logits/rejected": 12.344913482666016, "logps/chosen": -3.6354403495788574, "logps/rejected": -4.261615753173828, "loss": 3.1877, "rewards/accuracies": 1.0, "rewards/chosen": -36.35440444946289, "rewards/margins": 6.261754035949707, "rewards/rejected": -42.61615753173828, "step": 3250 }, { "epoch": 0.4426742919389978, "grad_norm": 39.891580448237065, "learning_rate": 5.464397839155202e-07, "logits/chosen": 11.824045181274414, "logits/rejected": 12.257619857788086, "logps/chosen": -4.072114944458008, "logps/rejected": -4.175973892211914, "loss": 3.5811, "rewards/accuracies": 0.75, "rewards/chosen": -40.721153259277344, "rewards/margins": 1.0385856628417969, "rewards/rejected": -41.759735107421875, "step": 3251 }, { "epoch": 0.44281045751633985, "grad_norm": 35.08042310354571, "learning_rate": 5.462628274549721e-07, "logits/chosen": 12.083833694458008, "logits/rejected": 12.345844268798828, "logps/chosen": -3.877180814743042, "logps/rejected": -3.9226231575012207, "loss": 3.8136, "rewards/accuracies": 0.25, "rewards/chosen": -38.77180862426758, "rewards/margins": 0.4544248580932617, "rewards/rejected": -39.22623062133789, "step": 3252 }, { "epoch": 0.44294662309368193, "grad_norm": 37.22466803822161, "learning_rate": 5.460858379451289e-07, "logits/chosen": 11.283184051513672, "logits/rejected": 10.798736572265625, "logps/chosen": -3.7460546493530273, "logps/rejected": -3.5948023796081543, "loss": 3.827, "rewards/accuracies": 0.25, "rewards/chosen": -37.460548400878906, "rewards/margins": -1.512521743774414, "rewards/rejected": -35.94802474975586, "step": 3253 }, { "epoch": 0.44308278867102396, "grad_norm": 39.60884124392977, "learning_rate": 5.459088154259834e-07, "logits/chosen": 11.470341682434082, "logits/rejected": 11.350871086120605, "logps/chosen": -3.5546464920043945, "logps/rejected": -3.6133761405944824, "loss": 4.2472, "rewards/accuracies": 0.5, "rewards/chosen": -35.54646301269531, "rewards/margins": 0.5872950553894043, "rewards/rejected": -36.133758544921875, "step": 3254 }, { "epoch": 0.443218954248366, "grad_norm": 36.14940836193359, "learning_rate": 5.457317599375347e-07, "logits/chosen": 11.432000160217285, "logits/rejected": 11.979790687561035, "logps/chosen": -3.7103166580200195, "logps/rejected": -3.8502399921417236, "loss": 3.7529, "rewards/accuracies": 0.75, "rewards/chosen": -37.10316467285156, "rewards/margins": 1.39923095703125, "rewards/rejected": -38.50239944458008, "step": 3255 }, { "epoch": 0.4433551198257081, "grad_norm": 39.25113605923627, "learning_rate": 5.455546715197903e-07, "logits/chosen": 10.530162811279297, "logits/rejected": 11.881233215332031, "logps/chosen": -3.278445243835449, "logps/rejected": -3.7356104850769043, "loss": 3.9479, "rewards/accuracies": 0.75, "rewards/chosen": -32.78445053100586, "rewards/margins": 4.571652889251709, "rewards/rejected": -37.35610580444336, "step": 3256 }, { "epoch": 0.4434912854030501, "grad_norm": 38.15951134302627, "learning_rate": 5.453775502127647e-07, "logits/chosen": 12.135993957519531, "logits/rejected": 11.569784164428711, "logps/chosen": -3.6765971183776855, "logps/rejected": -3.7489733695983887, "loss": 4.0297, "rewards/accuracies": 0.75, "rewards/chosen": -36.76597213745117, "rewards/margins": 0.7237615585327148, "rewards/rejected": -37.48973083496094, "step": 3257 }, { "epoch": 0.44362745098039214, "grad_norm": 37.76989376014642, "learning_rate": 5.4520039605648e-07, "logits/chosen": 13.28124713897705, "logits/rejected": 12.17224407196045, "logps/chosen": -4.0133891105651855, "logps/rejected": -3.881960391998291, "loss": 3.7159, "rewards/accuracies": 0.25, "rewards/chosen": -40.13389205932617, "rewards/margins": -1.314286231994629, "rewards/rejected": -38.81960678100586, "step": 3258 }, { "epoch": 0.4437636165577342, "grad_norm": 41.005010037079764, "learning_rate": 5.450232090909654e-07, "logits/chosen": 12.055760383605957, "logits/rejected": 12.069138526916504, "logps/chosen": -3.764671564102173, "logps/rejected": -3.8308725357055664, "loss": 4.2631, "rewards/accuracies": 0.5, "rewards/chosen": -37.6467170715332, "rewards/margins": 0.6620101928710938, "rewards/rejected": -38.30872344970703, "step": 3259 }, { "epoch": 0.44389978213507625, "grad_norm": 38.106539938476566, "learning_rate": 5.448459893562581e-07, "logits/chosen": 11.531427383422852, "logits/rejected": 11.518911361694336, "logps/chosen": -3.343801975250244, "logps/rejected": -3.6302428245544434, "loss": 3.6461, "rewards/accuracies": 0.75, "rewards/chosen": -33.438018798828125, "rewards/margins": 2.864408493041992, "rewards/rejected": -36.30242919921875, "step": 3260 }, { "epoch": 0.4440359477124183, "grad_norm": 39.464402321407384, "learning_rate": 5.44668736892402e-07, "logits/chosen": 11.369827270507812, "logits/rejected": 12.508686065673828, "logps/chosen": -3.5127639770507812, "logps/rejected": -3.9572877883911133, "loss": 3.8108, "rewards/accuracies": 0.75, "rewards/chosen": -35.12763977050781, "rewards/margins": 4.445239067077637, "rewards/rejected": -39.5728759765625, "step": 3261 }, { "epoch": 0.44417211328976036, "grad_norm": 36.76623797284722, "learning_rate": 5.444914517394491e-07, "logits/chosen": 12.010293960571289, "logits/rejected": 11.770343780517578, "logps/chosen": -3.7497103214263916, "logps/rejected": -3.8150923252105713, "loss": 4.0552, "rewards/accuracies": 0.5, "rewards/chosen": -37.49710464477539, "rewards/margins": 0.6538190841674805, "rewards/rejected": -38.15092468261719, "step": 3262 }, { "epoch": 0.4443082788671024, "grad_norm": 41.54749927411097, "learning_rate": 5.443141339374583e-07, "logits/chosen": 11.37820053100586, "logits/rejected": 11.130424499511719, "logps/chosen": -3.550693988800049, "logps/rejected": -3.411129951477051, "loss": 4.3784, "rewards/accuracies": 0.5, "rewards/chosen": -35.50693893432617, "rewards/margins": -1.3956375122070312, "rewards/rejected": -34.11130142211914, "step": 3263 }, { "epoch": 0.4444444444444444, "grad_norm": 36.483333638080886, "learning_rate": 5.44136783526496e-07, "logits/chosen": 10.532764434814453, "logits/rejected": 10.530414581298828, "logps/chosen": -3.5922982692718506, "logps/rejected": -3.441696882247925, "loss": 3.6495, "rewards/accuracies": 0.25, "rewards/chosen": -35.92298126220703, "rewards/margins": -1.5060148239135742, "rewards/rejected": -34.416969299316406, "step": 3264 }, { "epoch": 0.4445806100217865, "grad_norm": 37.5800052898471, "learning_rate": 5.43959400546636e-07, "logits/chosen": 11.771403312683105, "logits/rejected": 12.965950012207031, "logps/chosen": -3.8607070446014404, "logps/rejected": -4.074131965637207, "loss": 4.1983, "rewards/accuracies": 0.5, "rewards/chosen": -38.60707092285156, "rewards/margins": 2.1342506408691406, "rewards/rejected": -40.7413215637207, "step": 3265 }, { "epoch": 0.44471677559912853, "grad_norm": 42.0223962485271, "learning_rate": 5.437819850379594e-07, "logits/chosen": 11.87813663482666, "logits/rejected": 11.576492309570312, "logps/chosen": -3.9209914207458496, "logps/rejected": -3.761873245239258, "loss": 4.2911, "rewards/accuracies": 0.0, "rewards/chosen": -39.20991134643555, "rewards/margins": -1.5911827087402344, "rewards/rejected": -37.61872863769531, "step": 3266 }, { "epoch": 0.44485294117647056, "grad_norm": 36.09419505959109, "learning_rate": 5.43604537040555e-07, "logits/chosen": 11.164222717285156, "logits/rejected": 11.373188018798828, "logps/chosen": -3.6825332641601562, "logps/rejected": -3.483250617980957, "loss": 3.7926, "rewards/accuracies": 0.0, "rewards/chosen": -36.82533264160156, "rewards/margins": -1.9928274154663086, "rewards/rejected": -34.83250427246094, "step": 3267 }, { "epoch": 0.44498910675381265, "grad_norm": 35.470001379223746, "learning_rate": 5.434270565945181e-07, "logits/chosen": 11.948236465454102, "logits/rejected": 12.014951705932617, "logps/chosen": -3.5940475463867188, "logps/rejected": -3.6907501220703125, "loss": 3.9937, "rewards/accuracies": 0.5, "rewards/chosen": -35.94047546386719, "rewards/margins": 0.9670219421386719, "rewards/rejected": -36.907501220703125, "step": 3268 }, { "epoch": 0.4451252723311547, "grad_norm": 40.78681219672551, "learning_rate": 5.432495437399524e-07, "logits/chosen": 11.42115592956543, "logits/rejected": 11.130300521850586, "logps/chosen": -3.997623920440674, "logps/rejected": -3.77935791015625, "loss": 3.8858, "rewards/accuracies": 0.5, "rewards/chosen": -39.97623825073242, "rewards/margins": -2.1826610565185547, "rewards/rejected": -37.7935791015625, "step": 3269 }, { "epoch": 0.4452614379084967, "grad_norm": 40.15164691478828, "learning_rate": 5.430719985169681e-07, "logits/chosen": 12.148855209350586, "logits/rejected": 12.080944061279297, "logps/chosen": -3.7321882247924805, "logps/rejected": -3.8377318382263184, "loss": 4.0299, "rewards/accuracies": 0.5, "rewards/chosen": -37.32188415527344, "rewards/margins": 1.0554356575012207, "rewards/rejected": -38.3773193359375, "step": 3270 }, { "epoch": 0.4453976034858388, "grad_norm": 45.28401398053762, "learning_rate": 5.428944209656831e-07, "logits/chosen": 11.373117446899414, "logits/rejected": 11.228034019470215, "logps/chosen": -3.6426737308502197, "logps/rejected": -3.6115119457244873, "loss": 4.326, "rewards/accuracies": 0.5, "rewards/chosen": -36.426734924316406, "rewards/margins": -0.311617374420166, "rewards/rejected": -36.11511993408203, "step": 3271 }, { "epoch": 0.4455337690631808, "grad_norm": 40.13887580455122, "learning_rate": 5.427168111262225e-07, "logits/chosen": 12.540363311767578, "logits/rejected": 12.123756408691406, "logps/chosen": -3.633854389190674, "logps/rejected": -3.504788875579834, "loss": 4.2836, "rewards/accuracies": 0.25, "rewards/chosen": -36.33854675292969, "rewards/margins": -1.2906560897827148, "rewards/rejected": -35.047889709472656, "step": 3272 }, { "epoch": 0.4456699346405229, "grad_norm": 37.69392538080502, "learning_rate": 5.425391690387187e-07, "logits/chosen": 11.864337921142578, "logits/rejected": 12.102800369262695, "logps/chosen": -3.9351603984832764, "logps/rejected": -3.9162073135375977, "loss": 3.8739, "rewards/accuracies": 0.5, "rewards/chosen": -39.35160446166992, "rewards/margins": -0.18953180313110352, "rewards/rejected": -39.162071228027344, "step": 3273 }, { "epoch": 0.44580610021786493, "grad_norm": 40.96246294512555, "learning_rate": 5.423614947433115e-07, "logits/chosen": 11.22329330444336, "logits/rejected": 12.20919418334961, "logps/chosen": -3.2316346168518066, "logps/rejected": -4.077542304992676, "loss": 4.4503, "rewards/accuracies": 1.0, "rewards/chosen": -32.31634521484375, "rewards/margins": 8.459075927734375, "rewards/rejected": -40.775421142578125, "step": 3274 }, { "epoch": 0.44594226579520696, "grad_norm": 41.49951597547335, "learning_rate": 5.421837882801477e-07, "logits/chosen": 12.247444152832031, "logits/rejected": 11.997325897216797, "logps/chosen": -3.7415339946746826, "logps/rejected": -4.009550094604492, "loss": 3.9236, "rewards/accuracies": 0.75, "rewards/chosen": -37.415340423583984, "rewards/margins": 2.6801586151123047, "rewards/rejected": -40.095497131347656, "step": 3275 }, { "epoch": 0.44607843137254904, "grad_norm": 40.69336163081854, "learning_rate": 5.420060496893818e-07, "logits/chosen": 12.588397979736328, "logits/rejected": 12.747430801391602, "logps/chosen": -3.9838814735412598, "logps/rejected": -4.059414386749268, "loss": 4.0801, "rewards/accuracies": 0.75, "rewards/chosen": -39.83881378173828, "rewards/margins": 0.7553319931030273, "rewards/rejected": -40.594146728515625, "step": 3276 }, { "epoch": 0.4462145969498911, "grad_norm": 44.20018713222836, "learning_rate": 5.418282790111748e-07, "logits/chosen": 12.065804481506348, "logits/rejected": 12.495270729064941, "logps/chosen": -3.424401044845581, "logps/rejected": -3.584141731262207, "loss": 3.7884, "rewards/accuracies": 0.75, "rewards/chosen": -34.24401092529297, "rewards/margins": 1.5974078178405762, "rewards/rejected": -35.84142303466797, "step": 3277 }, { "epoch": 0.4463507625272331, "grad_norm": 36.922397290045296, "learning_rate": 5.416504762856961e-07, "logits/chosen": 11.358802795410156, "logits/rejected": 12.10151195526123, "logps/chosen": -3.2570552825927734, "logps/rejected": -3.944626808166504, "loss": 3.3957, "rewards/accuracies": 1.0, "rewards/chosen": -32.570552825927734, "rewards/margins": 6.875715732574463, "rewards/rejected": -39.446266174316406, "step": 3278 }, { "epoch": 0.4464869281045752, "grad_norm": 42.74464830446091, "learning_rate": 5.414726415531213e-07, "logits/chosen": 11.714860916137695, "logits/rejected": 12.172752380371094, "logps/chosen": -3.601594924926758, "logps/rejected": -3.9338631629943848, "loss": 4.0603, "rewards/accuracies": 0.75, "rewards/chosen": -36.01594924926758, "rewards/margins": 3.322683334350586, "rewards/rejected": -39.33863067626953, "step": 3279 }, { "epoch": 0.4466230936819172, "grad_norm": 39.681404296457, "learning_rate": 5.412947748536337e-07, "logits/chosen": 11.776082038879395, "logits/rejected": 11.828287124633789, "logps/chosen": -3.61360239982605, "logps/rejected": -3.9573073387145996, "loss": 3.7712, "rewards/accuracies": 0.5, "rewards/chosen": -36.136024475097656, "rewards/margins": 3.4370498657226562, "rewards/rejected": -39.57307434082031, "step": 3280 }, { "epoch": 0.44675925925925924, "grad_norm": 44.05948803514238, "learning_rate": 5.411168762274238e-07, "logits/chosen": 12.349634170532227, "logits/rejected": 11.926965713500977, "logps/chosen": -3.6368885040283203, "logps/rejected": -3.6759300231933594, "loss": 3.6816, "rewards/accuracies": 0.5, "rewards/chosen": -36.3688850402832, "rewards/margins": 0.3904147148132324, "rewards/rejected": -36.759300231933594, "step": 3281 }, { "epoch": 0.44689542483660133, "grad_norm": 41.89697857883722, "learning_rate": 5.409389457146891e-07, "logits/chosen": 12.461149215698242, "logits/rejected": 12.724020004272461, "logps/chosen": -3.618285655975342, "logps/rejected": -3.7865939140319824, "loss": 3.3477, "rewards/accuracies": 0.75, "rewards/chosen": -36.18285369873047, "rewards/margins": 1.6830830574035645, "rewards/rejected": -37.865936279296875, "step": 3282 }, { "epoch": 0.44703159041394336, "grad_norm": 39.16428586873445, "learning_rate": 5.407609833556348e-07, "logits/chosen": 11.697291374206543, "logits/rejected": 11.741724014282227, "logps/chosen": -3.790177345275879, "logps/rejected": -3.8626437187194824, "loss": 4.3934, "rewards/accuracies": 0.25, "rewards/chosen": -37.90177536010742, "rewards/margins": 0.7246618270874023, "rewards/rejected": -38.626434326171875, "step": 3283 }, { "epoch": 0.4471677559912854, "grad_norm": 38.04691044548525, "learning_rate": 5.405829891904727e-07, "logits/chosen": 12.854375839233398, "logits/rejected": 12.256002426147461, "logps/chosen": -3.992274045944214, "logps/rejected": -4.0291948318481445, "loss": 3.926, "rewards/accuracies": 0.25, "rewards/chosen": -39.92274475097656, "rewards/margins": 0.3692054748535156, "rewards/rejected": -40.29194641113281, "step": 3284 }, { "epoch": 0.44730392156862747, "grad_norm": 42.69843436722535, "learning_rate": 5.404049632594221e-07, "logits/chosen": 12.085505485534668, "logits/rejected": 12.50748062133789, "logps/chosen": -3.6440505981445312, "logps/rejected": -3.863748073577881, "loss": 3.9508, "rewards/accuracies": 0.75, "rewards/chosen": -36.44050216674805, "rewards/margins": 2.1969785690307617, "rewards/rejected": -38.637481689453125, "step": 3285 }, { "epoch": 0.4474400871459695, "grad_norm": 39.84636854076285, "learning_rate": 5.402269056027094e-07, "logits/chosen": 12.567756652832031, "logits/rejected": 12.4308443069458, "logps/chosen": -3.7365942001342773, "logps/rejected": -3.8261356353759766, "loss": 4.0402, "rewards/accuracies": 0.75, "rewards/chosen": -37.365943908691406, "rewards/margins": 0.8954095840454102, "rewards/rejected": -38.2613525390625, "step": 3286 }, { "epoch": 0.44757625272331153, "grad_norm": 44.234172396109116, "learning_rate": 5.400488162605684e-07, "logits/chosen": 10.743537902832031, "logits/rejected": 12.422530174255371, "logps/chosen": -3.5642995834350586, "logps/rejected": -4.160566806793213, "loss": 4.1, "rewards/accuracies": 1.0, "rewards/chosen": -35.64299774169922, "rewards/margins": 5.962672233581543, "rewards/rejected": -41.60566711425781, "step": 3287 }, { "epoch": 0.4477124183006536, "grad_norm": 47.20494056170881, "learning_rate": 5.398706952732396e-07, "logits/chosen": 11.186273574829102, "logits/rejected": 11.932823181152344, "logps/chosen": -3.4732398986816406, "logps/rejected": -3.595489501953125, "loss": 4.2961, "rewards/accuracies": 0.75, "rewards/chosen": -34.732398986816406, "rewards/margins": 1.22249174118042, "rewards/rejected": -35.954891204833984, "step": 3288 }, { "epoch": 0.44784858387799564, "grad_norm": 35.84873984103441, "learning_rate": 5.396925426809709e-07, "logits/chosen": 11.068567276000977, "logits/rejected": 12.533863067626953, "logps/chosen": -3.488008737564087, "logps/rejected": -3.7908833026885986, "loss": 3.0079, "rewards/accuracies": 0.75, "rewards/chosen": -34.880088806152344, "rewards/margins": 3.028747081756592, "rewards/rejected": -37.908836364746094, "step": 3289 }, { "epoch": 0.44798474945533767, "grad_norm": 43.453958427276035, "learning_rate": 5.395143585240178e-07, "logits/chosen": 12.153711318969727, "logits/rejected": 11.905328750610352, "logps/chosen": -3.5245249271392822, "logps/rejected": -3.6412878036499023, "loss": 4.6209, "rewards/accuracies": 0.5, "rewards/chosen": -35.2452507019043, "rewards/margins": 1.1676273345947266, "rewards/rejected": -36.412879943847656, "step": 3290 }, { "epoch": 0.44812091503267976, "grad_norm": 41.99113265835911, "learning_rate": 5.393361428426419e-07, "logits/chosen": 11.561407089233398, "logits/rejected": 12.51330280303955, "logps/chosen": -3.6926965713500977, "logps/rejected": -3.654447555541992, "loss": 3.832, "rewards/accuracies": 0.25, "rewards/chosen": -36.926963806152344, "rewards/margins": -0.3824901580810547, "rewards/rejected": -36.544471740722656, "step": 3291 }, { "epoch": 0.4482570806100218, "grad_norm": 41.31127803617468, "learning_rate": 5.391578956771127e-07, "logits/chosen": 11.989978790283203, "logits/rejected": 12.718694686889648, "logps/chosen": -3.5455989837646484, "logps/rejected": -3.9013512134552, "loss": 4.4542, "rewards/accuracies": 0.75, "rewards/chosen": -35.455989837646484, "rewards/margins": 3.5575222969055176, "rewards/rejected": -39.013511657714844, "step": 3292 }, { "epoch": 0.4483932461873638, "grad_norm": 41.24697901798344, "learning_rate": 5.38979617067707e-07, "logits/chosen": 11.346770286560059, "logits/rejected": 11.706418991088867, "logps/chosen": -3.468456745147705, "logps/rejected": -3.759107828140259, "loss": 3.9824, "rewards/accuracies": 0.75, "rewards/chosen": -34.6845703125, "rewards/margins": 2.9065113067626953, "rewards/rejected": -37.59107971191406, "step": 3293 }, { "epoch": 0.4485294117647059, "grad_norm": 40.4858104266322, "learning_rate": 5.388013070547078e-07, "logits/chosen": 11.942121505737305, "logits/rejected": 13.447731018066406, "logps/chosen": -3.588726758956909, "logps/rejected": -3.8430352210998535, "loss": 3.8494, "rewards/accuracies": 1.0, "rewards/chosen": -35.88726806640625, "rewards/margins": 2.5430831909179688, "rewards/rejected": -38.43035125732422, "step": 3294 }, { "epoch": 0.4486655773420479, "grad_norm": 77.56423994333588, "learning_rate": 5.386229656784058e-07, "logits/chosen": 13.094593048095703, "logits/rejected": 12.745155334472656, "logps/chosen": -3.928434133529663, "logps/rejected": -4.21701717376709, "loss": 4.5557, "rewards/accuracies": 0.75, "rewards/chosen": -39.284339904785156, "rewards/margins": 2.885833740234375, "rewards/rejected": -42.17017364501953, "step": 3295 }, { "epoch": 0.44880174291938996, "grad_norm": 38.110591753856525, "learning_rate": 5.38444592979099e-07, "logits/chosen": 12.96596908569336, "logits/rejected": 13.33859634399414, "logps/chosen": -3.9565155506134033, "logps/rejected": -3.966933488845825, "loss": 4.0474, "rewards/accuracies": 0.5, "rewards/chosen": -39.565155029296875, "rewards/margins": 0.10417652130126953, "rewards/rejected": -39.669334411621094, "step": 3296 }, { "epoch": 0.44893790849673204, "grad_norm": 42.55699162020232, "learning_rate": 5.382661889970921e-07, "logits/chosen": 13.060148239135742, "logits/rejected": 11.970538139343262, "logps/chosen": -4.197159767150879, "logps/rejected": -3.9409594535827637, "loss": 4.1429, "rewards/accuracies": 0.25, "rewards/chosen": -41.971595764160156, "rewards/margins": -2.562000274658203, "rewards/rejected": -39.40959548950195, "step": 3297 }, { "epoch": 0.44907407407407407, "grad_norm": 37.010958248899385, "learning_rate": 5.380877537726966e-07, "logits/chosen": 12.402996063232422, "logits/rejected": 12.705452919006348, "logps/chosen": -3.672504186630249, "logps/rejected": -3.9030463695526123, "loss": 3.902, "rewards/accuracies": 0.75, "rewards/chosen": -36.725040435791016, "rewards/margins": 2.305422782897949, "rewards/rejected": -39.03046417236328, "step": 3298 }, { "epoch": 0.4492102396514161, "grad_norm": 37.7361330039901, "learning_rate": 5.379092873462319e-07, "logits/chosen": 11.498924255371094, "logits/rejected": 11.77310562133789, "logps/chosen": -3.426849842071533, "logps/rejected": -3.572352886199951, "loss": 3.8463, "rewards/accuracies": 0.5, "rewards/chosen": -34.268497467041016, "rewards/margins": 1.4550299644470215, "rewards/rejected": -35.72352600097656, "step": 3299 }, { "epoch": 0.4493464052287582, "grad_norm": 41.55092680104198, "learning_rate": 5.377307897580238e-07, "logits/chosen": 12.379135131835938, "logits/rejected": 12.530109405517578, "logps/chosen": -4.003700256347656, "logps/rejected": -4.062005519866943, "loss": 3.9527, "rewards/accuracies": 0.25, "rewards/chosen": -40.03700256347656, "rewards/margins": 0.5830554962158203, "rewards/rejected": -40.62005615234375, "step": 3300 }, { "epoch": 0.4494825708061002, "grad_norm": 38.61018244146849, "learning_rate": 5.375522610484051e-07, "logits/chosen": 12.003164291381836, "logits/rejected": 12.501471519470215, "logps/chosen": -3.43400239944458, "logps/rejected": -3.8983545303344727, "loss": 3.9577, "rewards/accuracies": 1.0, "rewards/chosen": -34.340023040771484, "rewards/margins": 4.6435227394104, "rewards/rejected": -38.98354721069336, "step": 3301 }, { "epoch": 0.44961873638344224, "grad_norm": 39.7617869844273, "learning_rate": 5.373737012577161e-07, "logits/chosen": 12.139656066894531, "logits/rejected": 12.456979751586914, "logps/chosen": -4.170103073120117, "logps/rejected": -3.9027369022369385, "loss": 4.1009, "rewards/accuracies": 0.5, "rewards/chosen": -41.70103454589844, "rewards/margins": -2.6736631393432617, "rewards/rejected": -39.027366638183594, "step": 3302 }, { "epoch": 0.4497549019607843, "grad_norm": 42.1693459446027, "learning_rate": 5.371951104263037e-07, "logits/chosen": 11.602224349975586, "logits/rejected": 12.31930160522461, "logps/chosen": -3.521634101867676, "logps/rejected": -3.6467437744140625, "loss": 4.2895, "rewards/accuracies": 0.5, "rewards/chosen": -35.216339111328125, "rewards/margins": 1.2510957717895508, "rewards/rejected": -36.467437744140625, "step": 3303 }, { "epoch": 0.44989106753812635, "grad_norm": 36.34024216393864, "learning_rate": 5.37016488594522e-07, "logits/chosen": 11.601889610290527, "logits/rejected": 11.97048282623291, "logps/chosen": -3.815375804901123, "logps/rejected": -3.9429259300231934, "loss": 3.6621, "rewards/accuracies": 0.5, "rewards/chosen": -38.15375900268555, "rewards/margins": 1.2755002975463867, "rewards/rejected": -39.429256439208984, "step": 3304 }, { "epoch": 0.4500272331154684, "grad_norm": 38.26069057845701, "learning_rate": 5.368378358027322e-07, "logits/chosen": 12.304603576660156, "logits/rejected": 12.080732345581055, "logps/chosen": -3.845482587814331, "logps/rejected": -3.8728063106536865, "loss": 3.7185, "rewards/accuracies": 0.75, "rewards/chosen": -38.45482635498047, "rewards/margins": 0.2732362747192383, "rewards/rejected": -38.728065490722656, "step": 3305 }, { "epoch": 0.45016339869281047, "grad_norm": 36.38398404774797, "learning_rate": 5.366591520913024e-07, "logits/chosen": 12.097146987915039, "logits/rejected": 12.615715980529785, "logps/chosen": -3.4687066078186035, "logps/rejected": -3.858232021331787, "loss": 4.1247, "rewards/accuracies": 0.75, "rewards/chosen": -34.68706512451172, "rewards/margins": 3.8952560424804688, "rewards/rejected": -38.58232116699219, "step": 3306 }, { "epoch": 0.4502995642701525, "grad_norm": 38.17913398213345, "learning_rate": 5.364804375006072e-07, "logits/chosen": 11.455253601074219, "logits/rejected": 12.119285583496094, "logps/chosen": -3.5673928260803223, "logps/rejected": -3.969658136367798, "loss": 4.159, "rewards/accuracies": 1.0, "rewards/chosen": -35.673927307128906, "rewards/margins": 4.0226545333862305, "rewards/rejected": -39.69657897949219, "step": 3307 }, { "epoch": 0.4504357298474945, "grad_norm": 39.944718663375504, "learning_rate": 5.363016920710294e-07, "logits/chosen": 12.066252708435059, "logits/rejected": 12.645662307739258, "logps/chosen": -3.721001148223877, "logps/rejected": -4.121884346008301, "loss": 4.4116, "rewards/accuracies": 0.75, "rewards/chosen": -37.21001052856445, "rewards/margins": 4.008832931518555, "rewards/rejected": -41.218841552734375, "step": 3308 }, { "epoch": 0.4505718954248366, "grad_norm": 36.31604445785596, "learning_rate": 5.361229158429573e-07, "logits/chosen": 11.890754699707031, "logits/rejected": 12.143624305725098, "logps/chosen": -3.4953176975250244, "logps/rejected": -3.7917912006378174, "loss": 3.8478, "rewards/accuracies": 1.0, "rewards/chosen": -34.95317840576172, "rewards/margins": 2.9647340774536133, "rewards/rejected": -37.917911529541016, "step": 3309 }, { "epoch": 0.45070806100217864, "grad_norm": 36.0181279549373, "learning_rate": 5.359441088567872e-07, "logits/chosen": 11.83292007446289, "logits/rejected": 12.720412254333496, "logps/chosen": -3.80894136428833, "logps/rejected": -4.037179470062256, "loss": 3.8295, "rewards/accuracies": 0.75, "rewards/chosen": -38.08941650390625, "rewards/margins": 2.2823801040649414, "rewards/rejected": -40.371795654296875, "step": 3310 }, { "epoch": 0.4508442265795207, "grad_norm": 38.32622315762584, "learning_rate": 5.357652711529221e-07, "logits/chosen": 11.598539352416992, "logits/rejected": 12.237245559692383, "logps/chosen": -3.4086124897003174, "logps/rejected": -3.6221537590026855, "loss": 3.6527, "rewards/accuracies": 0.75, "rewards/chosen": -34.086124420166016, "rewards/margins": 2.1354126930236816, "rewards/rejected": -36.221534729003906, "step": 3311 }, { "epoch": 0.45098039215686275, "grad_norm": 40.889944755943915, "learning_rate": 5.355864027717717e-07, "logits/chosen": 12.213380813598633, "logits/rejected": 11.949777603149414, "logps/chosen": -3.8910622596740723, "logps/rejected": -4.005939483642578, "loss": 3.8373, "rewards/accuracies": 0.75, "rewards/chosen": -38.910621643066406, "rewards/margins": 1.1487693786621094, "rewards/rejected": -40.059391021728516, "step": 3312 }, { "epoch": 0.4511165577342048, "grad_norm": 36.47036158939459, "learning_rate": 5.354075037537527e-07, "logits/chosen": 11.593414306640625, "logits/rejected": 12.461621284484863, "logps/chosen": -3.7621850967407227, "logps/rejected": -3.954007625579834, "loss": 4.0461, "rewards/accuracies": 0.5, "rewards/chosen": -37.621849060058594, "rewards/margins": 1.918227195739746, "rewards/rejected": -39.540077209472656, "step": 3313 }, { "epoch": 0.45125272331154687, "grad_norm": 36.773822196982934, "learning_rate": 5.35228574139289e-07, "logits/chosen": 12.284904479980469, "logits/rejected": 12.443674087524414, "logps/chosen": -3.3613202571868896, "logps/rejected": -3.580252170562744, "loss": 4.3076, "rewards/accuracies": 0.75, "rewards/chosen": -33.61320495605469, "rewards/margins": 2.189319610595703, "rewards/rejected": -35.802520751953125, "step": 3314 }, { "epoch": 0.4513888888888889, "grad_norm": 38.52835663825085, "learning_rate": 5.350496139688112e-07, "logits/chosen": 11.57632064819336, "logits/rejected": 12.167556762695312, "logps/chosen": -3.237107992172241, "logps/rejected": -3.712221384048462, "loss": 3.4935, "rewards/accuracies": 0.75, "rewards/chosen": -32.37107849121094, "rewards/margins": 4.751132965087891, "rewards/rejected": -37.122215270996094, "step": 3315 }, { "epoch": 0.4515250544662309, "grad_norm": 37.93851032040757, "learning_rate": 5.348706232827569e-07, "logits/chosen": 12.108863830566406, "logits/rejected": 12.872499465942383, "logps/chosen": -4.101088523864746, "logps/rejected": -4.078638076782227, "loss": 4.22, "rewards/accuracies": 0.5, "rewards/chosen": -41.01088333129883, "rewards/margins": -0.2245044708251953, "rewards/rejected": -40.786376953125, "step": 3316 }, { "epoch": 0.451661220043573, "grad_norm": 38.76726297006319, "learning_rate": 5.346916021215702e-07, "logits/chosen": 11.800992965698242, "logits/rejected": 12.866360664367676, "logps/chosen": -3.710289478302002, "logps/rejected": -3.961003541946411, "loss": 4.2387, "rewards/accuracies": 0.5, "rewards/chosen": -37.1028938293457, "rewards/margins": 2.50714111328125, "rewards/rejected": -39.61003875732422, "step": 3317 }, { "epoch": 0.45179738562091504, "grad_norm": 37.92834316749869, "learning_rate": 5.345125505257028e-07, "logits/chosen": 11.945817947387695, "logits/rejected": 13.290358543395996, "logps/chosen": -3.431204080581665, "logps/rejected": -3.955742359161377, "loss": 3.906, "rewards/accuracies": 1.0, "rewards/chosen": -34.312042236328125, "rewards/margins": 5.245382308959961, "rewards/rejected": -39.55742263793945, "step": 3318 }, { "epoch": 0.45193355119825707, "grad_norm": 35.81812115115046, "learning_rate": 5.343334685356126e-07, "logits/chosen": 12.73102855682373, "logits/rejected": 12.381462097167969, "logps/chosen": -3.640500783920288, "logps/rejected": -4.074888229370117, "loss": 4.1452, "rewards/accuracies": 0.75, "rewards/chosen": -36.405006408691406, "rewards/margins": 4.343875885009766, "rewards/rejected": -40.74888229370117, "step": 3319 }, { "epoch": 0.45206971677559915, "grad_norm": 42.24153747498972, "learning_rate": 5.341543561917648e-07, "logits/chosen": 11.975400924682617, "logits/rejected": 12.482149124145508, "logps/chosen": -3.729689121246338, "logps/rejected": -3.9492247104644775, "loss": 3.5703, "rewards/accuracies": 0.75, "rewards/chosen": -37.29689025878906, "rewards/margins": 2.1953563690185547, "rewards/rejected": -39.49224853515625, "step": 3320 }, { "epoch": 0.4522058823529412, "grad_norm": 36.31595697391843, "learning_rate": 5.339752135346313e-07, "logits/chosen": 12.502815246582031, "logits/rejected": 12.8310546875, "logps/chosen": -3.7434396743774414, "logps/rejected": -3.9623451232910156, "loss": 4.3169, "rewards/accuracies": 0.75, "rewards/chosen": -37.43439483642578, "rewards/margins": 2.1890554428100586, "rewards/rejected": -39.623451232910156, "step": 3321 }, { "epoch": 0.4523420479302832, "grad_norm": 37.7564457091804, "learning_rate": 5.337960406046909e-07, "logits/chosen": 12.254692077636719, "logits/rejected": 12.991600036621094, "logps/chosen": -3.790745496749878, "logps/rejected": -4.14121150970459, "loss": 4.139, "rewards/accuracies": 0.75, "rewards/chosen": -37.90745544433594, "rewards/margins": 3.50466251373291, "rewards/rejected": -41.41211700439453, "step": 3322 }, { "epoch": 0.4524782135076253, "grad_norm": 42.26346403920681, "learning_rate": 5.336168374424291e-07, "logits/chosen": 11.666510581970215, "logits/rejected": 11.80253791809082, "logps/chosen": -3.643324613571167, "logps/rejected": -3.701864242553711, "loss": 4.3708, "rewards/accuracies": 0.5, "rewards/chosen": -36.43324661254883, "rewards/margins": 0.5853958129882812, "rewards/rejected": -37.01864242553711, "step": 3323 }, { "epoch": 0.4526143790849673, "grad_norm": 42.06317533812086, "learning_rate": 5.334376040883384e-07, "logits/chosen": 12.370884895324707, "logits/rejected": 12.6347074508667, "logps/chosen": -3.911015748977661, "logps/rejected": -3.9026055335998535, "loss": 4.4783, "rewards/accuracies": 0.5, "rewards/chosen": -39.11015701293945, "rewards/margins": -0.08409976959228516, "rewards/rejected": -39.02605438232422, "step": 3324 }, { "epoch": 0.45275054466230935, "grad_norm": 35.07878455193444, "learning_rate": 5.33258340582918e-07, "logits/chosen": 12.045772552490234, "logits/rejected": 12.247079849243164, "logps/chosen": -3.75840425491333, "logps/rejected": -3.890383005142212, "loss": 3.7319, "rewards/accuracies": 0.75, "rewards/chosen": -37.584041595458984, "rewards/margins": 1.319788932800293, "rewards/rejected": -38.903831481933594, "step": 3325 }, { "epoch": 0.45288671023965144, "grad_norm": 36.194866012114964, "learning_rate": 5.330790469666742e-07, "logits/chosen": 12.346214294433594, "logits/rejected": 12.442887306213379, "logps/chosen": -3.7528231143951416, "logps/rejected": -4.228278636932373, "loss": 3.6863, "rewards/accuracies": 1.0, "rewards/chosen": -37.528228759765625, "rewards/margins": 4.754555702209473, "rewards/rejected": -42.28278732299805, "step": 3326 }, { "epoch": 0.45302287581699346, "grad_norm": 39.43501055219488, "learning_rate": 5.328997232801195e-07, "logits/chosen": 12.146745681762695, "logits/rejected": 12.798917770385742, "logps/chosen": -3.763213872909546, "logps/rejected": -4.194401741027832, "loss": 4.2578, "rewards/accuracies": 1.0, "rewards/chosen": -37.63214111328125, "rewards/margins": 4.311878204345703, "rewards/rejected": -41.94401931762695, "step": 3327 }, { "epoch": 0.4531590413943355, "grad_norm": 35.6417215993154, "learning_rate": 5.327203695637738e-07, "logits/chosen": 12.047430038452148, "logits/rejected": 12.567821502685547, "logps/chosen": -3.6114068031311035, "logps/rejected": -3.9466023445129395, "loss": 3.9944, "rewards/accuracies": 0.5, "rewards/chosen": -36.11406707763672, "rewards/margins": 3.3519554138183594, "rewards/rejected": -39.46602249145508, "step": 3328 }, { "epoch": 0.4532952069716776, "grad_norm": 35.928580350169916, "learning_rate": 5.325409858581636e-07, "logits/chosen": 12.604101181030273, "logits/rejected": 12.913108825683594, "logps/chosen": -3.743166923522949, "logps/rejected": -3.765160083770752, "loss": 3.3845, "rewards/accuracies": 0.75, "rewards/chosen": -37.431671142578125, "rewards/margins": 0.21993350982666016, "rewards/rejected": -37.65160369873047, "step": 3329 }, { "epoch": 0.4534313725490196, "grad_norm": 35.12147416982466, "learning_rate": 5.32361572203822e-07, "logits/chosen": 12.241331100463867, "logits/rejected": 12.796341896057129, "logps/chosen": -3.801494598388672, "logps/rejected": -4.438753128051758, "loss": 3.6596, "rewards/accuracies": 1.0, "rewards/chosen": -38.01494598388672, "rewards/margins": 6.372585296630859, "rewards/rejected": -44.38753128051758, "step": 3330 }, { "epoch": 0.45356753812636164, "grad_norm": 38.374944232962456, "learning_rate": 5.32182128641289e-07, "logits/chosen": 10.43712329864502, "logits/rejected": 11.332530975341797, "logps/chosen": -3.3741612434387207, "logps/rejected": -3.849918842315674, "loss": 4.2227, "rewards/accuracies": 1.0, "rewards/chosen": -33.741615295410156, "rewards/margins": 4.757577419281006, "rewards/rejected": -38.49919128417969, "step": 3331 }, { "epoch": 0.4537037037037037, "grad_norm": 36.29098351723768, "learning_rate": 5.320026552111115e-07, "logits/chosen": 11.857465744018555, "logits/rejected": 13.12142276763916, "logps/chosen": -3.9025959968566895, "logps/rejected": -4.1588134765625, "loss": 4.0614, "rewards/accuracies": 0.75, "rewards/chosen": -39.025962829589844, "rewards/margins": 2.562171459197998, "rewards/rejected": -41.588134765625, "step": 3332 }, { "epoch": 0.45383986928104575, "grad_norm": 35.98081585333017, "learning_rate": 5.318231519538429e-07, "logits/chosen": 11.701360702514648, "logits/rejected": 12.524327278137207, "logps/chosen": -3.7922022342681885, "logps/rejected": -4.0464396476745605, "loss": 4.0326, "rewards/accuracies": 0.5, "rewards/chosen": -37.92202377319336, "rewards/margins": 2.5423736572265625, "rewards/rejected": -40.46439743041992, "step": 3333 }, { "epoch": 0.4539760348583878, "grad_norm": 40.00177140626093, "learning_rate": 5.316436189100434e-07, "logits/chosen": 12.416358947753906, "logits/rejected": 12.31313419342041, "logps/chosen": -3.8662869930267334, "logps/rejected": -4.075576305389404, "loss": 4.5018, "rewards/accuracies": 0.5, "rewards/chosen": -38.662864685058594, "rewards/margins": 2.0928964614868164, "rewards/rejected": -40.75576400756836, "step": 3334 }, { "epoch": 0.45411220043572986, "grad_norm": 37.40503520808464, "learning_rate": 5.314640561202801e-07, "logits/chosen": 12.101251602172852, "logits/rejected": 12.909265518188477, "logps/chosen": -3.9440300464630127, "logps/rejected": -4.201555252075195, "loss": 4.5209, "rewards/accuracies": 0.75, "rewards/chosen": -39.44029998779297, "rewards/margins": 2.5752487182617188, "rewards/rejected": -42.01554870605469, "step": 3335 }, { "epoch": 0.4542483660130719, "grad_norm": 36.87817844104728, "learning_rate": 5.312844636251266e-07, "logits/chosen": 12.206721305847168, "logits/rejected": 12.15034294128418, "logps/chosen": -4.063632965087891, "logps/rejected": -3.851745843887329, "loss": 3.5349, "rewards/accuracies": 0.25, "rewards/chosen": -40.636329650878906, "rewards/margins": -2.11887264251709, "rewards/rejected": -38.5174560546875, "step": 3336 }, { "epoch": 0.4543845315904139, "grad_norm": 39.14555417328917, "learning_rate": 5.311048414651634e-07, "logits/chosen": 11.622442245483398, "logits/rejected": 11.971765518188477, "logps/chosen": -3.6321396827697754, "logps/rejected": -4.039432048797607, "loss": 4.4232, "rewards/accuracies": 0.75, "rewards/chosen": -36.32139587402344, "rewards/margins": 4.072925567626953, "rewards/rejected": -40.39432144165039, "step": 3337 }, { "epoch": 0.454520697167756, "grad_norm": 39.3119479742185, "learning_rate": 5.309251896809774e-07, "logits/chosen": 12.473751068115234, "logits/rejected": 12.252876281738281, "logps/chosen": -3.8146629333496094, "logps/rejected": -4.025819778442383, "loss": 4.1246, "rewards/accuracies": 0.75, "rewards/chosen": -38.146629333496094, "rewards/margins": 2.111570358276367, "rewards/rejected": -40.258201599121094, "step": 3338 }, { "epoch": 0.45465686274509803, "grad_norm": 37.517765625866744, "learning_rate": 5.307455083131627e-07, "logits/chosen": 12.493183135986328, "logits/rejected": 12.084400177001953, "logps/chosen": -3.9670963287353516, "logps/rejected": -3.9644205570220947, "loss": 4.2506, "rewards/accuracies": 0.5, "rewards/chosen": -39.670963287353516, "rewards/margins": -0.026758193969726562, "rewards/rejected": -39.644203186035156, "step": 3339 }, { "epoch": 0.45479302832244006, "grad_norm": 40.53641353399364, "learning_rate": 5.305657974023194e-07, "logits/chosen": 12.017127990722656, "logits/rejected": 12.216179847717285, "logps/chosen": -4.011625289916992, "logps/rejected": -3.803725481033325, "loss": 4.4599, "rewards/accuracies": 0.25, "rewards/chosen": -40.11625671386719, "rewards/margins": -2.079000473022461, "rewards/rejected": -38.037254333496094, "step": 3340 }, { "epoch": 0.45492919389978215, "grad_norm": 38.66764012224617, "learning_rate": 5.30386056989055e-07, "logits/chosen": 12.597347259521484, "logits/rejected": 12.713722229003906, "logps/chosen": -3.876161575317383, "logps/rejected": -4.2449951171875, "loss": 4.1389, "rewards/accuracies": 0.75, "rewards/chosen": -38.76161193847656, "rewards/margins": 3.6883363723754883, "rewards/rejected": -42.449951171875, "step": 3341 }, { "epoch": 0.4550653594771242, "grad_norm": 37.369838638568325, "learning_rate": 5.302062871139835e-07, "logits/chosen": 12.162322044372559, "logits/rejected": 12.233315467834473, "logps/chosen": -4.046571731567383, "logps/rejected": -4.10675048828125, "loss": 4.1124, "rewards/accuracies": 0.75, "rewards/chosen": -40.46571731567383, "rewards/margins": 0.6017856597900391, "rewards/rejected": -41.0675048828125, "step": 3342 }, { "epoch": 0.4552015250544662, "grad_norm": 38.45605162038616, "learning_rate": 5.300264878177248e-07, "logits/chosen": 12.09745979309082, "logits/rejected": 12.144347190856934, "logps/chosen": -3.9759955406188965, "logps/rejected": -4.165124893188477, "loss": 3.8938, "rewards/accuracies": 0.5, "rewards/chosen": -39.75995635986328, "rewards/margins": 1.8912878036499023, "rewards/rejected": -41.6512451171875, "step": 3343 }, { "epoch": 0.4553376906318083, "grad_norm": 37.39686795170065, "learning_rate": 5.298466591409066e-07, "logits/chosen": 11.687782287597656, "logits/rejected": 11.791732788085938, "logps/chosen": -3.9717133045196533, "logps/rejected": -4.270559310913086, "loss": 3.5161, "rewards/accuracies": 0.75, "rewards/chosen": -39.717132568359375, "rewards/margins": 2.9884567260742188, "rewards/rejected": -42.705589294433594, "step": 3344 }, { "epoch": 0.4554738562091503, "grad_norm": 39.72859708198203, "learning_rate": 5.296668011241624e-07, "logits/chosen": 13.075252532958984, "logits/rejected": 13.225347518920898, "logps/chosen": -4.331249237060547, "logps/rejected": -4.179360389709473, "loss": 4.392, "rewards/accuracies": 0.25, "rewards/chosen": -43.31249237060547, "rewards/margins": -1.5188894271850586, "rewards/rejected": -41.793601989746094, "step": 3345 }, { "epoch": 0.45561002178649235, "grad_norm": 38.33218353339268, "learning_rate": 5.294869138081325e-07, "logits/chosen": 12.5794677734375, "logits/rejected": 12.046989440917969, "logps/chosen": -3.8985114097595215, "logps/rejected": -3.820930004119873, "loss": 3.8994, "rewards/accuracies": 0.5, "rewards/chosen": -38.98511505126953, "rewards/margins": -0.7758169174194336, "rewards/rejected": -38.20929718017578, "step": 3346 }, { "epoch": 0.45574618736383443, "grad_norm": 37.877271398297204, "learning_rate": 5.293069972334642e-07, "logits/chosen": 12.792657852172852, "logits/rejected": 12.644369125366211, "logps/chosen": -4.063812255859375, "logps/rejected": -4.0628461837768555, "loss": 3.7252, "rewards/accuracies": 0.75, "rewards/chosen": -40.63812255859375, "rewards/margins": -0.009663581848144531, "rewards/rejected": -40.62846374511719, "step": 3347 }, { "epoch": 0.45588235294117646, "grad_norm": 38.07840003973478, "learning_rate": 5.29127051440811e-07, "logits/chosen": 11.415719032287598, "logits/rejected": 12.37607479095459, "logps/chosen": -3.575399398803711, "logps/rejected": -3.9316234588623047, "loss": 4.0179, "rewards/accuracies": 0.75, "rewards/chosen": -35.753990173339844, "rewards/margins": 3.5622406005859375, "rewards/rejected": -39.31623077392578, "step": 3348 }, { "epoch": 0.45601851851851855, "grad_norm": 48.45724140751413, "learning_rate": 5.289470764708331e-07, "logits/chosen": 12.325244903564453, "logits/rejected": 12.122507095336914, "logps/chosen": -3.6063737869262695, "logps/rejected": -4.109358787536621, "loss": 4.1425, "rewards/accuracies": 0.75, "rewards/chosen": -36.06373977661133, "rewards/margins": 5.029849052429199, "rewards/rejected": -41.093589782714844, "step": 3349 }, { "epoch": 0.4561546840958606, "grad_norm": 38.032897999788844, "learning_rate": 5.287670723641975e-07, "logits/chosen": 12.111470222473145, "logits/rejected": 12.126707077026367, "logps/chosen": -3.7403268814086914, "logps/rejected": -3.867966413497925, "loss": 3.575, "rewards/accuracies": 0.75, "rewards/chosen": -37.40326690673828, "rewards/margins": 1.2763957977294922, "rewards/rejected": -38.679664611816406, "step": 3350 }, { "epoch": 0.4562908496732026, "grad_norm": 38.26130543201247, "learning_rate": 5.285870391615775e-07, "logits/chosen": 11.827083587646484, "logits/rejected": 13.00027084350586, "logps/chosen": -3.847161293029785, "logps/rejected": -4.199100494384766, "loss": 3.9024, "rewards/accuracies": 0.75, "rewards/chosen": -38.47161102294922, "rewards/margins": 3.519394874572754, "rewards/rejected": -41.991004943847656, "step": 3351 }, { "epoch": 0.4564270152505447, "grad_norm": 40.259096142088836, "learning_rate": 5.284069769036529e-07, "logits/chosen": 11.333169937133789, "logits/rejected": 12.257438659667969, "logps/chosen": -3.793649673461914, "logps/rejected": -4.117369651794434, "loss": 4.0663, "rewards/accuracies": 1.0, "rewards/chosen": -37.936492919921875, "rewards/margins": 3.237201690673828, "rewards/rejected": -41.17369842529297, "step": 3352 }, { "epoch": 0.4565631808278867, "grad_norm": 41.15069581974097, "learning_rate": 5.282268856311107e-07, "logits/chosen": 12.53016471862793, "logits/rejected": 12.264461517333984, "logps/chosen": -4.108241558074951, "logps/rejected": -4.274688243865967, "loss": 4.0585, "rewards/accuracies": 0.5, "rewards/chosen": -41.08241271972656, "rewards/margins": 1.6644678115844727, "rewards/rejected": -42.74687957763672, "step": 3353 }, { "epoch": 0.45669934640522875, "grad_norm": 41.00178113271773, "learning_rate": 5.280467653846436e-07, "logits/chosen": 11.778076171875, "logits/rejected": 12.84032917022705, "logps/chosen": -3.9571805000305176, "logps/rejected": -4.445751190185547, "loss": 4.0141, "rewards/accuracies": 0.75, "rewards/chosen": -39.571800231933594, "rewards/margins": 4.885707855224609, "rewards/rejected": -44.45751190185547, "step": 3354 }, { "epoch": 0.45683551198257083, "grad_norm": 38.85333765502687, "learning_rate": 5.278666162049514e-07, "logits/chosen": 12.380701065063477, "logits/rejected": 12.879483222961426, "logps/chosen": -3.6287357807159424, "logps/rejected": -4.125843048095703, "loss": 3.8655, "rewards/accuracies": 0.75, "rewards/chosen": -36.287357330322266, "rewards/margins": 4.971076965332031, "rewards/rejected": -41.2584342956543, "step": 3355 }, { "epoch": 0.45697167755991286, "grad_norm": 41.54595470365345, "learning_rate": 5.276864381327403e-07, "logits/chosen": 11.139667510986328, "logits/rejected": 12.091569900512695, "logps/chosen": -3.707720994949341, "logps/rejected": -4.145609378814697, "loss": 3.8446, "rewards/accuracies": 0.75, "rewards/chosen": -37.07720947265625, "rewards/margins": 4.378883361816406, "rewards/rejected": -41.456092834472656, "step": 3356 }, { "epoch": 0.4571078431372549, "grad_norm": 42.26771582081698, "learning_rate": 5.275062312087232e-07, "logits/chosen": 11.871047019958496, "logits/rejected": 12.696971893310547, "logps/chosen": -3.6541333198547363, "logps/rejected": -3.9740283489227295, "loss": 4.6607, "rewards/accuracies": 1.0, "rewards/chosen": -36.54133605957031, "rewards/margins": 3.1989498138427734, "rewards/rejected": -39.74028396606445, "step": 3357 }, { "epoch": 0.457244008714597, "grad_norm": 36.142353990584816, "learning_rate": 5.27325995473619e-07, "logits/chosen": 12.270586013793945, "logits/rejected": 12.585163116455078, "logps/chosen": -3.7481536865234375, "logps/rejected": -3.7155368328094482, "loss": 3.6027, "rewards/accuracies": 0.5, "rewards/chosen": -37.481536865234375, "rewards/margins": -0.3261680603027344, "rewards/rejected": -37.15536880493164, "step": 3358 }, { "epoch": 0.457380174291939, "grad_norm": 42.21490781139609, "learning_rate": 5.271457309681537e-07, "logits/chosen": 11.87169075012207, "logits/rejected": 12.02869987487793, "logps/chosen": -3.9324936866760254, "logps/rejected": -4.038047790527344, "loss": 3.4925, "rewards/accuracies": 0.75, "rewards/chosen": -39.32493591308594, "rewards/margins": 1.0555429458618164, "rewards/rejected": -40.38047790527344, "step": 3359 }, { "epoch": 0.45751633986928103, "grad_norm": 43.304658877199614, "learning_rate": 5.269654377330595e-07, "logits/chosen": 11.703628540039062, "logits/rejected": 11.813096046447754, "logps/chosen": -3.771101474761963, "logps/rejected": -3.9408645629882812, "loss": 3.7219, "rewards/accuracies": 1.0, "rewards/chosen": -37.71101379394531, "rewards/margins": 1.6976318359375, "rewards/rejected": -39.40864562988281, "step": 3360 }, { "epoch": 0.4576525054466231, "grad_norm": 43.706381613813306, "learning_rate": 5.267851158090752e-07, "logits/chosen": 12.688543319702148, "logits/rejected": 12.512475967407227, "logps/chosen": -3.9583234786987305, "logps/rejected": -4.177737236022949, "loss": 3.7708, "rewards/accuracies": 0.75, "rewards/chosen": -39.58323287963867, "rewards/margins": 2.1941375732421875, "rewards/rejected": -41.777374267578125, "step": 3361 }, { "epoch": 0.45778867102396514, "grad_norm": 39.45238968543922, "learning_rate": 5.266047652369458e-07, "logits/chosen": 10.722551345825195, "logits/rejected": 11.375049591064453, "logps/chosen": -3.3026533126831055, "logps/rejected": -3.7009546756744385, "loss": 3.87, "rewards/accuracies": 1.0, "rewards/chosen": -33.02653121948242, "rewards/margins": 3.9830150604248047, "rewards/rejected": -37.009544372558594, "step": 3362 }, { "epoch": 0.4579248366013072, "grad_norm": 49.82936059869755, "learning_rate": 5.264243860574232e-07, "logits/chosen": 12.11532974243164, "logits/rejected": 12.438955307006836, "logps/chosen": -3.3912272453308105, "logps/rejected": -3.8494091033935547, "loss": 3.4973, "rewards/accuracies": 0.75, "rewards/chosen": -33.912269592285156, "rewards/margins": 4.581818580627441, "rewards/rejected": -38.49408721923828, "step": 3363 }, { "epoch": 0.45806100217864926, "grad_norm": 40.602755767661826, "learning_rate": 5.262439783112657e-07, "logits/chosen": 11.707119941711426, "logits/rejected": 12.87202262878418, "logps/chosen": -3.4497733116149902, "logps/rejected": -3.5072357654571533, "loss": 3.9191, "rewards/accuracies": 0.5, "rewards/chosen": -34.49773406982422, "rewards/margins": 0.5746231079101562, "rewards/rejected": -35.072357177734375, "step": 3364 }, { "epoch": 0.4581971677559913, "grad_norm": 39.77352520870597, "learning_rate": 5.260635420392376e-07, "logits/chosen": 11.754257202148438, "logits/rejected": 11.69224739074707, "logps/chosen": -3.6723217964172363, "logps/rejected": -3.9249367713928223, "loss": 3.6723, "rewards/accuracies": 0.75, "rewards/chosen": -36.72321701049805, "rewards/margins": 2.5261502265930176, "rewards/rejected": -39.249366760253906, "step": 3365 }, { "epoch": 0.4583333333333333, "grad_norm": 40.74349658884968, "learning_rate": 5.258830772821102e-07, "logits/chosen": 12.07811450958252, "logits/rejected": 12.160654067993164, "logps/chosen": -3.879148006439209, "logps/rejected": -4.346587181091309, "loss": 3.9958, "rewards/accuracies": 1.0, "rewards/chosen": -38.791481018066406, "rewards/margins": 4.674391746520996, "rewards/rejected": -43.46586990356445, "step": 3366 }, { "epoch": 0.4584694989106754, "grad_norm": 42.026755646614475, "learning_rate": 5.257025840806609e-07, "logits/chosen": 11.454984664916992, "logits/rejected": 12.03636360168457, "logps/chosen": -3.172619342803955, "logps/rejected": -3.522402048110962, "loss": 3.9435, "rewards/accuracies": 1.0, "rewards/chosen": -31.726192474365234, "rewards/margins": 3.497828483581543, "rewards/rejected": -35.224021911621094, "step": 3367 }, { "epoch": 0.45860566448801743, "grad_norm": 50.059488668938414, "learning_rate": 5.255220624756737e-07, "logits/chosen": 12.581283569335938, "logits/rejected": 12.669267654418945, "logps/chosen": -4.2494683265686035, "logps/rejected": -3.985543727874756, "loss": 4.9536, "rewards/accuracies": 0.75, "rewards/chosen": -42.49468231201172, "rewards/margins": -2.639248847961426, "rewards/rejected": -39.855438232421875, "step": 3368 }, { "epoch": 0.45874183006535946, "grad_norm": 41.99906421344219, "learning_rate": 5.253415125079389e-07, "logits/chosen": 11.794981002807617, "logits/rejected": 12.04897689819336, "logps/chosen": -3.8900251388549805, "logps/rejected": -3.875393867492676, "loss": 3.7788, "rewards/accuracies": 0.5, "rewards/chosen": -38.90025329589844, "rewards/margins": -0.14631319046020508, "rewards/rejected": -38.753936767578125, "step": 3369 }, { "epoch": 0.45887799564270154, "grad_norm": 38.26411536864038, "learning_rate": 5.251609342182531e-07, "logits/chosen": 11.105611801147461, "logits/rejected": 11.458083152770996, "logps/chosen": -3.1897668838500977, "logps/rejected": -3.338815689086914, "loss": 3.555, "rewards/accuracies": 0.5, "rewards/chosen": -31.897668838500977, "rewards/margins": 1.490488052368164, "rewards/rejected": -33.388153076171875, "step": 3370 }, { "epoch": 0.45901416122004357, "grad_norm": 41.40144236278634, "learning_rate": 5.249803276474198e-07, "logits/chosen": 12.036321640014648, "logits/rejected": 10.853992462158203, "logps/chosen": -3.6430270671844482, "logps/rejected": -3.6209588050842285, "loss": 4.1256, "rewards/accuracies": 0.5, "rewards/chosen": -36.430267333984375, "rewards/margins": -0.22068166732788086, "rewards/rejected": -36.20958709716797, "step": 3371 }, { "epoch": 0.4591503267973856, "grad_norm": 49.4543041750999, "learning_rate": 5.247996928362484e-07, "logits/chosen": 10.753986358642578, "logits/rejected": 10.931497573852539, "logps/chosen": -2.9776711463928223, "logps/rejected": -3.098722457885742, "loss": 4.7423, "rewards/accuracies": 0.5, "rewards/chosen": -29.776710510253906, "rewards/margins": 1.2105145454406738, "rewards/rejected": -30.987224578857422, "step": 3372 }, { "epoch": 0.4592864923747277, "grad_norm": 39.554084895848284, "learning_rate": 5.246190298255546e-07, "logits/chosen": 11.663999557495117, "logits/rejected": 11.50425910949707, "logps/chosen": -3.430605411529541, "logps/rejected": -3.6948728561401367, "loss": 4.2427, "rewards/accuracies": 0.75, "rewards/chosen": -34.306053161621094, "rewards/margins": 2.64267635345459, "rewards/rejected": -36.94873046875, "step": 3373 }, { "epoch": 0.4594226579520697, "grad_norm": 39.0114812580351, "learning_rate": 5.244383386561612e-07, "logits/chosen": 10.702156066894531, "logits/rejected": 12.232032775878906, "logps/chosen": -3.698395252227783, "logps/rejected": -4.3206586837768555, "loss": 3.56, "rewards/accuracies": 0.75, "rewards/chosen": -36.98395538330078, "rewards/margins": 6.222635269165039, "rewards/rejected": -43.20658874511719, "step": 3374 }, { "epoch": 0.45955882352941174, "grad_norm": 35.248119588620696, "learning_rate": 5.242576193688964e-07, "logits/chosen": 11.403656005859375, "logits/rejected": 11.475940704345703, "logps/chosen": -3.264094352722168, "logps/rejected": -3.4479174613952637, "loss": 3.6637, "rewards/accuracies": 0.75, "rewards/chosen": -32.64094543457031, "rewards/margins": 1.8382301330566406, "rewards/rejected": -34.47917556762695, "step": 3375 }, { "epoch": 0.4596949891067538, "grad_norm": 65.41982536700168, "learning_rate": 5.240768720045952e-07, "logits/chosen": 11.786365509033203, "logits/rejected": 10.870962142944336, "logps/chosen": -3.1709041595458984, "logps/rejected": -3.174288749694824, "loss": 3.7206, "rewards/accuracies": 0.5, "rewards/chosen": -31.709041595458984, "rewards/margins": 0.03384542465209961, "rewards/rejected": -31.74288558959961, "step": 3376 }, { "epoch": 0.45983115468409586, "grad_norm": 38.83555399704476, "learning_rate": 5.238960966040995e-07, "logits/chosen": 12.036901473999023, "logits/rejected": 12.19553279876709, "logps/chosen": -3.391791820526123, "logps/rejected": -3.868478775024414, "loss": 3.7399, "rewards/accuracies": 1.0, "rewards/chosen": -33.91791534423828, "rewards/margins": 4.766871929168701, "rewards/rejected": -38.684791564941406, "step": 3377 }, { "epoch": 0.4599673202614379, "grad_norm": 36.34524895922515, "learning_rate": 5.237152932082563e-07, "logits/chosen": 10.551877975463867, "logits/rejected": 10.71563720703125, "logps/chosen": -3.120486259460449, "logps/rejected": -3.495871067047119, "loss": 3.797, "rewards/accuracies": 0.75, "rewards/chosen": -31.20486068725586, "rewards/margins": 3.7538490295410156, "rewards/rejected": -34.958709716796875, "step": 3378 }, { "epoch": 0.46010348583877997, "grad_norm": 39.858792902591055, "learning_rate": 5.235344618579202e-07, "logits/chosen": 10.800068855285645, "logits/rejected": 11.438467025756836, "logps/chosen": -3.4388012886047363, "logps/rejected": -3.508607864379883, "loss": 4.2846, "rewards/accuracies": 0.5, "rewards/chosen": -34.38801574707031, "rewards/margins": 0.6980657577514648, "rewards/rejected": -35.08607864379883, "step": 3379 }, { "epoch": 0.460239651416122, "grad_norm": 39.71683908026445, "learning_rate": 5.233536025939512e-07, "logits/chosen": 11.68277645111084, "logits/rejected": 12.093684196472168, "logps/chosen": -3.5067949295043945, "logps/rejected": -3.719691038131714, "loss": 4.089, "rewards/accuracies": 0.75, "rewards/chosen": -35.06794738769531, "rewards/margins": 2.1289615631103516, "rewards/rejected": -37.1969108581543, "step": 3380 }, { "epoch": 0.460375816993464, "grad_norm": 36.18342568525525, "learning_rate": 5.231727154572162e-07, "logits/chosen": 11.310245513916016, "logits/rejected": 11.716602325439453, "logps/chosen": -3.3525259494781494, "logps/rejected": -3.6370692253112793, "loss": 3.5529, "rewards/accuracies": 1.0, "rewards/chosen": -33.52526092529297, "rewards/margins": 2.8454322814941406, "rewards/rejected": -36.37069320678711, "step": 3381 }, { "epoch": 0.4605119825708061, "grad_norm": 40.572835137447406, "learning_rate": 5.229918004885877e-07, "logits/chosen": 11.743730545043945, "logits/rejected": 12.245667457580566, "logps/chosen": -3.657042980194092, "logps/rejected": -3.7026331424713135, "loss": 4.2769, "rewards/accuracies": 0.75, "rewards/chosen": -36.570430755615234, "rewards/margins": 0.4559001922607422, "rewards/rejected": -37.026329040527344, "step": 3382 }, { "epoch": 0.46064814814814814, "grad_norm": 35.771444804748626, "learning_rate": 5.228108577289454e-07, "logits/chosen": 11.506607055664062, "logits/rejected": 12.29423999786377, "logps/chosen": -3.2567477226257324, "logps/rejected": -3.438328266143799, "loss": 3.5809, "rewards/accuracies": 0.75, "rewards/chosen": -32.567474365234375, "rewards/margins": 1.815805435180664, "rewards/rejected": -34.38328170776367, "step": 3383 }, { "epoch": 0.46078431372549017, "grad_norm": 38.17293655067862, "learning_rate": 5.226298872191746e-07, "logits/chosen": 10.761306762695312, "logits/rejected": 11.023253440856934, "logps/chosen": -3.4393539428710938, "logps/rejected": -3.6589386463165283, "loss": 3.9912, "rewards/accuracies": 0.75, "rewards/chosen": -34.39353942871094, "rewards/margins": 2.1958484649658203, "rewards/rejected": -36.589385986328125, "step": 3384 }, { "epoch": 0.46092047930283225, "grad_norm": 38.86598264246339, "learning_rate": 5.22448889000167e-07, "logits/chosen": 11.603860855102539, "logits/rejected": 11.890595436096191, "logps/chosen": -3.6937167644500732, "logps/rejected": -3.779386043548584, "loss": 4.2049, "rewards/accuracies": 0.5, "rewards/chosen": -36.93716812133789, "rewards/margins": 0.8566927909851074, "rewards/rejected": -37.793861389160156, "step": 3385 }, { "epoch": 0.4610566448801743, "grad_norm": 42.80134844064054, "learning_rate": 5.222678631128209e-07, "logits/chosen": 11.838138580322266, "logits/rejected": 11.846585273742676, "logps/chosen": -3.5017309188842773, "logps/rejected": -3.4894001483917236, "loss": 4.2539, "rewards/accuracies": 0.5, "rewards/chosen": -35.017311096191406, "rewards/margins": -0.1233062744140625, "rewards/rejected": -34.894004821777344, "step": 3386 }, { "epoch": 0.46119281045751637, "grad_norm": 38.58869427357408, "learning_rate": 5.220868095980405e-07, "logits/chosen": 10.723989486694336, "logits/rejected": 11.778564453125, "logps/chosen": -3.684833526611328, "logps/rejected": -3.9307165145874023, "loss": 3.1539, "rewards/accuracies": 1.0, "rewards/chosen": -36.84833526611328, "rewards/margins": 2.4588308334350586, "rewards/rejected": -39.307167053222656, "step": 3387 }, { "epoch": 0.4613289760348584, "grad_norm": 36.57444443026398, "learning_rate": 5.219057284967362e-07, "logits/chosen": 10.784086227416992, "logits/rejected": 11.334808349609375, "logps/chosen": -3.2383973598480225, "logps/rejected": -3.4571170806884766, "loss": 4.0918, "rewards/accuracies": 0.5, "rewards/chosen": -32.38397216796875, "rewards/margins": 2.1871981620788574, "rewards/rejected": -34.571170806884766, "step": 3388 }, { "epoch": 0.4614651416122004, "grad_norm": 37.94190668508823, "learning_rate": 5.217246198498248e-07, "logits/chosen": 11.523286819458008, "logits/rejected": 12.106121063232422, "logps/chosen": -3.1892144680023193, "logps/rejected": -3.8403525352478027, "loss": 3.8859, "rewards/accuracies": 0.75, "rewards/chosen": -31.89214324951172, "rewards/margins": 6.511380195617676, "rewards/rejected": -38.403526306152344, "step": 3389 }, { "epoch": 0.4616013071895425, "grad_norm": 40.52042934455021, "learning_rate": 5.215434836982295e-07, "logits/chosen": 10.714215278625488, "logits/rejected": 10.78197193145752, "logps/chosen": -2.85435152053833, "logps/rejected": -3.2615280151367188, "loss": 4.2083, "rewards/accuracies": 1.0, "rewards/chosen": -28.543514251708984, "rewards/margins": 4.071766376495361, "rewards/rejected": -32.61528015136719, "step": 3390 }, { "epoch": 0.46173747276688454, "grad_norm": 38.41446648239824, "learning_rate": 5.213623200828792e-07, "logits/chosen": 11.600716590881348, "logits/rejected": 10.77104377746582, "logps/chosen": -3.8814682960510254, "logps/rejected": -3.826845169067383, "loss": 4.0722, "rewards/accuracies": 0.5, "rewards/chosen": -38.81468200683594, "rewards/margins": -0.546229362487793, "rewards/rejected": -38.26845169067383, "step": 3391 }, { "epoch": 0.46187363834422657, "grad_norm": 38.62358290939809, "learning_rate": 5.211811290447096e-07, "logits/chosen": 11.476410865783691, "logits/rejected": 12.409832000732422, "logps/chosen": -3.318311929702759, "logps/rejected": -3.716689109802246, "loss": 4.2346, "rewards/accuracies": 0.75, "rewards/chosen": -33.18312072753906, "rewards/margins": 3.9837727546691895, "rewards/rejected": -37.166893005371094, "step": 3392 }, { "epoch": 0.46200980392156865, "grad_norm": 39.974110446836434, "learning_rate": 5.209999106246623e-07, "logits/chosen": 11.387587547302246, "logits/rejected": 12.18652629852295, "logps/chosen": -3.6131339073181152, "logps/rejected": -3.8918163776397705, "loss": 4.4037, "rewards/accuracies": 0.75, "rewards/chosen": -36.13134002685547, "rewards/margins": 2.786825656890869, "rewards/rejected": -38.91816329956055, "step": 3393 }, { "epoch": 0.4621459694989107, "grad_norm": 36.752399160478305, "learning_rate": 5.208186648636849e-07, "logits/chosen": 10.645038604736328, "logits/rejected": 10.869131088256836, "logps/chosen": -3.217984199523926, "logps/rejected": -3.34543514251709, "loss": 3.867, "rewards/accuracies": 0.75, "rewards/chosen": -32.17984390258789, "rewards/margins": 1.274507999420166, "rewards/rejected": -33.45435333251953, "step": 3394 }, { "epoch": 0.4622821350762527, "grad_norm": 36.82780110032058, "learning_rate": 5.206373918027314e-07, "logits/chosen": 11.789216995239258, "logits/rejected": 12.392006874084473, "logps/chosen": -3.690728187561035, "logps/rejected": -4.053511619567871, "loss": 3.6332, "rewards/accuracies": 0.75, "rewards/chosen": -36.907283782958984, "rewards/margins": 3.6278343200683594, "rewards/rejected": -40.535118103027344, "step": 3395 }, { "epoch": 0.4624183006535948, "grad_norm": 36.5137228062659, "learning_rate": 5.204560914827621e-07, "logits/chosen": 12.447465896606445, "logits/rejected": 12.38280200958252, "logps/chosen": -3.6511034965515137, "logps/rejected": -3.819995641708374, "loss": 3.7855, "rewards/accuracies": 0.75, "rewards/chosen": -36.51103210449219, "rewards/margins": 1.6889219284057617, "rewards/rejected": -38.19995880126953, "step": 3396 }, { "epoch": 0.4625544662309368, "grad_norm": 37.567782971924316, "learning_rate": 5.202747639447432e-07, "logits/chosen": 11.648046493530273, "logits/rejected": 12.389719009399414, "logps/chosen": -3.3733630180358887, "logps/rejected": -3.839207172393799, "loss": 4.1052, "rewards/accuracies": 0.75, "rewards/chosen": -33.7336311340332, "rewards/margins": 4.658438205718994, "rewards/rejected": -38.392066955566406, "step": 3397 }, { "epoch": 0.46269063180827885, "grad_norm": 45.37798488181957, "learning_rate": 5.200934092296472e-07, "logits/chosen": 11.997018814086914, "logits/rejected": 12.428670883178711, "logps/chosen": -3.721073627471924, "logps/rejected": -3.952158212661743, "loss": 4.7425, "rewards/accuracies": 0.75, "rewards/chosen": -37.21073532104492, "rewards/margins": 2.310847759246826, "rewards/rejected": -39.521583557128906, "step": 3398 }, { "epoch": 0.46282679738562094, "grad_norm": 39.12377355766183, "learning_rate": 5.199120273784527e-07, "logits/chosen": 12.411081314086914, "logits/rejected": 12.837690353393555, "logps/chosen": -3.8294644355773926, "logps/rejected": -4.229500770568848, "loss": 3.7707, "rewards/accuracies": 1.0, "rewards/chosen": -38.29464340209961, "rewards/margins": 4.000363349914551, "rewards/rejected": -42.295005798339844, "step": 3399 }, { "epoch": 0.46296296296296297, "grad_norm": 36.48788088295561, "learning_rate": 5.197306184321443e-07, "logits/chosen": 11.292978286743164, "logits/rejected": 12.440971374511719, "logps/chosen": -3.6050972938537598, "logps/rejected": -4.010915756225586, "loss": 4.172, "rewards/accuracies": 1.0, "rewards/chosen": -36.05097198486328, "rewards/margins": 4.058189392089844, "rewards/rejected": -40.10916519165039, "step": 3400 }, { "epoch": 0.463099128540305, "grad_norm": 40.46883226558095, "learning_rate": 5.195491824317132e-07, "logits/chosen": 11.597152709960938, "logits/rejected": 11.879691123962402, "logps/chosen": -3.855585813522339, "logps/rejected": -3.79835844039917, "loss": 3.6815, "rewards/accuracies": 0.25, "rewards/chosen": -38.55585861206055, "rewards/margins": -0.5722732543945312, "rewards/rejected": -37.98358154296875, "step": 3401 }, { "epoch": 0.4632352941176471, "grad_norm": 36.31683818015897, "learning_rate": 5.19367719418156e-07, "logits/chosen": 12.139673233032227, "logits/rejected": 11.46563720703125, "logps/chosen": -3.749725818634033, "logps/rejected": -3.610914468765259, "loss": 4.2577, "rewards/accuracies": 0.25, "rewards/chosen": -37.497257232666016, "rewards/margins": -1.3881134986877441, "rewards/rejected": -36.1091423034668, "step": 3402 }, { "epoch": 0.4633714596949891, "grad_norm": 39.53313337651499, "learning_rate": 5.191862294324758e-07, "logits/chosen": 11.794517517089844, "logits/rejected": 12.100848197937012, "logps/chosen": -3.6281886100769043, "logps/rejected": -3.767310619354248, "loss": 3.4506, "rewards/accuracies": 0.75, "rewards/chosen": -36.28188705444336, "rewards/margins": 1.3912181854248047, "rewards/rejected": -37.6731071472168, "step": 3403 }, { "epoch": 0.46350762527233114, "grad_norm": 76.99317742929594, "learning_rate": 5.190047125156819e-07, "logits/chosen": 11.003969192504883, "logits/rejected": 11.818716049194336, "logps/chosen": -3.651541233062744, "logps/rejected": -3.934675455093384, "loss": 4.0873, "rewards/accuracies": 0.75, "rewards/chosen": -36.51541519165039, "rewards/margins": 2.8313417434692383, "rewards/rejected": -39.34675598144531, "step": 3404 }, { "epoch": 0.4636437908496732, "grad_norm": 36.64117072379803, "learning_rate": 5.188231687087895e-07, "logits/chosen": 12.640317916870117, "logits/rejected": 12.679566383361816, "logps/chosen": -4.159733772277832, "logps/rejected": -4.146771430969238, "loss": 4.0705, "rewards/accuracies": 0.5, "rewards/chosen": -41.59733581542969, "rewards/margins": -0.12961864471435547, "rewards/rejected": -41.46772003173828, "step": 3405 }, { "epoch": 0.46377995642701525, "grad_norm": 42.20666790151722, "learning_rate": 5.1864159805282e-07, "logits/chosen": 12.650639533996582, "logits/rejected": 12.14413833618164, "logps/chosen": -4.194233417510986, "logps/rejected": -3.56929349899292, "loss": 3.8974, "rewards/accuracies": 0.0, "rewards/chosen": -41.94233703613281, "rewards/margins": -6.249399185180664, "rewards/rejected": -35.69293975830078, "step": 3406 }, { "epoch": 0.4639161220043573, "grad_norm": 36.732458568667255, "learning_rate": 5.184600005888007e-07, "logits/chosen": 12.799802780151367, "logits/rejected": 11.834172248840332, "logps/chosen": -3.8408620357513428, "logps/rejected": -3.6153852939605713, "loss": 4.0421, "rewards/accuracies": 0.25, "rewards/chosen": -38.40862274169922, "rewards/margins": -2.2547688484191895, "rewards/rejected": -36.15385055541992, "step": 3407 }, { "epoch": 0.46405228758169936, "grad_norm": 36.42079547641898, "learning_rate": 5.18278376357765e-07, "logits/chosen": 12.424880981445312, "logits/rejected": 12.142635345458984, "logps/chosen": -4.07460355758667, "logps/rejected": -3.9838831424713135, "loss": 3.8865, "rewards/accuracies": 0.25, "rewards/chosen": -40.746036529541016, "rewards/margins": -0.9072046279907227, "rewards/rejected": -39.838829040527344, "step": 3408 }, { "epoch": 0.4641884531590414, "grad_norm": 39.41514552937165, "learning_rate": 5.180967254007525e-07, "logits/chosen": 12.102592468261719, "logits/rejected": 12.098226547241211, "logps/chosen": -4.211996078491211, "logps/rejected": -4.265163421630859, "loss": 4.0036, "rewards/accuracies": 0.5, "rewards/chosen": -42.11996078491211, "rewards/margins": 0.5316743850708008, "rewards/rejected": -42.651634216308594, "step": 3409 }, { "epoch": 0.4643246187363834, "grad_norm": 38.88755936319588, "learning_rate": 5.179150477588087e-07, "logits/chosen": 12.009773254394531, "logits/rejected": 13.16856575012207, "logps/chosen": -3.599545955657959, "logps/rejected": -4.182463645935059, "loss": 4.0095, "rewards/accuracies": 1.0, "rewards/chosen": -35.99545669555664, "rewards/margins": 5.8291778564453125, "rewards/rejected": -41.82463836669922, "step": 3410 }, { "epoch": 0.4644607843137255, "grad_norm": 38.81259049090704, "learning_rate": 5.177333434729852e-07, "logits/chosen": 11.493223190307617, "logits/rejected": 12.35910701751709, "logps/chosen": -3.790658473968506, "logps/rejected": -4.07231330871582, "loss": 4.4492, "rewards/accuracies": 1.0, "rewards/chosen": -37.906585693359375, "rewards/margins": 2.8165502548217773, "rewards/rejected": -40.7231330871582, "step": 3411 }, { "epoch": 0.46459694989106753, "grad_norm": 37.00368988353976, "learning_rate": 5.175516125843395e-07, "logits/chosen": 12.861649513244629, "logits/rejected": 12.878231048583984, "logps/chosen": -4.000954627990723, "logps/rejected": -3.8629024028778076, "loss": 4.1916, "rewards/accuracies": 0.25, "rewards/chosen": -40.00954818725586, "rewards/margins": -1.380523681640625, "rewards/rejected": -38.629024505615234, "step": 3412 }, { "epoch": 0.46473311546840956, "grad_norm": 36.97886428220212, "learning_rate": 5.173698551339352e-07, "logits/chosen": 11.966255187988281, "logits/rejected": 11.539133071899414, "logps/chosen": -3.639042615890503, "logps/rejected": -3.723952054977417, "loss": 3.6294, "rewards/accuracies": 0.75, "rewards/chosen": -36.39042663574219, "rewards/margins": 0.8490934371948242, "rewards/rejected": -37.23952102661133, "step": 3413 }, { "epoch": 0.46486928104575165, "grad_norm": 38.02584258984951, "learning_rate": 5.171880711628421e-07, "logits/chosen": 12.272811889648438, "logits/rejected": 13.476083755493164, "logps/chosen": -3.864474058151245, "logps/rejected": -4.400935173034668, "loss": 4.04, "rewards/accuracies": 0.75, "rewards/chosen": -38.644737243652344, "rewards/margins": 5.364609718322754, "rewards/rejected": -44.00934982299805, "step": 3414 }, { "epoch": 0.4650054466230937, "grad_norm": 39.43244189821348, "learning_rate": 5.170062607121356e-07, "logits/chosen": 12.298408508300781, "logits/rejected": 12.493568420410156, "logps/chosen": -3.691265821456909, "logps/rejected": -3.7683897018432617, "loss": 4.273, "rewards/accuracies": 0.25, "rewards/chosen": -36.91265869140625, "rewards/margins": 0.7712373733520508, "rewards/rejected": -37.68389892578125, "step": 3415 }, { "epoch": 0.4651416122004357, "grad_norm": 37.939427955823625, "learning_rate": 5.168244238228971e-07, "logits/chosen": 12.177590370178223, "logits/rejected": 12.459135055541992, "logps/chosen": -3.700334072113037, "logps/rejected": -3.659942865371704, "loss": 3.9901, "rewards/accuracies": 0.5, "rewards/chosen": -37.00334167480469, "rewards/margins": -0.40391063690185547, "rewards/rejected": -36.599430084228516, "step": 3416 }, { "epoch": 0.4652777777777778, "grad_norm": 38.112014225796905, "learning_rate": 5.166425605362145e-07, "logits/chosen": 11.82651138305664, "logits/rejected": 12.491564750671387, "logps/chosen": -3.5766751766204834, "logps/rejected": -4.284326553344727, "loss": 3.8602, "rewards/accuracies": 1.0, "rewards/chosen": -35.76675033569336, "rewards/margins": 7.076516151428223, "rewards/rejected": -42.84326934814453, "step": 3417 }, { "epoch": 0.4654139433551198, "grad_norm": 41.70314520296261, "learning_rate": 5.164606708931812e-07, "logits/chosen": 12.44919204711914, "logits/rejected": 12.029123306274414, "logps/chosen": -3.8105664253234863, "logps/rejected": -3.659397602081299, "loss": 4.8389, "rewards/accuracies": 0.25, "rewards/chosen": -38.10566329956055, "rewards/margins": -1.5116872787475586, "rewards/rejected": -36.59397506713867, "step": 3418 }, { "epoch": 0.46555010893246185, "grad_norm": 38.95892763036975, "learning_rate": 5.162787549348966e-07, "logits/chosen": 12.598525047302246, "logits/rejected": 12.870597839355469, "logps/chosen": -4.031386852264404, "logps/rejected": -4.287858009338379, "loss": 4.3947, "rewards/accuracies": 0.75, "rewards/chosen": -40.313865661621094, "rewards/margins": 2.5647149085998535, "rewards/rejected": -42.87858200073242, "step": 3419 }, { "epoch": 0.46568627450980393, "grad_norm": 36.17608665767633, "learning_rate": 5.160968127024662e-07, "logits/chosen": 12.005692481994629, "logits/rejected": 12.355764389038086, "logps/chosen": -3.6683497428894043, "logps/rejected": -3.8855929374694824, "loss": 3.49, "rewards/accuracies": 0.75, "rewards/chosen": -36.683494567871094, "rewards/margins": 2.1724324226379395, "rewards/rejected": -38.85593032836914, "step": 3420 }, { "epoch": 0.46582244008714596, "grad_norm": 37.76754463556373, "learning_rate": 5.159148442370013e-07, "logits/chosen": 13.24570369720459, "logits/rejected": 13.15243911743164, "logps/chosen": -4.195868015289307, "logps/rejected": -3.9371728897094727, "loss": 3.5438, "rewards/accuracies": 0.25, "rewards/chosen": -41.95867919921875, "rewards/margins": -2.586949348449707, "rewards/rejected": -39.371726989746094, "step": 3421 }, { "epoch": 0.465958605664488, "grad_norm": 41.64267871757524, "learning_rate": 5.157328495796191e-07, "logits/chosen": 12.369033813476562, "logits/rejected": 12.981531143188477, "logps/chosen": -3.6283118724823, "logps/rejected": -3.7985939979553223, "loss": 4.5993, "rewards/accuracies": 0.75, "rewards/chosen": -36.283119201660156, "rewards/margins": 1.7028203010559082, "rewards/rejected": -37.985939025878906, "step": 3422 }, { "epoch": 0.4660947712418301, "grad_norm": 33.87101915805425, "learning_rate": 5.15550828771443e-07, "logits/chosen": 12.500885009765625, "logits/rejected": 12.456771850585938, "logps/chosen": -4.064285755157471, "logps/rejected": -3.953986644744873, "loss": 3.8788, "rewards/accuracies": 0.5, "rewards/chosen": -40.64285659790039, "rewards/margins": -1.1029891967773438, "rewards/rejected": -39.53986740112305, "step": 3423 }, { "epoch": 0.4662309368191721, "grad_norm": 35.554751370088056, "learning_rate": 5.153687818536019e-07, "logits/chosen": 11.834177017211914, "logits/rejected": 12.539724349975586, "logps/chosen": -3.4819154739379883, "logps/rejected": -3.848191976547241, "loss": 3.7727, "rewards/accuracies": 0.5, "rewards/chosen": -34.81915283203125, "rewards/margins": 3.6627655029296875, "rewards/rejected": -38.48191833496094, "step": 3424 }, { "epoch": 0.4663671023965142, "grad_norm": 42.855652984353576, "learning_rate": 5.15186708867231e-07, "logits/chosen": 12.476832389831543, "logits/rejected": 12.720358848571777, "logps/chosen": -3.5778861045837402, "logps/rejected": -3.919095277786255, "loss": 4.1433, "rewards/accuracies": 1.0, "rewards/chosen": -35.77886199951172, "rewards/margins": 3.4120936393737793, "rewards/rejected": -39.190956115722656, "step": 3425 }, { "epoch": 0.4665032679738562, "grad_norm": 111.84817997685103, "learning_rate": 5.15004609853471e-07, "logits/chosen": 12.378070831298828, "logits/rejected": 11.952346801757812, "logps/chosen": -3.86432147026062, "logps/rejected": -3.908536434173584, "loss": 4.1422, "rewards/accuracies": 0.5, "rewards/chosen": -38.64321517944336, "rewards/margins": 0.44214963912963867, "rewards/rejected": -39.085365295410156, "step": 3426 }, { "epoch": 0.46663943355119825, "grad_norm": 32.79068042320649, "learning_rate": 5.148224848534687e-07, "logits/chosen": 11.53887939453125, "logits/rejected": 12.001776695251465, "logps/chosen": -3.849635601043701, "logps/rejected": -3.757950782775879, "loss": 3.8023, "rewards/accuracies": 0.5, "rewards/chosen": -38.49635314941406, "rewards/margins": -0.9168453216552734, "rewards/rejected": -37.579505920410156, "step": 3427 }, { "epoch": 0.46677559912854033, "grad_norm": 38.12075975361773, "learning_rate": 5.146403339083769e-07, "logits/chosen": 12.183935165405273, "logits/rejected": 12.620503425598145, "logps/chosen": -3.893388509750366, "logps/rejected": -3.86759090423584, "loss": 4.2938, "rewards/accuracies": 0.25, "rewards/chosen": -38.93388366699219, "rewards/margins": -0.25797462463378906, "rewards/rejected": -38.67591094970703, "step": 3428 }, { "epoch": 0.46691176470588236, "grad_norm": 43.296616173550134, "learning_rate": 5.14458157059354e-07, "logits/chosen": 11.916898727416992, "logits/rejected": 12.468170166015625, "logps/chosen": -3.548691987991333, "logps/rejected": -4.084959983825684, "loss": 4.5753, "rewards/accuracies": 0.75, "rewards/chosen": -35.48691940307617, "rewards/margins": 5.362679481506348, "rewards/rejected": -40.8495979309082, "step": 3429 }, { "epoch": 0.4670479302832244, "grad_norm": 43.24166420617121, "learning_rate": 5.142759543475644e-07, "logits/chosen": 12.667684555053711, "logits/rejected": 12.451934814453125, "logps/chosen": -3.9574742317199707, "logps/rejected": -3.9683237075805664, "loss": 4.4515, "rewards/accuracies": 0.5, "rewards/chosen": -39.57474136352539, "rewards/margins": 0.10849380493164062, "rewards/rejected": -39.68323516845703, "step": 3430 }, { "epoch": 0.4671840958605665, "grad_norm": 44.21609530718694, "learning_rate": 5.140937258141782e-07, "logits/chosen": 12.421005249023438, "logits/rejected": 12.243656158447266, "logps/chosen": -3.787130355834961, "logps/rejected": -4.196527481079102, "loss": 4.5191, "rewards/accuracies": 1.0, "rewards/chosen": -37.87130355834961, "rewards/margins": 4.093975067138672, "rewards/rejected": -41.96527862548828, "step": 3431 }, { "epoch": 0.4673202614379085, "grad_norm": 34.17474994873925, "learning_rate": 5.139114715003718e-07, "logits/chosen": 13.3308744430542, "logits/rejected": 12.389900207519531, "logps/chosen": -3.7469582557678223, "logps/rejected": -4.043329238891602, "loss": 3.9329, "rewards/accuracies": 0.75, "rewards/chosen": -37.469581604003906, "rewards/margins": 2.9637088775634766, "rewards/rejected": -40.43328857421875, "step": 3432 }, { "epoch": 0.46745642701525053, "grad_norm": 37.371143613861896, "learning_rate": 5.137291914473266e-07, "logits/chosen": 12.2658109664917, "logits/rejected": 11.640941619873047, "logps/chosen": -3.7047412395477295, "logps/rejected": -3.593245029449463, "loss": 4.3977, "rewards/accuracies": 0.5, "rewards/chosen": -37.04741287231445, "rewards/margins": -1.114964485168457, "rewards/rejected": -35.93244934082031, "step": 3433 }, { "epoch": 0.4675925925925926, "grad_norm": 36.341085963139165, "learning_rate": 5.135468856962304e-07, "logits/chosen": 11.52314281463623, "logits/rejected": 12.788263320922852, "logps/chosen": -3.4469966888427734, "logps/rejected": -3.8363900184631348, "loss": 4.0752, "rewards/accuracies": 0.5, "rewards/chosen": -34.469966888427734, "rewards/margins": 3.893932342529297, "rewards/rejected": -38.36389923095703, "step": 3434 }, { "epoch": 0.46772875816993464, "grad_norm": 38.67792275759129, "learning_rate": 5.133645542882771e-07, "logits/chosen": 12.40998649597168, "logits/rejected": 12.976938247680664, "logps/chosen": -3.7708740234375, "logps/rejected": -3.8216376304626465, "loss": 3.8211, "rewards/accuracies": 0.75, "rewards/chosen": -37.708740234375, "rewards/margins": 0.5076346397399902, "rewards/rejected": -38.216373443603516, "step": 3435 }, { "epoch": 0.4678649237472767, "grad_norm": 35.64867724384758, "learning_rate": 5.131821972646655e-07, "logits/chosen": 13.380290985107422, "logits/rejected": 13.2086181640625, "logps/chosen": -4.203785419464111, "logps/rejected": -4.338973045349121, "loss": 3.8028, "rewards/accuracies": 0.75, "rewards/chosen": -42.03784942626953, "rewards/margins": 1.351877212524414, "rewards/rejected": -43.389732360839844, "step": 3436 }, { "epoch": 0.46800108932461876, "grad_norm": 36.89602053926493, "learning_rate": 5.129998146666008e-07, "logits/chosen": 11.727581977844238, "logits/rejected": 12.033374786376953, "logps/chosen": -3.6103873252868652, "logps/rejected": -3.99503231048584, "loss": 4.2097, "rewards/accuracies": 1.0, "rewards/chosen": -36.10387420654297, "rewards/margins": 3.8464488983154297, "rewards/rejected": -39.95032501220703, "step": 3437 }, { "epoch": 0.4681372549019608, "grad_norm": 37.394159227362934, "learning_rate": 5.128174065352941e-07, "logits/chosen": 12.853706359863281, "logits/rejected": 12.853317260742188, "logps/chosen": -4.188096046447754, "logps/rejected": -3.9061245918273926, "loss": 3.7654, "rewards/accuracies": 0.25, "rewards/chosen": -41.880958557128906, "rewards/margins": -2.8197097778320312, "rewards/rejected": -39.061248779296875, "step": 3438 }, { "epoch": 0.4682734204793028, "grad_norm": 35.84467556242272, "learning_rate": 5.126349729119617e-07, "logits/chosen": 11.337577819824219, "logits/rejected": 12.073266983032227, "logps/chosen": -3.763390302658081, "logps/rejected": -4.130611896514893, "loss": 4.0144, "rewards/accuracies": 0.75, "rewards/chosen": -37.63390350341797, "rewards/margins": 3.6722145080566406, "rewards/rejected": -41.306121826171875, "step": 3439 }, { "epoch": 0.4684095860566449, "grad_norm": 37.7648209809996, "learning_rate": 5.124525138378262e-07, "logits/chosen": 12.08851432800293, "logits/rejected": 12.056760787963867, "logps/chosen": -3.92862606048584, "logps/rejected": -4.019611835479736, "loss": 4.0486, "rewards/accuracies": 0.5, "rewards/chosen": -39.28626251220703, "rewards/margins": 0.9098567962646484, "rewards/rejected": -40.19611740112305, "step": 3440 }, { "epoch": 0.46854575163398693, "grad_norm": 38.67627339827026, "learning_rate": 5.122700293541155e-07, "logits/chosen": 12.426191329956055, "logits/rejected": 12.397468566894531, "logps/chosen": -3.9063565731048584, "logps/rejected": -3.9731838703155518, "loss": 4.3071, "rewards/accuracies": 0.5, "rewards/chosen": -39.063568115234375, "rewards/margins": 0.6682720184326172, "rewards/rejected": -39.73183822631836, "step": 3441 }, { "epoch": 0.46868191721132896, "grad_norm": 35.36538895102198, "learning_rate": 5.120875195020637e-07, "logits/chosen": 12.02057933807373, "logits/rejected": 14.02595329284668, "logps/chosen": -3.7593166828155518, "logps/rejected": -4.485318183898926, "loss": 3.7214, "rewards/accuracies": 0.75, "rewards/chosen": -37.593170166015625, "rewards/margins": 7.260015487670898, "rewards/rejected": -44.85318374633789, "step": 3442 }, { "epoch": 0.46881808278867104, "grad_norm": 43.6694108911437, "learning_rate": 5.119049843229105e-07, "logits/chosen": 12.249865531921387, "logits/rejected": 12.335212707519531, "logps/chosen": -3.9137730598449707, "logps/rejected": -3.9880001544952393, "loss": 3.4227, "rewards/accuracies": 0.5, "rewards/chosen": -39.13772964477539, "rewards/margins": 0.7422723770141602, "rewards/rejected": -39.880001068115234, "step": 3443 }, { "epoch": 0.46895424836601307, "grad_norm": 36.177594398930616, "learning_rate": 5.117224238579009e-07, "logits/chosen": 12.356634140014648, "logits/rejected": 12.284156799316406, "logps/chosen": -3.8227880001068115, "logps/rejected": -3.8786096572875977, "loss": 4.0277, "rewards/accuracies": 0.5, "rewards/chosen": -38.227882385253906, "rewards/margins": 0.5582132339477539, "rewards/rejected": -38.786094665527344, "step": 3444 }, { "epoch": 0.4690904139433551, "grad_norm": 36.89785061659844, "learning_rate": 5.115398381482862e-07, "logits/chosen": 12.075096130371094, "logits/rejected": 12.195819854736328, "logps/chosen": -3.8372466564178467, "logps/rejected": -3.6543445587158203, "loss": 3.6959, "rewards/accuracies": 0.25, "rewards/chosen": -38.372467041015625, "rewards/margins": -1.8290224075317383, "rewards/rejected": -36.54344177246094, "step": 3445 }, { "epoch": 0.4692265795206972, "grad_norm": 34.64916489388771, "learning_rate": 5.11357227235323e-07, "logits/chosen": 12.722013473510742, "logits/rejected": 12.426916122436523, "logps/chosen": -3.9316606521606445, "logps/rejected": -4.092292785644531, "loss": 4.1782, "rewards/accuracies": 0.75, "rewards/chosen": -39.31660461425781, "rewards/margins": 1.6063222885131836, "rewards/rejected": -40.92292785644531, "step": 3446 }, { "epoch": 0.4693627450980392, "grad_norm": 37.45273946593007, "learning_rate": 5.111745911602739e-07, "logits/chosen": 11.819729804992676, "logits/rejected": 12.53192138671875, "logps/chosen": -3.7602529525756836, "logps/rejected": -3.9428086280822754, "loss": 3.7047, "rewards/accuracies": 0.75, "rewards/chosen": -37.6025276184082, "rewards/margins": 1.8255577087402344, "rewards/rejected": -39.42808532714844, "step": 3447 }, { "epoch": 0.46949891067538124, "grad_norm": 44.80450127332605, "learning_rate": 5.109919299644069e-07, "logits/chosen": 12.343244552612305, "logits/rejected": 12.982351303100586, "logps/chosen": -3.7764134407043457, "logps/rejected": -4.1684393882751465, "loss": 4.2109, "rewards/accuracies": 1.0, "rewards/chosen": -37.764137268066406, "rewards/margins": 3.9202566146850586, "rewards/rejected": -41.68439483642578, "step": 3448 }, { "epoch": 0.4696350762527233, "grad_norm": 38.92303038319554, "learning_rate": 5.108092436889959e-07, "logits/chosen": 12.31433391571045, "logits/rejected": 12.148645401000977, "logps/chosen": -3.7790603637695312, "logps/rejected": -4.308530807495117, "loss": 3.3743, "rewards/accuracies": 1.0, "rewards/chosen": -37.79060363769531, "rewards/margins": 5.294704437255859, "rewards/rejected": -43.08531188964844, "step": 3449 }, { "epoch": 0.46977124183006536, "grad_norm": 39.51200021304764, "learning_rate": 5.106265323753203e-07, "logits/chosen": 12.00582218170166, "logits/rejected": 12.503408432006836, "logps/chosen": -3.623427152633667, "logps/rejected": -3.6229939460754395, "loss": 4.0673, "rewards/accuracies": 0.5, "rewards/chosen": -36.23426818847656, "rewards/margins": -0.004330635070800781, "rewards/rejected": -36.229942321777344, "step": 3450 }, { "epoch": 0.4699074074074074, "grad_norm": 38.799882041315726, "learning_rate": 5.104437960646652e-07, "logits/chosen": 12.050601959228516, "logits/rejected": 12.869199752807617, "logps/chosen": -3.911059617996216, "logps/rejected": -4.240085124969482, "loss": 3.8328, "rewards/accuracies": 1.0, "rewards/chosen": -39.110595703125, "rewards/margins": 3.2902517318725586, "rewards/rejected": -42.400848388671875, "step": 3451 }, { "epoch": 0.47004357298474947, "grad_norm": 39.93689489885376, "learning_rate": 5.102610347983216e-07, "logits/chosen": 12.313678741455078, "logits/rejected": 12.709739685058594, "logps/chosen": -3.9201202392578125, "logps/rejected": -4.088850021362305, "loss": 4.5235, "rewards/accuracies": 0.75, "rewards/chosen": -39.201202392578125, "rewards/margins": 1.687300682067871, "rewards/rejected": -40.88850402832031, "step": 3452 }, { "epoch": 0.4701797385620915, "grad_norm": 35.16741454055885, "learning_rate": 5.100782486175857e-07, "logits/chosen": 12.516857147216797, "logits/rejected": 12.696966171264648, "logps/chosen": -3.6559486389160156, "logps/rejected": -4.234362602233887, "loss": 4.1932, "rewards/accuracies": 0.75, "rewards/chosen": -36.559486389160156, "rewards/margins": 5.784141540527344, "rewards/rejected": -42.3436279296875, "step": 3453 }, { "epoch": 0.4703159041394335, "grad_norm": 40.782008500894435, "learning_rate": 5.098954375637595e-07, "logits/chosen": 11.549524307250977, "logits/rejected": 11.844877243041992, "logps/chosen": -3.732903480529785, "logps/rejected": -3.9204330444335938, "loss": 3.8722, "rewards/accuracies": 0.75, "rewards/chosen": -37.329036712646484, "rewards/margins": 1.875295639038086, "rewards/rejected": -39.20433044433594, "step": 3454 }, { "epoch": 0.4704520697167756, "grad_norm": 39.18057934388827, "learning_rate": 5.097126016781508e-07, "logits/chosen": 12.248590469360352, "logits/rejected": 12.93143081665039, "logps/chosen": -3.6913318634033203, "logps/rejected": -4.04771614074707, "loss": 3.9345, "rewards/accuracies": 1.0, "rewards/chosen": -36.9133186340332, "rewards/margins": 3.5638389587402344, "rewards/rejected": -40.47715759277344, "step": 3455 }, { "epoch": 0.47058823529411764, "grad_norm": 40.64588039662755, "learning_rate": 5.09529741002073e-07, "logits/chosen": 11.895610809326172, "logits/rejected": 12.895282745361328, "logps/chosen": -3.312282085418701, "logps/rejected": -3.902947187423706, "loss": 3.9556, "rewards/accuracies": 1.0, "rewards/chosen": -33.12282180786133, "rewards/margins": 5.9066481590271, "rewards/rejected": -39.02946853637695, "step": 3456 }, { "epoch": 0.47072440087145967, "grad_norm": 41.8435358424492, "learning_rate": 5.093468555768446e-07, "logits/chosen": 11.645581245422363, "logits/rejected": 11.907361030578613, "logps/chosen": -3.670616865158081, "logps/rejected": -3.8082995414733887, "loss": 3.7828, "rewards/accuracies": 0.75, "rewards/chosen": -36.70616912841797, "rewards/margins": 1.3768248558044434, "rewards/rejected": -38.08299255371094, "step": 3457 }, { "epoch": 0.47086056644880175, "grad_norm": 36.30446747881619, "learning_rate": 5.091639454437905e-07, "logits/chosen": 12.450292587280273, "logits/rejected": 12.301746368408203, "logps/chosen": -3.8192553520202637, "logps/rejected": -4.201648712158203, "loss": 3.6863, "rewards/accuracies": 1.0, "rewards/chosen": -38.19255065917969, "rewards/margins": 3.8239316940307617, "rewards/rejected": -42.01648712158203, "step": 3458 }, { "epoch": 0.4709967320261438, "grad_norm": 40.77266964223582, "learning_rate": 5.089810106442405e-07, "logits/chosen": 12.738683700561523, "logits/rejected": 13.837105751037598, "logps/chosen": -3.9050838947296143, "logps/rejected": -4.1841278076171875, "loss": 3.9435, "rewards/accuracies": 0.5, "rewards/chosen": -39.050838470458984, "rewards/margins": 2.79044246673584, "rewards/rejected": -41.841278076171875, "step": 3459 }, { "epoch": 0.4711328976034858, "grad_norm": 39.61860766873645, "learning_rate": 5.087980512195303e-07, "logits/chosen": 13.052399635314941, "logits/rejected": 12.63260269165039, "logps/chosen": -3.9796247482299805, "logps/rejected": -4.002251625061035, "loss": 3.6497, "rewards/accuracies": 0.75, "rewards/chosen": -39.79624938964844, "rewards/margins": 0.22626686096191406, "rewards/rejected": -40.02251434326172, "step": 3460 }, { "epoch": 0.4712690631808279, "grad_norm": 46.60566274600885, "learning_rate": 5.086150672110012e-07, "logits/chosen": 11.978216171264648, "logits/rejected": 12.721328735351562, "logps/chosen": -3.652860164642334, "logps/rejected": -3.825747013092041, "loss": 4.4823, "rewards/accuracies": 0.75, "rewards/chosen": -36.52859878540039, "rewards/margins": 1.7288694381713867, "rewards/rejected": -38.257469177246094, "step": 3461 }, { "epoch": 0.4714052287581699, "grad_norm": 40.19484930829026, "learning_rate": 5.084320586599997e-07, "logits/chosen": 11.63119125366211, "logits/rejected": 11.820565223693848, "logps/chosen": -3.5673370361328125, "logps/rejected": -3.974036693572998, "loss": 3.566, "rewards/accuracies": 1.0, "rewards/chosen": -35.673370361328125, "rewards/margins": 4.066995620727539, "rewards/rejected": -39.74036407470703, "step": 3462 }, { "epoch": 0.471541394335512, "grad_norm": 41.12996305449867, "learning_rate": 5.082490256078784e-07, "logits/chosen": 11.862434387207031, "logits/rejected": 12.710051536560059, "logps/chosen": -3.8197031021118164, "logps/rejected": -4.213901042938232, "loss": 3.9058, "rewards/accuracies": 0.75, "rewards/chosen": -38.19702911376953, "rewards/margins": 3.94197940826416, "rewards/rejected": -42.13901138305664, "step": 3463 }, { "epoch": 0.47167755991285404, "grad_norm": 41.41955098065617, "learning_rate": 5.080659680959947e-07, "logits/chosen": 10.89098072052002, "logits/rejected": 11.674561500549316, "logps/chosen": -3.2527456283569336, "logps/rejected": -3.8242807388305664, "loss": 4.3196, "rewards/accuracies": 1.0, "rewards/chosen": -32.52745819091797, "rewards/margins": 5.7153496742248535, "rewards/rejected": -38.24280548095703, "step": 3464 }, { "epoch": 0.47181372549019607, "grad_norm": 47.42630760810606, "learning_rate": 5.078828861657125e-07, "logits/chosen": 11.371444702148438, "logits/rejected": 11.810405731201172, "logps/chosen": -3.5233006477355957, "logps/rejected": -3.872683048248291, "loss": 4.5427, "rewards/accuracies": 0.75, "rewards/chosen": -35.233009338378906, "rewards/margins": 3.4938225746154785, "rewards/rejected": -38.726829528808594, "step": 3465 }, { "epoch": 0.47194989106753815, "grad_norm": 37.54201659518873, "learning_rate": 5.076997798584003e-07, "logits/chosen": 11.879953384399414, "logits/rejected": 12.470720291137695, "logps/chosen": -3.5439417362213135, "logps/rejected": -3.860273838043213, "loss": 3.9902, "rewards/accuracies": 0.75, "rewards/chosen": -35.439414978027344, "rewards/margins": 3.163320541381836, "rewards/rejected": -38.60273742675781, "step": 3466 }, { "epoch": 0.4720860566448802, "grad_norm": 42.54097305288933, "learning_rate": 5.075166492154325e-07, "logits/chosen": 12.509862899780273, "logits/rejected": 12.281126022338867, "logps/chosen": -3.8168983459472656, "logps/rejected": -3.646629571914673, "loss": 4.2313, "rewards/accuracies": 0.0, "rewards/chosen": -38.168983459472656, "rewards/margins": -1.7026901245117188, "rewards/rejected": -36.46629333496094, "step": 3467 }, { "epoch": 0.4722222222222222, "grad_norm": 45.19796722923867, "learning_rate": 5.073334942781893e-07, "logits/chosen": 12.20610523223877, "logits/rejected": 12.574386596679688, "logps/chosen": -3.6947953701019287, "logps/rejected": -3.842686653137207, "loss": 3.8873, "rewards/accuracies": 0.5, "rewards/chosen": -36.94795227050781, "rewards/margins": 1.4789142608642578, "rewards/rejected": -38.42686462402344, "step": 3468 }, { "epoch": 0.4723583877995643, "grad_norm": 37.86723913898076, "learning_rate": 5.071503150880556e-07, "logits/chosen": 11.225770950317383, "logits/rejected": 11.564740180969238, "logps/chosen": -3.361736297607422, "logps/rejected": -3.5291695594787598, "loss": 3.7291, "rewards/accuracies": 0.5, "rewards/chosen": -33.61736297607422, "rewards/margins": 1.6743335723876953, "rewards/rejected": -35.29169464111328, "step": 3469 }, { "epoch": 0.4724945533769063, "grad_norm": 44.930213829448334, "learning_rate": 5.069671116864226e-07, "logits/chosen": 11.971406936645508, "logits/rejected": 12.699945449829102, "logps/chosen": -3.4753403663635254, "logps/rejected": -4.051022529602051, "loss": 4.1136, "rewards/accuracies": 1.0, "rewards/chosen": -34.75340270996094, "rewards/margins": 5.756823539733887, "rewards/rejected": -40.51022720336914, "step": 3470 }, { "epoch": 0.47263071895424835, "grad_norm": 42.10277823350386, "learning_rate": 5.067838841146865e-07, "logits/chosen": 12.335308074951172, "logits/rejected": 13.208130836486816, "logps/chosen": -4.225555419921875, "logps/rejected": -4.276361465454102, "loss": 3.9007, "rewards/accuracies": 0.5, "rewards/chosen": -42.25555419921875, "rewards/margins": 0.5080604553222656, "rewards/rejected": -42.76361846923828, "step": 3471 }, { "epoch": 0.47276688453159044, "grad_norm": 42.020472147274845, "learning_rate": 5.06600632414249e-07, "logits/chosen": 12.21044921875, "logits/rejected": 12.26645278930664, "logps/chosen": -3.562072992324829, "logps/rejected": -3.538632869720459, "loss": 4.1441, "rewards/accuracies": 0.5, "rewards/chosen": -35.620731353759766, "rewards/margins": -0.23440265655517578, "rewards/rejected": -35.386329650878906, "step": 3472 }, { "epoch": 0.47290305010893247, "grad_norm": 45.21606606916179, "learning_rate": 5.064173566265177e-07, "logits/chosen": 11.510866165161133, "logits/rejected": 11.76384162902832, "logps/chosen": -3.630255937576294, "logps/rejected": -3.6889586448669434, "loss": 4.5102, "rewards/accuracies": 0.5, "rewards/chosen": -36.30255889892578, "rewards/margins": 0.5870246887207031, "rewards/rejected": -36.88958740234375, "step": 3473 }, { "epoch": 0.4730392156862745, "grad_norm": 42.46079241813652, "learning_rate": 5.062340567929048e-07, "logits/chosen": 11.358606338500977, "logits/rejected": 12.952411651611328, "logps/chosen": -3.6439208984375, "logps/rejected": -3.9061522483825684, "loss": 4.0746, "rewards/accuracies": 0.75, "rewards/chosen": -36.439208984375, "rewards/margins": 2.622311592102051, "rewards/rejected": -39.061519622802734, "step": 3474 }, { "epoch": 0.4731753812636166, "grad_norm": 41.72360052202471, "learning_rate": 5.060507329548286e-07, "logits/chosen": 12.746675491333008, "logits/rejected": 12.059465408325195, "logps/chosen": -3.9668376445770264, "logps/rejected": -3.9344022274017334, "loss": 4.2708, "rewards/accuracies": 0.5, "rewards/chosen": -39.66837692260742, "rewards/margins": -0.3243570327758789, "rewards/rejected": -39.34402084350586, "step": 3475 }, { "epoch": 0.4733115468409586, "grad_norm": 39.19705206829824, "learning_rate": 5.058673851537127e-07, "logits/chosen": 11.439704895019531, "logits/rejected": 11.922822952270508, "logps/chosen": -3.7337846755981445, "logps/rejected": -3.92537784576416, "loss": 3.7436, "rewards/accuracies": 0.75, "rewards/chosen": -37.33784484863281, "rewards/margins": 1.9159317016601562, "rewards/rejected": -39.25377655029297, "step": 3476 }, { "epoch": 0.47344771241830064, "grad_norm": 49.60804071973147, "learning_rate": 5.056840134309862e-07, "logits/chosen": 11.478439331054688, "logits/rejected": 12.530519485473633, "logps/chosen": -3.2952795028686523, "logps/rejected": -3.697934150695801, "loss": 3.6391, "rewards/accuracies": 1.0, "rewards/chosen": -32.952796936035156, "rewards/margins": 4.026546955108643, "rewards/rejected": -36.979339599609375, "step": 3477 }, { "epoch": 0.4735838779956427, "grad_norm": 39.57896397282145, "learning_rate": 5.05500617828083e-07, "logits/chosen": 11.828336715698242, "logits/rejected": 12.478372573852539, "logps/chosen": -3.9435012340545654, "logps/rejected": -4.221234321594238, "loss": 3.6764, "rewards/accuracies": 1.0, "rewards/chosen": -39.43501281738281, "rewards/margins": 2.777332305908203, "rewards/rejected": -42.21234893798828, "step": 3478 }, { "epoch": 0.47372004357298475, "grad_norm": 58.93326185676813, "learning_rate": 5.053171983864433e-07, "logits/chosen": 11.972352981567383, "logits/rejected": 12.374734878540039, "logps/chosen": -3.8191745281219482, "logps/rejected": -3.9574475288391113, "loss": 4.6883, "rewards/accuracies": 0.75, "rewards/chosen": -38.19174575805664, "rewards/margins": 1.3827285766601562, "rewards/rejected": -39.5744743347168, "step": 3479 }, { "epoch": 0.4738562091503268, "grad_norm": 43.888296754226985, "learning_rate": 5.05133755147512e-07, "logits/chosen": 12.131919860839844, "logits/rejected": 12.15034294128418, "logps/chosen": -3.8155674934387207, "logps/rejected": -4.094702243804932, "loss": 4.172, "rewards/accuracies": 0.75, "rewards/chosen": -38.155677795410156, "rewards/margins": 2.7913475036621094, "rewards/rejected": -40.947021484375, "step": 3480 }, { "epoch": 0.47399237472766886, "grad_norm": 71.73713181967223, "learning_rate": 5.049502881527398e-07, "logits/chosen": 12.222299575805664, "logits/rejected": 11.931405067443848, "logps/chosen": -3.6266558170318604, "logps/rejected": -3.9701547622680664, "loss": 3.904, "rewards/accuracies": 0.75, "rewards/chosen": -36.26655578613281, "rewards/margins": 3.4349870681762695, "rewards/rejected": -39.70154571533203, "step": 3481 }, { "epoch": 0.4741285403050109, "grad_norm": 42.98180749921479, "learning_rate": 5.047667974435823e-07, "logits/chosen": 11.585394859313965, "logits/rejected": 12.051946640014648, "logps/chosen": -3.9319210052490234, "logps/rejected": -4.0822343826293945, "loss": 4.1721, "rewards/accuracies": 0.75, "rewards/chosen": -39.319210052490234, "rewards/margins": 1.503133773803711, "rewards/rejected": -40.82234191894531, "step": 3482 }, { "epoch": 0.4742647058823529, "grad_norm": 38.58213123843918, "learning_rate": 5.04583283061501e-07, "logits/chosen": 12.141987800598145, "logits/rejected": 11.717309951782227, "logps/chosen": -3.5014517307281494, "logps/rejected": -3.8760876655578613, "loss": 4.4459, "rewards/accuracies": 1.0, "rewards/chosen": -35.01451873779297, "rewards/margins": 3.7463579177856445, "rewards/rejected": -38.7608757019043, "step": 3483 }, { "epoch": 0.474400871459695, "grad_norm": 41.40100381556139, "learning_rate": 5.043997450479622e-07, "logits/chosen": 11.191793441772461, "logits/rejected": 11.985594749450684, "logps/chosen": -3.4412879943847656, "logps/rejected": -3.612579345703125, "loss": 3.3491, "rewards/accuracies": 0.75, "rewards/chosen": -34.412879943847656, "rewards/margins": 1.7129144668579102, "rewards/rejected": -36.12579345703125, "step": 3484 }, { "epoch": 0.47453703703703703, "grad_norm": 38.74659384347768, "learning_rate": 5.042161834444383e-07, "logits/chosen": 12.44677734375, "logits/rejected": 12.774826049804688, "logps/chosen": -3.773613214492798, "logps/rejected": -3.8290603160858154, "loss": 3.7157, "rewards/accuracies": 0.5, "rewards/chosen": -37.73613357543945, "rewards/margins": 0.554469108581543, "rewards/rejected": -38.29060363769531, "step": 3485 }, { "epoch": 0.47467320261437906, "grad_norm": 37.14021747753097, "learning_rate": 5.040325982924062e-07, "logits/chosen": 11.704120635986328, "logits/rejected": 11.77543830871582, "logps/chosen": -3.611314058303833, "logps/rejected": -3.782932758331299, "loss": 3.4404, "rewards/accuracies": 1.0, "rewards/chosen": -36.11314010620117, "rewards/margins": 1.7161865234375, "rewards/rejected": -37.82932662963867, "step": 3486 }, { "epoch": 0.47480936819172115, "grad_norm": 39.76764258268582, "learning_rate": 5.038489896333485e-07, "logits/chosen": 11.572366714477539, "logits/rejected": 11.648672103881836, "logps/chosen": -3.998452663421631, "logps/rejected": -3.8632473945617676, "loss": 4.2828, "rewards/accuracies": 0.25, "rewards/chosen": -39.984527587890625, "rewards/margins": -1.3520526885986328, "rewards/rejected": -38.63247299194336, "step": 3487 }, { "epoch": 0.4749455337690632, "grad_norm": 45.913905132841165, "learning_rate": 5.036653575087533e-07, "logits/chosen": 11.812649726867676, "logits/rejected": 10.447025299072266, "logps/chosen": -3.7109856605529785, "logps/rejected": -3.6673827171325684, "loss": 4.4753, "rewards/accuracies": 0.5, "rewards/chosen": -37.10985565185547, "rewards/margins": -0.43602800369262695, "rewards/rejected": -36.673828125, "step": 3488 }, { "epoch": 0.4750816993464052, "grad_norm": 41.36652819674711, "learning_rate": 5.034817019601135e-07, "logits/chosen": 11.935245513916016, "logits/rejected": 12.272092819213867, "logps/chosen": -4.028438091278076, "logps/rejected": -3.9867608547210693, "loss": 3.7948, "rewards/accuracies": 0.5, "rewards/chosen": -40.28437805175781, "rewards/margins": -0.41677284240722656, "rewards/rejected": -39.86760711669922, "step": 3489 }, { "epoch": 0.4752178649237473, "grad_norm": 45.634604168246675, "learning_rate": 5.032980230289279e-07, "logits/chosen": 12.15899658203125, "logits/rejected": 12.324483871459961, "logps/chosen": -3.7860665321350098, "logps/rejected": -3.927882671356201, "loss": 4.8787, "rewards/accuracies": 0.75, "rewards/chosen": -37.86066436767578, "rewards/margins": 1.4181604385375977, "rewards/rejected": -39.27882766723633, "step": 3490 }, { "epoch": 0.4753540305010893, "grad_norm": 38.27980173091141, "learning_rate": 5.031143207567001e-07, "logits/chosen": 11.33129596710205, "logits/rejected": 11.758951187133789, "logps/chosen": -3.5595645904541016, "logps/rejected": -3.9050846099853516, "loss": 3.861, "rewards/accuracies": 0.75, "rewards/chosen": -35.59564971923828, "rewards/margins": 3.455197334289551, "rewards/rejected": -39.050846099853516, "step": 3491 }, { "epoch": 0.47549019607843135, "grad_norm": 39.56801752670169, "learning_rate": 5.029305951849391e-07, "logits/chosen": 11.435375213623047, "logits/rejected": 11.565065383911133, "logps/chosen": -3.7125802040100098, "logps/rejected": -3.9889674186706543, "loss": 3.9497, "rewards/accuracies": 0.75, "rewards/chosen": -37.12580108642578, "rewards/margins": 2.7638721466064453, "rewards/rejected": -39.889671325683594, "step": 3492 }, { "epoch": 0.47562636165577343, "grad_norm": 36.32685558514663, "learning_rate": 5.027468463551594e-07, "logits/chosen": 11.817644119262695, "logits/rejected": 12.124775886535645, "logps/chosen": -3.8272128105163574, "logps/rejected": -3.8123490810394287, "loss": 3.8168, "rewards/accuracies": 0.5, "rewards/chosen": -38.27212905883789, "rewards/margins": -0.14864063262939453, "rewards/rejected": -38.12348937988281, "step": 3493 }, { "epoch": 0.47576252723311546, "grad_norm": 40.424900926065185, "learning_rate": 5.025630743088804e-07, "logits/chosen": 11.569253921508789, "logits/rejected": 11.546173095703125, "logps/chosen": -3.66904354095459, "logps/rejected": -3.6252338886260986, "loss": 4.4875, "rewards/accuracies": 0.5, "rewards/chosen": -36.69043731689453, "rewards/margins": -0.4380941390991211, "rewards/rejected": -36.252342224121094, "step": 3494 }, { "epoch": 0.4758986928104575, "grad_norm": 39.68191809208189, "learning_rate": 5.023792790876269e-07, "logits/chosen": 11.746826171875, "logits/rejected": 12.523330688476562, "logps/chosen": -3.536752700805664, "logps/rejected": -4.037148475646973, "loss": 4.0612, "rewards/accuracies": 1.0, "rewards/chosen": -35.367530822753906, "rewards/margins": 5.003955841064453, "rewards/rejected": -40.371482849121094, "step": 3495 }, { "epoch": 0.4760348583877996, "grad_norm": 35.36398357189209, "learning_rate": 5.021954607329291e-07, "logits/chosen": 12.809257507324219, "logits/rejected": 12.532520294189453, "logps/chosen": -3.896799325942993, "logps/rejected": -4.062444686889648, "loss": 3.707, "rewards/accuracies": 0.5, "rewards/chosen": -38.967994689941406, "rewards/margins": 1.656454086303711, "rewards/rejected": -40.624446868896484, "step": 3496 }, { "epoch": 0.4761710239651416, "grad_norm": 41.95335685914283, "learning_rate": 5.02011619286322e-07, "logits/chosen": 12.668167114257812, "logits/rejected": 12.61221981048584, "logps/chosen": -4.074456691741943, "logps/rejected": -4.0377631187438965, "loss": 4.5238, "rewards/accuracies": 0.5, "rewards/chosen": -40.74456787109375, "rewards/margins": -0.36693668365478516, "rewards/rejected": -40.37763214111328, "step": 3497 }, { "epoch": 0.47630718954248363, "grad_norm": 37.751796067294215, "learning_rate": 5.018277547893465e-07, "logits/chosen": 12.871786117553711, "logits/rejected": 13.208709716796875, "logps/chosen": -3.835543394088745, "logps/rejected": -4.419190883636475, "loss": 4.0058, "rewards/accuracies": 1.0, "rewards/chosen": -38.355430603027344, "rewards/margins": 5.836475372314453, "rewards/rejected": -44.19190979003906, "step": 3498 }, { "epoch": 0.4764433551198257, "grad_norm": 35.2979800417841, "learning_rate": 5.016438672835481e-07, "logits/chosen": 12.385981559753418, "logits/rejected": 12.183572769165039, "logps/chosen": -3.986156463623047, "logps/rejected": -4.388952255249023, "loss": 3.8235, "rewards/accuracies": 0.75, "rewards/chosen": -39.86156463623047, "rewards/margins": 4.027958869934082, "rewards/rejected": -43.889522552490234, "step": 3499 }, { "epoch": 0.47657952069716775, "grad_norm": 40.66814693158597, "learning_rate": 5.014599568104776e-07, "logits/chosen": 12.839675903320312, "logits/rejected": 12.673471450805664, "logps/chosen": -3.66860294342041, "logps/rejected": -4.050067901611328, "loss": 4.1118, "rewards/accuracies": 0.75, "rewards/chosen": -36.68602752685547, "rewards/margins": 3.8146467208862305, "rewards/rejected": -40.50067901611328, "step": 3500 }, { "epoch": 0.47671568627450983, "grad_norm": 41.0409708254771, "learning_rate": 5.012760234116912e-07, "logits/chosen": 12.289636611938477, "logits/rejected": 12.34727668762207, "logps/chosen": -3.9257702827453613, "logps/rejected": -4.004403114318848, "loss": 3.8866, "rewards/accuracies": 0.5, "rewards/chosen": -39.2577018737793, "rewards/margins": 0.7863292694091797, "rewards/rejected": -40.04403305053711, "step": 3501 }, { "epoch": 0.47685185185185186, "grad_norm": 38.33890385550245, "learning_rate": 5.010920671287501e-07, "logits/chosen": 11.974674224853516, "logits/rejected": 12.507078170776367, "logps/chosen": -3.850208282470703, "logps/rejected": -3.984550952911377, "loss": 3.9414, "rewards/accuracies": 0.5, "rewards/chosen": -38.50208282470703, "rewards/margins": 1.343428611755371, "rewards/rejected": -39.84551239013672, "step": 3502 }, { "epoch": 0.4769880174291939, "grad_norm": 39.21501340127492, "learning_rate": 5.00908088003221e-07, "logits/chosen": 13.037240028381348, "logits/rejected": 13.128662109375, "logps/chosen": -4.10344123840332, "logps/rejected": -4.149472713470459, "loss": 3.7687, "rewards/accuracies": 0.5, "rewards/chosen": -41.03440856933594, "rewards/margins": 0.4603147506713867, "rewards/rejected": -41.494728088378906, "step": 3503 }, { "epoch": 0.477124183006536, "grad_norm": 48.63150106660592, "learning_rate": 5.007240860766751e-07, "logits/chosen": 12.227212905883789, "logits/rejected": 12.605988502502441, "logps/chosen": -4.2047119140625, "logps/rejected": -4.463476181030273, "loss": 3.845, "rewards/accuracies": 0.75, "rewards/chosen": -42.047119140625, "rewards/margins": 2.5876407623291016, "rewards/rejected": -44.63475799560547, "step": 3504 }, { "epoch": 0.477260348583878, "grad_norm": 40.60781254544986, "learning_rate": 5.005400613906894e-07, "logits/chosen": 12.01440143585205, "logits/rejected": 13.409481048583984, "logps/chosen": -3.599169969558716, "logps/rejected": -3.9019381999969482, "loss": 4.4844, "rewards/accuracies": 0.75, "rewards/chosen": -35.99169921875, "rewards/margins": 3.027682304382324, "rewards/rejected": -39.01938247680664, "step": 3505 }, { "epoch": 0.47739651416122003, "grad_norm": 55.31571920890174, "learning_rate": 5.003560139868457e-07, "logits/chosen": 12.844975471496582, "logits/rejected": 13.088146209716797, "logps/chosen": -4.172163963317871, "logps/rejected": -4.247830867767334, "loss": 4.5237, "rewards/accuracies": 0.75, "rewards/chosen": -41.72163772583008, "rewards/margins": 0.7566690444946289, "rewards/rejected": -42.478309631347656, "step": 3506 }, { "epoch": 0.4775326797385621, "grad_norm": 42.847913386831124, "learning_rate": 5.001719439067312e-07, "logits/chosen": 12.302297592163086, "logits/rejected": 12.728625297546387, "logps/chosen": -3.6514365673065186, "logps/rejected": -3.9075441360473633, "loss": 4.7926, "rewards/accuracies": 0.75, "rewards/chosen": -36.514366149902344, "rewards/margins": 2.561075210571289, "rewards/rejected": -39.075443267822266, "step": 3507 }, { "epoch": 0.47766884531590414, "grad_norm": 40.35378931126287, "learning_rate": 4.999878511919378e-07, "logits/chosen": 11.850746154785156, "logits/rejected": 13.79525375366211, "logps/chosen": -3.8961174488067627, "logps/rejected": -4.645395278930664, "loss": 3.6449, "rewards/accuracies": 1.0, "rewards/chosen": -38.96117401123047, "rewards/margins": 7.4927778244018555, "rewards/rejected": -46.45395278930664, "step": 3508 }, { "epoch": 0.4778050108932462, "grad_norm": 42.78050485053529, "learning_rate": 4.998037358840632e-07, "logits/chosen": 11.651148796081543, "logits/rejected": 13.481695175170898, "logps/chosen": -3.9598660469055176, "logps/rejected": -4.080420970916748, "loss": 4.2027, "rewards/accuracies": 0.75, "rewards/chosen": -39.59865951538086, "rewards/margins": 1.2055492401123047, "rewards/rejected": -40.80420684814453, "step": 3509 }, { "epoch": 0.47794117647058826, "grad_norm": 42.41172579293135, "learning_rate": 4.996195980247091e-07, "logits/chosen": 12.238384246826172, "logits/rejected": 12.06022834777832, "logps/chosen": -3.715287208557129, "logps/rejected": -3.9540929794311523, "loss": 3.7361, "rewards/accuracies": 0.75, "rewards/chosen": -37.152870178222656, "rewards/margins": 2.388059616088867, "rewards/rejected": -39.540931701660156, "step": 3510 }, { "epoch": 0.4780773420479303, "grad_norm": 38.04691182674281, "learning_rate": 4.994354376554836e-07, "logits/chosen": 13.107817649841309, "logits/rejected": 13.088651657104492, "logps/chosen": -4.088644981384277, "logps/rejected": -4.661755084991455, "loss": 3.7597, "rewards/accuracies": 1.0, "rewards/chosen": -40.886451721191406, "rewards/margins": 5.731098175048828, "rewards/rejected": -46.617549896240234, "step": 3511 }, { "epoch": 0.4782135076252723, "grad_norm": 43.5268029268446, "learning_rate": 4.99251254817999e-07, "logits/chosen": 12.927136421203613, "logits/rejected": 13.818674087524414, "logps/chosen": -4.177899360656738, "logps/rejected": -4.106654167175293, "loss": 3.9369, "rewards/accuracies": 0.5, "rewards/chosen": -41.77899169921875, "rewards/margins": -0.7124500274658203, "rewards/rejected": -41.06654357910156, "step": 3512 }, { "epoch": 0.4783496732026144, "grad_norm": 44.4233824152579, "learning_rate": 4.99067049553873e-07, "logits/chosen": 12.840009689331055, "logits/rejected": 13.162504196166992, "logps/chosen": -4.113978385925293, "logps/rejected": -4.108767509460449, "loss": 4.3269, "rewards/accuracies": 0.75, "rewards/chosen": -41.13978576660156, "rewards/margins": -0.052109718322753906, "rewards/rejected": -41.087677001953125, "step": 3513 }, { "epoch": 0.47848583877995643, "grad_norm": 41.96739848695939, "learning_rate": 4.988828219047282e-07, "logits/chosen": 12.517669677734375, "logits/rejected": 12.469911575317383, "logps/chosen": -4.3294525146484375, "logps/rejected": -4.236301422119141, "loss": 3.9704, "rewards/accuracies": 0.25, "rewards/chosen": -43.294525146484375, "rewards/margins": -0.9315128326416016, "rewards/rejected": -42.363014221191406, "step": 3514 }, { "epoch": 0.47862200435729846, "grad_norm": 40.77515020726763, "learning_rate": 4.986985719121923e-07, "logits/chosen": 11.900520324707031, "logits/rejected": 13.335115432739258, "logps/chosen": -4.071834564208984, "logps/rejected": -4.459545612335205, "loss": 3.4864, "rewards/accuracies": 1.0, "rewards/chosen": -40.71834945678711, "rewards/margins": 3.877108573913574, "rewards/rejected": -44.595458984375, "step": 3515 }, { "epoch": 0.47875816993464054, "grad_norm": 44.838630577691895, "learning_rate": 4.985142996178984e-07, "logits/chosen": 13.367433547973633, "logits/rejected": 13.735034942626953, "logps/chosen": -4.091340065002441, "logps/rejected": -4.322702407836914, "loss": 4.0809, "rewards/accuracies": 0.5, "rewards/chosen": -40.91339874267578, "rewards/margins": 2.3136253356933594, "rewards/rejected": -43.227020263671875, "step": 3516 }, { "epoch": 0.47889433551198257, "grad_norm": 40.27168046833205, "learning_rate": 4.983300050634841e-07, "logits/chosen": 13.089839935302734, "logits/rejected": 12.719974517822266, "logps/chosen": -4.428625106811523, "logps/rejected": -4.419182777404785, "loss": 3.6937, "rewards/accuracies": 0.75, "rewards/chosen": -44.28624725341797, "rewards/margins": -0.09442329406738281, "rewards/rejected": -44.19182586669922, "step": 3517 }, { "epoch": 0.4790305010893246, "grad_norm": 44.82094337216361, "learning_rate": 4.981456882905924e-07, "logits/chosen": 12.683393478393555, "logits/rejected": 13.501692771911621, "logps/chosen": -4.174333572387695, "logps/rejected": -4.179903984069824, "loss": 4.1122, "rewards/accuracies": 0.5, "rewards/chosen": -41.74333572387695, "rewards/margins": 0.055706024169921875, "rewards/rejected": -41.799041748046875, "step": 3518 }, { "epoch": 0.4791666666666667, "grad_norm": 43.973501205119504, "learning_rate": 4.979613493408711e-07, "logits/chosen": 11.395611763000488, "logits/rejected": 11.853755950927734, "logps/chosen": -3.4689369201660156, "logps/rejected": -3.7948896884918213, "loss": 4.2585, "rewards/accuracies": 0.75, "rewards/chosen": -34.68936538696289, "rewards/margins": 3.2595300674438477, "rewards/rejected": -37.94889831542969, "step": 3519 }, { "epoch": 0.4793028322440087, "grad_norm": 44.27985973394065, "learning_rate": 4.977769882559731e-07, "logits/chosen": 12.684640884399414, "logits/rejected": 12.384490966796875, "logps/chosen": -3.8263132572174072, "logps/rejected": -3.9179186820983887, "loss": 4.5108, "rewards/accuracies": 0.75, "rewards/chosen": -38.26313018798828, "rewards/margins": 0.9160547256469727, "rewards/rejected": -39.1791877746582, "step": 3520 }, { "epoch": 0.47943899782135074, "grad_norm": 47.0023330690674, "learning_rate": 4.975926050775565e-07, "logits/chosen": 11.576650619506836, "logits/rejected": 11.907279014587402, "logps/chosen": -4.142636299133301, "logps/rejected": -4.234591484069824, "loss": 3.8089, "rewards/accuracies": 0.75, "rewards/chosen": -41.426368713378906, "rewards/margins": 0.9195489883422852, "rewards/rejected": -42.345916748046875, "step": 3521 }, { "epoch": 0.4795751633986928, "grad_norm": 44.793968288116446, "learning_rate": 4.97408199847284e-07, "logits/chosen": 12.38614559173584, "logits/rejected": 12.927023887634277, "logps/chosen": -3.955366611480713, "logps/rejected": -4.214727401733398, "loss": 4.2289, "rewards/accuracies": 0.5, "rewards/chosen": -39.55366516113281, "rewards/margins": 2.593611717224121, "rewards/rejected": -42.14727783203125, "step": 3522 }, { "epoch": 0.47971132897603486, "grad_norm": 40.6841597648468, "learning_rate": 4.972237726068236e-07, "logits/chosen": 12.02618408203125, "logits/rejected": 12.642339706420898, "logps/chosen": -3.846045970916748, "logps/rejected": -4.230076789855957, "loss": 4.2771, "rewards/accuracies": 1.0, "rewards/chosen": -38.4604606628418, "rewards/margins": 3.8403072357177734, "rewards/rejected": -42.30076599121094, "step": 3523 }, { "epoch": 0.4798474945533769, "grad_norm": 42.27178296990105, "learning_rate": 4.970393233978481e-07, "logits/chosen": 13.324297904968262, "logits/rejected": 13.23814582824707, "logps/chosen": -4.290493011474609, "logps/rejected": -4.490062236785889, "loss": 3.9366, "rewards/accuracies": 0.75, "rewards/chosen": -42.90493392944336, "rewards/margins": 1.9956884384155273, "rewards/rejected": -44.90061950683594, "step": 3524 }, { "epoch": 0.47998366013071897, "grad_norm": 42.16077338104376, "learning_rate": 4.968548522620353e-07, "logits/chosen": 11.984010696411133, "logits/rejected": 12.873748779296875, "logps/chosen": -3.9506049156188965, "logps/rejected": -4.458793640136719, "loss": 4.0947, "rewards/accuracies": 0.75, "rewards/chosen": -39.50605010986328, "rewards/margins": 5.081890106201172, "rewards/rejected": -44.58793640136719, "step": 3525 }, { "epoch": 0.480119825708061, "grad_norm": 41.66990618213027, "learning_rate": 4.966703592410681e-07, "logits/chosen": 12.194341659545898, "logits/rejected": 12.92635726928711, "logps/chosen": -3.7831850051879883, "logps/rejected": -3.8236827850341797, "loss": 3.9312, "rewards/accuracies": 0.75, "rewards/chosen": -37.831851959228516, "rewards/margins": 0.40497779846191406, "rewards/rejected": -38.2368278503418, "step": 3526 }, { "epoch": 0.480255991285403, "grad_norm": 43.16081660891028, "learning_rate": 4.964858443766341e-07, "logits/chosen": 12.780776023864746, "logits/rejected": 12.0275239944458, "logps/chosen": -4.068262100219727, "logps/rejected": -4.103792190551758, "loss": 4.5056, "rewards/accuracies": 0.5, "rewards/chosen": -40.682621002197266, "rewards/margins": 0.3552999496459961, "rewards/rejected": -41.03791809082031, "step": 3527 }, { "epoch": 0.4803921568627451, "grad_norm": 46.50812572308816, "learning_rate": 4.96301307710426e-07, "logits/chosen": 12.151615142822266, "logits/rejected": 12.432489395141602, "logps/chosen": -4.012656211853027, "logps/rejected": -4.090597629547119, "loss": 3.6942, "rewards/accuracies": 0.75, "rewards/chosen": -40.126564025878906, "rewards/margins": 0.7794132232666016, "rewards/rejected": -40.905975341796875, "step": 3528 }, { "epoch": 0.48052832244008714, "grad_norm": 39.937854150794976, "learning_rate": 4.961167492841414e-07, "logits/chosen": 12.817266464233398, "logits/rejected": 13.414749145507812, "logps/chosen": -4.000683784484863, "logps/rejected": -4.200709819793701, "loss": 3.9378, "rewards/accuracies": 0.5, "rewards/chosen": -40.0068359375, "rewards/margins": 2.0002613067626953, "rewards/rejected": -42.00709533691406, "step": 3529 }, { "epoch": 0.48066448801742917, "grad_norm": 37.80937140782278, "learning_rate": 4.959321691394828e-07, "logits/chosen": 12.082345962524414, "logits/rejected": 12.936559677124023, "logps/chosen": -4.20680046081543, "logps/rejected": -4.34435510635376, "loss": 3.7076, "rewards/accuracies": 0.75, "rewards/chosen": -42.0680046081543, "rewards/margins": 1.3755483627319336, "rewards/rejected": -43.44355010986328, "step": 3530 }, { "epoch": 0.48080065359477125, "grad_norm": 46.59422447813834, "learning_rate": 4.957475673181576e-07, "logits/chosen": 11.993078231811523, "logits/rejected": 12.73066520690918, "logps/chosen": -3.796083688735962, "logps/rejected": -3.9498472213745117, "loss": 4.5964, "rewards/accuracies": 0.75, "rewards/chosen": -37.960838317871094, "rewards/margins": 1.5376367568969727, "rewards/rejected": -39.49847412109375, "step": 3531 }, { "epoch": 0.4809368191721133, "grad_norm": 42.946941220742126, "learning_rate": 4.955629438618782e-07, "logits/chosen": 12.994609832763672, "logits/rejected": 12.66382122039795, "logps/chosen": -4.073089599609375, "logps/rejected": -3.8780815601348877, "loss": 3.5229, "rewards/accuracies": 0.25, "rewards/chosen": -40.73089599609375, "rewards/margins": -1.9500799179077148, "rewards/rejected": -38.78081512451172, "step": 3532 }, { "epoch": 0.4810729847494553, "grad_norm": 43.42636925985765, "learning_rate": 4.953782988123615e-07, "logits/chosen": 13.725841522216797, "logits/rejected": 13.378562927246094, "logps/chosen": -4.295477867126465, "logps/rejected": -4.056504249572754, "loss": 4.6403, "rewards/accuracies": 0.25, "rewards/chosen": -42.95477294921875, "rewards/margins": -2.38973331451416, "rewards/rejected": -40.565040588378906, "step": 3533 }, { "epoch": 0.4812091503267974, "grad_norm": 37.78051314557664, "learning_rate": 4.951936322113299e-07, "logits/chosen": 11.61916732788086, "logits/rejected": 12.333108901977539, "logps/chosen": -3.6875691413879395, "logps/rejected": -4.060360908508301, "loss": 4.2723, "rewards/accuracies": 1.0, "rewards/chosen": -36.875694274902344, "rewards/margins": 3.727916717529297, "rewards/rejected": -40.603607177734375, "step": 3534 }, { "epoch": 0.4813453159041394, "grad_norm": 39.11396749988366, "learning_rate": 4.950089441005102e-07, "logits/chosen": 13.115728378295898, "logits/rejected": 13.272234916687012, "logps/chosen": -4.460468769073486, "logps/rejected": -4.511542797088623, "loss": 3.882, "rewards/accuracies": 0.75, "rewards/chosen": -44.60468673706055, "rewards/margins": 0.5107440948486328, "rewards/rejected": -45.11542892456055, "step": 3535 }, { "epoch": 0.48148148148148145, "grad_norm": 36.20336109205725, "learning_rate": 4.948242345216343e-07, "logits/chosen": 11.997478485107422, "logits/rejected": 12.76453685760498, "logps/chosen": -3.756732940673828, "logps/rejected": -3.7511069774627686, "loss": 3.9857, "rewards/accuracies": 0.5, "rewards/chosen": -37.56732940673828, "rewards/margins": -0.056260108947753906, "rewards/rejected": -37.511070251464844, "step": 3536 }, { "epoch": 0.48161764705882354, "grad_norm": 36.95742703435459, "learning_rate": 4.946395035164387e-07, "logits/chosen": 12.922111511230469, "logits/rejected": 12.700788497924805, "logps/chosen": -4.033163547515869, "logps/rejected": -3.853625535964966, "loss": 4.0591, "rewards/accuracies": 0.5, "rewards/chosen": -40.331634521484375, "rewards/margins": -1.795379638671875, "rewards/rejected": -38.5362548828125, "step": 3537 }, { "epoch": 0.48175381263616557, "grad_norm": 36.71667302027801, "learning_rate": 4.94454751126665e-07, "logits/chosen": 12.226764678955078, "logits/rejected": 13.02337646484375, "logps/chosen": -3.9374544620513916, "logps/rejected": -4.471248626708984, "loss": 3.5146, "rewards/accuracies": 0.75, "rewards/chosen": -39.374542236328125, "rewards/margins": 5.337940216064453, "rewards/rejected": -44.71248245239258, "step": 3538 }, { "epoch": 0.48188997821350765, "grad_norm": 41.458914503664914, "learning_rate": 4.942699773940595e-07, "logits/chosen": 12.513525009155273, "logits/rejected": 13.1861572265625, "logps/chosen": -3.6648502349853516, "logps/rejected": -4.067343711853027, "loss": 4.4301, "rewards/accuracies": 0.75, "rewards/chosen": -36.648502349853516, "rewards/margins": 4.024937629699707, "rewards/rejected": -40.67344284057617, "step": 3539 }, { "epoch": 0.4820261437908497, "grad_norm": 39.451981145342934, "learning_rate": 4.940851823603733e-07, "logits/chosen": 13.657211303710938, "logits/rejected": 12.330310821533203, "logps/chosen": -4.625269889831543, "logps/rejected": -4.273927211761475, "loss": 4.3191, "rewards/accuracies": 0.25, "rewards/chosen": -46.2526969909668, "rewards/margins": -3.513422966003418, "rewards/rejected": -42.73927307128906, "step": 3540 }, { "epoch": 0.4821623093681917, "grad_norm": 38.86205520548418, "learning_rate": 4.939003660673625e-07, "logits/chosen": 11.483196258544922, "logits/rejected": 13.404415130615234, "logps/chosen": -3.657036781311035, "logps/rejected": -4.196098327636719, "loss": 3.8186, "rewards/accuracies": 1.0, "rewards/chosen": -36.570369720458984, "rewards/margins": 5.39061164855957, "rewards/rejected": -41.96098327636719, "step": 3541 }, { "epoch": 0.4822984749455338, "grad_norm": 39.44984329011206, "learning_rate": 4.937155285567879e-07, "logits/chosen": 12.890340805053711, "logits/rejected": 12.764983177185059, "logps/chosen": -3.9492568969726562, "logps/rejected": -4.25186824798584, "loss": 3.6927, "rewards/accuracies": 0.75, "rewards/chosen": -39.49256896972656, "rewards/margins": 3.0261144638061523, "rewards/rejected": -42.51868438720703, "step": 3542 }, { "epoch": 0.4824346405228758, "grad_norm": 42.12952917696519, "learning_rate": 4.935306698704148e-07, "logits/chosen": 13.869108200073242, "logits/rejected": 14.10722541809082, "logps/chosen": -4.656030654907227, "logps/rejected": -4.645984649658203, "loss": 4.2452, "rewards/accuracies": 0.25, "rewards/chosen": -46.560306549072266, "rewards/margins": -0.1004648208618164, "rewards/rejected": -46.459842681884766, "step": 3543 }, { "epoch": 0.48257080610021785, "grad_norm": 39.63150363951132, "learning_rate": 4.933457900500138e-07, "logits/chosen": 11.627429962158203, "logits/rejected": 11.949345588684082, "logps/chosen": -4.087477684020996, "logps/rejected": -4.080287456512451, "loss": 4.2733, "rewards/accuracies": 0.5, "rewards/chosen": -40.87477493286133, "rewards/margins": -0.0718994140625, "rewards/rejected": -40.80287551879883, "step": 3544 }, { "epoch": 0.48270697167755994, "grad_norm": 40.85914400817608, "learning_rate": 4.931608891373599e-07, "logits/chosen": 12.56886100769043, "logits/rejected": 13.648090362548828, "logps/chosen": -3.874332904815674, "logps/rejected": -4.213517189025879, "loss": 4.5516, "rewards/accuracies": 0.75, "rewards/chosen": -38.74332809448242, "rewards/margins": 3.3918447494506836, "rewards/rejected": -42.13517379760742, "step": 3545 }, { "epoch": 0.48284313725490197, "grad_norm": 74.29703432242157, "learning_rate": 4.92975967174233e-07, "logits/chosen": 12.312530517578125, "logits/rejected": 12.35788345336914, "logps/chosen": -4.406402587890625, "logps/rejected": -4.563023567199707, "loss": 3.7554, "rewards/accuracies": 1.0, "rewards/chosen": -44.06402587890625, "rewards/margins": 1.5662050247192383, "rewards/rejected": -45.63023376464844, "step": 3546 }, { "epoch": 0.482979302832244, "grad_norm": 36.455542969968846, "learning_rate": 4.927910242024178e-07, "logits/chosen": 12.795209884643555, "logits/rejected": 12.403400421142578, "logps/chosen": -4.039184093475342, "logps/rejected": -4.002196311950684, "loss": 3.8849, "rewards/accuracies": 0.5, "rewards/chosen": -40.39183807373047, "rewards/margins": -0.3698749542236328, "rewards/rejected": -40.02196502685547, "step": 3547 }, { "epoch": 0.4831154684095861, "grad_norm": 37.65485298790751, "learning_rate": 4.926060602637037e-07, "logits/chosen": 12.387609481811523, "logits/rejected": 12.486883163452148, "logps/chosen": -4.193027973175049, "logps/rejected": -4.181723594665527, "loss": 4.0454, "rewards/accuracies": 0.5, "rewards/chosen": -41.93027877807617, "rewards/margins": -0.11304378509521484, "rewards/rejected": -41.817237854003906, "step": 3548 }, { "epoch": 0.4832516339869281, "grad_norm": 38.32157060543743, "learning_rate": 4.924210753998847e-07, "logits/chosen": 13.401561737060547, "logits/rejected": 13.229455947875977, "logps/chosen": -4.507783889770508, "logps/rejected": -4.227444648742676, "loss": 4.2576, "rewards/accuracies": 0.25, "rewards/chosen": -45.07783889770508, "rewards/margins": -2.803389549255371, "rewards/rejected": -42.27444839477539, "step": 3549 }, { "epoch": 0.48338779956427014, "grad_norm": 37.17360846155118, "learning_rate": 4.922360696527599e-07, "logits/chosen": 12.392963409423828, "logits/rejected": 12.368165969848633, "logps/chosen": -3.985203504562378, "logps/rejected": -4.154623985290527, "loss": 4.2454, "rewards/accuracies": 0.5, "rewards/chosen": -39.85203170776367, "rewards/margins": 1.6942024230957031, "rewards/rejected": -41.546234130859375, "step": 3550 }, { "epoch": 0.4835239651416122, "grad_norm": 37.48568742917298, "learning_rate": 4.920510430641327e-07, "logits/chosen": 12.230779647827148, "logits/rejected": 13.122891426086426, "logps/chosen": -4.048563480377197, "logps/rejected": -4.626950740814209, "loss": 3.8951, "rewards/accuracies": 0.75, "rewards/chosen": -40.485633850097656, "rewards/margins": 5.783872604370117, "rewards/rejected": -46.269508361816406, "step": 3551 }, { "epoch": 0.48366013071895425, "grad_norm": 41.041769977835884, "learning_rate": 4.918659956758113e-07, "logits/chosen": 12.572083473205566, "logits/rejected": 13.2900390625, "logps/chosen": -3.9619574546813965, "logps/rejected": -4.278987884521484, "loss": 3.537, "rewards/accuracies": 0.5, "rewards/chosen": -39.61957550048828, "rewards/margins": 3.1703062057495117, "rewards/rejected": -42.789878845214844, "step": 3552 }, { "epoch": 0.4837962962962963, "grad_norm": 36.28812233538065, "learning_rate": 4.916809275296089e-07, "logits/chosen": 12.763724327087402, "logits/rejected": 13.263540267944336, "logps/chosen": -4.320869445800781, "logps/rejected": -4.392861366271973, "loss": 4.111, "rewards/accuracies": 0.5, "rewards/chosen": -43.20869445800781, "rewards/margins": 0.7199153900146484, "rewards/rejected": -43.928611755371094, "step": 3553 }, { "epoch": 0.48393246187363836, "grad_norm": 41.82216161836628, "learning_rate": 4.914958386673431e-07, "logits/chosen": 12.605721473693848, "logits/rejected": 13.549335479736328, "logps/chosen": -4.184422492980957, "logps/rejected": -4.381346702575684, "loss": 3.9737, "rewards/accuracies": 0.75, "rewards/chosen": -41.84422302246094, "rewards/margins": 1.969247817993164, "rewards/rejected": -43.81346893310547, "step": 3554 }, { "epoch": 0.4840686274509804, "grad_norm": 43.90536804302736, "learning_rate": 4.91310729130836e-07, "logits/chosen": 12.146350860595703, "logits/rejected": 12.587074279785156, "logps/chosen": -3.9850759506225586, "logps/rejected": -4.094308853149414, "loss": 3.8314, "rewards/accuracies": 0.5, "rewards/chosen": -39.85075759887695, "rewards/margins": 1.0923280715942383, "rewards/rejected": -40.94308853149414, "step": 3555 }, { "epoch": 0.4842047930283224, "grad_norm": 38.06421739266808, "learning_rate": 4.911255989619151e-07, "logits/chosen": 12.540470123291016, "logits/rejected": 13.091697692871094, "logps/chosen": -4.154980659484863, "logps/rejected": -4.29099178314209, "loss": 4.593, "rewards/accuracies": 0.5, "rewards/chosen": -41.549808502197266, "rewards/margins": 1.3601083755493164, "rewards/rejected": -42.90991973876953, "step": 3556 }, { "epoch": 0.4843409586056645, "grad_norm": 36.01902344698906, "learning_rate": 4.90940448202412e-07, "logits/chosen": 12.74655532836914, "logits/rejected": 12.506661415100098, "logps/chosen": -3.855943441390991, "logps/rejected": -4.270119667053223, "loss": 4.1497, "rewards/accuracies": 0.75, "rewards/chosen": -38.5594367980957, "rewards/margins": 4.141762733459473, "rewards/rejected": -42.70119857788086, "step": 3557 }, { "epoch": 0.48447712418300654, "grad_norm": 38.46348508960071, "learning_rate": 4.907552768941626e-07, "logits/chosen": 12.929475784301758, "logits/rejected": 12.643096923828125, "logps/chosen": -4.105971336364746, "logps/rejected": -4.279247283935547, "loss": 3.8699, "rewards/accuracies": 0.75, "rewards/chosen": -41.05970764160156, "rewards/margins": 1.7327632904052734, "rewards/rejected": -42.79247283935547, "step": 3558 }, { "epoch": 0.48461328976034856, "grad_norm": 36.934903829198404, "learning_rate": 4.905700850790083e-07, "logits/chosen": 13.005461692810059, "logits/rejected": 12.90733814239502, "logps/chosen": -3.963033437728882, "logps/rejected": -4.0861101150512695, "loss": 3.9045, "rewards/accuracies": 0.5, "rewards/chosen": -39.63033676147461, "rewards/margins": 1.2307682037353516, "rewards/rejected": -40.861106872558594, "step": 3559 }, { "epoch": 0.48474945533769065, "grad_norm": 41.08406345191675, "learning_rate": 4.903848727987947e-07, "logits/chosen": 13.561529159545898, "logits/rejected": 12.881622314453125, "logps/chosen": -3.928037643432617, "logps/rejected": -4.308867931365967, "loss": 3.9798, "rewards/accuracies": 0.75, "rewards/chosen": -39.28037643432617, "rewards/margins": 3.8083009719848633, "rewards/rejected": -43.08867645263672, "step": 3560 }, { "epoch": 0.4848856209150327, "grad_norm": 39.08031090836272, "learning_rate": 4.901996400953718e-07, "logits/chosen": 12.899541854858398, "logits/rejected": 12.937345504760742, "logps/chosen": -3.9251723289489746, "logps/rejected": -3.8740017414093018, "loss": 4.367, "rewards/accuracies": 0.25, "rewards/chosen": -39.25172424316406, "rewards/margins": -0.5117034912109375, "rewards/rejected": -38.740020751953125, "step": 3561 }, { "epoch": 0.4850217864923747, "grad_norm": 38.89052056234556, "learning_rate": 4.900143870105948e-07, "logits/chosen": 12.818470001220703, "logits/rejected": 13.553865432739258, "logps/chosen": -4.07243013381958, "logps/rejected": -4.394335746765137, "loss": 4.2934, "rewards/accuracies": 0.75, "rewards/chosen": -40.724300384521484, "rewards/margins": 3.219057083129883, "rewards/rejected": -43.943359375, "step": 3562 }, { "epoch": 0.4851579520697168, "grad_norm": 43.16543632896298, "learning_rate": 4.898291135863229e-07, "logits/chosen": 11.984537124633789, "logits/rejected": 13.195058822631836, "logps/chosen": -3.9161922931671143, "logps/rejected": -4.439152717590332, "loss": 3.8162, "rewards/accuracies": 1.0, "rewards/chosen": -39.16192626953125, "rewards/margins": 5.229602813720703, "rewards/rejected": -44.39152526855469, "step": 3563 }, { "epoch": 0.4852941176470588, "grad_norm": 39.271357140290675, "learning_rate": 4.896438198644203e-07, "logits/chosen": 12.606348037719727, "logits/rejected": 13.516042709350586, "logps/chosen": -4.112170696258545, "logps/rejected": -4.348393440246582, "loss": 4.1988, "rewards/accuracies": 0.5, "rewards/chosen": -41.1217041015625, "rewards/margins": 2.362227439880371, "rewards/rejected": -43.48393249511719, "step": 3564 }, { "epoch": 0.48543028322440085, "grad_norm": 43.098863923531106, "learning_rate": 4.894585058867555e-07, "logits/chosen": 12.237516403198242, "logits/rejected": 12.357104301452637, "logps/chosen": -3.934246301651001, "logps/rejected": -3.922574758529663, "loss": 4.3382, "rewards/accuracies": 0.5, "rewards/chosen": -39.34246063232422, "rewards/margins": -0.1167154312133789, "rewards/rejected": -39.225746154785156, "step": 3565 }, { "epoch": 0.48556644880174293, "grad_norm": 46.16156881022091, "learning_rate": 4.892731716952019e-07, "logits/chosen": 12.058890342712402, "logits/rejected": 12.959039688110352, "logps/chosen": -4.103556156158447, "logps/rejected": -4.58721923828125, "loss": 3.5017, "rewards/accuracies": 0.75, "rewards/chosen": -41.035560607910156, "rewards/margins": 4.836627006530762, "rewards/rejected": -45.8721923828125, "step": 3566 }, { "epoch": 0.48570261437908496, "grad_norm": 39.606858373203394, "learning_rate": 4.890878173316373e-07, "logits/chosen": 12.883285522460938, "logits/rejected": 13.367668151855469, "logps/chosen": -4.157710075378418, "logps/rejected": -4.272580146789551, "loss": 4.0969, "rewards/accuracies": 0.5, "rewards/chosen": -41.57710266113281, "rewards/margins": 1.1487007141113281, "rewards/rejected": -42.725799560546875, "step": 3567 }, { "epoch": 0.485838779956427, "grad_norm": 41.53284065644656, "learning_rate": 4.889024428379437e-07, "logits/chosen": 12.168083190917969, "logits/rejected": 13.116941452026367, "logps/chosen": -4.088008880615234, "logps/rejected": -4.448597431182861, "loss": 4.0428, "rewards/accuracies": 0.75, "rewards/chosen": -40.880088806152344, "rewards/margins": 3.605884552001953, "rewards/rejected": -44.4859733581543, "step": 3568 }, { "epoch": 0.4859749455337691, "grad_norm": 46.867903565907234, "learning_rate": 4.887170482560085e-07, "logits/chosen": 12.568159103393555, "logits/rejected": 12.887398719787598, "logps/chosen": -4.091701507568359, "logps/rejected": -4.288297653198242, "loss": 3.1414, "rewards/accuracies": 1.0, "rewards/chosen": -40.917015075683594, "rewards/margins": 1.9659624099731445, "rewards/rejected": -42.88297653198242, "step": 3569 }, { "epoch": 0.4861111111111111, "grad_norm": 41.95045776298461, "learning_rate": 4.885316336277227e-07, "logits/chosen": 13.380672454833984, "logits/rejected": 12.664026260375977, "logps/chosen": -3.9653306007385254, "logps/rejected": -4.173177242279053, "loss": 4.0094, "rewards/accuracies": 0.5, "rewards/chosen": -39.6533088684082, "rewards/margins": 2.078465461730957, "rewards/rejected": -41.731773376464844, "step": 3570 }, { "epoch": 0.48624727668845313, "grad_norm": 44.481650995259706, "learning_rate": 4.883461989949827e-07, "logits/chosen": 11.924662590026855, "logits/rejected": 12.12891960144043, "logps/chosen": -4.353227615356445, "logps/rejected": -4.198747158050537, "loss": 5.1049, "rewards/accuracies": 0.5, "rewards/chosen": -43.53227615356445, "rewards/margins": -1.544804573059082, "rewards/rejected": -41.98747253417969, "step": 3571 }, { "epoch": 0.4863834422657952, "grad_norm": 38.78630717118862, "learning_rate": 4.881607443996887e-07, "logits/chosen": 11.90323543548584, "logits/rejected": 12.429107666015625, "logps/chosen": -4.154402732849121, "logps/rejected": -4.41854190826416, "loss": 3.8101, "rewards/accuracies": 0.75, "rewards/chosen": -41.54402160644531, "rewards/margins": 2.6413917541503906, "rewards/rejected": -44.18541717529297, "step": 3572 }, { "epoch": 0.48651960784313725, "grad_norm": 52.34612909229408, "learning_rate": 4.879752698837457e-07, "logits/chosen": 11.808834075927734, "logits/rejected": 12.695991516113281, "logps/chosen": -4.003169059753418, "logps/rejected": -4.115882396697998, "loss": 3.681, "rewards/accuracies": 0.5, "rewards/chosen": -40.03169250488281, "rewards/margins": 1.1271324157714844, "rewards/rejected": -41.1588249206543, "step": 3573 }, { "epoch": 0.4866557734204793, "grad_norm": 37.23876822042177, "learning_rate": 4.877897754890634e-07, "logits/chosen": 12.543466567993164, "logits/rejected": 13.753400802612305, "logps/chosen": -3.8586325645446777, "logps/rejected": -4.63865852355957, "loss": 4.1641, "rewards/accuracies": 1.0, "rewards/chosen": -38.586326599121094, "rewards/margins": 7.80025577545166, "rewards/rejected": -46.38658142089844, "step": 3574 }, { "epoch": 0.48679193899782136, "grad_norm": 41.92575696415624, "learning_rate": 4.876042612575554e-07, "logits/chosen": 11.471776008605957, "logits/rejected": 12.538440704345703, "logps/chosen": -4.019668102264404, "logps/rejected": -4.188915252685547, "loss": 3.4378, "rewards/accuracies": 0.75, "rewards/chosen": -40.196678161621094, "rewards/margins": 1.6924705505371094, "rewards/rejected": -41.88915252685547, "step": 3575 }, { "epoch": 0.4869281045751634, "grad_norm": 44.32775878708228, "learning_rate": 4.874187272311406e-07, "logits/chosen": 12.398412704467773, "logits/rejected": 12.80031681060791, "logps/chosen": -3.9704790115356445, "logps/rejected": -4.084049224853516, "loss": 4.0012, "rewards/accuracies": 0.75, "rewards/chosen": -39.70479202270508, "rewards/margins": 1.135702133178711, "rewards/rejected": -40.840492248535156, "step": 3576 }, { "epoch": 0.4870642701525055, "grad_norm": 53.34816699675874, "learning_rate": 4.872331734517418e-07, "logits/chosen": 12.226953506469727, "logits/rejected": 12.4715576171875, "logps/chosen": -3.553518772125244, "logps/rejected": -3.8511223793029785, "loss": 3.6496, "rewards/accuracies": 0.75, "rewards/chosen": -35.535186767578125, "rewards/margins": 2.9760379791259766, "rewards/rejected": -38.51122283935547, "step": 3577 }, { "epoch": 0.4872004357298475, "grad_norm": 38.34360501670288, "learning_rate": 4.870475999612863e-07, "logits/chosen": 13.37405776977539, "logits/rejected": 13.913799285888672, "logps/chosen": -4.201713562011719, "logps/rejected": -4.284391403198242, "loss": 4.0994, "rewards/accuracies": 0.5, "rewards/chosen": -42.01713180541992, "rewards/margins": 0.8267822265625, "rewards/rejected": -42.84391403198242, "step": 3578 }, { "epoch": 0.48733660130718953, "grad_norm": 42.57667829840242, "learning_rate": 4.86862006801706e-07, "logits/chosen": 11.531045913696289, "logits/rejected": 12.580268859863281, "logps/chosen": -3.787868022918701, "logps/rejected": -4.194779396057129, "loss": 3.9728, "rewards/accuracies": 1.0, "rewards/chosen": -37.87868118286133, "rewards/margins": 4.069114685058594, "rewards/rejected": -41.94779586791992, "step": 3579 }, { "epoch": 0.4874727668845316, "grad_norm": 40.75987830340303, "learning_rate": 4.866763940149374e-07, "logits/chosen": 11.584573745727539, "logits/rejected": 13.135231018066406, "logps/chosen": -3.6035141944885254, "logps/rejected": -4.143404960632324, "loss": 3.7768, "rewards/accuracies": 1.0, "rewards/chosen": -36.03514099121094, "rewards/margins": 5.398906707763672, "rewards/rejected": -41.434043884277344, "step": 3580 }, { "epoch": 0.48760893246187365, "grad_norm": 42.209034412452134, "learning_rate": 4.864907616429211e-07, "logits/chosen": 11.70002555847168, "logits/rejected": 12.339742660522461, "logps/chosen": -3.437723159790039, "logps/rejected": -3.753657102584839, "loss": 4.1023, "rewards/accuracies": 0.75, "rewards/chosen": -34.37723159790039, "rewards/margins": 3.159339427947998, "rewards/rejected": -37.53657150268555, "step": 3581 }, { "epoch": 0.4877450980392157, "grad_norm": 40.024710611862496, "learning_rate": 4.863051097276021e-07, "logits/chosen": 12.918073654174805, "logits/rejected": 12.791023254394531, "logps/chosen": -4.0807719230651855, "logps/rejected": -4.127459526062012, "loss": 3.8545, "rewards/accuracies": 0.5, "rewards/chosen": -40.80772018432617, "rewards/margins": 0.46687889099121094, "rewards/rejected": -41.27459716796875, "step": 3582 }, { "epoch": 0.48788126361655776, "grad_norm": 38.525248032480974, "learning_rate": 4.861194383109301e-07, "logits/chosen": 11.500799179077148, "logits/rejected": 11.713043212890625, "logps/chosen": -3.597557544708252, "logps/rejected": -3.8920674324035645, "loss": 3.6381, "rewards/accuracies": 0.75, "rewards/chosen": -35.97557067871094, "rewards/margins": 2.945099353790283, "rewards/rejected": -38.92067337036133, "step": 3583 }, { "epoch": 0.4880174291938998, "grad_norm": 40.584069050071655, "learning_rate": 4.859337474348594e-07, "logits/chosen": 12.976825714111328, "logits/rejected": 11.872817993164062, "logps/chosen": -3.8657078742980957, "logps/rejected": -3.9489502906799316, "loss": 3.8097, "rewards/accuracies": 0.5, "rewards/chosen": -38.65707778930664, "rewards/margins": 0.8324265480041504, "rewards/rejected": -39.489501953125, "step": 3584 }, { "epoch": 0.4881535947712418, "grad_norm": 42.83842776724305, "learning_rate": 4.85748037141348e-07, "logits/chosen": 13.24940299987793, "logits/rejected": 13.003459930419922, "logps/chosen": -3.7960405349731445, "logps/rejected": -3.9007010459899902, "loss": 3.5937, "rewards/accuracies": 0.75, "rewards/chosen": -37.96040725708008, "rewards/margins": 1.0466022491455078, "rewards/rejected": -39.00701141357422, "step": 3585 }, { "epoch": 0.4882897603485839, "grad_norm": 37.38697845884151, "learning_rate": 4.855623074723588e-07, "logits/chosen": 12.335233688354492, "logits/rejected": 12.786413192749023, "logps/chosen": -4.00960111618042, "logps/rejected": -4.067215919494629, "loss": 4.322, "rewards/accuracies": 0.5, "rewards/chosen": -40.09600830078125, "rewards/margins": 0.5761489868164062, "rewards/rejected": -40.672157287597656, "step": 3586 }, { "epoch": 0.48842592592592593, "grad_norm": 35.82327271739774, "learning_rate": 4.85376558469859e-07, "logits/chosen": 12.497440338134766, "logits/rejected": 12.897656440734863, "logps/chosen": -3.880082130432129, "logps/rejected": -4.077726364135742, "loss": 3.8246, "rewards/accuracies": 0.75, "rewards/chosen": -38.800819396972656, "rewards/margins": 1.9764432907104492, "rewards/rejected": -40.777259826660156, "step": 3587 }, { "epoch": 0.48856209150326796, "grad_norm": 39.23461183095653, "learning_rate": 4.8519079017582e-07, "logits/chosen": 12.035564422607422, "logits/rejected": 12.63353157043457, "logps/chosen": -4.0058135986328125, "logps/rejected": -4.1617608070373535, "loss": 4.2564, "rewards/accuracies": 0.75, "rewards/chosen": -40.05813217163086, "rewards/margins": 1.5594758987426758, "rewards/rejected": -41.61760711669922, "step": 3588 }, { "epoch": 0.48869825708061004, "grad_norm": 44.57550730768102, "learning_rate": 4.850050026322179e-07, "logits/chosen": 11.469228744506836, "logits/rejected": 11.362447738647461, "logps/chosen": -3.787309169769287, "logps/rejected": -3.8319764137268066, "loss": 4.7604, "rewards/accuracies": 0.75, "rewards/chosen": -37.87309265136719, "rewards/margins": 0.44667530059814453, "rewards/rejected": -38.319766998291016, "step": 3589 }, { "epoch": 0.4888344226579521, "grad_norm": 36.325762208492364, "learning_rate": 4.848191958810328e-07, "logits/chosen": 12.380426406860352, "logits/rejected": 13.037147521972656, "logps/chosen": -3.9529972076416016, "logps/rejected": -3.9193825721740723, "loss": 3.7441, "rewards/accuracies": 0.5, "rewards/chosen": -39.529972076416016, "rewards/margins": -0.3361473083496094, "rewards/rejected": -39.193824768066406, "step": 3590 }, { "epoch": 0.4889705882352941, "grad_norm": 34.30852976708469, "learning_rate": 4.846333699642491e-07, "logits/chosen": 11.863256454467773, "logits/rejected": 12.651707649230957, "logps/chosen": -3.9241700172424316, "logps/rejected": -4.13424015045166, "loss": 3.92, "rewards/accuracies": 0.5, "rewards/chosen": -39.24169921875, "rewards/margins": 2.100705146789551, "rewards/rejected": -41.342403411865234, "step": 3591 }, { "epoch": 0.4891067538126362, "grad_norm": 42.00560288725657, "learning_rate": 4.84447524923856e-07, "logits/chosen": 11.798138618469238, "logits/rejected": 13.181156158447266, "logps/chosen": -3.8202033042907715, "logps/rejected": -4.10302209854126, "loss": 3.4663, "rewards/accuracies": 0.5, "rewards/chosen": -38.20203399658203, "rewards/margins": 2.82818603515625, "rewards/rejected": -41.03022003173828, "step": 3592 }, { "epoch": 0.4892429193899782, "grad_norm": 38.19850576588459, "learning_rate": 4.842616608018465e-07, "logits/chosen": 12.796770095825195, "logits/rejected": 12.268054962158203, "logps/chosen": -3.4534497261047363, "logps/rejected": -3.5055108070373535, "loss": 3.6185, "rewards/accuracies": 0.75, "rewards/chosen": -34.53449630737305, "rewards/margins": 0.5206103324890137, "rewards/rejected": -35.05510711669922, "step": 3593 }, { "epoch": 0.48937908496732024, "grad_norm": 34.225705254438125, "learning_rate": 4.840757776402183e-07, "logits/chosen": 12.466148376464844, "logits/rejected": 13.001041412353516, "logps/chosen": -3.8356263637542725, "logps/rejected": -4.215778350830078, "loss": 3.4932, "rewards/accuracies": 0.75, "rewards/chosen": -38.35626220703125, "rewards/margins": 3.8015213012695312, "rewards/rejected": -42.15778350830078, "step": 3594 }, { "epoch": 0.48951525054466233, "grad_norm": 40.367425466604615, "learning_rate": 4.838898754809731e-07, "logits/chosen": 12.377870559692383, "logits/rejected": 12.693172454833984, "logps/chosen": -3.7040762901306152, "logps/rejected": -4.0258588790893555, "loss": 3.597, "rewards/accuracies": 1.0, "rewards/chosen": -37.04076385498047, "rewards/margins": 3.217827320098877, "rewards/rejected": -40.25859069824219, "step": 3595 }, { "epoch": 0.48965141612200436, "grad_norm": 30.740993282210578, "learning_rate": 4.837039543661173e-07, "logits/chosen": 11.791378021240234, "logits/rejected": 12.04049301147461, "logps/chosen": -3.696167469024658, "logps/rejected": -3.846640110015869, "loss": 3.4087, "rewards/accuracies": 0.5, "rewards/chosen": -36.96167755126953, "rewards/margins": 1.5047264099121094, "rewards/rejected": -38.466400146484375, "step": 3596 }, { "epoch": 0.4897875816993464, "grad_norm": 41.70459228529939, "learning_rate": 4.835180143376608e-07, "logits/chosen": 12.292551040649414, "logits/rejected": 12.362369537353516, "logps/chosen": -3.6447534561157227, "logps/rejected": -3.994091033935547, "loss": 4.5548, "rewards/accuracies": 0.5, "rewards/chosen": -36.44753646850586, "rewards/margins": 3.4933767318725586, "rewards/rejected": -39.94091033935547, "step": 3597 }, { "epoch": 0.48992374727668847, "grad_norm": 39.2281291769379, "learning_rate": 4.833320554376187e-07, "logits/chosen": 11.67877197265625, "logits/rejected": 11.925610542297363, "logps/chosen": -3.500147819519043, "logps/rejected": -3.6405298709869385, "loss": 4.0102, "rewards/accuracies": 0.75, "rewards/chosen": -35.00148010253906, "rewards/margins": 1.4038200378417969, "rewards/rejected": -36.40530014038086, "step": 3598 }, { "epoch": 0.4900599128540305, "grad_norm": 39.19858985917419, "learning_rate": 4.8314607770801e-07, "logits/chosen": 12.904403686523438, "logits/rejected": 12.52107048034668, "logps/chosen": -4.018835544586182, "logps/rejected": -3.823225259780884, "loss": 4.1021, "rewards/accuracies": 0.25, "rewards/chosen": -40.1883544921875, "rewards/margins": -1.956101417541504, "rewards/rejected": -38.23225402832031, "step": 3599 }, { "epoch": 0.49019607843137253, "grad_norm": 37.904175501673166, "learning_rate": 4.829600811908576e-07, "logits/chosen": 12.928949356079102, "logits/rejected": 12.961057662963867, "logps/chosen": -3.954220771789551, "logps/rejected": -4.253276824951172, "loss": 3.873, "rewards/accuracies": 0.75, "rewards/chosen": -39.542205810546875, "rewards/margins": 2.9905614852905273, "rewards/rejected": -42.53276824951172, "step": 3600 }, { "epoch": 0.4903322440087146, "grad_norm": 35.34761459200561, "learning_rate": 4.827740659281892e-07, "logits/chosen": 13.298341751098633, "logits/rejected": 13.382745742797852, "logps/chosen": -4.273865222930908, "logps/rejected": -4.392755508422852, "loss": 3.7847, "rewards/accuracies": 0.5, "rewards/chosen": -42.73865509033203, "rewards/margins": 1.1889028549194336, "rewards/rejected": -43.927555084228516, "step": 3601 }, { "epoch": 0.49046840958605664, "grad_norm": 43.70860359409064, "learning_rate": 4.825880319620363e-07, "logits/chosen": 13.512980461120605, "logits/rejected": 13.09605598449707, "logps/chosen": -4.2798333168029785, "logps/rejected": -4.198111534118652, "loss": 4.673, "rewards/accuracies": 0.5, "rewards/chosen": -42.79833221435547, "rewards/margins": -0.8172130584716797, "rewards/rejected": -41.981117248535156, "step": 3602 }, { "epoch": 0.49060457516339867, "grad_norm": 37.1510572122308, "learning_rate": 4.824019793344349e-07, "logits/chosen": 13.55379867553711, "logits/rejected": 13.469990730285645, "logps/chosen": -4.244022369384766, "logps/rejected": -4.055622100830078, "loss": 4.0602, "rewards/accuracies": 0.25, "rewards/chosen": -42.440223693847656, "rewards/margins": -1.8840036392211914, "rewards/rejected": -40.55622100830078, "step": 3603 }, { "epoch": 0.49074074074074076, "grad_norm": 36.39717301078159, "learning_rate": 4.822159080874253e-07, "logits/chosen": 13.128488540649414, "logits/rejected": 13.766534805297852, "logps/chosen": -3.970191478729248, "logps/rejected": -4.239416599273682, "loss": 4.033, "rewards/accuracies": 0.75, "rewards/chosen": -39.7019157409668, "rewards/margins": 2.6922502517700195, "rewards/rejected": -42.3941650390625, "step": 3604 }, { "epoch": 0.4908769063180828, "grad_norm": 37.17516058805745, "learning_rate": 4.820298182630514e-07, "logits/chosen": 12.98597526550293, "logits/rejected": 13.22171401977539, "logps/chosen": -4.146430015563965, "logps/rejected": -4.167708396911621, "loss": 3.9482, "rewards/accuracies": 0.75, "rewards/chosen": -41.464298248291016, "rewards/margins": 0.21278667449951172, "rewards/rejected": -41.677085876464844, "step": 3605 }, { "epoch": 0.4910130718954248, "grad_norm": 39.154887803586284, "learning_rate": 4.818437099033621e-07, "logits/chosen": 12.979621887207031, "logits/rejected": 14.60462474822998, "logps/chosen": -4.240413665771484, "logps/rejected": -4.710874080657959, "loss": 4.2245, "rewards/accuracies": 1.0, "rewards/chosen": -42.404136657714844, "rewards/margins": 4.7046051025390625, "rewards/rejected": -47.108741760253906, "step": 3606 }, { "epoch": 0.4911492374727669, "grad_norm": 36.23596508175623, "learning_rate": 4.816575830504101e-07, "logits/chosen": 14.09699535369873, "logits/rejected": 13.330379486083984, "logps/chosen": -4.23530387878418, "logps/rejected": -4.257956504821777, "loss": 4.0326, "rewards/accuracies": 0.5, "rewards/chosen": -42.35303497314453, "rewards/margins": 0.2265300750732422, "rewards/rejected": -42.579566955566406, "step": 3607 }, { "epoch": 0.4912854030501089, "grad_norm": 46.222827247916854, "learning_rate": 4.814714377462521e-07, "logits/chosen": 13.32132339477539, "logits/rejected": 13.581039428710938, "logps/chosen": -3.8463733196258545, "logps/rejected": -4.092485427856445, "loss": 4.0993, "rewards/accuracies": 0.75, "rewards/chosen": -38.4637336730957, "rewards/margins": 2.4611244201660156, "rewards/rejected": -40.92485809326172, "step": 3608 }, { "epoch": 0.49142156862745096, "grad_norm": 38.33998392073615, "learning_rate": 4.812852740329493e-07, "logits/chosen": 13.11761474609375, "logits/rejected": 14.081390380859375, "logps/chosen": -4.223391532897949, "logps/rejected": -4.581096172332764, "loss": 4.1712, "rewards/accuracies": 0.75, "rewards/chosen": -42.23391342163086, "rewards/margins": 3.577045440673828, "rewards/rejected": -45.81095886230469, "step": 3609 }, { "epoch": 0.49155773420479304, "grad_norm": 36.93408607873091, "learning_rate": 4.81099091952567e-07, "logits/chosen": 13.72885513305664, "logits/rejected": 13.073368072509766, "logps/chosen": -4.45763635635376, "logps/rejected": -4.216984748840332, "loss": 3.5356, "rewards/accuracies": 0.25, "rewards/chosen": -44.57636642456055, "rewards/margins": -2.4065141677856445, "rewards/rejected": -42.16984939575195, "step": 3610 }, { "epoch": 0.49169389978213507, "grad_norm": 44.4060031635413, "learning_rate": 4.809128915471744e-07, "logits/chosen": 13.228617668151855, "logits/rejected": 13.256607055664062, "logps/chosen": -3.9418559074401855, "logps/rejected": -4.351458549499512, "loss": 3.5615, "rewards/accuracies": 0.75, "rewards/chosen": -39.418556213378906, "rewards/margins": 4.096026420593262, "rewards/rejected": -43.514583587646484, "step": 3611 }, { "epoch": 0.4918300653594771, "grad_norm": 37.18252451145692, "learning_rate": 4.807266728588452e-07, "logits/chosen": 13.297784805297852, "logits/rejected": 14.484947204589844, "logps/chosen": -4.231520652770996, "logps/rejected": -4.656661033630371, "loss": 4.2813, "rewards/accuracies": 0.75, "rewards/chosen": -42.31520080566406, "rewards/margins": 4.251409530639648, "rewards/rejected": -46.566612243652344, "step": 3612 }, { "epoch": 0.4919662309368192, "grad_norm": 40.69957887471733, "learning_rate": 4.80540435929657e-07, "logits/chosen": 12.249975204467773, "logits/rejected": 12.682157516479492, "logps/chosen": -3.9493589401245117, "logps/rejected": -3.7885613441467285, "loss": 4.4851, "rewards/accuracies": 0.25, "rewards/chosen": -39.493587493896484, "rewards/margins": -1.6079750061035156, "rewards/rejected": -37.88561248779297, "step": 3613 }, { "epoch": 0.4921023965141612, "grad_norm": 40.227949439742254, "learning_rate": 4.803541808016915e-07, "logits/chosen": 12.059356689453125, "logits/rejected": 12.709833145141602, "logps/chosen": -3.8051857948303223, "logps/rejected": -4.303292274475098, "loss": 4.3447, "rewards/accuracies": 1.0, "rewards/chosen": -38.05186080932617, "rewards/margins": 4.981060981750488, "rewards/rejected": -43.032920837402344, "step": 3614 }, { "epoch": 0.4922385620915033, "grad_norm": 41.519969427022396, "learning_rate": 4.801679075170347e-07, "logits/chosen": 13.2725830078125, "logits/rejected": 13.938445091247559, "logps/chosen": -4.115230560302734, "logps/rejected": -4.352078437805176, "loss": 3.7224, "rewards/accuracies": 0.75, "rewards/chosen": -41.15230178833008, "rewards/margins": 2.368480682373047, "rewards/rejected": -43.520782470703125, "step": 3615 }, { "epoch": 0.4923747276688453, "grad_norm": 43.630520176066305, "learning_rate": 4.799816161177763e-07, "logits/chosen": 12.5020751953125, "logits/rejected": 12.778156280517578, "logps/chosen": -4.079487323760986, "logps/rejected": -4.057460784912109, "loss": 3.6657, "rewards/accuracies": 0.25, "rewards/chosen": -40.79487228393555, "rewards/margins": -0.22026443481445312, "rewards/rejected": -40.574607849121094, "step": 3616 }, { "epoch": 0.49251089324618735, "grad_norm": 39.96686745174254, "learning_rate": 4.797953066460108e-07, "logits/chosen": 13.344732284545898, "logits/rejected": 13.144196510314941, "logps/chosen": -4.151296138763428, "logps/rejected": -4.192655563354492, "loss": 4.1443, "rewards/accuracies": 0.5, "rewards/chosen": -41.512962341308594, "rewards/margins": 0.41359424591064453, "rewards/rejected": -41.92655944824219, "step": 3617 }, { "epoch": 0.49264705882352944, "grad_norm": 37.40599202381337, "learning_rate": 4.796089791438362e-07, "logits/chosen": 11.924633026123047, "logits/rejected": 12.338258743286133, "logps/chosen": -3.839061737060547, "logps/rejected": -4.040268898010254, "loss": 3.7726, "rewards/accuracies": 0.5, "rewards/chosen": -38.39061737060547, "rewards/margins": 2.012075424194336, "rewards/rejected": -40.40269088745117, "step": 3618 }, { "epoch": 0.49278322440087147, "grad_norm": 55.93596267893272, "learning_rate": 4.794226336533546e-07, "logits/chosen": 12.437105178833008, "logits/rejected": 13.278068542480469, "logps/chosen": -3.8661296367645264, "logps/rejected": -4.2645792961120605, "loss": 3.537, "rewards/accuracies": 1.0, "rewards/chosen": -38.66129684448242, "rewards/margins": 3.9844970703125, "rewards/rejected": -42.64579391479492, "step": 3619 }, { "epoch": 0.4929193899782135, "grad_norm": 39.880265816583055, "learning_rate": 4.792362702166725e-07, "logits/chosen": 13.163738250732422, "logits/rejected": 14.154162406921387, "logps/chosen": -3.9939961433410645, "logps/rejected": -4.520412921905518, "loss": 3.8063, "rewards/accuracies": 0.75, "rewards/chosen": -39.939964294433594, "rewards/margins": 5.264167785644531, "rewards/rejected": -45.204132080078125, "step": 3620 }, { "epoch": 0.4930555555555556, "grad_norm": 43.99320390499561, "learning_rate": 4.790498888759e-07, "logits/chosen": 12.449993133544922, "logits/rejected": 12.42378044128418, "logps/chosen": -3.854907989501953, "logps/rejected": -3.8220324516296387, "loss": 3.6366, "rewards/accuracies": 0.75, "rewards/chosen": -38.54907989501953, "rewards/margins": -0.3287534713745117, "rewards/rejected": -38.2203254699707, "step": 3621 }, { "epoch": 0.4931917211328976, "grad_norm": 40.8599567026886, "learning_rate": 4.788634896731519e-07, "logits/chosen": 13.189640045166016, "logits/rejected": 12.93872356414795, "logps/chosen": -3.9808335304260254, "logps/rejected": -3.965547561645508, "loss": 4.3844, "rewards/accuracies": 0.25, "rewards/chosen": -39.80833435058594, "rewards/margins": -0.15285873413085938, "rewards/rejected": -39.655479431152344, "step": 3622 }, { "epoch": 0.49332788671023964, "grad_norm": 38.86454470743024, "learning_rate": 4.786770726505463e-07, "logits/chosen": 13.144927978515625, "logits/rejected": 13.112102508544922, "logps/chosen": -4.1089067459106445, "logps/rejected": -4.245024681091309, "loss": 3.8436, "rewards/accuracies": 0.5, "rewards/chosen": -41.08906555175781, "rewards/margins": 1.3611841201782227, "rewards/rejected": -42.45024871826172, "step": 3623 }, { "epoch": 0.4934640522875817, "grad_norm": 41.547386192164076, "learning_rate": 4.784906378502058e-07, "logits/chosen": 13.179327011108398, "logits/rejected": 12.247760772705078, "logps/chosen": -4.190430641174316, "logps/rejected": -3.888549327850342, "loss": 4.4918, "rewards/accuracies": 0.25, "rewards/chosen": -41.90430450439453, "rewards/margins": -3.018810272216797, "rewards/rejected": -38.885494232177734, "step": 3624 }, { "epoch": 0.49360021786492375, "grad_norm": 42.361208577957754, "learning_rate": 4.783041853142568e-07, "logits/chosen": 13.37232780456543, "logits/rejected": 13.498735427856445, "logps/chosen": -4.1453986167907715, "logps/rejected": -4.246349334716797, "loss": 4.0626, "rewards/accuracies": 0.5, "rewards/chosen": -41.45398712158203, "rewards/margins": 1.0095043182373047, "rewards/rejected": -42.4634895324707, "step": 3625 }, { "epoch": 0.4937363834422658, "grad_norm": 38.66392008243345, "learning_rate": 4.7811771508483e-07, "logits/chosen": 12.919862747192383, "logits/rejected": 12.429451942443848, "logps/chosen": -3.9428658485412598, "logps/rejected": -3.785095691680908, "loss": 3.7302, "rewards/accuracies": 0.5, "rewards/chosen": -39.42865753173828, "rewards/margins": -1.5776996612548828, "rewards/rejected": -37.85095977783203, "step": 3626 }, { "epoch": 0.49387254901960786, "grad_norm": 38.35449969926077, "learning_rate": 4.779312272040597e-07, "logits/chosen": 13.259773254394531, "logits/rejected": 13.077779769897461, "logps/chosen": -4.106165885925293, "logps/rejected": -4.160979747772217, "loss": 4.2935, "rewards/accuracies": 0.5, "rewards/chosen": -41.0616569519043, "rewards/margins": 0.5481405258178711, "rewards/rejected": -41.60979461669922, "step": 3627 }, { "epoch": 0.4940087145969499, "grad_norm": 41.15760198542907, "learning_rate": 4.777447217140845e-07, "logits/chosen": 13.232492446899414, "logits/rejected": 13.984720230102539, "logps/chosen": -4.17929220199585, "logps/rejected": -4.0972900390625, "loss": 4.2481, "rewards/accuracies": 0.25, "rewards/chosen": -41.79292297363281, "rewards/margins": -0.8200225830078125, "rewards/rejected": -40.972900390625, "step": 3628 }, { "epoch": 0.4941448801742919, "grad_norm": 46.00994971749938, "learning_rate": 4.775581986570467e-07, "logits/chosen": 13.654556274414062, "logits/rejected": 13.879899978637695, "logps/chosen": -4.179452419281006, "logps/rejected": -4.378298759460449, "loss": 4.5556, "rewards/accuracies": 0.5, "rewards/chosen": -41.794525146484375, "rewards/margins": 1.9884624481201172, "rewards/rejected": -43.782989501953125, "step": 3629 }, { "epoch": 0.494281045751634, "grad_norm": 36.3963939682242, "learning_rate": 4.773716580750926e-07, "logits/chosen": 12.922616958618164, "logits/rejected": 13.065725326538086, "logps/chosen": -3.8725883960723877, "logps/rejected": -4.092462062835693, "loss": 3.7275, "rewards/accuracies": 0.75, "rewards/chosen": -38.72588348388672, "rewards/margins": 2.1987366676330566, "rewards/rejected": -40.92462158203125, "step": 3630 }, { "epoch": 0.49441721132897604, "grad_norm": 38.06642346985399, "learning_rate": 4.771851000103731e-07, "logits/chosen": 13.066656112670898, "logits/rejected": 13.537275314331055, "logps/chosen": -4.091612815856934, "logps/rejected": -4.132593154907227, "loss": 3.8152, "rewards/accuracies": 0.75, "rewards/chosen": -40.91613006591797, "rewards/margins": 0.4098048210144043, "rewards/rejected": -41.32593536376953, "step": 3631 }, { "epoch": 0.49455337690631807, "grad_norm": 232.4461668144327, "learning_rate": 4.769985245050421e-07, "logits/chosen": 12.935486793518066, "logits/rejected": 13.738445281982422, "logps/chosen": -4.304910659790039, "logps/rejected": -4.759886741638184, "loss": 4.8133, "rewards/accuracies": 1.0, "rewards/chosen": -43.049110412597656, "rewards/margins": 4.5497589111328125, "rewards/rejected": -47.59886932373047, "step": 3632 }, { "epoch": 0.49468954248366015, "grad_norm": 41.41679861432525, "learning_rate": 4.768119316012581e-07, "logits/chosen": 13.641199111938477, "logits/rejected": 13.913251876831055, "logps/chosen": -3.9625673294067383, "logps/rejected": -4.384965896606445, "loss": 4.0204, "rewards/accuracies": 0.75, "rewards/chosen": -39.62567138671875, "rewards/margins": 4.2239837646484375, "rewards/rejected": -43.84965515136719, "step": 3633 }, { "epoch": 0.4948257080610022, "grad_norm": 40.099950929157465, "learning_rate": 4.766253213411832e-07, "logits/chosen": 13.366546630859375, "logits/rejected": 13.528812408447266, "logps/chosen": -4.198467254638672, "logps/rejected": -4.252815246582031, "loss": 4.2489, "rewards/accuracies": 0.5, "rewards/chosen": -41.98467254638672, "rewards/margins": 0.5434818267822266, "rewards/rejected": -42.52815628051758, "step": 3634 }, { "epoch": 0.4949618736383442, "grad_norm": 42.58363066128147, "learning_rate": 4.764386937669835e-07, "logits/chosen": 12.849503517150879, "logits/rejected": 12.678592681884766, "logps/chosen": -3.9425344467163086, "logps/rejected": -3.9517621994018555, "loss": 3.9632, "rewards/accuracies": 0.5, "rewards/chosen": -39.42534637451172, "rewards/margins": 0.09227752685546875, "rewards/rejected": -39.51762390136719, "step": 3635 }, { "epoch": 0.4950980392156863, "grad_norm": 40.096972582164334, "learning_rate": 4.7625204892082906e-07, "logits/chosen": 14.031900405883789, "logits/rejected": 13.452535629272461, "logps/chosen": -4.257076263427734, "logps/rejected": -4.120987892150879, "loss": 3.9869, "rewards/accuracies": 0.0, "rewards/chosen": -42.570762634277344, "rewards/margins": -1.3608779907226562, "rewards/rejected": -41.20988464355469, "step": 3636 }, { "epoch": 0.4952342047930283, "grad_norm": 36.7795443242074, "learning_rate": 4.7606538684489397e-07, "logits/chosen": 12.442925453186035, "logits/rejected": 13.792543411254883, "logps/chosen": -3.653866767883301, "logps/rejected": -4.1542253494262695, "loss": 4.1133, "rewards/accuracies": 0.5, "rewards/chosen": -36.538665771484375, "rewards/margins": 5.0035881996154785, "rewards/rejected": -41.54225540161133, "step": 3637 }, { "epoch": 0.49537037037037035, "grad_norm": 44.56170195453454, "learning_rate": 4.7587870758135595e-07, "logits/chosen": 13.976787567138672, "logits/rejected": 13.950471878051758, "logps/chosen": -4.444883346557617, "logps/rejected": -4.586634635925293, "loss": 3.8358, "rewards/accuracies": 0.5, "rewards/chosen": -44.44883346557617, "rewards/margins": 1.417510986328125, "rewards/rejected": -45.8663444519043, "step": 3638 }, { "epoch": 0.49550653594771243, "grad_norm": 40.282448889147325, "learning_rate": 4.756920111723966e-07, "logits/chosen": 13.41044807434082, "logits/rejected": 14.272226333618164, "logps/chosen": -4.1621413230896, "logps/rejected": -4.568035125732422, "loss": 4.2445, "rewards/accuracies": 0.75, "rewards/chosen": -41.62141418457031, "rewards/margins": 4.058941841125488, "rewards/rejected": -45.680355072021484, "step": 3639 }, { "epoch": 0.49564270152505446, "grad_norm": 40.960730901927676, "learning_rate": 4.7550529766020177e-07, "logits/chosen": 13.673004150390625, "logits/rejected": 13.488519668579102, "logps/chosen": -4.794220447540283, "logps/rejected": -4.499443054199219, "loss": 4.2949, "rewards/accuracies": 0.25, "rewards/chosen": -47.94220733642578, "rewards/margins": -2.947774887084961, "rewards/rejected": -44.99443054199219, "step": 3640 }, { "epoch": 0.4957788671023965, "grad_norm": 34.75110231238492, "learning_rate": 4.753185670869608e-07, "logits/chosen": 13.310321807861328, "logits/rejected": 14.294015884399414, "logps/chosen": -4.11656379699707, "logps/rejected": -4.550015449523926, "loss": 3.7341, "rewards/accuracies": 0.75, "rewards/chosen": -41.1656379699707, "rewards/margins": 4.334516525268555, "rewards/rejected": -45.500152587890625, "step": 3641 }, { "epoch": 0.4959150326797386, "grad_norm": 43.59758062216374, "learning_rate": 4.751318194948669e-07, "logits/chosen": 13.742830276489258, "logits/rejected": 14.156505584716797, "logps/chosen": -3.979057550430298, "logps/rejected": -4.280953407287598, "loss": 3.8943, "rewards/accuracies": 0.75, "rewards/chosen": -39.79057312011719, "rewards/margins": 3.0189571380615234, "rewards/rejected": -42.809532165527344, "step": 3642 }, { "epoch": 0.4960511982570806, "grad_norm": 37.52076162851101, "learning_rate": 4.7494505492611746e-07, "logits/chosen": 13.807052612304688, "logits/rejected": 13.30998420715332, "logps/chosen": -4.2184295654296875, "logps/rejected": -4.36328125, "loss": 3.6497, "rewards/accuracies": 0.5, "rewards/chosen": -42.18429946899414, "rewards/margins": 1.4485130310058594, "rewards/rejected": -43.6328125, "step": 3643 }, { "epoch": 0.49618736383442263, "grad_norm": 38.7388761192545, "learning_rate": 4.7475827342291337e-07, "logits/chosen": 13.837371826171875, "logits/rejected": 13.641773223876953, "logps/chosen": -4.3429741859436035, "logps/rejected": -4.445345878601074, "loss": 3.9421, "rewards/accuracies": 0.25, "rewards/chosen": -43.42974090576172, "rewards/margins": 1.0237188339233398, "rewards/rejected": -44.453460693359375, "step": 3644 }, { "epoch": 0.4963235294117647, "grad_norm": 36.56147909629426, "learning_rate": 4.7457147502745927e-07, "logits/chosen": 11.97408676147461, "logits/rejected": 13.674678802490234, "logps/chosen": -3.4935529232025146, "logps/rejected": -3.9083824157714844, "loss": 3.8968, "rewards/accuracies": 0.75, "rewards/chosen": -34.93553161621094, "rewards/margins": 4.148298263549805, "rewards/rejected": -39.083824157714844, "step": 3645 }, { "epoch": 0.49645969498910675, "grad_norm": 56.68918943277795, "learning_rate": 4.743846597819641e-07, "logits/chosen": 13.463370323181152, "logits/rejected": 13.472204208374023, "logps/chosen": -4.1720685958862305, "logps/rejected": -4.089010238647461, "loss": 4.037, "rewards/accuracies": 0.25, "rewards/chosen": -41.72068405151367, "rewards/margins": -0.8305788040161133, "rewards/rejected": -40.890106201171875, "step": 3646 }, { "epoch": 0.4965958605664488, "grad_norm": 37.42969287015927, "learning_rate": 4.741978277286402e-07, "logits/chosen": 13.677431106567383, "logits/rejected": 13.641142845153809, "logps/chosen": -4.0202178955078125, "logps/rejected": -3.92868709564209, "loss": 3.8352, "rewards/accuracies": 0.25, "rewards/chosen": -40.20217514038086, "rewards/margins": -0.9153060913085938, "rewards/rejected": -39.28687286376953, "step": 3647 }, { "epoch": 0.49673202614379086, "grad_norm": 37.53203083716781, "learning_rate": 4.7401097890970375e-07, "logits/chosen": 12.498404502868652, "logits/rejected": 13.175681114196777, "logps/chosen": -3.6167409420013428, "logps/rejected": -4.075913906097412, "loss": 4.0065, "rewards/accuracies": 1.0, "rewards/chosen": -36.16741180419922, "rewards/margins": 4.591729164123535, "rewards/rejected": -40.75914001464844, "step": 3648 }, { "epoch": 0.4968681917211329, "grad_norm": 40.69810657609497, "learning_rate": 4.7382411336737485e-07, "logits/chosen": 13.272210121154785, "logits/rejected": 13.486164093017578, "logps/chosen": -4.104726791381836, "logps/rejected": -4.060355186462402, "loss": 3.7779, "rewards/accuracies": 0.75, "rewards/chosen": -41.047271728515625, "rewards/margins": -0.44371986389160156, "rewards/rejected": -40.60354995727539, "step": 3649 }, { "epoch": 0.4970043572984749, "grad_norm": 39.64281058266952, "learning_rate": 4.7363723114387735e-07, "logits/chosen": 13.418156623840332, "logits/rejected": 14.047710418701172, "logps/chosen": -4.050667762756348, "logps/rejected": -4.451992988586426, "loss": 4.0954, "rewards/accuracies": 0.75, "rewards/chosen": -40.506675720214844, "rewards/margins": 4.013250350952148, "rewards/rejected": -44.519927978515625, "step": 3650 }, { "epoch": 0.497140522875817, "grad_norm": 39.68429313409828, "learning_rate": 4.734503322814387e-07, "logits/chosen": 13.377116203308105, "logits/rejected": 13.3546142578125, "logps/chosen": -3.820976734161377, "logps/rejected": -4.106471538543701, "loss": 3.3819, "rewards/accuracies": 0.5, "rewards/chosen": -38.20977020263672, "rewards/margins": 2.854947090148926, "rewards/rejected": -41.06471252441406, "step": 3651 }, { "epoch": 0.49727668845315903, "grad_norm": 48.0223028067471, "learning_rate": 4.732634168222903e-07, "logits/chosen": 13.77143669128418, "logits/rejected": 14.358739852905273, "logps/chosen": -4.01608419418335, "logps/rejected": -4.448814392089844, "loss": 3.4666, "rewards/accuracies": 1.0, "rewards/chosen": -40.16084289550781, "rewards/margins": 4.327296257019043, "rewards/rejected": -44.48814392089844, "step": 3652 }, { "epoch": 0.4974128540305011, "grad_norm": 41.106479436548526, "learning_rate": 4.7307648480866744e-07, "logits/chosen": 13.751184463500977, "logits/rejected": 14.002933502197266, "logps/chosen": -4.002070426940918, "logps/rejected": -4.313876152038574, "loss": 4.0339, "rewards/accuracies": 1.0, "rewards/chosen": -40.02070617675781, "rewards/margins": 3.118058204650879, "rewards/rejected": -43.138763427734375, "step": 3653 }, { "epoch": 0.49754901960784315, "grad_norm": 40.574093428662174, "learning_rate": 4.7288953628280853e-07, "logits/chosen": 13.344038009643555, "logits/rejected": 13.402053833007812, "logps/chosen": -3.920106887817383, "logps/rejected": -3.9591472148895264, "loss": 4.2518, "rewards/accuracies": 0.5, "rewards/chosen": -39.20106887817383, "rewards/margins": 0.39040040969848633, "rewards/rejected": -39.591468811035156, "step": 3654 }, { "epoch": 0.4976851851851852, "grad_norm": 37.627884541705896, "learning_rate": 4.727025712869566e-07, "logits/chosen": 12.15223503112793, "logits/rejected": 13.242902755737305, "logps/chosen": -3.7049036026000977, "logps/rejected": -4.025615692138672, "loss": 3.9216, "rewards/accuracies": 0.75, "rewards/chosen": -37.049034118652344, "rewards/margins": 3.207119941711426, "rewards/rejected": -40.25615692138672, "step": 3655 }, { "epoch": 0.49782135076252726, "grad_norm": 43.48773038072735, "learning_rate": 4.7251558986335764e-07, "logits/chosen": 12.735642433166504, "logits/rejected": 14.040369033813477, "logps/chosen": -3.922880172729492, "logps/rejected": -4.170514106750488, "loss": 4.456, "rewards/accuracies": 0.75, "rewards/chosen": -39.22880554199219, "rewards/margins": 2.476334571838379, "rewards/rejected": -41.70513916015625, "step": 3656 }, { "epoch": 0.4979575163398693, "grad_norm": 40.01079293718931, "learning_rate": 4.723285920542617e-07, "logits/chosen": 13.160079956054688, "logits/rejected": 13.277107238769531, "logps/chosen": -4.149521827697754, "logps/rejected": -4.543828964233398, "loss": 4.2495, "rewards/accuracies": 0.75, "rewards/chosen": -41.49522018432617, "rewards/margins": 3.943065643310547, "rewards/rejected": -45.43828582763672, "step": 3657 }, { "epoch": 0.4980936819172113, "grad_norm": 42.44171087596371, "learning_rate": 4.7214157790192253e-07, "logits/chosen": 13.422065734863281, "logits/rejected": 13.657733917236328, "logps/chosen": -4.278500080108643, "logps/rejected": -4.22097110748291, "loss": 4.3539, "rewards/accuracies": 0.5, "rewards/chosen": -42.785003662109375, "rewards/margins": -0.5752887725830078, "rewards/rejected": -42.209716796875, "step": 3658 }, { "epoch": 0.4982298474945534, "grad_norm": 48.23105103566173, "learning_rate": 4.7195454744859756e-07, "logits/chosen": 13.771947860717773, "logits/rejected": 13.843315124511719, "logps/chosen": -4.208096504211426, "logps/rejected": -4.223526954650879, "loss": 3.8601, "rewards/accuracies": 0.5, "rewards/chosen": -42.08096694946289, "rewards/margins": 0.15430259704589844, "rewards/rejected": -42.235267639160156, "step": 3659 }, { "epoch": 0.49836601307189543, "grad_norm": 37.44965670183754, "learning_rate": 4.717675007365477e-07, "logits/chosen": 13.156255722045898, "logits/rejected": 12.961647033691406, "logps/chosen": -4.0863037109375, "logps/rejected": -4.33465576171875, "loss": 3.7499, "rewards/accuracies": 0.75, "rewards/chosen": -40.863037109375, "rewards/margins": 2.483523368835449, "rewards/rejected": -43.3465576171875, "step": 3660 }, { "epoch": 0.49850217864923746, "grad_norm": 38.73397233606795, "learning_rate": 4.71580437808038e-07, "logits/chosen": 14.232583999633789, "logits/rejected": 13.492393493652344, "logps/chosen": -4.079538345336914, "logps/rejected": -4.340973377227783, "loss": 3.3908, "rewards/accuracies": 0.5, "rewards/chosen": -40.795387268066406, "rewards/margins": 2.614346504211426, "rewards/rejected": -43.409732818603516, "step": 3661 }, { "epoch": 0.49863834422657954, "grad_norm": 37.462976199447354, "learning_rate": 4.7139335870533645e-07, "logits/chosen": 12.085710525512695, "logits/rejected": 12.63180160522461, "logps/chosen": -3.711224317550659, "logps/rejected": -3.823837995529175, "loss": 3.7567, "rewards/accuracies": 0.5, "rewards/chosen": -37.11224365234375, "rewards/margins": 1.1261367797851562, "rewards/rejected": -38.238380432128906, "step": 3662 }, { "epoch": 0.4987745098039216, "grad_norm": 41.277439480606816, "learning_rate": 4.712062634707155e-07, "logits/chosen": 14.09056282043457, "logits/rejected": 14.382221221923828, "logps/chosen": -4.265441417694092, "logps/rejected": -4.496294975280762, "loss": 4.3547, "rewards/accuracies": 0.75, "rewards/chosen": -42.654415130615234, "rewards/margins": 2.308534622192383, "rewards/rejected": -44.96295166015625, "step": 3663 }, { "epoch": 0.4989106753812636, "grad_norm": 51.083037016662104, "learning_rate": 4.710191521464507e-07, "logits/chosen": 12.961610794067383, "logits/rejected": 13.396703720092773, "logps/chosen": -3.9175376892089844, "logps/rejected": -4.097409725189209, "loss": 4.1428, "rewards/accuracies": 0.5, "rewards/chosen": -39.175376892089844, "rewards/margins": 1.7987184524536133, "rewards/rejected": -40.97409439086914, "step": 3664 }, { "epoch": 0.4990468409586057, "grad_norm": 38.61875573052535, "learning_rate": 4.708320247748214e-07, "logits/chosen": 13.8048095703125, "logits/rejected": 12.829907417297363, "logps/chosen": -4.476765155792236, "logps/rejected": -4.132134437561035, "loss": 4.3661, "rewards/accuracies": 0.5, "rewards/chosen": -44.76765441894531, "rewards/margins": -3.4463043212890625, "rewards/rejected": -41.321346282958984, "step": 3665 }, { "epoch": 0.4991830065359477, "grad_norm": 38.889706224092656, "learning_rate": 4.7064488139811063e-07, "logits/chosen": 13.784374237060547, "logits/rejected": 13.8353910446167, "logps/chosen": -4.248200416564941, "logps/rejected": -4.326114654541016, "loss": 4.1803, "rewards/accuracies": 0.5, "rewards/chosen": -42.48200988769531, "rewards/margins": 0.7791366577148438, "rewards/rejected": -43.26114273071289, "step": 3666 }, { "epoch": 0.49931917211328974, "grad_norm": 37.06416838828442, "learning_rate": 4.704577220586049e-07, "logits/chosen": 13.02043342590332, "logits/rejected": 14.411537170410156, "logps/chosen": -3.8845486640930176, "logps/rejected": -4.269952774047852, "loss": 3.6524, "rewards/accuracies": 0.75, "rewards/chosen": -38.84548568725586, "rewards/margins": 3.8540420532226562, "rewards/rejected": -42.69953155517578, "step": 3667 }, { "epoch": 0.49945533769063183, "grad_norm": 43.54591083239908, "learning_rate": 4.702705467985945e-07, "logits/chosen": 13.353172302246094, "logits/rejected": 13.006284713745117, "logps/chosen": -4.232438087463379, "logps/rejected": -4.013671875, "loss": 4.1241, "rewards/accuracies": 0.25, "rewards/chosen": -42.324378967285156, "rewards/margins": -2.1876611709594727, "rewards/rejected": -40.13671875, "step": 3668 }, { "epoch": 0.49959150326797386, "grad_norm": 482.53472712191257, "learning_rate": 4.700833556603731e-07, "logits/chosen": 14.360843658447266, "logits/rejected": 13.354876518249512, "logps/chosen": -4.207189559936523, "logps/rejected": -4.426914215087891, "loss": 5.0787, "rewards/accuracies": 0.5, "rewards/chosen": -42.0718994140625, "rewards/margins": 2.1972475051879883, "rewards/rejected": -44.26914596557617, "step": 3669 }, { "epoch": 0.4997276688453159, "grad_norm": 37.03081556954978, "learning_rate": 4.6989614868623835e-07, "logits/chosen": 13.211884498596191, "logits/rejected": 13.302757263183594, "logps/chosen": -3.90328311920166, "logps/rejected": -4.04489803314209, "loss": 3.5297, "rewards/accuracies": 0.75, "rewards/chosen": -39.03282928466797, "rewards/margins": 1.4161500930786133, "rewards/rejected": -40.44898223876953, "step": 3670 }, { "epoch": 0.49986383442265797, "grad_norm": 36.58661485253715, "learning_rate": 4.69708925918491e-07, "logits/chosen": 13.346057891845703, "logits/rejected": 12.782184600830078, "logps/chosen": -4.316525459289551, "logps/rejected": -4.181373596191406, "loss": 4.3342, "rewards/accuracies": 0.25, "rewards/chosen": -43.165252685546875, "rewards/margins": -1.3515205383300781, "rewards/rejected": -41.81373596191406, "step": 3671 }, { "epoch": 0.5, "grad_norm": 43.261371065233895, "learning_rate": 4.695216873994355e-07, "logits/chosen": 13.976702690124512, "logits/rejected": 13.711198806762695, "logps/chosen": -4.187472820281982, "logps/rejected": -4.050103187561035, "loss": 4.3029, "rewards/accuracies": 0.5, "rewards/chosen": -41.874725341796875, "rewards/margins": -1.3736944198608398, "rewards/rejected": -40.501033782958984, "step": 3672 }, { "epoch": 0.5001361655773421, "grad_norm": 38.629073944803146, "learning_rate": 4.693344331713802e-07, "logits/chosen": 14.284387588500977, "logits/rejected": 14.25655460357666, "logps/chosen": -3.9816770553588867, "logps/rejected": -4.302992820739746, "loss": 3.8579, "rewards/accuracies": 0.75, "rewards/chosen": -39.816768646240234, "rewards/margins": 3.2131600379943848, "rewards/rejected": -43.029930114746094, "step": 3673 }, { "epoch": 0.5002723311546841, "grad_norm": 38.05388322738053, "learning_rate": 4.6914716327663653e-07, "logits/chosen": 13.358076095581055, "logits/rejected": 13.674903869628906, "logps/chosen": -3.7778854370117188, "logps/rejected": -3.9074978828430176, "loss": 3.8248, "rewards/accuracies": 0.5, "rewards/chosen": -37.77885437011719, "rewards/margins": 1.2961254119873047, "rewards/rejected": -39.074981689453125, "step": 3674 }, { "epoch": 0.5004084967320261, "grad_norm": 38.49682675509112, "learning_rate": 4.689598777575197e-07, "logits/chosen": 12.193557739257812, "logits/rejected": 12.777618408203125, "logps/chosen": -4.187429428100586, "logps/rejected": -4.252195358276367, "loss": 4.3743, "rewards/accuracies": 0.5, "rewards/chosen": -41.874298095703125, "rewards/margins": 0.6476554870605469, "rewards/rejected": -42.521949768066406, "step": 3675 }, { "epoch": 0.5005446623093682, "grad_norm": 36.48624788175915, "learning_rate": 4.687725766563485e-07, "logits/chosen": 12.543540954589844, "logits/rejected": 13.973368644714355, "logps/chosen": -4.1989665031433105, "logps/rejected": -4.652449607849121, "loss": 3.7484, "rewards/accuracies": 0.75, "rewards/chosen": -41.98966598510742, "rewards/margins": 4.53482723236084, "rewards/rejected": -46.52449035644531, "step": 3676 }, { "epoch": 0.5006808278867102, "grad_norm": 37.513260058076035, "learning_rate": 4.6858526001544517e-07, "logits/chosen": 12.727363586425781, "logits/rejected": 13.735908508300781, "logps/chosen": -4.08561897277832, "logps/rejected": -4.510705947875977, "loss": 3.811, "rewards/accuracies": 1.0, "rewards/chosen": -40.85618591308594, "rewards/margins": 4.2508697509765625, "rewards/rejected": -45.1070556640625, "step": 3677 }, { "epoch": 0.5008169934640523, "grad_norm": 45.820962985659385, "learning_rate": 4.683979278771353e-07, "logits/chosen": 13.409138679504395, "logits/rejected": 14.23011302947998, "logps/chosen": -4.231164932250977, "logps/rejected": -4.626114368438721, "loss": 3.8258, "rewards/accuracies": 0.75, "rewards/chosen": -42.311649322509766, "rewards/margins": 3.9494943618774414, "rewards/rejected": -46.261146545410156, "step": 3678 }, { "epoch": 0.5009531590413944, "grad_norm": 37.87292327168075, "learning_rate": 4.6821058028374833e-07, "logits/chosen": 13.543352127075195, "logits/rejected": 13.772480964660645, "logps/chosen": -3.999436855316162, "logps/rejected": -4.268976211547852, "loss": 3.9999, "rewards/accuracies": 0.75, "rewards/chosen": -39.99436950683594, "rewards/margins": 2.695392608642578, "rewards/rejected": -42.689762115478516, "step": 3679 }, { "epoch": 0.5010893246187363, "grad_norm": 37.45888285645932, "learning_rate": 4.6802321727761696e-07, "logits/chosen": 13.65077018737793, "logits/rejected": 13.437663078308105, "logps/chosen": -3.9827282428741455, "logps/rejected": -3.9388670921325684, "loss": 3.9254, "rewards/accuracies": 0.75, "rewards/chosen": -39.8272819519043, "rewards/margins": -0.4386138916015625, "rewards/rejected": -39.388671875, "step": 3680 }, { "epoch": 0.5012254901960784, "grad_norm": 40.210725779288694, "learning_rate": 4.678358389010772e-07, "logits/chosen": 13.464021682739258, "logits/rejected": 14.166145324707031, "logps/chosen": -4.186385154724121, "logps/rejected": -4.669926643371582, "loss": 3.6614, "rewards/accuracies": 0.75, "rewards/chosen": -41.86384582519531, "rewards/margins": 4.835418701171875, "rewards/rejected": -46.69926452636719, "step": 3681 }, { "epoch": 0.5013616557734205, "grad_norm": 87.8795345823922, "learning_rate": 4.67648445196469e-07, "logits/chosen": 12.515790939331055, "logits/rejected": 14.02469253540039, "logps/chosen": -3.6751415729522705, "logps/rejected": -4.046807289123535, "loss": 4.0137, "rewards/accuracies": 0.75, "rewards/chosen": -36.75141525268555, "rewards/margins": 3.7166547775268555, "rewards/rejected": -40.46807098388672, "step": 3682 }, { "epoch": 0.5014978213507625, "grad_norm": 42.57022982288637, "learning_rate": 4.6746103620613545e-07, "logits/chosen": 13.714639663696289, "logits/rejected": 14.09381103515625, "logps/chosen": -4.017249584197998, "logps/rejected": -4.199599266052246, "loss": 4.4822, "rewards/accuracies": 0.5, "rewards/chosen": -40.1724967956543, "rewards/margins": 1.8234939575195312, "rewards/rejected": -41.99599075317383, "step": 3683 }, { "epoch": 0.5016339869281046, "grad_norm": 41.544514739012996, "learning_rate": 4.672736119724231e-07, "logits/chosen": 12.783197402954102, "logits/rejected": 12.666183471679688, "logps/chosen": -4.018475532531738, "logps/rejected": -3.7548344135284424, "loss": 4.1419, "rewards/accuracies": 0.25, "rewards/chosen": -40.18475341796875, "rewards/margins": -2.6364078521728516, "rewards/rejected": -37.54834747314453, "step": 3684 }, { "epoch": 0.5017701525054467, "grad_norm": 43.67555473157722, "learning_rate": 4.6708617253768203e-07, "logits/chosen": 14.071325302124023, "logits/rejected": 13.946088790893555, "logps/chosen": -4.467836380004883, "logps/rejected": -4.485749244689941, "loss": 4.0995, "rewards/accuracies": 0.75, "rewards/chosen": -44.67835998535156, "rewards/margins": 0.17912769317626953, "rewards/rejected": -44.85749053955078, "step": 3685 }, { "epoch": 0.5019063180827886, "grad_norm": 37.45554355043911, "learning_rate": 4.6689871794426575e-07, "logits/chosen": 13.393749237060547, "logits/rejected": 13.455429077148438, "logps/chosen": -4.039381504058838, "logps/rejected": -3.8730111122131348, "loss": 4.0619, "rewards/accuracies": 0.5, "rewards/chosen": -40.39381408691406, "rewards/margins": -1.6637067794799805, "rewards/rejected": -38.730106353759766, "step": 3686 }, { "epoch": 0.5020424836601307, "grad_norm": 38.015669797484215, "learning_rate": 4.6671124823453114e-07, "logits/chosen": 13.526649475097656, "logits/rejected": 14.268695831298828, "logps/chosen": -3.7876803874969482, "logps/rejected": -4.387045860290527, "loss": 4.2063, "rewards/accuracies": 1.0, "rewards/chosen": -37.87680435180664, "rewards/margins": 5.993656158447266, "rewards/rejected": -43.870460510253906, "step": 3687 }, { "epoch": 0.5021786492374728, "grad_norm": 38.4760831055411, "learning_rate": 4.6652376345083854e-07, "logits/chosen": 13.340862274169922, "logits/rejected": 13.285551071166992, "logps/chosen": -4.092845439910889, "logps/rejected": -4.00330114364624, "loss": 4.0864, "rewards/accuracies": 0.25, "rewards/chosen": -40.9284553527832, "rewards/margins": -0.8954448699951172, "rewards/rejected": -40.03301239013672, "step": 3688 }, { "epoch": 0.5023148148148148, "grad_norm": 40.208695646815755, "learning_rate": 4.6633626363555177e-07, "logits/chosen": 12.836553573608398, "logits/rejected": 12.531976699829102, "logps/chosen": -3.9171526432037354, "logps/rejected": -4.1147050857543945, "loss": 3.1874, "rewards/accuracies": 1.0, "rewards/chosen": -39.17152404785156, "rewards/margins": 1.97552490234375, "rewards/rejected": -41.14704895019531, "step": 3689 }, { "epoch": 0.5024509803921569, "grad_norm": 45.75655963706031, "learning_rate": 4.661487488310378e-07, "logits/chosen": 12.420095443725586, "logits/rejected": 14.868179321289062, "logps/chosen": -3.7877917289733887, "logps/rejected": -4.478575706481934, "loss": 4.3125, "rewards/accuracies": 1.0, "rewards/chosen": -37.8779182434082, "rewards/margins": 6.9078369140625, "rewards/rejected": -44.78575897216797, "step": 3690 }, { "epoch": 0.5025871459694989, "grad_norm": 42.459885141787666, "learning_rate": 4.6596121907966726e-07, "logits/chosen": 12.910301208496094, "logits/rejected": 14.037810325622559, "logps/chosen": -3.8050174713134766, "logps/rejected": -4.220910549163818, "loss": 4.3816, "rewards/accuracies": 1.0, "rewards/chosen": -38.050174713134766, "rewards/margins": 4.158932685852051, "rewards/rejected": -42.2091064453125, "step": 3691 }, { "epoch": 0.5027233115468409, "grad_norm": 37.76601934746004, "learning_rate": 4.657736744238141e-07, "logits/chosen": 13.531388282775879, "logits/rejected": 14.885564804077148, "logps/chosen": -3.96443772315979, "logps/rejected": -4.240187644958496, "loss": 3.4953, "rewards/accuracies": 0.75, "rewards/chosen": -39.644378662109375, "rewards/margins": 2.7574996948242188, "rewards/rejected": -42.40187454223633, "step": 3692 }, { "epoch": 0.502859477124183, "grad_norm": 39.99104884779223, "learning_rate": 4.655861149058554e-07, "logits/chosen": 13.064823150634766, "logits/rejected": 13.201683044433594, "logps/chosen": -3.95554518699646, "logps/rejected": -4.005729675292969, "loss": 4.3836, "rewards/accuracies": 0.75, "rewards/chosen": -39.555450439453125, "rewards/margins": 0.5018482208251953, "rewards/rejected": -40.05729675292969, "step": 3693 }, { "epoch": 0.5029956427015251, "grad_norm": 34.38048313866216, "learning_rate": 4.6539854056817194e-07, "logits/chosen": 13.493356704711914, "logits/rejected": 14.09914779663086, "logps/chosen": -3.8066258430480957, "logps/rejected": -4.345772743225098, "loss": 3.7609, "rewards/accuracies": 0.75, "rewards/chosen": -38.06625747680664, "rewards/margins": 5.391472816467285, "rewards/rejected": -43.45772933959961, "step": 3694 }, { "epoch": 0.503131808278867, "grad_norm": 39.31744195138398, "learning_rate": 4.6521095145314773e-07, "logits/chosen": 12.802804946899414, "logits/rejected": 13.940014839172363, "logps/chosen": -4.016892433166504, "logps/rejected": -4.465973377227783, "loss": 4.0818, "rewards/accuracies": 1.0, "rewards/chosen": -40.168922424316406, "rewards/margins": 4.490812301635742, "rewards/rejected": -44.65973663330078, "step": 3695 }, { "epoch": 0.5032679738562091, "grad_norm": 38.59789873945545, "learning_rate": 4.650233476031698e-07, "logits/chosen": 13.689371109008789, "logits/rejected": 13.589783668518066, "logps/chosen": -4.035968780517578, "logps/rejected": -4.249928951263428, "loss": 4.0567, "rewards/accuracies": 0.75, "rewards/chosen": -40.359683990478516, "rewards/margins": 2.139603614807129, "rewards/rejected": -42.499290466308594, "step": 3696 }, { "epoch": 0.5034041394335512, "grad_norm": 35.10615042207916, "learning_rate": 4.648357290606292e-07, "logits/chosen": 14.008176803588867, "logits/rejected": 13.675275802612305, "logps/chosen": -4.155710220336914, "logps/rejected": -4.160484790802002, "loss": 3.8265, "rewards/accuracies": 0.25, "rewards/chosen": -41.557098388671875, "rewards/margins": 0.04774761199951172, "rewards/rejected": -41.6048469543457, "step": 3697 }, { "epoch": 0.5035403050108932, "grad_norm": 39.105123126254675, "learning_rate": 4.6464809586791966e-07, "logits/chosen": 12.524421691894531, "logits/rejected": 13.983268737792969, "logps/chosen": -3.4753775596618652, "logps/rejected": -4.023436546325684, "loss": 4.0217, "rewards/accuracies": 0.75, "rewards/chosen": -34.75377655029297, "rewards/margins": 5.480589866638184, "rewards/rejected": -40.2343635559082, "step": 3698 }, { "epoch": 0.5036764705882353, "grad_norm": 40.693801091431645, "learning_rate": 4.644604480674383e-07, "logits/chosen": 13.76347541809082, "logits/rejected": 13.915460586547852, "logps/chosen": -4.25579833984375, "logps/rejected": -4.433274745941162, "loss": 4.2391, "rewards/accuracies": 0.75, "rewards/chosen": -42.5579833984375, "rewards/margins": 1.774759292602539, "rewards/rejected": -44.33274459838867, "step": 3699 }, { "epoch": 0.5038126361655774, "grad_norm": 35.63628871974884, "learning_rate": 4.6427278570158607e-07, "logits/chosen": 12.832319259643555, "logits/rejected": 13.343103408813477, "logps/chosen": -3.6490120887756348, "logps/rejected": -3.855560779571533, "loss": 3.4201, "rewards/accuracies": 0.75, "rewards/chosen": -36.49011993408203, "rewards/margins": 2.065487861633301, "rewards/rejected": -38.55561065673828, "step": 3700 }, { "epoch": 0.5039488017429193, "grad_norm": 37.15440115304678, "learning_rate": 4.6408510881276656e-07, "logits/chosen": 13.270252227783203, "logits/rejected": 13.652961730957031, "logps/chosen": -4.146317958831787, "logps/rejected": -4.228779315948486, "loss": 4.1004, "rewards/accuracies": 0.75, "rewards/chosen": -41.46318054199219, "rewards/margins": 0.8246116638183594, "rewards/rejected": -42.28779220581055, "step": 3701 }, { "epoch": 0.5040849673202614, "grad_norm": 44.768814700613746, "learning_rate": 4.6389741744338693e-07, "logits/chosen": 12.946014404296875, "logits/rejected": 13.338356018066406, "logps/chosen": -4.009507179260254, "logps/rejected": -4.272617340087891, "loss": 4.4059, "rewards/accuracies": 0.75, "rewards/chosen": -40.095069885253906, "rewards/margins": 2.6310997009277344, "rewards/rejected": -42.726173400878906, "step": 3702 }, { "epoch": 0.5042211328976035, "grad_norm": 37.925670469091436, "learning_rate": 4.6370971163585765e-07, "logits/chosen": 13.863119125366211, "logits/rejected": 13.18960189819336, "logps/chosen": -4.355708122253418, "logps/rejected": -4.249809265136719, "loss": 4.2549, "rewards/accuracies": 0.5, "rewards/chosen": -43.55708312988281, "rewards/margins": -1.0589866638183594, "rewards/rejected": -42.49809265136719, "step": 3703 }, { "epoch": 0.5043572984749455, "grad_norm": 41.03546680391918, "learning_rate": 4.635219914325924e-07, "logits/chosen": 12.133167266845703, "logits/rejected": 12.539323806762695, "logps/chosen": -3.500620126724243, "logps/rejected": -3.8997340202331543, "loss": 3.9457, "rewards/accuracies": 0.75, "rewards/chosen": -35.006202697753906, "rewards/margins": 3.991138458251953, "rewards/rejected": -38.99734115600586, "step": 3704 }, { "epoch": 0.5044934640522876, "grad_norm": 35.33478947559178, "learning_rate": 4.6333425687600813e-07, "logits/chosen": 13.2701416015625, "logits/rejected": 13.336626052856445, "logps/chosen": -3.890298843383789, "logps/rejected": -4.231750965118408, "loss": 3.8633, "rewards/accuracies": 1.0, "rewards/chosen": -38.902992248535156, "rewards/margins": 3.414518356323242, "rewards/rejected": -42.317508697509766, "step": 3705 }, { "epoch": 0.5046296296296297, "grad_norm": 36.63156520592772, "learning_rate": 4.6314650800852496e-07, "logits/chosen": 13.02952766418457, "logits/rejected": 12.431659698486328, "logps/chosen": -3.43776535987854, "logps/rejected": -3.327106475830078, "loss": 4.132, "rewards/accuracies": 0.5, "rewards/chosen": -34.37765121459961, "rewards/margins": -1.1065893173217773, "rewards/rejected": -33.27106475830078, "step": 3706 }, { "epoch": 0.5047657952069716, "grad_norm": 35.86638824505477, "learning_rate": 4.6295874487256645e-07, "logits/chosen": 13.118314743041992, "logits/rejected": 13.879725456237793, "logps/chosen": -4.267186641693115, "logps/rejected": -4.532580852508545, "loss": 4.1595, "rewards/accuracies": 0.75, "rewards/chosen": -42.67186737060547, "rewards/margins": 2.6539411544799805, "rewards/rejected": -45.3258056640625, "step": 3707 }, { "epoch": 0.5049019607843137, "grad_norm": 40.7134684260331, "learning_rate": 4.627709675105589e-07, "logits/chosen": 14.153631210327148, "logits/rejected": 13.674840927124023, "logps/chosen": -4.237056732177734, "logps/rejected": -3.977210521697998, "loss": 4.0945, "rewards/accuracies": 0.25, "rewards/chosen": -42.37057113647461, "rewards/margins": -2.598465919494629, "rewards/rejected": -39.77210235595703, "step": 3708 }, { "epoch": 0.5050381263616558, "grad_norm": 38.353270607609645, "learning_rate": 4.625831759649326e-07, "logits/chosen": 13.352453231811523, "logits/rejected": 13.599990844726562, "logps/chosen": -4.179633140563965, "logps/rejected": -4.3026933670043945, "loss": 3.382, "rewards/accuracies": 0.75, "rewards/chosen": -41.79633331298828, "rewards/margins": 1.2306013107299805, "rewards/rejected": -43.02693557739258, "step": 3709 }, { "epoch": 0.5051742919389978, "grad_norm": 36.784808751784745, "learning_rate": 4.623953702781203e-07, "logits/chosen": 13.236488342285156, "logits/rejected": 14.132347106933594, "logps/chosen": -4.006322860717773, "logps/rejected": -4.301856994628906, "loss": 3.6782, "rewards/accuracies": 0.75, "rewards/chosen": -40.063228607177734, "rewards/margins": 2.955338478088379, "rewards/rejected": -43.0185661315918, "step": 3710 }, { "epoch": 0.5053104575163399, "grad_norm": 41.35439292717423, "learning_rate": 4.622075504925582e-07, "logits/chosen": 13.399534225463867, "logits/rejected": 14.676248550415039, "logps/chosen": -4.028219223022461, "logps/rejected": -4.400123596191406, "loss": 4.5144, "rewards/accuracies": 0.75, "rewards/chosen": -40.282196044921875, "rewards/margins": 3.7190446853637695, "rewards/rejected": -44.00123596191406, "step": 3711 }, { "epoch": 0.5054466230936819, "grad_norm": 37.50700793620438, "learning_rate": 4.62019716650686e-07, "logits/chosen": 13.8063325881958, "logits/rejected": 13.937461853027344, "logps/chosen": -4.361696243286133, "logps/rejected": -4.201212406158447, "loss": 4.3389, "rewards/accuracies": 0.25, "rewards/chosen": -43.61696243286133, "rewards/margins": -1.6048402786254883, "rewards/rejected": -42.012123107910156, "step": 3712 }, { "epoch": 0.505582788671024, "grad_norm": 37.75163125544747, "learning_rate": 4.6183186879494603e-07, "logits/chosen": 13.091933250427246, "logits/rejected": 13.892539024353027, "logps/chosen": -3.6662285327911377, "logps/rejected": -4.191458225250244, "loss": 4.1309, "rewards/accuracies": 1.0, "rewards/chosen": -36.66228485107422, "rewards/margins": 5.252297401428223, "rewards/rejected": -41.91458511352539, "step": 3713 }, { "epoch": 0.505718954248366, "grad_norm": 39.91875415566491, "learning_rate": 4.616440069677843e-07, "logits/chosen": 13.473123550415039, "logits/rejected": 13.195207595825195, "logps/chosen": -3.943359613418579, "logps/rejected": -3.9055521488189697, "loss": 4.5458, "rewards/accuracies": 0.25, "rewards/chosen": -39.433597564697266, "rewards/margins": -0.37807369232177734, "rewards/rejected": -39.05552291870117, "step": 3714 }, { "epoch": 0.5058551198257081, "grad_norm": 36.81105240171192, "learning_rate": 4.6145613121164955e-07, "logits/chosen": 13.019323348999023, "logits/rejected": 14.045486450195312, "logps/chosen": -3.642852306365967, "logps/rejected": -4.081232070922852, "loss": 4.0057, "rewards/accuracies": 0.75, "rewards/chosen": -36.42852783203125, "rewards/margins": 4.383798599243164, "rewards/rejected": -40.81232452392578, "step": 3715 }, { "epoch": 0.5059912854030502, "grad_norm": 39.010576632245574, "learning_rate": 4.6126824156899404e-07, "logits/chosen": 14.101935386657715, "logits/rejected": 14.105843544006348, "logps/chosen": -3.883448362350464, "logps/rejected": -4.019222259521484, "loss": 3.8489, "rewards/accuracies": 0.5, "rewards/chosen": -38.83448028564453, "rewards/margins": 1.3577375411987305, "rewards/rejected": -40.192222595214844, "step": 3716 }, { "epoch": 0.5061274509803921, "grad_norm": 36.71349742149015, "learning_rate": 4.6108033808227295e-07, "logits/chosen": 13.746987342834473, "logits/rejected": 14.314157485961914, "logps/chosen": -4.054763317108154, "logps/rejected": -4.327594757080078, "loss": 3.8084, "rewards/accuracies": 0.75, "rewards/chosen": -40.547630310058594, "rewards/margins": 2.7283143997192383, "rewards/rejected": -43.275943756103516, "step": 3717 }, { "epoch": 0.5062636165577342, "grad_norm": 44.35859795337042, "learning_rate": 4.608924207939444e-07, "logits/chosen": 12.926811218261719, "logits/rejected": 12.4734525680542, "logps/chosen": -3.8532674312591553, "logps/rejected": -3.6076836585998535, "loss": 3.6208, "rewards/accuracies": 0.0, "rewards/chosen": -38.532676696777344, "rewards/margins": -2.455838203430176, "rewards/rejected": -36.07683563232422, "step": 3718 }, { "epoch": 0.5063997821350763, "grad_norm": 39.130419576846315, "learning_rate": 4.6070448974647015e-07, "logits/chosen": 13.301280975341797, "logits/rejected": 13.241739273071289, "logps/chosen": -3.5897130966186523, "logps/rejected": -3.8068554401397705, "loss": 3.3341, "rewards/accuracies": 0.5, "rewards/chosen": -35.897132873535156, "rewards/margins": 2.1714224815368652, "rewards/rejected": -38.06855773925781, "step": 3719 }, { "epoch": 0.5065359477124183, "grad_norm": 84.98230558675895, "learning_rate": 4.605165449823146e-07, "logits/chosen": 13.08304214477539, "logits/rejected": 14.038728713989258, "logps/chosen": -3.597261428833008, "logps/rejected": -4.8902387619018555, "loss": 4.1674, "rewards/accuracies": 1.0, "rewards/chosen": -35.97261047363281, "rewards/margins": 12.929773330688477, "rewards/rejected": -48.90238571166992, "step": 3720 }, { "epoch": 0.5066721132897604, "grad_norm": 35.35438391655776, "learning_rate": 4.6032858654394555e-07, "logits/chosen": 13.48090648651123, "logits/rejected": 13.674869537353516, "logps/chosen": -3.900899648666382, "logps/rejected": -3.935330390930176, "loss": 3.611, "rewards/accuracies": 0.75, "rewards/chosen": -39.008995056152344, "rewards/margins": 0.3443107604980469, "rewards/rejected": -39.35330581665039, "step": 3721 }, { "epoch": 0.5068082788671024, "grad_norm": 39.222959318601, "learning_rate": 4.6014061447383367e-07, "logits/chosen": 13.707401275634766, "logits/rejected": 14.144376754760742, "logps/chosen": -4.002793788909912, "logps/rejected": -3.920548439025879, "loss": 3.8953, "rewards/accuracies": 0.75, "rewards/chosen": -40.02793502807617, "rewards/margins": -0.8224515914916992, "rewards/rejected": -39.20548629760742, "step": 3722 }, { "epoch": 0.5069444444444444, "grad_norm": 38.60197155980737, "learning_rate": 4.5995262881445277e-07, "logits/chosen": 13.140287399291992, "logits/rejected": 14.131871223449707, "logps/chosen": -3.7175614833831787, "logps/rejected": -4.134531497955322, "loss": 4.1309, "rewards/accuracies": 1.0, "rewards/chosen": -37.17561340332031, "rewards/margins": 4.169701099395752, "rewards/rejected": -41.34531784057617, "step": 3723 }, { "epoch": 0.5070806100217865, "grad_norm": 35.55424538706183, "learning_rate": 4.597646296082798e-07, "logits/chosen": 13.673868179321289, "logits/rejected": 13.880500793457031, "logps/chosen": -4.159348487854004, "logps/rejected": -4.04018497467041, "loss": 3.9397, "rewards/accuracies": 0.25, "rewards/chosen": -41.593482971191406, "rewards/margins": -1.1916322708129883, "rewards/rejected": -40.401851654052734, "step": 3724 }, { "epoch": 0.5072167755991286, "grad_norm": 39.41340884323736, "learning_rate": 4.595766168977949e-07, "logits/chosen": 12.707221984863281, "logits/rejected": 13.238542556762695, "logps/chosen": -3.7122445106506348, "logps/rejected": -3.8311290740966797, "loss": 3.5155, "rewards/accuracies": 0.5, "rewards/chosen": -37.12244415283203, "rewards/margins": 1.1888446807861328, "rewards/rejected": -38.3112907409668, "step": 3725 }, { "epoch": 0.5073529411764706, "grad_norm": 37.36747407473306, "learning_rate": 4.593885907254807e-07, "logits/chosen": 13.770376205444336, "logits/rejected": 13.82172966003418, "logps/chosen": -4.112585067749023, "logps/rejected": -3.7945778369903564, "loss": 3.9223, "rewards/accuracies": 0.25, "rewards/chosen": -41.1258544921875, "rewards/margins": -3.180074691772461, "rewards/rejected": -37.945777893066406, "step": 3726 }, { "epoch": 0.5074891067538126, "grad_norm": 37.825382449737646, "learning_rate": 4.5920055113382376e-07, "logits/chosen": 13.340105056762695, "logits/rejected": 13.377202033996582, "logps/chosen": -3.661888599395752, "logps/rejected": -3.862450122833252, "loss": 3.9367, "rewards/accuracies": 0.75, "rewards/chosen": -36.6188850402832, "rewards/margins": 2.005615234375, "rewards/rejected": -38.62450408935547, "step": 3727 }, { "epoch": 0.5076252723311547, "grad_norm": 45.28613172527537, "learning_rate": 4.5901249816531287e-07, "logits/chosen": 13.728713989257812, "logits/rejected": 13.534111022949219, "logps/chosen": -4.464786052703857, "logps/rejected": -3.8500313758850098, "loss": 4.9235, "rewards/accuracies": 0.0, "rewards/chosen": -44.64786148071289, "rewards/margins": -6.147550582885742, "rewards/rejected": -38.50031280517578, "step": 3728 }, { "epoch": 0.5077614379084967, "grad_norm": 36.148452045614036, "learning_rate": 4.5882443186244006e-07, "logits/chosen": 13.186102867126465, "logits/rejected": 13.647697448730469, "logps/chosen": -4.047799587249756, "logps/rejected": -4.090826988220215, "loss": 3.6508, "rewards/accuracies": 0.75, "rewards/chosen": -40.477996826171875, "rewards/margins": 0.43027210235595703, "rewards/rejected": -40.90827178955078, "step": 3729 }, { "epoch": 0.5078976034858388, "grad_norm": 38.35437622488243, "learning_rate": 4.586363522677008e-07, "logits/chosen": 14.406911849975586, "logits/rejected": 14.725299835205078, "logps/chosen": -4.263552188873291, "logps/rejected": -4.164470672607422, "loss": 4.0122, "rewards/accuracies": 0.5, "rewards/chosen": -42.635520935058594, "rewards/margins": -0.9908151626586914, "rewards/rejected": -41.64470672607422, "step": 3730 }, { "epoch": 0.5080337690631809, "grad_norm": 40.15215252425789, "learning_rate": 4.58448259423593e-07, "logits/chosen": 13.683032035827637, "logits/rejected": 14.196710586547852, "logps/chosen": -3.691713571548462, "logps/rejected": -3.7348790168762207, "loss": 4.0836, "rewards/accuracies": 0.5, "rewards/chosen": -36.91713333129883, "rewards/margins": 0.4316549301147461, "rewards/rejected": -37.34878921508789, "step": 3731 }, { "epoch": 0.5081699346405228, "grad_norm": 40.07801575349487, "learning_rate": 4.582601533726178e-07, "logits/chosen": 13.959983825683594, "logits/rejected": 14.915107727050781, "logps/chosen": -3.9541783332824707, "logps/rejected": -4.499406337738037, "loss": 3.5654, "rewards/accuracies": 1.0, "rewards/chosen": -39.541786193847656, "rewards/margins": 5.452279090881348, "rewards/rejected": -44.99406433105469, "step": 3732 }, { "epoch": 0.5083061002178649, "grad_norm": 40.94510874139069, "learning_rate": 4.580720341572794e-07, "logits/chosen": 13.287128448486328, "logits/rejected": 13.283363342285156, "logps/chosen": -3.6375975608825684, "logps/rejected": -3.9895269870758057, "loss": 4.0561, "rewards/accuracies": 0.75, "rewards/chosen": -36.3759765625, "rewards/margins": 3.5192956924438477, "rewards/rejected": -39.89527130126953, "step": 3733 }, { "epoch": 0.508442265795207, "grad_norm": 36.34274023393133, "learning_rate": 4.578839018200849e-07, "logits/chosen": 13.159891128540039, "logits/rejected": 13.340267181396484, "logps/chosen": -3.649247407913208, "logps/rejected": -3.9434163570404053, "loss": 3.7773, "rewards/accuracies": 0.75, "rewards/chosen": -36.49247741699219, "rewards/margins": 2.9416885375976562, "rewards/rejected": -39.434165954589844, "step": 3734 }, { "epoch": 0.508578431372549, "grad_norm": 37.05877312298946, "learning_rate": 4.576957564035442e-07, "logits/chosen": 12.400638580322266, "logits/rejected": 13.903390884399414, "logps/chosen": -3.524723529815674, "logps/rejected": -4.11100435256958, "loss": 3.5583, "rewards/accuracies": 1.0, "rewards/chosen": -35.24723434448242, "rewards/margins": 5.862810134887695, "rewards/rejected": -41.110042572021484, "step": 3735 }, { "epoch": 0.5087145969498911, "grad_norm": 56.79801544106982, "learning_rate": 4.5750759795017053e-07, "logits/chosen": 13.101015090942383, "logits/rejected": 13.82496452331543, "logps/chosen": -4.052440643310547, "logps/rejected": -4.105875492095947, "loss": 4.2251, "rewards/accuracies": 0.75, "rewards/chosen": -40.524410247802734, "rewards/margins": 0.5343456268310547, "rewards/rejected": -41.058753967285156, "step": 3736 }, { "epoch": 0.5088507625272332, "grad_norm": 43.04368117411965, "learning_rate": 4.5731942650247975e-07, "logits/chosen": 14.64139461517334, "logits/rejected": 13.583283424377441, "logps/chosen": -4.34086799621582, "logps/rejected": -4.0957183837890625, "loss": 4.2137, "rewards/accuracies": 0.25, "rewards/chosen": -43.40868377685547, "rewards/margins": -2.451496124267578, "rewards/rejected": -40.957183837890625, "step": 3737 }, { "epoch": 0.5089869281045751, "grad_norm": 37.47675339219118, "learning_rate": 4.5713124210299065e-07, "logits/chosen": 12.726774215698242, "logits/rejected": 13.434795379638672, "logps/chosen": -3.648681640625, "logps/rejected": -3.8855953216552734, "loss": 4.3753, "rewards/accuracies": 0.5, "rewards/chosen": -36.48681640625, "rewards/margins": 2.369138717651367, "rewards/rejected": -38.85595703125, "step": 3738 }, { "epoch": 0.5091230936819172, "grad_norm": 36.30071522106621, "learning_rate": 4.5694304479422525e-07, "logits/chosen": 13.909988403320312, "logits/rejected": 14.47462272644043, "logps/chosen": -4.055990695953369, "logps/rejected": -4.276828289031982, "loss": 3.6495, "rewards/accuracies": 0.75, "rewards/chosen": -40.559906005859375, "rewards/margins": 2.2083749771118164, "rewards/rejected": -42.768280029296875, "step": 3739 }, { "epoch": 0.5092592592592593, "grad_norm": 36.30158781874295, "learning_rate": 4.567548346187081e-07, "logits/chosen": 13.71696662902832, "logits/rejected": 13.515626907348633, "logps/chosen": -4.113234043121338, "logps/rejected": -4.082209587097168, "loss": 3.6617, "rewards/accuracies": 0.25, "rewards/chosen": -41.13233947753906, "rewards/margins": -0.3102426528930664, "rewards/rejected": -40.82209396362305, "step": 3740 }, { "epoch": 0.5093954248366013, "grad_norm": 37.71928315840213, "learning_rate": 4.5656661161896695e-07, "logits/chosen": 12.97730827331543, "logits/rejected": 13.23176383972168, "logps/chosen": -3.7225136756896973, "logps/rejected": -4.159213066101074, "loss": 3.8292, "rewards/accuracies": 0.75, "rewards/chosen": -37.225135803222656, "rewards/margins": 4.366996765136719, "rewards/rejected": -41.592132568359375, "step": 3741 }, { "epoch": 0.5095315904139434, "grad_norm": 40.08388167488359, "learning_rate": 4.563783758375323e-07, "logits/chosen": 13.717815399169922, "logits/rejected": 14.809099197387695, "logps/chosen": -3.8497750759124756, "logps/rejected": -4.397520065307617, "loss": 3.7767, "rewards/accuracies": 0.75, "rewards/chosen": -38.49774932861328, "rewards/margins": 5.477449417114258, "rewards/rejected": -43.975196838378906, "step": 3742 }, { "epoch": 0.5096677559912854, "grad_norm": 37.93777405395467, "learning_rate": 4.5619012731693765e-07, "logits/chosen": 13.256850242614746, "logits/rejected": 14.231036186218262, "logps/chosen": -3.834228038787842, "logps/rejected": -4.071171760559082, "loss": 3.8375, "rewards/accuracies": 0.5, "rewards/chosen": -38.342281341552734, "rewards/margins": 2.3694353103637695, "rewards/rejected": -40.71171569824219, "step": 3743 }, { "epoch": 0.5098039215686274, "grad_norm": 43.54278814093535, "learning_rate": 4.5600186609971923e-07, "logits/chosen": 14.389836311340332, "logits/rejected": 14.04520034790039, "logps/chosen": -4.049787521362305, "logps/rejected": -3.975245952606201, "loss": 4.4884, "rewards/accuracies": 0.5, "rewards/chosen": -40.49787902832031, "rewards/margins": -0.7454195022583008, "rewards/rejected": -39.75246047973633, "step": 3744 }, { "epoch": 0.5099400871459695, "grad_norm": 42.5557588682723, "learning_rate": 4.5581359222841626e-07, "logits/chosen": 13.590578079223633, "logits/rejected": 14.129608154296875, "logps/chosen": -4.060914516448975, "logps/rejected": -4.369958877563477, "loss": 4.5297, "rewards/accuracies": 0.75, "rewards/chosen": -40.60914611816406, "rewards/margins": 3.0904407501220703, "rewards/rejected": -43.6995849609375, "step": 3745 }, { "epoch": 0.5100762527233116, "grad_norm": 39.723682191703205, "learning_rate": 4.5562530574557076e-07, "logits/chosen": 12.659399032592773, "logits/rejected": 13.385019302368164, "logps/chosen": -3.530714273452759, "logps/rejected": -3.9129958152770996, "loss": 4.2087, "rewards/accuracies": 0.75, "rewards/chosen": -35.30714416503906, "rewards/margins": 3.822816848754883, "rewards/rejected": -39.12995910644531, "step": 3746 }, { "epoch": 0.5102124183006536, "grad_norm": 37.996773856205046, "learning_rate": 4.5543700669372755e-07, "logits/chosen": 12.81183910369873, "logits/rejected": 13.91339111328125, "logps/chosen": -3.8074584007263184, "logps/rejected": -4.220816612243652, "loss": 3.2603, "rewards/accuracies": 0.75, "rewards/chosen": -38.074581146240234, "rewards/margins": 4.133586883544922, "rewards/rejected": -42.208168029785156, "step": 3747 }, { "epoch": 0.5103485838779956, "grad_norm": 36.48320772516277, "learning_rate": 4.5524869511543453e-07, "logits/chosen": 12.946414947509766, "logits/rejected": 13.178329467773438, "logps/chosen": -4.236767292022705, "logps/rejected": -4.270565032958984, "loss": 4.2276, "rewards/accuracies": 0.5, "rewards/chosen": -42.36767578125, "rewards/margins": 0.33797740936279297, "rewards/rejected": -42.705650329589844, "step": 3748 }, { "epoch": 0.5104847494553377, "grad_norm": 35.316917237655346, "learning_rate": 4.550603710532422e-07, "logits/chosen": 13.771272659301758, "logits/rejected": 14.018260955810547, "logps/chosen": -4.028652667999268, "logps/rejected": -4.470785617828369, "loss": 3.3287, "rewards/accuracies": 1.0, "rewards/chosen": -40.286529541015625, "rewards/margins": 4.421329498291016, "rewards/rejected": -44.707855224609375, "step": 3749 }, { "epoch": 0.5106209150326797, "grad_norm": 38.11403595807267, "learning_rate": 4.5487203454970375e-07, "logits/chosen": 13.050811767578125, "logits/rejected": 13.511486053466797, "logps/chosen": -3.952059507369995, "logps/rejected": -4.218009948730469, "loss": 3.8251, "rewards/accuracies": 1.0, "rewards/chosen": -39.52059555053711, "rewards/margins": 2.659501075744629, "rewards/rejected": -42.18009567260742, "step": 3750 }, { "epoch": 0.5107570806100218, "grad_norm": 41.49271449392801, "learning_rate": 4.5468368564737565e-07, "logits/chosen": 13.394813537597656, "logits/rejected": 13.934735298156738, "logps/chosen": -3.8627328872680664, "logps/rejected": -4.2587080001831055, "loss": 4.3476, "rewards/accuracies": 0.75, "rewards/chosen": -38.6273307800293, "rewards/margins": 3.9597511291503906, "rewards/rejected": -42.58708190917969, "step": 3751 }, { "epoch": 0.5108932461873639, "grad_norm": 33.56286117334342, "learning_rate": 4.544953243888167e-07, "logits/chosen": 13.324541091918945, "logits/rejected": 13.859236717224121, "logps/chosen": -3.6178669929504395, "logps/rejected": -4.287760257720947, "loss": 3.2354, "rewards/accuracies": 1.0, "rewards/chosen": -36.178672790527344, "rewards/margins": 6.698929786682129, "rewards/rejected": -42.877601623535156, "step": 3752 }, { "epoch": 0.5110294117647058, "grad_norm": 38.671313802438235, "learning_rate": 4.543069508165887e-07, "logits/chosen": 13.861710548400879, "logits/rejected": 12.982012748718262, "logps/chosen": -3.8776750564575195, "logps/rejected": -3.936124563217163, "loss": 4.3426, "rewards/accuracies": 0.5, "rewards/chosen": -38.77674865722656, "rewards/margins": 0.5844969749450684, "rewards/rejected": -39.361244201660156, "step": 3753 }, { "epoch": 0.5111655773420479, "grad_norm": 38.004924592538785, "learning_rate": 4.541185649732563e-07, "logits/chosen": 12.605157852172852, "logits/rejected": 12.941219329833984, "logps/chosen": -3.756941556930542, "logps/rejected": -3.9255220890045166, "loss": 3.9055, "rewards/accuracies": 1.0, "rewards/chosen": -37.56941604614258, "rewards/margins": 1.685807228088379, "rewards/rejected": -39.25522232055664, "step": 3754 }, { "epoch": 0.51130174291939, "grad_norm": 39.617493814138975, "learning_rate": 4.539301669013868e-07, "logits/chosen": 13.678513526916504, "logits/rejected": 13.420612335205078, "logps/chosen": -3.9987223148345947, "logps/rejected": -3.931290864944458, "loss": 4.2481, "rewards/accuracies": 0.25, "rewards/chosen": -39.987220764160156, "rewards/margins": -0.6743144989013672, "rewards/rejected": -39.31290817260742, "step": 3755 }, { "epoch": 0.511437908496732, "grad_norm": 40.13558785390202, "learning_rate": 4.5374175664355033e-07, "logits/chosen": 13.17294979095459, "logits/rejected": 13.885499000549316, "logps/chosen": -3.991234064102173, "logps/rejected": -4.135787487030029, "loss": 4.0389, "rewards/accuracies": 0.5, "rewards/chosen": -39.9123420715332, "rewards/margins": 1.4455347061157227, "rewards/rejected": -41.35787582397461, "step": 3756 }, { "epoch": 0.5115740740740741, "grad_norm": 35.4425427860897, "learning_rate": 4.535533342423196e-07, "logits/chosen": 13.10580062866211, "logits/rejected": 13.922304153442383, "logps/chosen": -4.014782905578613, "logps/rejected": -4.386889457702637, "loss": 3.7713, "rewards/accuracies": 1.0, "rewards/chosen": -40.1478271484375, "rewards/margins": 3.721064567565918, "rewards/rejected": -43.868892669677734, "step": 3757 }, { "epoch": 0.5117102396514162, "grad_norm": 38.62773516932735, "learning_rate": 4.5336489974027044e-07, "logits/chosen": 13.360321044921875, "logits/rejected": 13.453645706176758, "logps/chosen": -3.944092273712158, "logps/rejected": -4.064990997314453, "loss": 4.1795, "rewards/accuracies": 0.75, "rewards/chosen": -39.44092559814453, "rewards/margins": 1.2089862823486328, "rewards/rejected": -40.64990997314453, "step": 3758 }, { "epoch": 0.5118464052287581, "grad_norm": 40.57041848791305, "learning_rate": 4.53176453179981e-07, "logits/chosen": 12.780899047851562, "logits/rejected": 13.742382049560547, "logps/chosen": -3.9502062797546387, "logps/rejected": -4.169887542724609, "loss": 4.4745, "rewards/accuracies": 0.75, "rewards/chosen": -39.50205993652344, "rewards/margins": 2.19681453704834, "rewards/rejected": -41.698875427246094, "step": 3759 }, { "epoch": 0.5119825708061002, "grad_norm": 34.72812314972657, "learning_rate": 4.5298799460403244e-07, "logits/chosen": 12.307083129882812, "logits/rejected": 12.979097366333008, "logps/chosen": -3.7170839309692383, "logps/rejected": -3.956725835800171, "loss": 4.0431, "rewards/accuracies": 0.75, "rewards/chosen": -37.170841217041016, "rewards/margins": 2.3964176177978516, "rewards/rejected": -39.567256927490234, "step": 3760 }, { "epoch": 0.5121187363834423, "grad_norm": 40.239016288054, "learning_rate": 4.5279952405500844e-07, "logits/chosen": 12.749614715576172, "logits/rejected": 12.605993270874023, "logps/chosen": -3.7188096046447754, "logps/rejected": -3.575883150100708, "loss": 4.3801, "rewards/accuracies": 0.0, "rewards/chosen": -37.18809509277344, "rewards/margins": -1.4292645454406738, "rewards/rejected": -35.75883102416992, "step": 3761 }, { "epoch": 0.5122549019607843, "grad_norm": 38.34933035952213, "learning_rate": 4.5261104157549567e-07, "logits/chosen": 12.410140037536621, "logits/rejected": 12.722663879394531, "logps/chosen": -3.806065559387207, "logps/rejected": -3.721813678741455, "loss": 4.0506, "rewards/accuracies": 0.5, "rewards/chosen": -38.0606575012207, "rewards/margins": -0.8425216674804688, "rewards/rejected": -37.21813201904297, "step": 3762 }, { "epoch": 0.5123910675381264, "grad_norm": 40.757745896243605, "learning_rate": 4.5242254720808307e-07, "logits/chosen": 12.700061798095703, "logits/rejected": 13.99898910522461, "logps/chosen": -3.716630458831787, "logps/rejected": -4.419391632080078, "loss": 3.6853, "rewards/accuracies": 1.0, "rewards/chosen": -37.16630554199219, "rewards/margins": 7.027612686157227, "rewards/rejected": -44.19391632080078, "step": 3763 }, { "epoch": 0.5125272331154684, "grad_norm": 38.19604422682524, "learning_rate": 4.522340409953625e-07, "logits/chosen": 13.026041030883789, "logits/rejected": 12.985336303710938, "logps/chosen": -4.080806255340576, "logps/rejected": -3.9236862659454346, "loss": 3.8367, "rewards/accuracies": 0.25, "rewards/chosen": -40.80806350708008, "rewards/margins": -1.5711994171142578, "rewards/rejected": -39.23686218261719, "step": 3764 }, { "epoch": 0.5126633986928104, "grad_norm": 34.65865308191223, "learning_rate": 4.520455229799287e-07, "logits/chosen": 12.729635238647461, "logits/rejected": 13.482295036315918, "logps/chosen": -3.756649971008301, "logps/rejected": -4.185482978820801, "loss": 3.7971, "rewards/accuracies": 0.75, "rewards/chosen": -37.56650161743164, "rewards/margins": 4.288326263427734, "rewards/rejected": -41.854827880859375, "step": 3765 }, { "epoch": 0.5127995642701525, "grad_norm": 38.82994470862406, "learning_rate": 4.518569932043787e-07, "logits/chosen": 13.288806915283203, "logits/rejected": 13.700876235961914, "logps/chosen": -4.017117500305176, "logps/rejected": -4.161367416381836, "loss": 3.5919, "rewards/accuracies": 0.75, "rewards/chosen": -40.171173095703125, "rewards/margins": 1.4424972534179688, "rewards/rejected": -41.613670349121094, "step": 3766 }, { "epoch": 0.5129357298474946, "grad_norm": 36.01107522285141, "learning_rate": 4.516684517113126e-07, "logits/chosen": 12.83940315246582, "logits/rejected": 13.987436294555664, "logps/chosen": -4.1019606590271, "logps/rejected": -4.3095903396606445, "loss": 3.9714, "rewards/accuracies": 1.0, "rewards/chosen": -41.01961135864258, "rewards/margins": 2.0762901306152344, "rewards/rejected": -43.09590148925781, "step": 3767 }, { "epoch": 0.5130718954248366, "grad_norm": 39.87909871172806, "learning_rate": 4.514798985433326e-07, "logits/chosen": 13.27290153503418, "logits/rejected": 14.209003448486328, "logps/chosen": -3.9199512004852295, "logps/rejected": -4.095077037811279, "loss": 3.5964, "rewards/accuracies": 0.5, "rewards/chosen": -39.19951248168945, "rewards/margins": 1.7512598037719727, "rewards/rejected": -40.95077133178711, "step": 3768 }, { "epoch": 0.5132080610021786, "grad_norm": 39.018550631254485, "learning_rate": 4.51291333743044e-07, "logits/chosen": 13.009000778198242, "logits/rejected": 13.442261695861816, "logps/chosen": -4.1725029945373535, "logps/rejected": -4.269870758056641, "loss": 4.2107, "rewards/accuracies": 0.75, "rewards/chosen": -41.72502899169922, "rewards/margins": 0.9736804962158203, "rewards/rejected": -42.69871139526367, "step": 3769 }, { "epoch": 0.5133442265795207, "grad_norm": 43.37001431404562, "learning_rate": 4.5110275735305467e-07, "logits/chosen": 12.92611312866211, "logits/rejected": 13.230709075927734, "logps/chosen": -4.222201347351074, "logps/rejected": -4.192404747009277, "loss": 3.3732, "rewards/accuracies": 0.5, "rewards/chosen": -42.22201156616211, "rewards/margins": -0.2979612350463867, "rewards/rejected": -41.924049377441406, "step": 3770 }, { "epoch": 0.5134803921568627, "grad_norm": 39.302487439849024, "learning_rate": 4.509141694159748e-07, "logits/chosen": 12.599834442138672, "logits/rejected": 13.630470275878906, "logps/chosen": -3.942234992980957, "logps/rejected": -4.625652313232422, "loss": 4.4182, "rewards/accuracies": 1.0, "rewards/chosen": -39.4223518371582, "rewards/margins": 6.834175109863281, "rewards/rejected": -46.256526947021484, "step": 3771 }, { "epoch": 0.5136165577342048, "grad_norm": 42.367176495000024, "learning_rate": 4.507255699744175e-07, "logits/chosen": 12.880456924438477, "logits/rejected": 13.007497787475586, "logps/chosen": -4.17625617980957, "logps/rejected": -4.164802551269531, "loss": 3.6983, "rewards/accuracies": 0.5, "rewards/chosen": -41.76255798339844, "rewards/margins": -0.11453437805175781, "rewards/rejected": -41.64802551269531, "step": 3772 }, { "epoch": 0.5137527233115469, "grad_norm": 36.41907403995017, "learning_rate": 4.505369590709984e-07, "logits/chosen": 12.344406127929688, "logits/rejected": 12.74416732788086, "logps/chosen": -3.7854411602020264, "logps/rejected": -3.9681639671325684, "loss": 3.8424, "rewards/accuracies": 0.5, "rewards/chosen": -37.85441207885742, "rewards/margins": 1.8272294998168945, "rewards/rejected": -39.681640625, "step": 3773 }, { "epoch": 0.5138888888888888, "grad_norm": 36.63291577033337, "learning_rate": 4.5034833674833556e-07, "logits/chosen": 13.44710636138916, "logits/rejected": 13.381308555603027, "logps/chosen": -4.137078285217285, "logps/rejected": -4.432437419891357, "loss": 4.0553, "rewards/accuracies": 0.75, "rewards/chosen": -41.37078094482422, "rewards/margins": 2.953592300415039, "rewards/rejected": -44.324371337890625, "step": 3774 }, { "epoch": 0.5140250544662309, "grad_norm": 38.736943164653894, "learning_rate": 4.501597030490499e-07, "logits/chosen": 13.102062225341797, "logits/rejected": 12.625862121582031, "logps/chosen": -3.8742833137512207, "logps/rejected": -4.184597969055176, "loss": 4.1745, "rewards/accuracies": 0.75, "rewards/chosen": -38.742828369140625, "rewards/margins": 3.1031484603881836, "rewards/rejected": -41.845977783203125, "step": 3775 }, { "epoch": 0.514161220043573, "grad_norm": 51.59688605410542, "learning_rate": 4.4997105801576474e-07, "logits/chosen": 12.945079803466797, "logits/rejected": 13.16374397277832, "logps/chosen": -4.294767379760742, "logps/rejected": -4.186925411224365, "loss": 3.66, "rewards/accuracies": 0.25, "rewards/chosen": -42.947669982910156, "rewards/margins": -1.0784168243408203, "rewards/rejected": -41.86925506591797, "step": 3776 }, { "epoch": 0.514297385620915, "grad_norm": 37.36490927278037, "learning_rate": 4.4978240169110596e-07, "logits/chosen": 12.576887130737305, "logits/rejected": 13.927830696105957, "logps/chosen": -4.14310884475708, "logps/rejected": -4.607954025268555, "loss": 3.6841, "rewards/accuracies": 0.75, "rewards/chosen": -41.43109130859375, "rewards/margins": 4.64845085144043, "rewards/rejected": -46.07954025268555, "step": 3777 }, { "epoch": 0.5144335511982571, "grad_norm": 36.62664249374876, "learning_rate": 4.4959373411770194e-07, "logits/chosen": 14.237691879272461, "logits/rejected": 13.155254364013672, "logps/chosen": -4.121145248413086, "logps/rejected": -4.206061363220215, "loss": 3.559, "rewards/accuracies": 0.25, "rewards/chosen": -41.211448669433594, "rewards/margins": 0.8491659164428711, "rewards/rejected": -42.06061553955078, "step": 3778 }, { "epoch": 0.5145697167755992, "grad_norm": 39.39890693056433, "learning_rate": 4.4940505533818384e-07, "logits/chosen": 13.044612884521484, "logits/rejected": 12.973260879516602, "logps/chosen": -3.8424854278564453, "logps/rejected": -4.012454986572266, "loss": 4.0767, "rewards/accuracies": 0.5, "rewards/chosen": -38.42485427856445, "rewards/margins": 1.699692726135254, "rewards/rejected": -40.12454605102539, "step": 3779 }, { "epoch": 0.5147058823529411, "grad_norm": 42.595822482454096, "learning_rate": 4.49216365395185e-07, "logits/chosen": 13.906545639038086, "logits/rejected": 14.006696701049805, "logps/chosen": -4.481637954711914, "logps/rejected": -4.383780002593994, "loss": 4.3193, "rewards/accuracies": 0.25, "rewards/chosen": -44.81637954711914, "rewards/margins": -0.9785757064819336, "rewards/rejected": -43.83780288696289, "step": 3780 }, { "epoch": 0.5148420479302832, "grad_norm": 39.082385689650195, "learning_rate": 4.490276643313417e-07, "logits/chosen": 13.187118530273438, "logits/rejected": 12.926397323608398, "logps/chosen": -4.1569743156433105, "logps/rejected": -4.116525650024414, "loss": 3.4576, "rewards/accuracies": 0.5, "rewards/chosen": -41.569740295410156, "rewards/margins": -0.4044828414916992, "rewards/rejected": -41.16525650024414, "step": 3781 }, { "epoch": 0.5149782135076253, "grad_norm": 39.95446725650409, "learning_rate": 4.4883895218929233e-07, "logits/chosen": 12.765993118286133, "logits/rejected": 13.311088562011719, "logps/chosen": -3.931917905807495, "logps/rejected": -4.293117523193359, "loss": 3.4015, "rewards/accuracies": 0.75, "rewards/chosen": -39.319175720214844, "rewards/margins": 3.61199951171875, "rewards/rejected": -42.93117904663086, "step": 3782 }, { "epoch": 0.5151143790849673, "grad_norm": 242.97376889026495, "learning_rate": 4.486502290116779e-07, "logits/chosen": 12.98375129699707, "logits/rejected": 13.178714752197266, "logps/chosen": -3.9240596294403076, "logps/rejected": -4.194137096405029, "loss": 3.3216, "rewards/accuracies": 1.0, "rewards/chosen": -39.24059295654297, "rewards/margins": 2.7007765769958496, "rewards/rejected": -41.94137191772461, "step": 3783 }, { "epoch": 0.5152505446623094, "grad_norm": 48.93663912260957, "learning_rate": 4.4846149484114226e-07, "logits/chosen": 12.903072357177734, "logits/rejected": 13.599401473999023, "logps/chosen": -4.069379806518555, "logps/rejected": -4.398215293884277, "loss": 4.6259, "rewards/accuracies": 0.75, "rewards/chosen": -40.69379806518555, "rewards/margins": 3.288355827331543, "rewards/rejected": -43.982154846191406, "step": 3784 }, { "epoch": 0.5153867102396514, "grad_norm": 43.288224358442996, "learning_rate": 4.4827274972033116e-07, "logits/chosen": 13.452398300170898, "logits/rejected": 13.567840576171875, "logps/chosen": -4.114901542663574, "logps/rejected": -4.311823844909668, "loss": 3.7304, "rewards/accuracies": 0.75, "rewards/chosen": -41.14901351928711, "rewards/margins": 1.9692277908325195, "rewards/rejected": -43.11824035644531, "step": 3785 }, { "epoch": 0.5155228758169934, "grad_norm": 48.07659214249712, "learning_rate": 4.480839936918932e-07, "logits/chosen": 13.728322982788086, "logits/rejected": 13.020339965820312, "logps/chosen": -4.379891872406006, "logps/rejected": -4.2770609855651855, "loss": 4.5628, "rewards/accuracies": 0.5, "rewards/chosen": -43.79891586303711, "rewards/margins": -1.028306007385254, "rewards/rejected": -42.770606994628906, "step": 3786 }, { "epoch": 0.5156590413943355, "grad_norm": 40.24152880360725, "learning_rate": 4.4789522679847946e-07, "logits/chosen": 12.98793888092041, "logits/rejected": 13.802156448364258, "logps/chosen": -3.757053852081299, "logps/rejected": -4.279419898986816, "loss": 3.4131, "rewards/accuracies": 0.75, "rewards/chosen": -37.57054138183594, "rewards/margins": 5.223658561706543, "rewards/rejected": -42.79419708251953, "step": 3787 }, { "epoch": 0.5157952069716776, "grad_norm": 43.52866699704237, "learning_rate": 4.477064490827434e-07, "logits/chosen": 13.082080841064453, "logits/rejected": 13.10771369934082, "logps/chosen": -3.706634521484375, "logps/rejected": -4.0182037353515625, "loss": 4.007, "rewards/accuracies": 1.0, "rewards/chosen": -37.06634521484375, "rewards/margins": 3.115694046020508, "rewards/rejected": -40.18204116821289, "step": 3788 }, { "epoch": 0.5159313725490197, "grad_norm": 45.18385892750102, "learning_rate": 4.4751766058734065e-07, "logits/chosen": 12.65635871887207, "logits/rejected": 13.05176067352295, "logps/chosen": -4.103686332702637, "logps/rejected": -4.3496551513671875, "loss": 4.7735, "rewards/accuracies": 0.5, "rewards/chosen": -41.036865234375, "rewards/margins": 2.4596872329711914, "rewards/rejected": -43.496551513671875, "step": 3789 }, { "epoch": 0.5160675381263616, "grad_norm": 42.97058348808956, "learning_rate": 4.4732886135492985e-07, "logits/chosen": 13.03309440612793, "logits/rejected": 14.28253173828125, "logps/chosen": -3.799917221069336, "logps/rejected": -4.371269702911377, "loss": 3.648, "rewards/accuracies": 1.0, "rewards/chosen": -37.99917221069336, "rewards/margins": 5.713525772094727, "rewards/rejected": -43.71269607543945, "step": 3790 }, { "epoch": 0.5162037037037037, "grad_norm": 39.43052175236178, "learning_rate": 4.4714005142817155e-07, "logits/chosen": 13.521123886108398, "logits/rejected": 12.710335731506348, "logps/chosen": -3.7991037368774414, "logps/rejected": -4.059488296508789, "loss": 3.3323, "rewards/accuracies": 0.75, "rewards/chosen": -37.99103546142578, "rewards/margins": 2.603849411010742, "rewards/rejected": -40.594886779785156, "step": 3791 }, { "epoch": 0.5163398692810458, "grad_norm": 42.01527996055959, "learning_rate": 4.4695123084972887e-07, "logits/chosen": 14.183685302734375, "logits/rejected": 13.803436279296875, "logps/chosen": -4.383305072784424, "logps/rejected": -4.223597526550293, "loss": 4.432, "rewards/accuracies": 0.5, "rewards/chosen": -43.83305358886719, "rewards/margins": -1.597076416015625, "rewards/rejected": -42.23597717285156, "step": 3792 }, { "epoch": 0.5164760348583878, "grad_norm": 40.72876664903349, "learning_rate": 4.467623996622676e-07, "logits/chosen": 13.04000473022461, "logits/rejected": 13.275419235229492, "logps/chosen": -4.082405090332031, "logps/rejected": -4.125216484069824, "loss": 4.3255, "rewards/accuracies": 0.25, "rewards/chosen": -40.82404708862305, "rewards/margins": 0.4281187057495117, "rewards/rejected": -41.252166748046875, "step": 3793 }, { "epoch": 0.5166122004357299, "grad_norm": 43.8555811744622, "learning_rate": 4.4657355790845564e-07, "logits/chosen": 12.978513717651367, "logits/rejected": 13.657918930053711, "logps/chosen": -4.112308025360107, "logps/rejected": -4.42271089553833, "loss": 4.4825, "rewards/accuracies": 0.75, "rewards/chosen": -41.12308120727539, "rewards/margins": 3.104029655456543, "rewards/rejected": -44.22711181640625, "step": 3794 }, { "epoch": 0.516748366013072, "grad_norm": 42.902163427459584, "learning_rate": 4.4638470563096307e-07, "logits/chosen": 13.247177124023438, "logits/rejected": 13.231805801391602, "logps/chosen": -4.161548614501953, "logps/rejected": -3.921454906463623, "loss": 4.0233, "rewards/accuracies": 0.25, "rewards/chosen": -41.6154899597168, "rewards/margins": -2.40093994140625, "rewards/rejected": -39.21455001831055, "step": 3795 }, { "epoch": 0.5168845315904139, "grad_norm": 39.47017783431696, "learning_rate": 4.4619584287246306e-07, "logits/chosen": 13.58019733428955, "logits/rejected": 13.254776000976562, "logps/chosen": -4.266885757446289, "logps/rejected": -4.471860885620117, "loss": 3.9469, "rewards/accuracies": 0.75, "rewards/chosen": -42.668861389160156, "rewards/margins": 2.0497474670410156, "rewards/rejected": -44.71860885620117, "step": 3796 }, { "epoch": 0.517020697167756, "grad_norm": 39.027151218627004, "learning_rate": 4.4600696967563046e-07, "logits/chosen": 13.008434295654297, "logits/rejected": 13.165806770324707, "logps/chosen": -3.860924005508423, "logps/rejected": -4.054329872131348, "loss": 3.9415, "rewards/accuracies": 0.5, "rewards/chosen": -38.6092414855957, "rewards/margins": 1.9340572357177734, "rewards/rejected": -40.543296813964844, "step": 3797 }, { "epoch": 0.5171568627450981, "grad_norm": 36.12916777514075, "learning_rate": 4.458180860831426e-07, "logits/chosen": 13.177595138549805, "logits/rejected": 13.929386138916016, "logps/chosen": -3.904202461242676, "logps/rejected": -4.453307151794434, "loss": 3.7779, "rewards/accuracies": 1.0, "rewards/chosen": -39.04202651977539, "rewards/margins": 5.491044044494629, "rewards/rejected": -44.5330696105957, "step": 3798 }, { "epoch": 0.5172930283224401, "grad_norm": 39.603275579228836, "learning_rate": 4.4562919213767963e-07, "logits/chosen": 12.746832847595215, "logits/rejected": 13.669816970825195, "logps/chosen": -4.085616111755371, "logps/rejected": -4.241093158721924, "loss": 3.5935, "rewards/accuracies": 0.5, "rewards/chosen": -40.856163024902344, "rewards/margins": 1.5547714233398438, "rewards/rejected": -42.41093444824219, "step": 3799 }, { "epoch": 0.5174291938997821, "grad_norm": 43.875675705045275, "learning_rate": 4.454402878819235e-07, "logits/chosen": 14.235810279846191, "logits/rejected": 13.090825080871582, "logps/chosen": -4.48944616317749, "logps/rejected": -4.287952423095703, "loss": 4.2535, "rewards/accuracies": 0.25, "rewards/chosen": -44.89446258544922, "rewards/margins": -2.014939308166504, "rewards/rejected": -42.87952423095703, "step": 3800 }, { "epoch": 0.5175653594771242, "grad_norm": 38.80474212985557, "learning_rate": 4.4525137335855857e-07, "logits/chosen": 13.213455200195312, "logits/rejected": 13.585043907165527, "logps/chosen": -4.090083599090576, "logps/rejected": -4.141461372375488, "loss": 3.9758, "rewards/accuracies": 0.5, "rewards/chosen": -40.90083312988281, "rewards/margins": 0.5137805938720703, "rewards/rejected": -41.414615631103516, "step": 3801 }, { "epoch": 0.5177015250544662, "grad_norm": 35.476859671937824, "learning_rate": 4.450624486102719e-07, "logits/chosen": 13.586387634277344, "logits/rejected": 13.926193237304688, "logps/chosen": -4.03373908996582, "logps/rejected": -4.405948638916016, "loss": 3.9132, "rewards/accuracies": 1.0, "rewards/chosen": -40.3373908996582, "rewards/margins": 3.722095489501953, "rewards/rejected": -44.059486389160156, "step": 3802 }, { "epoch": 0.5178376906318083, "grad_norm": 38.821213391260436, "learning_rate": 4.4487351367975254e-07, "logits/chosen": 12.434480667114258, "logits/rejected": 14.388665199279785, "logps/chosen": -3.7520995140075684, "logps/rejected": -4.578249931335449, "loss": 3.4953, "rewards/accuracies": 1.0, "rewards/chosen": -37.520992279052734, "rewards/margins": 8.261502265930176, "rewards/rejected": -45.782493591308594, "step": 3803 }, { "epoch": 0.5179738562091504, "grad_norm": 38.94731782900564, "learning_rate": 4.4468456860969165e-07, "logits/chosen": 13.277559280395508, "logits/rejected": 13.475746154785156, "logps/chosen": -4.107166290283203, "logps/rejected": -4.355765342712402, "loss": 3.6232, "rewards/accuracies": 0.5, "rewards/chosen": -41.07166290283203, "rewards/margins": 2.4859933853149414, "rewards/rejected": -43.557655334472656, "step": 3804 }, { "epoch": 0.5181100217864923, "grad_norm": 40.33759183093183, "learning_rate": 4.4449561344278325e-07, "logits/chosen": 14.457645416259766, "logits/rejected": 14.237241744995117, "logps/chosen": -4.373144149780273, "logps/rejected": -4.280690670013428, "loss": 3.957, "rewards/accuracies": 0.5, "rewards/chosen": -43.7314453125, "rewards/margins": -0.9245376586914062, "rewards/rejected": -42.80690383911133, "step": 3805 }, { "epoch": 0.5182461873638344, "grad_norm": 39.140390138504245, "learning_rate": 4.443066482217232e-07, "logits/chosen": 14.125215530395508, "logits/rejected": 13.698784828186035, "logps/chosen": -4.429148197174072, "logps/rejected": -4.667788505554199, "loss": 4.0749, "rewards/accuracies": 0.75, "rewards/chosen": -44.29148483276367, "rewards/margins": 2.3863983154296875, "rewards/rejected": -46.67788314819336, "step": 3806 }, { "epoch": 0.5183823529411765, "grad_norm": 40.19585038028774, "learning_rate": 4.4411767298920966e-07, "logits/chosen": 12.91386604309082, "logits/rejected": 12.993829727172852, "logps/chosen": -3.694887638092041, "logps/rejected": -3.9138574600219727, "loss": 4.3456, "rewards/accuracies": 0.75, "rewards/chosen": -36.948875427246094, "rewards/margins": 2.189697265625, "rewards/rejected": -39.138572692871094, "step": 3807 }, { "epoch": 0.5185185185185185, "grad_norm": 37.8764093041105, "learning_rate": 4.439286877879432e-07, "logits/chosen": 13.411691665649414, "logits/rejected": 13.453086853027344, "logps/chosen": -3.8223378658294678, "logps/rejected": -4.075066566467285, "loss": 3.8875, "rewards/accuracies": 1.0, "rewards/chosen": -38.22338104248047, "rewards/margins": 2.5272884368896484, "rewards/rejected": -40.750667572021484, "step": 3808 }, { "epoch": 0.5186546840958606, "grad_norm": 41.807579130777924, "learning_rate": 4.4373969266062675e-07, "logits/chosen": 13.14992904663086, "logits/rejected": 13.483932495117188, "logps/chosen": -4.203643798828125, "logps/rejected": -4.510344505310059, "loss": 4.3279, "rewards/accuracies": 0.75, "rewards/chosen": -42.03643798828125, "rewards/margins": 3.0670108795166016, "rewards/rejected": -45.10344696044922, "step": 3809 }, { "epoch": 0.5187908496732027, "grad_norm": 37.14256939830993, "learning_rate": 4.4355068764996504e-07, "logits/chosen": 13.047605514526367, "logits/rejected": 13.214554786682129, "logps/chosen": -3.7806332111358643, "logps/rejected": -3.9400527477264404, "loss": 3.8563, "rewards/accuracies": 0.75, "rewards/chosen": -37.80633544921875, "rewards/margins": 1.5941963195800781, "rewards/rejected": -39.40052795410156, "step": 3810 }, { "epoch": 0.5189270152505446, "grad_norm": 42.65714455481354, "learning_rate": 4.433616727986656e-07, "logits/chosen": 13.396869659423828, "logits/rejected": 13.694663047790527, "logps/chosen": -4.088959217071533, "logps/rejected": -4.46164608001709, "loss": 4.0849, "rewards/accuracies": 1.0, "rewards/chosen": -40.88959503173828, "rewards/margins": 3.7268638610839844, "rewards/rejected": -44.616458892822266, "step": 3811 }, { "epoch": 0.5190631808278867, "grad_norm": 37.40738448264117, "learning_rate": 4.431726481494376e-07, "logits/chosen": 12.43388843536377, "logits/rejected": 13.597661018371582, "logps/chosen": -3.686826229095459, "logps/rejected": -4.115617752075195, "loss": 3.9058, "rewards/accuracies": 0.75, "rewards/chosen": -36.868263244628906, "rewards/margins": 4.287914276123047, "rewards/rejected": -41.15618133544922, "step": 3812 }, { "epoch": 0.5191993464052288, "grad_norm": 42.08241418028146, "learning_rate": 4.4298361374499305e-07, "logits/chosen": 13.291372299194336, "logits/rejected": 13.64886474609375, "logps/chosen": -4.0193772315979, "logps/rejected": -4.158469200134277, "loss": 4.4636, "rewards/accuracies": 0.5, "rewards/chosen": -40.19377136230469, "rewards/margins": 1.3909239768981934, "rewards/rejected": -41.584693908691406, "step": 3813 }, { "epoch": 0.5193355119825708, "grad_norm": 42.63466028111096, "learning_rate": 4.4279456962804556e-07, "logits/chosen": 12.860695838928223, "logits/rejected": 13.847959518432617, "logps/chosen": -3.542239189147949, "logps/rejected": -3.9935035705566406, "loss": 3.7041, "rewards/accuracies": 1.0, "rewards/chosen": -35.42238998413086, "rewards/margins": 4.512641906738281, "rewards/rejected": -39.935035705566406, "step": 3814 }, { "epoch": 0.5194716775599129, "grad_norm": 40.0101869299381, "learning_rate": 4.4260551584131135e-07, "logits/chosen": 12.744579315185547, "logits/rejected": 14.0053129196167, "logps/chosen": -3.727703094482422, "logps/rejected": -3.950389862060547, "loss": 4.4511, "rewards/accuracies": 0.5, "rewards/chosen": -37.27703094482422, "rewards/margins": 2.22686767578125, "rewards/rejected": -39.50389862060547, "step": 3815 }, { "epoch": 0.5196078431372549, "grad_norm": 41.27528587499256, "learning_rate": 4.4241645242750865e-07, "logits/chosen": 13.150928497314453, "logits/rejected": 13.50772476196289, "logps/chosen": -4.1697893142700195, "logps/rejected": -4.127622604370117, "loss": 4.4388, "rewards/accuracies": 0.25, "rewards/chosen": -41.69789123535156, "rewards/margins": -0.4216623306274414, "rewards/rejected": -41.27622985839844, "step": 3816 }, { "epoch": 0.5197440087145969, "grad_norm": 39.93464901336178, "learning_rate": 4.422273794293579e-07, "logits/chosen": 13.976102828979492, "logits/rejected": 14.283384323120117, "logps/chosen": -4.264922142028809, "logps/rejected": -4.314258575439453, "loss": 4.1444, "rewards/accuracies": 0.5, "rewards/chosen": -42.64922332763672, "rewards/margins": 0.4933614730834961, "rewards/rejected": -43.14258575439453, "step": 3817 }, { "epoch": 0.519880174291939, "grad_norm": 95.7272598574927, "learning_rate": 4.4203829688958176e-07, "logits/chosen": 13.235504150390625, "logits/rejected": 13.450189590454102, "logps/chosen": -4.151318073272705, "logps/rejected": -4.131092071533203, "loss": 4.2377, "rewards/accuracies": 0.5, "rewards/chosen": -41.513179779052734, "rewards/margins": -0.20225811004638672, "rewards/rejected": -41.31092071533203, "step": 3818 }, { "epoch": 0.5200163398692811, "grad_norm": 35.58158247666492, "learning_rate": 4.4184920485090487e-07, "logits/chosen": 13.123068809509277, "logits/rejected": 13.878602981567383, "logps/chosen": -4.343935966491699, "logps/rejected": -4.661401748657227, "loss": 3.8634, "rewards/accuracies": 0.5, "rewards/chosen": -43.439361572265625, "rewards/margins": 3.174661636352539, "rewards/rejected": -46.61402130126953, "step": 3819 }, { "epoch": 0.5201525054466231, "grad_norm": 38.96384986015354, "learning_rate": 4.4166010335605427e-07, "logits/chosen": 14.057497024536133, "logits/rejected": 13.33959674835205, "logps/chosen": -4.152951240539551, "logps/rejected": -4.227596759796143, "loss": 4.2238, "rewards/accuracies": 0.5, "rewards/chosen": -41.52951431274414, "rewards/margins": 0.7464523315429688, "rewards/rejected": -42.27596664428711, "step": 3820 }, { "epoch": 0.5202886710239651, "grad_norm": 41.94014485262413, "learning_rate": 4.41470992447759e-07, "logits/chosen": 13.468696594238281, "logits/rejected": 14.070149421691895, "logps/chosen": -4.360036849975586, "logps/rejected": -4.510101318359375, "loss": 4.2622, "rewards/accuracies": 0.75, "rewards/chosen": -43.60036849975586, "rewards/margins": 1.500645637512207, "rewards/rejected": -45.10101318359375, "step": 3821 }, { "epoch": 0.5204248366013072, "grad_norm": 40.57453737972062, "learning_rate": 4.4128187216875004e-07, "logits/chosen": 12.829344749450684, "logits/rejected": 13.182231903076172, "logps/chosen": -3.959566354751587, "logps/rejected": -4.063188076019287, "loss": 3.9821, "rewards/accuracies": 0.5, "rewards/chosen": -39.59566116333008, "rewards/margins": 1.036219596862793, "rewards/rejected": -40.63188171386719, "step": 3822 }, { "epoch": 0.5205610021786492, "grad_norm": 38.7086573172362, "learning_rate": 4.4109274256176097e-07, "logits/chosen": 13.198801040649414, "logits/rejected": 13.167850494384766, "logps/chosen": -4.205475807189941, "logps/rejected": -4.2770586013793945, "loss": 4.5142, "rewards/accuracies": 0.5, "rewards/chosen": -42.05475616455078, "rewards/margins": 0.7158288955688477, "rewards/rejected": -42.77058410644531, "step": 3823 }, { "epoch": 0.5206971677559913, "grad_norm": 35.00689124514441, "learning_rate": 4.40903603669527e-07, "logits/chosen": 12.611076354980469, "logits/rejected": 12.885591506958008, "logps/chosen": -4.033106803894043, "logps/rejected": -4.092270374298096, "loss": 3.7915, "rewards/accuracies": 0.75, "rewards/chosen": -40.33106994628906, "rewards/margins": 0.5916337966918945, "rewards/rejected": -40.92270278930664, "step": 3824 }, { "epoch": 0.5208333333333334, "grad_norm": 37.0640368889228, "learning_rate": 4.4071445553478563e-07, "logits/chosen": 14.005807876586914, "logits/rejected": 15.407285690307617, "logps/chosen": -4.132901668548584, "logps/rejected": -4.706826210021973, "loss": 3.9894, "rewards/accuracies": 1.0, "rewards/chosen": -41.329017639160156, "rewards/margins": 5.73924446105957, "rewards/rejected": -47.068260192871094, "step": 3825 }, { "epoch": 0.5209694989106753, "grad_norm": 35.729950944448476, "learning_rate": 4.405252982002765e-07, "logits/chosen": 13.10078239440918, "logits/rejected": 12.81060791015625, "logps/chosen": -4.047125816345215, "logps/rejected": -4.174976825714111, "loss": 4.2253, "rewards/accuracies": 0.75, "rewards/chosen": -40.471256256103516, "rewards/margins": 1.2785139083862305, "rewards/rejected": -41.74977111816406, "step": 3826 }, { "epoch": 0.5211056644880174, "grad_norm": 50.52292441636098, "learning_rate": 4.4033613170874124e-07, "logits/chosen": 12.503620147705078, "logits/rejected": 13.409177780151367, "logps/chosen": -3.708049774169922, "logps/rejected": -4.195406913757324, "loss": 4.0959, "rewards/accuracies": 1.0, "rewards/chosen": -37.08049774169922, "rewards/margins": 4.873573303222656, "rewards/rejected": -41.954071044921875, "step": 3827 }, { "epoch": 0.5212418300653595, "grad_norm": 36.95552984234756, "learning_rate": 4.4014695610292356e-07, "logits/chosen": 13.922500610351562, "logits/rejected": 13.809414863586426, "logps/chosen": -3.8569564819335938, "logps/rejected": -4.070024490356445, "loss": 3.3775, "rewards/accuracies": 0.75, "rewards/chosen": -38.56956481933594, "rewards/margins": 2.1306800842285156, "rewards/rejected": -40.70024108886719, "step": 3828 }, { "epoch": 0.5213779956427015, "grad_norm": 40.00990263371602, "learning_rate": 4.399577714255694e-07, "logits/chosen": 13.25065803527832, "logits/rejected": 13.152144432067871, "logps/chosen": -4.100466728210449, "logps/rejected": -4.39292049407959, "loss": 4.1972, "rewards/accuracies": 0.75, "rewards/chosen": -41.004669189453125, "rewards/margins": 2.9245338439941406, "rewards/rejected": -43.92920684814453, "step": 3829 }, { "epoch": 0.5215141612200436, "grad_norm": 37.81581243653934, "learning_rate": 4.3976857771942643e-07, "logits/chosen": 13.249770164489746, "logits/rejected": 13.805445671081543, "logps/chosen": -3.9074063301086426, "logps/rejected": -4.5381879806518555, "loss": 4.0682, "rewards/accuracies": 0.75, "rewards/chosen": -39.074066162109375, "rewards/margins": 6.307814598083496, "rewards/rejected": -45.38187789916992, "step": 3830 }, { "epoch": 0.5216503267973857, "grad_norm": 37.283242500024954, "learning_rate": 4.395793750272446e-07, "logits/chosen": 12.489370346069336, "logits/rejected": 13.472921371459961, "logps/chosen": -4.189796447753906, "logps/rejected": -4.439708709716797, "loss": 4.1283, "rewards/accuracies": 1.0, "rewards/chosen": -41.89796447753906, "rewards/margins": 2.4991207122802734, "rewards/rejected": -44.39708709716797, "step": 3831 }, { "epoch": 0.5217864923747276, "grad_norm": 37.2895721061085, "learning_rate": 4.3939016339177585e-07, "logits/chosen": 12.734119415283203, "logits/rejected": 13.48347282409668, "logps/chosen": -3.882751941680908, "logps/rejected": -4.21972131729126, "loss": 3.9626, "rewards/accuracies": 0.75, "rewards/chosen": -38.82752227783203, "rewards/margins": 3.3696937561035156, "rewards/rejected": -42.19721221923828, "step": 3832 }, { "epoch": 0.5219226579520697, "grad_norm": 40.71606156891687, "learning_rate": 4.392009428557741e-07, "logits/chosen": 13.399703979492188, "logits/rejected": 14.127534866333008, "logps/chosen": -4.0370025634765625, "logps/rejected": -4.298465251922607, "loss": 3.591, "rewards/accuracies": 1.0, "rewards/chosen": -40.370025634765625, "rewards/margins": 2.614626884460449, "rewards/rejected": -42.984649658203125, "step": 3833 }, { "epoch": 0.5220588235294118, "grad_norm": 40.15318417419233, "learning_rate": 4.3901171346199515e-07, "logits/chosen": 13.235749244689941, "logits/rejected": 13.033068656921387, "logps/chosen": -4.181972503662109, "logps/rejected": -4.225146770477295, "loss": 4.036, "rewards/accuracies": 0.5, "rewards/chosen": -41.819725036621094, "rewards/margins": 0.4317445755004883, "rewards/rejected": -42.251468658447266, "step": 3834 }, { "epoch": 0.5221949891067538, "grad_norm": 40.56613487708788, "learning_rate": 4.388224752531972e-07, "logits/chosen": 12.930395126342773, "logits/rejected": 12.791569709777832, "logps/chosen": -4.152833938598633, "logps/rejected": -4.221101760864258, "loss": 3.7317, "rewards/accuracies": 0.75, "rewards/chosen": -41.52833938598633, "rewards/margins": 0.6826772689819336, "rewards/rejected": -42.21101760864258, "step": 3835 }, { "epoch": 0.5223311546840959, "grad_norm": 38.454658455031726, "learning_rate": 4.3863322827213995e-07, "logits/chosen": 12.860414505004883, "logits/rejected": 12.456714630126953, "logps/chosen": -3.8593878746032715, "logps/rejected": -3.8754868507385254, "loss": 4.0533, "rewards/accuracies": 0.5, "rewards/chosen": -38.59387969970703, "rewards/margins": 0.16099071502685547, "rewards/rejected": -38.75486755371094, "step": 3836 }, { "epoch": 0.5224673202614379, "grad_norm": 37.55826234488932, "learning_rate": 4.3844397256158545e-07, "logits/chosen": 12.917167663574219, "logits/rejected": 13.49643325805664, "logps/chosen": -3.9663784503936768, "logps/rejected": -4.353069305419922, "loss": 3.7719, "rewards/accuracies": 1.0, "rewards/chosen": -39.66378402709961, "rewards/margins": 3.86690616607666, "rewards/rejected": -43.53068923950195, "step": 3837 }, { "epoch": 0.5226034858387799, "grad_norm": 37.109600547061724, "learning_rate": 4.3825470816429763e-07, "logits/chosen": 12.594476699829102, "logits/rejected": 13.931038856506348, "logps/chosen": -4.083076477050781, "logps/rejected": -4.39192008972168, "loss": 4.059, "rewards/accuracies": 0.75, "rewards/chosen": -40.83076477050781, "rewards/margins": 3.0884342193603516, "rewards/rejected": -43.9192008972168, "step": 3838 }, { "epoch": 0.522739651416122, "grad_norm": 38.16756040330261, "learning_rate": 4.380654351230422e-07, "logits/chosen": 13.683530807495117, "logits/rejected": 13.78133773803711, "logps/chosen": -4.256924629211426, "logps/rejected": -4.487871170043945, "loss": 4.5387, "rewards/accuracies": 0.75, "rewards/chosen": -42.569244384765625, "rewards/margins": 2.3094635009765625, "rewards/rejected": -44.87870788574219, "step": 3839 }, { "epoch": 0.5228758169934641, "grad_norm": 39.00439130994063, "learning_rate": 4.3787615348058714e-07, "logits/chosen": 12.925069808959961, "logits/rejected": 13.720613479614258, "logps/chosen": -4.097166061401367, "logps/rejected": -4.344669818878174, "loss": 3.8018, "rewards/accuracies": 1.0, "rewards/chosen": -40.97166061401367, "rewards/margins": 2.475039482116699, "rewards/rejected": -43.44670104980469, "step": 3840 }, { "epoch": 0.523011982570806, "grad_norm": 37.76677374164965, "learning_rate": 4.376868632797021e-07, "logits/chosen": 14.159238815307617, "logits/rejected": 13.834939002990723, "logps/chosen": -4.414105415344238, "logps/rejected": -4.533198356628418, "loss": 3.892, "rewards/accuracies": 0.75, "rewards/chosen": -44.14105224609375, "rewards/margins": 1.1909332275390625, "rewards/rejected": -45.33198547363281, "step": 3841 }, { "epoch": 0.5231481481481481, "grad_norm": 40.886575311033724, "learning_rate": 4.374975645631587e-07, "logits/chosen": 14.228631973266602, "logits/rejected": 13.35081958770752, "logps/chosen": -4.624350547790527, "logps/rejected": -4.337711811065674, "loss": 4.6266, "rewards/accuracies": 0.25, "rewards/chosen": -46.24350357055664, "rewards/margins": -2.8663854598999023, "rewards/rejected": -43.37712097167969, "step": 3842 }, { "epoch": 0.5232843137254902, "grad_norm": 50.03680057086691, "learning_rate": 4.3730825737373065e-07, "logits/chosen": 14.040315628051758, "logits/rejected": 13.628301620483398, "logps/chosen": -4.228298664093018, "logps/rejected": -4.071529388427734, "loss": 4.2219, "rewards/accuracies": 0.25, "rewards/chosen": -42.28298568725586, "rewards/margins": -1.5676889419555664, "rewards/rejected": -40.71529769897461, "step": 3843 }, { "epoch": 0.5234204793028322, "grad_norm": 35.55560195865916, "learning_rate": 4.3711894175419354e-07, "logits/chosen": 13.364517211914062, "logits/rejected": 14.041830062866211, "logps/chosen": -3.9680447578430176, "logps/rejected": -4.340577602386475, "loss": 3.6979, "rewards/accuracies": 1.0, "rewards/chosen": -39.68044662475586, "rewards/margins": 3.725332260131836, "rewards/rejected": -43.40577697753906, "step": 3844 }, { "epoch": 0.5235566448801743, "grad_norm": 39.00377652716353, "learning_rate": 4.369296177473247e-07, "logits/chosen": 12.894149780273438, "logits/rejected": 13.636200904846191, "logps/chosen": -3.832688808441162, "logps/rejected": -4.241735458374023, "loss": 3.7868, "rewards/accuracies": 0.75, "rewards/chosen": -38.32688903808594, "rewards/margins": 4.09046745300293, "rewards/rejected": -42.4173583984375, "step": 3845 }, { "epoch": 0.5236928104575164, "grad_norm": 37.561224033585624, "learning_rate": 4.367402853959033e-07, "logits/chosen": 14.233823776245117, "logits/rejected": 13.382637023925781, "logps/chosen": -4.319019317626953, "logps/rejected": -4.572668075561523, "loss": 3.7457, "rewards/accuracies": 0.75, "rewards/chosen": -43.19019317626953, "rewards/margins": 2.5364866256713867, "rewards/rejected": -45.726680755615234, "step": 3846 }, { "epoch": 0.5238289760348583, "grad_norm": 43.620631639155576, "learning_rate": 4.365509447427109e-07, "logits/chosen": 13.433317184448242, "logits/rejected": 13.302790641784668, "logps/chosen": -3.929508686065674, "logps/rejected": -3.9053475856781006, "loss": 4.3943, "rewards/accuracies": 0.5, "rewards/chosen": -39.29508590698242, "rewards/margins": -0.2416086196899414, "rewards/rejected": -39.05347442626953, "step": 3847 }, { "epoch": 0.5239651416122004, "grad_norm": 39.15447322975777, "learning_rate": 4.3636159583053035e-07, "logits/chosen": 13.359382629394531, "logits/rejected": 12.971476554870605, "logps/chosen": -4.212647438049316, "logps/rejected": -4.03246545791626, "loss": 3.5842, "rewards/accuracies": 0.25, "rewards/chosen": -42.12647247314453, "rewards/margins": -1.8018198013305664, "rewards/rejected": -40.32465362548828, "step": 3848 }, { "epoch": 0.5241013071895425, "grad_norm": 38.72202934126848, "learning_rate": 4.361722387021467e-07, "logits/chosen": 13.04425048828125, "logits/rejected": 13.496349334716797, "logps/chosen": -3.980893135070801, "logps/rejected": -4.247950553894043, "loss": 3.5396, "rewards/accuracies": 0.75, "rewards/chosen": -39.808929443359375, "rewards/margins": 2.6705760955810547, "rewards/rejected": -42.47950744628906, "step": 3849 }, { "epoch": 0.5242374727668845, "grad_norm": 39.399038035116185, "learning_rate": 4.359828734003466e-07, "logits/chosen": 13.317028045654297, "logits/rejected": 14.335197448730469, "logps/chosen": -4.277609825134277, "logps/rejected": -4.287567615509033, "loss": 4.2777, "rewards/accuracies": 0.5, "rewards/chosen": -42.776092529296875, "rewards/margins": 0.09958267211914062, "rewards/rejected": -42.87567901611328, "step": 3850 }, { "epoch": 0.5243736383442266, "grad_norm": 41.897453708552845, "learning_rate": 4.357934999679189e-07, "logits/chosen": 12.699085235595703, "logits/rejected": 12.438331604003906, "logps/chosen": -3.8794641494750977, "logps/rejected": -4.24442195892334, "loss": 4.331, "rewards/accuracies": 0.75, "rewards/chosen": -38.794639587402344, "rewards/margins": 3.6495790481567383, "rewards/rejected": -42.44422149658203, "step": 3851 }, { "epoch": 0.5245098039215687, "grad_norm": 39.23247081056425, "learning_rate": 4.356041184476539e-07, "logits/chosen": 14.489317893981934, "logits/rejected": 13.571878433227539, "logps/chosen": -4.365915298461914, "logps/rejected": -4.2845330238342285, "loss": 4.0935, "rewards/accuracies": 0.5, "rewards/chosen": -43.659156799316406, "rewards/margins": -0.8138227462768555, "rewards/rejected": -42.845333099365234, "step": 3852 }, { "epoch": 0.5246459694989106, "grad_norm": 38.05343148243694, "learning_rate": 4.3541472888234417e-07, "logits/chosen": 12.975942611694336, "logits/rejected": 13.403745651245117, "logps/chosen": -3.878185749053955, "logps/rejected": -4.172152519226074, "loss": 4.1406, "rewards/accuracies": 0.75, "rewards/chosen": -38.7818603515625, "rewards/margins": 2.939669609069824, "rewards/rejected": -41.721527099609375, "step": 3853 }, { "epoch": 0.5247821350762527, "grad_norm": 36.61660474146865, "learning_rate": 4.352253313147837e-07, "logits/chosen": 12.882747650146484, "logits/rejected": 13.470335006713867, "logps/chosen": -4.069919586181641, "logps/rejected": -4.2750749588012695, "loss": 3.8308, "rewards/accuracies": 0.5, "rewards/chosen": -40.699195861816406, "rewards/margins": 2.051553726196289, "rewards/rejected": -42.75074768066406, "step": 3854 }, { "epoch": 0.5249183006535948, "grad_norm": 35.80326091317465, "learning_rate": 4.350359257877684e-07, "logits/chosen": 13.619998931884766, "logits/rejected": 13.472052574157715, "logps/chosen": -3.995314598083496, "logps/rejected": -3.9080278873443604, "loss": 4.096, "rewards/accuracies": 0.5, "rewards/chosen": -39.953147888183594, "rewards/margins": -0.8728694915771484, "rewards/rejected": -39.08027648925781, "step": 3855 }, { "epoch": 0.5250544662309368, "grad_norm": 37.63764345415634, "learning_rate": 4.34846512344096e-07, "logits/chosen": 13.085014343261719, "logits/rejected": 14.696966171264648, "logps/chosen": -4.0189290046691895, "logps/rejected": -4.55775785446167, "loss": 4.0624, "rewards/accuracies": 0.75, "rewards/chosen": -40.18928909301758, "rewards/margins": 5.3882904052734375, "rewards/rejected": -45.57758331298828, "step": 3856 }, { "epoch": 0.5251906318082789, "grad_norm": 39.59033767585389, "learning_rate": 4.3465709102656606e-07, "logits/chosen": 12.825752258300781, "logits/rejected": 13.146388053894043, "logps/chosen": -4.362737655639648, "logps/rejected": -4.451913833618164, "loss": 3.7555, "rewards/accuracies": 0.75, "rewards/chosen": -43.627376556396484, "rewards/margins": 0.8917646408081055, "rewards/rejected": -44.519142150878906, "step": 3857 }, { "epoch": 0.5253267973856209, "grad_norm": 35.536736776796374, "learning_rate": 4.3446766187798013e-07, "logits/chosen": 12.72187614440918, "logits/rejected": 13.386861801147461, "logps/chosen": -4.112753391265869, "logps/rejected": -4.320315361022949, "loss": 4.2498, "rewards/accuracies": 0.75, "rewards/chosen": -41.127532958984375, "rewards/margins": 2.0756235122680664, "rewards/rejected": -43.203155517578125, "step": 3858 }, { "epoch": 0.5254629629629629, "grad_norm": 35.70518821312075, "learning_rate": 4.342782249411409e-07, "logits/chosen": 13.033065795898438, "logits/rejected": 14.513313293457031, "logps/chosen": -3.8687515258789062, "logps/rejected": -4.4990363121032715, "loss": 3.8846, "rewards/accuracies": 1.0, "rewards/chosen": -38.68751525878906, "rewards/margins": 6.302846908569336, "rewards/rejected": -44.99036407470703, "step": 3859 }, { "epoch": 0.525599128540305, "grad_norm": 37.56812898897295, "learning_rate": 4.3408878025885344e-07, "logits/chosen": 13.853536605834961, "logits/rejected": 14.080242156982422, "logps/chosen": -3.920902729034424, "logps/rejected": -4.380066394805908, "loss": 4.0683, "rewards/accuracies": 0.75, "rewards/chosen": -39.20903015136719, "rewards/margins": 4.5916337966918945, "rewards/rejected": -43.800662994384766, "step": 3860 }, { "epoch": 0.5257352941176471, "grad_norm": 36.35545961504996, "learning_rate": 4.338993278739243e-07, "logits/chosen": 13.542739868164062, "logits/rejected": 13.741487503051758, "logps/chosen": -4.1974077224731445, "logps/rejected": -4.443671226501465, "loss": 3.9604, "rewards/accuracies": 0.75, "rewards/chosen": -41.97407913208008, "rewards/margins": 2.4626331329345703, "rewards/rejected": -44.43671417236328, "step": 3861 }, { "epoch": 0.525871459694989, "grad_norm": 38.83067650045933, "learning_rate": 4.337098678291619e-07, "logits/chosen": 12.858833312988281, "logits/rejected": 13.34759521484375, "logps/chosen": -3.8274409770965576, "logps/rejected": -4.105955600738525, "loss": 4.1363, "rewards/accuracies": 0.75, "rewards/chosen": -38.27440643310547, "rewards/margins": 2.785146713256836, "rewards/rejected": -41.05955505371094, "step": 3862 }, { "epoch": 0.5260076252723311, "grad_norm": 34.82490636929619, "learning_rate": 4.3352040016737615e-07, "logits/chosen": 13.380870819091797, "logits/rejected": 13.48563003540039, "logps/chosen": -4.244317054748535, "logps/rejected": -4.38571834564209, "loss": 4.2624, "rewards/accuracies": 0.5, "rewards/chosen": -42.443172454833984, "rewards/margins": 1.4140119552612305, "rewards/rejected": -43.85718536376953, "step": 3863 }, { "epoch": 0.5261437908496732, "grad_norm": 36.813416311816134, "learning_rate": 4.333309249313789e-07, "logits/chosen": 13.87519645690918, "logits/rejected": 13.030864715576172, "logps/chosen": -4.365675926208496, "logps/rejected": -3.928525447845459, "loss": 3.8859, "rewards/accuracies": 0.25, "rewards/chosen": -43.656761169433594, "rewards/margins": -4.3715057373046875, "rewards/rejected": -39.285255432128906, "step": 3864 }, { "epoch": 0.5262799564270153, "grad_norm": 37.67758572971641, "learning_rate": 4.3314144216398364e-07, "logits/chosen": 12.981441497802734, "logits/rejected": 14.053136825561523, "logps/chosen": -3.9882102012634277, "logps/rejected": -4.366727352142334, "loss": 4.1034, "rewards/accuracies": 0.75, "rewards/chosen": -39.882102966308594, "rewards/margins": 3.785172462463379, "rewards/rejected": -43.667274475097656, "step": 3865 }, { "epoch": 0.5264161220043573, "grad_norm": 38.33571780546465, "learning_rate": 4.3295195190800556e-07, "logits/chosen": 13.060373306274414, "logits/rejected": 13.521770477294922, "logps/chosen": -4.040714740753174, "logps/rejected": -4.200361251831055, "loss": 3.6041, "rewards/accuracies": 0.75, "rewards/chosen": -40.40715026855469, "rewards/margins": 1.5964641571044922, "rewards/rejected": -42.00361633300781, "step": 3866 }, { "epoch": 0.5265522875816994, "grad_norm": 40.786418863592836, "learning_rate": 4.327624542062615e-07, "logits/chosen": 13.473968505859375, "logits/rejected": 14.03377914428711, "logps/chosen": -3.7648398876190186, "logps/rejected": -4.204654693603516, "loss": 4.3148, "rewards/accuracies": 1.0, "rewards/chosen": -37.648399353027344, "rewards/margins": 4.398149013519287, "rewards/rejected": -42.046546936035156, "step": 3867 }, { "epoch": 0.5266884531590414, "grad_norm": 35.142850283746945, "learning_rate": 4.3257294910157023e-07, "logits/chosen": 12.305057525634766, "logits/rejected": 12.33597183227539, "logps/chosen": -3.976577043533325, "logps/rejected": -3.9960062503814697, "loss": 3.9188, "rewards/accuracies": 0.25, "rewards/chosen": -39.765769958496094, "rewards/margins": 0.19429397583007812, "rewards/rejected": -39.96006393432617, "step": 3868 }, { "epoch": 0.5268246187363834, "grad_norm": 37.68078851540364, "learning_rate": 4.323834366367519e-07, "logits/chosen": 13.24492073059082, "logits/rejected": 13.705036163330078, "logps/chosen": -4.334400653839111, "logps/rejected": -4.519007682800293, "loss": 3.6437, "rewards/accuracies": 1.0, "rewards/chosen": -43.3440055847168, "rewards/margins": 1.8460731506347656, "rewards/rejected": -45.19007873535156, "step": 3869 }, { "epoch": 0.5269607843137255, "grad_norm": 36.555137961094715, "learning_rate": 4.321939168546282e-07, "logits/chosen": 12.772735595703125, "logits/rejected": 12.445314407348633, "logps/chosen": -3.8109772205352783, "logps/rejected": -3.941762685775757, "loss": 3.9743, "rewards/accuracies": 0.75, "rewards/chosen": -38.109771728515625, "rewards/margins": 1.3078546524047852, "rewards/rejected": -39.417625427246094, "step": 3870 }, { "epoch": 0.5270969498910676, "grad_norm": 35.162157981750234, "learning_rate": 4.32004389798023e-07, "logits/chosen": 12.54244327545166, "logits/rejected": 13.267538070678711, "logps/chosen": -3.825061798095703, "logps/rejected": -3.9188084602355957, "loss": 3.783, "rewards/accuracies": 0.5, "rewards/chosen": -38.25061798095703, "rewards/margins": 0.9374666213989258, "rewards/rejected": -39.18808364868164, "step": 3871 }, { "epoch": 0.5272331154684096, "grad_norm": 37.05123792639246, "learning_rate": 4.318148555097613e-07, "logits/chosen": 12.30999755859375, "logits/rejected": 11.891057968139648, "logps/chosen": -3.752579927444458, "logps/rejected": -3.572378635406494, "loss": 3.9591, "rewards/accuracies": 0.25, "rewards/chosen": -37.52579879760742, "rewards/margins": -1.802011489868164, "rewards/rejected": -35.723785400390625, "step": 3872 }, { "epoch": 0.5273692810457516, "grad_norm": 37.517208749824164, "learning_rate": 4.3162531403267e-07, "logits/chosen": 12.439435958862305, "logits/rejected": 12.481849670410156, "logps/chosen": -3.893662929534912, "logps/rejected": -3.469944477081299, "loss": 4.3714, "rewards/accuracies": 0.0, "rewards/chosen": -38.93662643432617, "rewards/margins": -4.2371826171875, "rewards/rejected": -34.69944763183594, "step": 3873 }, { "epoch": 0.5275054466230937, "grad_norm": 36.53357381019803, "learning_rate": 4.314357654095777e-07, "logits/chosen": 14.171712875366211, "logits/rejected": 13.613388061523438, "logps/chosen": -4.280792713165283, "logps/rejected": -4.136326789855957, "loss": 4.0165, "rewards/accuracies": 0.25, "rewards/chosen": -42.80792999267578, "rewards/margins": -1.4446582794189453, "rewards/rejected": -41.36326599121094, "step": 3874 }, { "epoch": 0.5276416122004357, "grad_norm": 37.346177810769355, "learning_rate": 4.312462096833142e-07, "logits/chosen": 13.023883819580078, "logits/rejected": 13.177656173706055, "logps/chosen": -3.966923952102661, "logps/rejected": -4.086484909057617, "loss": 4.3713, "rewards/accuracies": 0.75, "rewards/chosen": -39.66923904418945, "rewards/margins": 1.1956100463867188, "rewards/rejected": -40.86484909057617, "step": 3875 }, { "epoch": 0.5277777777777778, "grad_norm": 36.63605375107257, "learning_rate": 4.3105664689671144e-07, "logits/chosen": 13.869340896606445, "logits/rejected": 13.798690795898438, "logps/chosen": -4.17853307723999, "logps/rejected": -4.538036346435547, "loss": 4.2869, "rewards/accuracies": 1.0, "rewards/chosen": -41.78533172607422, "rewards/margins": 3.5950279235839844, "rewards/rejected": -45.38036346435547, "step": 3876 }, { "epoch": 0.5279139433551199, "grad_norm": 37.81170173669591, "learning_rate": 4.308670770926026e-07, "logits/chosen": 13.742831230163574, "logits/rejected": 13.408219337463379, "logps/chosen": -4.059459686279297, "logps/rejected": -4.12701416015625, "loss": 3.4586, "rewards/accuracies": 0.75, "rewards/chosen": -40.59459686279297, "rewards/margins": 0.6755428314208984, "rewards/rejected": -41.2701416015625, "step": 3877 }, { "epoch": 0.5280501089324618, "grad_norm": 34.53402252926514, "learning_rate": 4.3067750031382245e-07, "logits/chosen": 12.498351097106934, "logits/rejected": 13.621566772460938, "logps/chosen": -3.991878032684326, "logps/rejected": -4.606746196746826, "loss": 3.8802, "rewards/accuracies": 1.0, "rewards/chosen": -39.91877746582031, "rewards/margins": 6.148684501647949, "rewards/rejected": -46.06746292114258, "step": 3878 }, { "epoch": 0.5281862745098039, "grad_norm": 35.67720425853588, "learning_rate": 4.3048791660320763e-07, "logits/chosen": 13.46396541595459, "logits/rejected": 13.612857818603516, "logps/chosen": -4.004558563232422, "logps/rejected": -4.295696258544922, "loss": 3.9912, "rewards/accuracies": 1.0, "rewards/chosen": -40.04558563232422, "rewards/margins": 2.911376953125, "rewards/rejected": -42.95696258544922, "step": 3879 }, { "epoch": 0.528322440087146, "grad_norm": 37.61483379090814, "learning_rate": 4.3029832600359597e-07, "logits/chosen": 13.619787216186523, "logits/rejected": 12.633180618286133, "logps/chosen": -4.118252277374268, "logps/rejected": -3.838895320892334, "loss": 4.5428, "rewards/accuracies": 0.0, "rewards/chosen": -41.18252182006836, "rewards/margins": -2.7935714721679688, "rewards/rejected": -38.38895034790039, "step": 3880 }, { "epoch": 0.528458605664488, "grad_norm": 40.62455125450866, "learning_rate": 4.3010872855782707e-07, "logits/chosen": 12.241591453552246, "logits/rejected": 12.941045761108398, "logps/chosen": -4.00083065032959, "logps/rejected": -4.216662406921387, "loss": 3.7873, "rewards/accuracies": 1.0, "rewards/chosen": -40.008304595947266, "rewards/margins": 2.158318519592285, "rewards/rejected": -42.1666259765625, "step": 3881 }, { "epoch": 0.5285947712418301, "grad_norm": 34.1064681814458, "learning_rate": 4.2991912430874216e-07, "logits/chosen": 13.6862211227417, "logits/rejected": 12.428146362304688, "logps/chosen": -4.242632865905762, "logps/rejected": -4.086902141571045, "loss": 3.9317, "rewards/accuracies": 0.25, "rewards/chosen": -42.426326751708984, "rewards/margins": -1.557307243347168, "rewards/rejected": -40.8690185546875, "step": 3882 }, { "epoch": 0.5287309368191722, "grad_norm": 34.42273191012572, "learning_rate": 4.297295132991838e-07, "logits/chosen": 12.684375762939453, "logits/rejected": 12.734840393066406, "logps/chosen": -3.510456085205078, "logps/rejected": -4.367501258850098, "loss": 3.9849, "rewards/accuracies": 1.0, "rewards/chosen": -35.10456085205078, "rewards/margins": 8.570453643798828, "rewards/rejected": -43.675010681152344, "step": 3883 }, { "epoch": 0.5288671023965141, "grad_norm": 36.845485074376526, "learning_rate": 4.29539895571996e-07, "logits/chosen": 13.747982025146484, "logits/rejected": 14.33293342590332, "logps/chosen": -4.045414924621582, "logps/rejected": -4.278745651245117, "loss": 3.861, "rewards/accuracies": 0.75, "rewards/chosen": -40.45415115356445, "rewards/margins": 2.333303451538086, "rewards/rejected": -42.787452697753906, "step": 3884 }, { "epoch": 0.5290032679738562, "grad_norm": 35.812048342381935, "learning_rate": 4.293502711700249e-07, "logits/chosen": 12.49781608581543, "logits/rejected": 13.05769157409668, "logps/chosen": -4.145445346832275, "logps/rejected": -4.050803184509277, "loss": 3.6551, "rewards/accuracies": 0.75, "rewards/chosen": -41.45445251464844, "rewards/margins": -0.9464206695556641, "rewards/rejected": -40.508033752441406, "step": 3885 }, { "epoch": 0.5291394335511983, "grad_norm": 37.181668617338644, "learning_rate": 4.2916064013611725e-07, "logits/chosen": 12.030256271362305, "logits/rejected": 12.3492431640625, "logps/chosen": -3.7814552783966064, "logps/rejected": -3.8729758262634277, "loss": 4.1295, "rewards/accuracies": 0.5, "rewards/chosen": -37.814552307128906, "rewards/margins": 0.9152050018310547, "rewards/rejected": -38.72975540161133, "step": 3886 }, { "epoch": 0.5292755991285403, "grad_norm": 45.974892747941794, "learning_rate": 4.289710025131218e-07, "logits/chosen": 12.580554962158203, "logits/rejected": 12.655556678771973, "logps/chosen": -3.665282726287842, "logps/rejected": -4.035331726074219, "loss": 3.8244, "rewards/accuracies": 0.75, "rewards/chosen": -36.652828216552734, "rewards/margins": 3.700488567352295, "rewards/rejected": -40.35331726074219, "step": 3887 }, { "epoch": 0.5294117647058824, "grad_norm": 34.72771702987825, "learning_rate": 4.287813583438891e-07, "logits/chosen": 13.280689239501953, "logits/rejected": 13.100016593933105, "logps/chosen": -3.8301591873168945, "logps/rejected": -3.900743007659912, "loss": 3.7003, "rewards/accuracies": 0.25, "rewards/chosen": -38.30159378051758, "rewards/margins": 0.7058334350585938, "rewards/rejected": -39.00742721557617, "step": 3888 }, { "epoch": 0.5295479302832244, "grad_norm": 40.28945976307175, "learning_rate": 4.285917076712705e-07, "logits/chosen": 12.412135124206543, "logits/rejected": 12.891590118408203, "logps/chosen": -4.117562294006348, "logps/rejected": -4.240791320800781, "loss": 4.2504, "rewards/accuracies": 0.75, "rewards/chosen": -41.175621032714844, "rewards/margins": 1.2322874069213867, "rewards/rejected": -42.40791320800781, "step": 3889 }, { "epoch": 0.5296840958605664, "grad_norm": 41.234654098940695, "learning_rate": 4.284020505381191e-07, "logits/chosen": 13.800697326660156, "logits/rejected": 13.928838729858398, "logps/chosen": -4.0705461502075195, "logps/rejected": -4.306264877319336, "loss": 3.8118, "rewards/accuracies": 0.75, "rewards/chosen": -40.70546340942383, "rewards/margins": 2.3571815490722656, "rewards/rejected": -43.062644958496094, "step": 3890 }, { "epoch": 0.5298202614379085, "grad_norm": 37.14909450171554, "learning_rate": 4.2821238698728966e-07, "logits/chosen": 12.174524307250977, "logits/rejected": 12.50692367553711, "logps/chosen": -3.575273036956787, "logps/rejected": -3.962843418121338, "loss": 4.053, "rewards/accuracies": 0.75, "rewards/chosen": -35.75273132324219, "rewards/margins": 3.875702381134033, "rewards/rejected": -39.62843322753906, "step": 3891 }, { "epoch": 0.5299564270152506, "grad_norm": 42.31243259472596, "learning_rate": 4.280227170616382e-07, "logits/chosen": 13.31936264038086, "logits/rejected": 12.044761657714844, "logps/chosen": -4.132394790649414, "logps/rejected": -3.6688826084136963, "loss": 3.9166, "rewards/accuracies": 0.25, "rewards/chosen": -41.32394790649414, "rewards/margins": -4.6351213455200195, "rewards/rejected": -36.68882751464844, "step": 3892 }, { "epoch": 0.5300925925925926, "grad_norm": 37.69875446378237, "learning_rate": 4.2783304080402215e-07, "logits/chosen": 13.758345603942871, "logits/rejected": 12.860403060913086, "logps/chosen": -4.251049518585205, "logps/rejected": -4.052597999572754, "loss": 4.1948, "rewards/accuracies": 0.25, "rewards/chosen": -42.510494232177734, "rewards/margins": -1.9845161437988281, "rewards/rejected": -40.525978088378906, "step": 3893 }, { "epoch": 0.5302287581699346, "grad_norm": 38.33678280482465, "learning_rate": 4.276433582573005e-07, "logits/chosen": 14.086587905883789, "logits/rejected": 13.103397369384766, "logps/chosen": -3.7029550075531006, "logps/rejected": -3.877244710922241, "loss": 4.5519, "rewards/accuracies": 0.75, "rewards/chosen": -37.02954864501953, "rewards/margins": 1.7428994178771973, "rewards/rejected": -38.77244567871094, "step": 3894 }, { "epoch": 0.5303649237472767, "grad_norm": 38.22334680005771, "learning_rate": 4.274536694643335e-07, "logits/chosen": 13.431039810180664, "logits/rejected": 13.679512023925781, "logps/chosen": -4.008387088775635, "logps/rejected": -4.082256317138672, "loss": 3.8636, "rewards/accuracies": 0.5, "rewards/chosen": -40.08386993408203, "rewards/margins": 0.7386908531188965, "rewards/rejected": -40.82256317138672, "step": 3895 }, { "epoch": 0.5305010893246187, "grad_norm": 35.86279732940759, "learning_rate": 4.2726397446798294e-07, "logits/chosen": 12.55687427520752, "logits/rejected": 13.42734432220459, "logps/chosen": -3.9199986457824707, "logps/rejected": -4.278968811035156, "loss": 3.86, "rewards/accuracies": 0.75, "rewards/chosen": -39.19998550415039, "rewards/margins": 3.589700698852539, "rewards/rejected": -42.78968811035156, "step": 3896 }, { "epoch": 0.5306372549019608, "grad_norm": 37.809475068461246, "learning_rate": 4.2707427331111204e-07, "logits/chosen": 12.364736557006836, "logits/rejected": 12.854589462280273, "logps/chosen": -3.9559662342071533, "logps/rejected": -3.8509726524353027, "loss": 4.1053, "rewards/accuracies": 0.5, "rewards/chosen": -39.559661865234375, "rewards/margins": -1.0499372482299805, "rewards/rejected": -38.50972366333008, "step": 3897 }, { "epoch": 0.5307734204793029, "grad_norm": 37.534915078939335, "learning_rate": 4.268845660365853e-07, "logits/chosen": 12.936979293823242, "logits/rejected": 13.355753898620605, "logps/chosen": -4.025787353515625, "logps/rejected": -4.2879638671875, "loss": 4.1663, "rewards/accuracies": 0.75, "rewards/chosen": -40.257877349853516, "rewards/margins": 2.6217575073242188, "rewards/rejected": -42.879634857177734, "step": 3898 }, { "epoch": 0.5309095860566448, "grad_norm": 37.16761124076241, "learning_rate": 4.266948526872685e-07, "logits/chosen": 13.727190971374512, "logits/rejected": 12.15186882019043, "logps/chosen": -3.8529446125030518, "logps/rejected": -3.843830108642578, "loss": 4.3455, "rewards/accuracies": 0.5, "rewards/chosen": -38.529449462890625, "rewards/margins": -0.09114599227905273, "rewards/rejected": -38.43830108642578, "step": 3899 }, { "epoch": 0.5310457516339869, "grad_norm": 36.902321435626114, "learning_rate": 4.2650513330602924e-07, "logits/chosen": 12.799376487731934, "logits/rejected": 13.178565979003906, "logps/chosen": -3.6971282958984375, "logps/rejected": -4.214571475982666, "loss": 3.8181, "rewards/accuracies": 1.0, "rewards/chosen": -36.971282958984375, "rewards/margins": 5.174435615539551, "rewards/rejected": -42.145713806152344, "step": 3900 }, { "epoch": 0.531181917211329, "grad_norm": 38.33902728769787, "learning_rate": 4.2631540793573597e-07, "logits/chosen": 11.970603942871094, "logits/rejected": 12.45641803741455, "logps/chosen": -3.750028133392334, "logps/rejected": -3.931478500366211, "loss": 3.8641, "rewards/accuracies": 0.75, "rewards/chosen": -37.500282287597656, "rewards/margins": 1.8145017623901367, "rewards/rejected": -39.31478500366211, "step": 3901 }, { "epoch": 0.531318082788671, "grad_norm": 36.742368254154925, "learning_rate": 4.261256766192587e-07, "logits/chosen": 12.076117515563965, "logits/rejected": 13.81096076965332, "logps/chosen": -3.6510777473449707, "logps/rejected": -4.417716979980469, "loss": 3.8979, "rewards/accuracies": 1.0, "rewards/chosen": -36.51077651977539, "rewards/margins": 7.666393756866455, "rewards/rejected": -44.17716979980469, "step": 3902 }, { "epoch": 0.5314542483660131, "grad_norm": 41.62937522385992, "learning_rate": 4.259359393994689e-07, "logits/chosen": 12.869097709655762, "logits/rejected": 13.358606338500977, "logps/chosen": -3.7298336029052734, "logps/rejected": -3.8643646240234375, "loss": 3.6107, "rewards/accuracies": 0.75, "rewards/chosen": -37.298336029052734, "rewards/margins": 1.345311164855957, "rewards/rejected": -38.643646240234375, "step": 3903 }, { "epoch": 0.5315904139433552, "grad_norm": 43.98087582092733, "learning_rate": 4.257461963192392e-07, "logits/chosen": 12.727178573608398, "logits/rejected": 13.528371810913086, "logps/chosen": -3.866666793823242, "logps/rejected": -3.8860960006713867, "loss": 4.7704, "rewards/accuracies": 0.5, "rewards/chosen": -38.66666793823242, "rewards/margins": 0.1942911148071289, "rewards/rejected": -38.860958099365234, "step": 3904 }, { "epoch": 0.5317265795206971, "grad_norm": 38.97062959017014, "learning_rate": 4.255564474214435e-07, "logits/chosen": 12.47575569152832, "logits/rejected": 13.104958534240723, "logps/chosen": -3.651179075241089, "logps/rejected": -4.136730194091797, "loss": 4.629, "rewards/accuracies": 1.0, "rewards/chosen": -36.51179122924805, "rewards/margins": 4.855512619018555, "rewards/rejected": -41.36730194091797, "step": 3905 }, { "epoch": 0.5318627450980392, "grad_norm": 38.4298716328816, "learning_rate": 4.2536669274895735e-07, "logits/chosen": 12.75833511352539, "logits/rejected": 12.764703750610352, "logps/chosen": -3.983438730239868, "logps/rejected": -4.042351245880127, "loss": 3.9071, "rewards/accuracies": 0.75, "rewards/chosen": -39.83438491821289, "rewards/margins": 0.5891275405883789, "rewards/rejected": -40.42351150512695, "step": 3906 }, { "epoch": 0.5319989106753813, "grad_norm": 39.0022562278289, "learning_rate": 4.251769323446574e-07, "logits/chosen": 13.303762435913086, "logits/rejected": 13.595439910888672, "logps/chosen": -4.00177001953125, "logps/rejected": -4.223759651184082, "loss": 4.0004, "rewards/accuracies": 0.75, "rewards/chosen": -40.0177001953125, "rewards/margins": 2.2198972702026367, "rewards/rejected": -42.23759460449219, "step": 3907 }, { "epoch": 0.5321350762527233, "grad_norm": 40.444776320648984, "learning_rate": 4.249871662514213e-07, "logits/chosen": 13.403976440429688, "logits/rejected": 13.010038375854492, "logps/chosen": -3.8364830017089844, "logps/rejected": -3.9931349754333496, "loss": 3.5736, "rewards/accuracies": 0.5, "rewards/chosen": -38.364830017089844, "rewards/margins": 1.566518783569336, "rewards/rejected": -39.93135070800781, "step": 3908 }, { "epoch": 0.5322712418300654, "grad_norm": 39.302688556613816, "learning_rate": 4.247973945121285e-07, "logits/chosen": 12.434908866882324, "logits/rejected": 13.465547561645508, "logps/chosen": -3.8918514251708984, "logps/rejected": -4.168344020843506, "loss": 4.0378, "rewards/accuracies": 1.0, "rewards/chosen": -38.918514251708984, "rewards/margins": 2.764925003051758, "rewards/rejected": -41.683441162109375, "step": 3909 }, { "epoch": 0.5324074074074074, "grad_norm": 37.87347634888631, "learning_rate": 4.246076171696595e-07, "logits/chosen": 13.037452697753906, "logits/rejected": 12.7896089553833, "logps/chosen": -3.801858901977539, "logps/rejected": -3.7524993419647217, "loss": 3.763, "rewards/accuracies": 0.25, "rewards/chosen": -38.01858901977539, "rewards/margins": -0.4935951232910156, "rewards/rejected": -37.524993896484375, "step": 3910 }, { "epoch": 0.5325435729847494, "grad_norm": 37.93191180735482, "learning_rate": 4.2441783426689586e-07, "logits/chosen": 13.129731178283691, "logits/rejected": 13.084518432617188, "logps/chosen": -3.940674066543579, "logps/rejected": -3.739612579345703, "loss": 4.1494, "rewards/accuracies": 0.25, "rewards/chosen": -39.406742095947266, "rewards/margins": -2.010617256164551, "rewards/rejected": -37.396121978759766, "step": 3911 }, { "epoch": 0.5326797385620915, "grad_norm": 40.07139727891665, "learning_rate": 4.242280458467208e-07, "logits/chosen": 12.982379913330078, "logits/rejected": 12.87779426574707, "logps/chosen": -3.8200035095214844, "logps/rejected": -4.11298942565918, "loss": 4.1105, "rewards/accuracies": 0.75, "rewards/chosen": -38.200035095214844, "rewards/margins": 2.929859161376953, "rewards/rejected": -41.1298942565918, "step": 3912 }, { "epoch": 0.5328159041394336, "grad_norm": 37.71683347370567, "learning_rate": 4.2403825195201843e-07, "logits/chosen": 12.875419616699219, "logits/rejected": 12.817106246948242, "logps/chosen": -3.744372844696045, "logps/rejected": -3.9778079986572266, "loss": 4.2918, "rewards/accuracies": 0.75, "rewards/chosen": -37.4437255859375, "rewards/margins": 2.3343544006347656, "rewards/rejected": -39.778079986572266, "step": 3913 }, { "epoch": 0.5329520697167756, "grad_norm": 34.94644310511986, "learning_rate": 4.238484526256744e-07, "logits/chosen": 13.937551498413086, "logits/rejected": 13.826910972595215, "logps/chosen": -3.9936766624450684, "logps/rejected": -4.321643829345703, "loss": 3.4702, "rewards/accuracies": 0.75, "rewards/chosen": -39.936767578125, "rewards/margins": 3.279672622680664, "rewards/rejected": -43.21643829345703, "step": 3914 }, { "epoch": 0.5330882352941176, "grad_norm": 33.30914547650665, "learning_rate": 4.236586479105755e-07, "logits/chosen": 12.86600112915039, "logits/rejected": 12.77480697631836, "logps/chosen": -3.8253445625305176, "logps/rejected": -3.9535653591156006, "loss": 3.9039, "rewards/accuracies": 0.5, "rewards/chosen": -38.253448486328125, "rewards/margins": 1.282205581665039, "rewards/rejected": -39.53565216064453, "step": 3915 }, { "epoch": 0.5332244008714597, "grad_norm": 36.30215116229053, "learning_rate": 4.2346883784960934e-07, "logits/chosen": 12.436117172241211, "logits/rejected": 13.116170883178711, "logps/chosen": -3.7260866165161133, "logps/rejected": -4.103337287902832, "loss": 3.8993, "rewards/accuracies": 1.0, "rewards/chosen": -37.260868072509766, "rewards/margins": 3.772507667541504, "rewards/rejected": -41.03337478637695, "step": 3916 }, { "epoch": 0.5333605664488017, "grad_norm": 36.98300395238748, "learning_rate": 4.2327902248566536e-07, "logits/chosen": 13.503080368041992, "logits/rejected": 13.405820846557617, "logps/chosen": -3.9891610145568848, "logps/rejected": -4.207172393798828, "loss": 4.2685, "rewards/accuracies": 0.5, "rewards/chosen": -39.89160919189453, "rewards/margins": 2.180112838745117, "rewards/rejected": -42.07172393798828, "step": 3917 }, { "epoch": 0.5334967320261438, "grad_norm": 41.302926664945524, "learning_rate": 4.2308920186163376e-07, "logits/chosen": 12.683581352233887, "logits/rejected": 12.653682708740234, "logps/chosen": -4.024956703186035, "logps/rejected": -3.9890787601470947, "loss": 3.1552, "rewards/accuracies": 0.75, "rewards/chosen": -40.24956512451172, "rewards/margins": -0.3587779998779297, "rewards/rejected": -39.890785217285156, "step": 3918 }, { "epoch": 0.5336328976034859, "grad_norm": 115.03389114733625, "learning_rate": 4.228993760204062e-07, "logits/chosen": 12.949970245361328, "logits/rejected": 13.322493553161621, "logps/chosen": -3.9467687606811523, "logps/rejected": -4.239101886749268, "loss": 4.3393, "rewards/accuracies": 1.0, "rewards/chosen": -39.467689514160156, "rewards/margins": 2.9233293533325195, "rewards/rejected": -42.391014099121094, "step": 3919 }, { "epoch": 0.5337690631808278, "grad_norm": 35.01589341769234, "learning_rate": 4.227095450048753e-07, "logits/chosen": 13.045312881469727, "logits/rejected": 12.297979354858398, "logps/chosen": -3.8107266426086426, "logps/rejected": -3.7231369018554688, "loss": 3.693, "rewards/accuracies": 0.5, "rewards/chosen": -38.10726547241211, "rewards/margins": -0.8758974075317383, "rewards/rejected": -37.23136901855469, "step": 3920 }, { "epoch": 0.5339052287581699, "grad_norm": 39.55606624002327, "learning_rate": 4.2251970885793506e-07, "logits/chosen": 12.805835723876953, "logits/rejected": 13.031402587890625, "logps/chosen": -3.315215587615967, "logps/rejected": -3.715240001678467, "loss": 4.2109, "rewards/accuracies": 0.75, "rewards/chosen": -33.15215301513672, "rewards/margins": 4.000243186950684, "rewards/rejected": -37.15239715576172, "step": 3921 }, { "epoch": 0.534041394335512, "grad_norm": 34.05889781472611, "learning_rate": 4.223298676224804e-07, "logits/chosen": 14.141227722167969, "logits/rejected": 13.899238586425781, "logps/chosen": -3.855494737625122, "logps/rejected": -4.059221267700195, "loss": 3.8114, "rewards/accuracies": 0.75, "rewards/chosen": -38.55494689941406, "rewards/margins": 2.0372676849365234, "rewards/rejected": -40.59221649169922, "step": 3922 }, { "epoch": 0.534177559912854, "grad_norm": 38.919702914073184, "learning_rate": 4.2214002134140745e-07, "logits/chosen": 13.514604568481445, "logits/rejected": 13.413778305053711, "logps/chosen": -4.076202392578125, "logps/rejected": -3.8662071228027344, "loss": 4.4171, "rewards/accuracies": 0.5, "rewards/chosen": -40.762020111083984, "rewards/margins": -2.0999488830566406, "rewards/rejected": -38.662071228027344, "step": 3923 }, { "epoch": 0.5343137254901961, "grad_norm": 37.670462476436846, "learning_rate": 4.2195017005761393e-07, "logits/chosen": 12.967294692993164, "logits/rejected": 13.908178329467773, "logps/chosen": -3.7976582050323486, "logps/rejected": -4.3083906173706055, "loss": 3.9739, "rewards/accuracies": 1.0, "rewards/chosen": -37.976585388183594, "rewards/margins": 5.107322692871094, "rewards/rejected": -43.08390426635742, "step": 3924 }, { "epoch": 0.5344498910675382, "grad_norm": 34.995349445989014, "learning_rate": 4.217603138139979e-07, "logits/chosen": 13.378023147583008, "logits/rejected": 13.101945877075195, "logps/chosen": -3.6972594261169434, "logps/rejected": -4.012936592102051, "loss": 3.6323, "rewards/accuracies": 0.75, "rewards/chosen": -36.97259521484375, "rewards/margins": 3.156770706176758, "rewards/rejected": -40.129364013671875, "step": 3925 }, { "epoch": 0.5345860566448801, "grad_norm": 37.67934314253167, "learning_rate": 4.2157045265345906e-07, "logits/chosen": 12.596131324768066, "logits/rejected": 13.2720947265625, "logps/chosen": -3.8684635162353516, "logps/rejected": -4.121563911437988, "loss": 3.9656, "rewards/accuracies": 0.5, "rewards/chosen": -38.684635162353516, "rewards/margins": 2.531002998352051, "rewards/rejected": -41.21563720703125, "step": 3926 }, { "epoch": 0.5347222222222222, "grad_norm": 42.39673974889441, "learning_rate": 4.2138058661889816e-07, "logits/chosen": 13.421653747558594, "logits/rejected": 14.032353401184082, "logps/chosen": -3.6925926208496094, "logps/rejected": -3.994262933731079, "loss": 4.1987, "rewards/accuracies": 1.0, "rewards/chosen": -36.925926208496094, "rewards/margins": 3.016702175140381, "rewards/rejected": -39.942626953125, "step": 3927 }, { "epoch": 0.5348583877995643, "grad_norm": 39.4257246759491, "learning_rate": 4.2119071575321704e-07, "logits/chosen": 14.325437545776367, "logits/rejected": 13.902925491333008, "logps/chosen": -4.363117218017578, "logps/rejected": -4.427635669708252, "loss": 4.4011, "rewards/accuracies": 0.5, "rewards/chosen": -43.631168365478516, "rewards/margins": 0.6451883316040039, "rewards/rejected": -44.27635955810547, "step": 3928 }, { "epoch": 0.5349945533769063, "grad_norm": 35.132993219312084, "learning_rate": 4.210008400993184e-07, "logits/chosen": 12.631848335266113, "logits/rejected": 13.715606689453125, "logps/chosen": -3.320765972137451, "logps/rejected": -3.890899181365967, "loss": 4.0278, "rewards/accuracies": 1.0, "rewards/chosen": -33.20766067504883, "rewards/margins": 5.701332092285156, "rewards/rejected": -38.90898895263672, "step": 3929 }, { "epoch": 0.5351307189542484, "grad_norm": 34.0690395065627, "learning_rate": 4.2081095970010646e-07, "logits/chosen": 12.929553031921387, "logits/rejected": 14.566319465637207, "logps/chosen": -3.9824419021606445, "logps/rejected": -4.20089054107666, "loss": 3.713, "rewards/accuracies": 0.75, "rewards/chosen": -39.82441711425781, "rewards/margins": 2.1844892501831055, "rewards/rejected": -42.00890350341797, "step": 3930 }, { "epoch": 0.5352668845315904, "grad_norm": 41.06754914392922, "learning_rate": 4.2062107459848616e-07, "logits/chosen": 11.921936988830566, "logits/rejected": 12.639105796813965, "logps/chosen": -3.515307903289795, "logps/rejected": -4.071821212768555, "loss": 4.0736, "rewards/accuracies": 1.0, "rewards/chosen": -35.153076171875, "rewards/margins": 5.565136909484863, "rewards/rejected": -40.71821594238281, "step": 3931 }, { "epoch": 0.5354030501089324, "grad_norm": 38.26265174493985, "learning_rate": 4.2043118483736356e-07, "logits/chosen": 12.979331970214844, "logits/rejected": 13.280384063720703, "logps/chosen": -4.196028709411621, "logps/rejected": -4.373703956604004, "loss": 4.1617, "rewards/accuracies": 0.75, "rewards/chosen": -41.960289001464844, "rewards/margins": 1.7767515182495117, "rewards/rejected": -43.737037658691406, "step": 3932 }, { "epoch": 0.5355392156862745, "grad_norm": 31.761222659988377, "learning_rate": 4.2024129045964585e-07, "logits/chosen": 12.544286727905273, "logits/rejected": 13.18247127532959, "logps/chosen": -3.6782829761505127, "logps/rejected": -3.907447338104248, "loss": 3.5583, "rewards/accuracies": 0.75, "rewards/chosen": -36.78282928466797, "rewards/margins": 2.291645050048828, "rewards/rejected": -39.07447052001953, "step": 3933 }, { "epoch": 0.5356753812636166, "grad_norm": 44.74569143811112, "learning_rate": 4.2005139150824134e-07, "logits/chosen": 12.34488296508789, "logits/rejected": 12.995675086975098, "logps/chosen": -3.5969440937042236, "logps/rejected": -3.6366524696350098, "loss": 4.0385, "rewards/accuracies": 0.5, "rewards/chosen": -35.96944046020508, "rewards/margins": 0.39708471298217773, "rewards/rejected": -36.36652374267578, "step": 3934 }, { "epoch": 0.5358115468409586, "grad_norm": 37.46240929470314, "learning_rate": 4.198614880260591e-07, "logits/chosen": 12.186212539672852, "logits/rejected": 14.570337295532227, "logps/chosen": -3.634167194366455, "logps/rejected": -4.293625831604004, "loss": 4.2092, "rewards/accuracies": 1.0, "rewards/chosen": -36.341670989990234, "rewards/margins": 6.594589710235596, "rewards/rejected": -42.93626022338867, "step": 3935 }, { "epoch": 0.5359477124183006, "grad_norm": 39.3542982906159, "learning_rate": 4.196715800560094e-07, "logits/chosen": 13.635398864746094, "logits/rejected": 14.021397590637207, "logps/chosen": -3.607052803039551, "logps/rejected": -4.097030162811279, "loss": 4.1419, "rewards/accuracies": 1.0, "rewards/chosen": -36.07052993774414, "rewards/margins": 4.8997697830200195, "rewards/rejected": -40.970298767089844, "step": 3936 }, { "epoch": 0.5360838779956427, "grad_norm": 42.27263688914702, "learning_rate": 4.194816676410037e-07, "logits/chosen": 12.826013565063477, "logits/rejected": 12.657520294189453, "logps/chosen": -3.977276086807251, "logps/rejected": -3.9157748222351074, "loss": 4.4891, "rewards/accuracies": 0.25, "rewards/chosen": -39.77275848388672, "rewards/margins": -0.6150131225585938, "rewards/rejected": -39.157745361328125, "step": 3937 }, { "epoch": 0.5362200435729847, "grad_norm": 37.034900620582576, "learning_rate": 4.1929175082395404e-07, "logits/chosen": 12.79440975189209, "logits/rejected": 12.873144149780273, "logps/chosen": -3.7010090351104736, "logps/rejected": -4.13266658782959, "loss": 4.006, "rewards/accuracies": 1.0, "rewards/chosen": -37.01008987426758, "rewards/margins": 4.316575050354004, "rewards/rejected": -41.326663970947266, "step": 3938 }, { "epoch": 0.5363562091503268, "grad_norm": 36.135607766397946, "learning_rate": 4.1910182964777385e-07, "logits/chosen": 13.53317642211914, "logits/rejected": 14.128372192382812, "logps/chosen": -4.063310623168945, "logps/rejected": -4.143775463104248, "loss": 3.8057, "rewards/accuracies": 0.75, "rewards/chosen": -40.63310623168945, "rewards/margins": 0.8046493530273438, "rewards/rejected": -41.43775177001953, "step": 3939 }, { "epoch": 0.5364923747276689, "grad_norm": 36.40386951677545, "learning_rate": 4.189119041553774e-07, "logits/chosen": 14.368062973022461, "logits/rejected": 13.683414459228516, "logps/chosen": -3.9697353839874268, "logps/rejected": -4.2861762046813965, "loss": 3.2094, "rewards/accuracies": 0.75, "rewards/chosen": -39.697349548339844, "rewards/margins": 3.164409637451172, "rewards/rejected": -42.86176300048828, "step": 3940 }, { "epoch": 0.536628540305011, "grad_norm": 38.27842228881916, "learning_rate": 4.187219743896798e-07, "logits/chosen": 12.167634963989258, "logits/rejected": 14.162368774414062, "logps/chosen": -3.544997215270996, "logps/rejected": -4.181545257568359, "loss": 3.6267, "rewards/accuracies": 1.0, "rewards/chosen": -35.449974060058594, "rewards/margins": 6.365482330322266, "rewards/rejected": -41.815452575683594, "step": 3941 }, { "epoch": 0.5367647058823529, "grad_norm": 41.181176990369025, "learning_rate": 4.1853204039359743e-07, "logits/chosen": 13.632251739501953, "logits/rejected": 13.761507034301758, "logps/chosen": -4.2677836418151855, "logps/rejected": -4.375482082366943, "loss": 4.1111, "rewards/accuracies": 0.5, "rewards/chosen": -42.67784118652344, "rewards/margins": 1.0769805908203125, "rewards/rejected": -43.75482177734375, "step": 3942 }, { "epoch": 0.536900871459695, "grad_norm": 42.556241792864604, "learning_rate": 4.183421022100473e-07, "logits/chosen": 13.36612606048584, "logits/rejected": 12.950139999389648, "logps/chosen": -3.952152967453003, "logps/rejected": -3.8447046279907227, "loss": 4.3831, "rewards/accuracies": 0.25, "rewards/chosen": -39.52153015136719, "rewards/margins": -1.074483871459961, "rewards/rejected": -38.447044372558594, "step": 3943 }, { "epoch": 0.5370370370370371, "grad_norm": 55.42326854522549, "learning_rate": 4.1815215988194745e-07, "logits/chosen": 13.580610275268555, "logits/rejected": 14.036949157714844, "logps/chosen": -3.976213216781616, "logps/rejected": -4.4419050216674805, "loss": 4.2856, "rewards/accuracies": 1.0, "rewards/chosen": -39.76213073730469, "rewards/margins": 4.656916618347168, "rewards/rejected": -44.41905212402344, "step": 3944 }, { "epoch": 0.5371732026143791, "grad_norm": 32.67265850052651, "learning_rate": 4.1796221345221723e-07, "logits/chosen": 13.159231185913086, "logits/rejected": 13.088932037353516, "logps/chosen": -3.9581360816955566, "logps/rejected": -3.9503190517425537, "loss": 3.5598, "rewards/accuracies": 0.5, "rewards/chosen": -39.581363677978516, "rewards/margins": -0.07817459106445312, "rewards/rejected": -39.50318908691406, "step": 3945 }, { "epoch": 0.5373093681917211, "grad_norm": 36.19789216437654, "learning_rate": 4.1777226296377634e-07, "logits/chosen": 12.746867179870605, "logits/rejected": 14.58942699432373, "logps/chosen": -4.017208576202393, "logps/rejected": -4.187612533569336, "loss": 4.0933, "rewards/accuracies": 0.5, "rewards/chosen": -40.17208480834961, "rewards/margins": 1.70404052734375, "rewards/rejected": -41.876129150390625, "step": 3946 }, { "epoch": 0.5374455337690632, "grad_norm": 38.74429957673256, "learning_rate": 4.175823084595456e-07, "logits/chosen": 13.611797332763672, "logits/rejected": 13.692770004272461, "logps/chosen": -4.056816101074219, "logps/rejected": -4.303862571716309, "loss": 4.0536, "rewards/accuracies": 0.5, "rewards/chosen": -40.56816101074219, "rewards/margins": 2.4704647064208984, "rewards/rejected": -43.03862762451172, "step": 3947 }, { "epoch": 0.5375816993464052, "grad_norm": 36.954569189640665, "learning_rate": 4.173923499824471e-07, "logits/chosen": 14.555984497070312, "logits/rejected": 14.18822956085205, "logps/chosen": -4.358967304229736, "logps/rejected": -4.3761491775512695, "loss": 3.8054, "rewards/accuracies": 0.5, "rewards/chosen": -43.58967208862305, "rewards/margins": 0.17181968688964844, "rewards/rejected": -43.76148986816406, "step": 3948 }, { "epoch": 0.5377178649237473, "grad_norm": 40.28815250832851, "learning_rate": 4.172023875754033e-07, "logits/chosen": 13.450101852416992, "logits/rejected": 13.512454986572266, "logps/chosen": -4.201867580413818, "logps/rejected": -4.3097243309021, "loss": 4.385, "rewards/accuracies": 0.75, "rewards/chosen": -42.01866912841797, "rewards/margins": 1.0785713195800781, "rewards/rejected": -43.09724426269531, "step": 3949 }, { "epoch": 0.5378540305010894, "grad_norm": 37.826300305150966, "learning_rate": 4.170124212813377e-07, "logits/chosen": 13.942575454711914, "logits/rejected": 13.362434387207031, "logps/chosen": -3.930795192718506, "logps/rejected": -3.9181580543518066, "loss": 4.0049, "rewards/accuracies": 0.5, "rewards/chosen": -39.307952880859375, "rewards/margins": -0.1263713836669922, "rewards/rejected": -39.18157958984375, "step": 3950 }, { "epoch": 0.5379901960784313, "grad_norm": 38.71768975987479, "learning_rate": 4.1682245114317503e-07, "logits/chosen": 13.128746032714844, "logits/rejected": 13.059804916381836, "logps/chosen": -3.789083957672119, "logps/rejected": -3.874103307723999, "loss": 3.7778, "rewards/accuracies": 0.5, "rewards/chosen": -37.890838623046875, "rewards/margins": 0.8501949310302734, "rewards/rejected": -38.74103546142578, "step": 3951 }, { "epoch": 0.5381263616557734, "grad_norm": 37.41767104058812, "learning_rate": 4.1663247720384047e-07, "logits/chosen": 13.981317520141602, "logits/rejected": 14.512819290161133, "logps/chosen": -4.33057975769043, "logps/rejected": -4.881007671356201, "loss": 3.6773, "rewards/accuracies": 1.0, "rewards/chosen": -43.30579376220703, "rewards/margins": 5.504281044006348, "rewards/rejected": -48.81007385253906, "step": 3952 }, { "epoch": 0.5382625272331155, "grad_norm": 37.5965054788476, "learning_rate": 4.1644249950626016e-07, "logits/chosen": 12.364400863647461, "logits/rejected": 12.73440170288086, "logps/chosen": -3.728315830230713, "logps/rejected": -3.927619457244873, "loss": 4.2492, "rewards/accuracies": 0.75, "rewards/chosen": -37.28315734863281, "rewards/margins": 1.9930367469787598, "rewards/rejected": -39.27619552612305, "step": 3953 }, { "epoch": 0.5383986928104575, "grad_norm": 96.42788468597085, "learning_rate": 4.1625251809336115e-07, "logits/chosen": 13.23017406463623, "logits/rejected": 13.988726615905762, "logps/chosen": -3.7115061283111572, "logps/rejected": -4.187207221984863, "loss": 4.0979, "rewards/accuracies": 0.75, "rewards/chosen": -37.11506271362305, "rewards/margins": 4.757006645202637, "rewards/rejected": -41.8720703125, "step": 3954 }, { "epoch": 0.5385348583877996, "grad_norm": 37.72036068321837, "learning_rate": 4.1606253300807134e-07, "logits/chosen": 13.402769088745117, "logits/rejected": 13.405820846557617, "logps/chosen": -4.038475036621094, "logps/rejected": -4.004770278930664, "loss": 3.8387, "rewards/accuracies": 0.25, "rewards/chosen": -40.38475036621094, "rewards/margins": -0.3370513916015625, "rewards/rejected": -40.047698974609375, "step": 3955 }, { "epoch": 0.5386710239651417, "grad_norm": 38.414759065817265, "learning_rate": 4.1587254429331946e-07, "logits/chosen": 12.243180274963379, "logits/rejected": 13.098871231079102, "logps/chosen": -3.952998638153076, "logps/rejected": -4.115409851074219, "loss": 3.8526, "rewards/accuracies": 0.75, "rewards/chosen": -39.52998733520508, "rewards/margins": 1.6241121292114258, "rewards/rejected": -41.15409851074219, "step": 3956 }, { "epoch": 0.5388071895424836, "grad_norm": 38.62545683891922, "learning_rate": 4.1568255199203495e-07, "logits/chosen": 13.541681289672852, "logits/rejected": 14.044879913330078, "logps/chosen": -3.9720513820648193, "logps/rejected": -4.410966873168945, "loss": 3.8738, "rewards/accuracies": 0.75, "rewards/chosen": -39.72051239013672, "rewards/margins": 4.389150619506836, "rewards/rejected": -44.10966491699219, "step": 3957 }, { "epoch": 0.5389433551198257, "grad_norm": 38.76319025431661, "learning_rate": 4.154925561471482e-07, "logits/chosen": 12.94920539855957, "logits/rejected": 13.660717964172363, "logps/chosen": -4.086592674255371, "logps/rejected": -4.468383312225342, "loss": 4.1852, "rewards/accuracies": 1.0, "rewards/chosen": -40.86592483520508, "rewards/margins": 3.8179092407226562, "rewards/rejected": -44.683834075927734, "step": 3958 }, { "epoch": 0.5390795206971678, "grad_norm": 37.23141537662013, "learning_rate": 4.153025568015903e-07, "logits/chosen": 14.023797035217285, "logits/rejected": 13.111753463745117, "logps/chosen": -4.210604190826416, "logps/rejected": -4.396651268005371, "loss": 3.9276, "rewards/accuracies": 0.75, "rewards/chosen": -42.106040954589844, "rewards/margins": 1.8604726791381836, "rewards/rejected": -43.966514587402344, "step": 3959 }, { "epoch": 0.5392156862745098, "grad_norm": 42.85191501249315, "learning_rate": 4.1511255399829324e-07, "logits/chosen": 12.411417961120605, "logits/rejected": 12.724778175354004, "logps/chosen": -3.716472864151001, "logps/rejected": -3.8525612354278564, "loss": 4.1797, "rewards/accuracies": 0.5, "rewards/chosen": -37.164730072021484, "rewards/margins": 1.3608818054199219, "rewards/rejected": -38.525611877441406, "step": 3960 }, { "epoch": 0.5393518518518519, "grad_norm": 40.10411899342318, "learning_rate": 4.149225477801897e-07, "logits/chosen": 12.63333511352539, "logits/rejected": 13.783065795898438, "logps/chosen": -3.790128707885742, "logps/rejected": -4.36414098739624, "loss": 4.2012, "rewards/accuracies": 0.75, "rewards/chosen": -37.90128707885742, "rewards/margins": 5.740121841430664, "rewards/rejected": -43.64140701293945, "step": 3961 }, { "epoch": 0.539488017429194, "grad_norm": 35.87360799179059, "learning_rate": 4.1473253819021306e-07, "logits/chosen": 12.738260269165039, "logits/rejected": 12.510419845581055, "logps/chosen": -3.914522647857666, "logps/rejected": -3.976868152618408, "loss": 4.0767, "rewards/accuracies": 0.25, "rewards/chosen": -39.145225524902344, "rewards/margins": 0.6234560012817383, "rewards/rejected": -39.76868438720703, "step": 3962 }, { "epoch": 0.5396241830065359, "grad_norm": 36.85802286182613, "learning_rate": 4.1454252527129767e-07, "logits/chosen": 13.167078018188477, "logits/rejected": 13.847013473510742, "logps/chosen": -4.283563137054443, "logps/rejected": -4.457214832305908, "loss": 4.1078, "rewards/accuracies": 0.75, "rewards/chosen": -42.835636138916016, "rewards/margins": 1.7365169525146484, "rewards/rejected": -44.57215118408203, "step": 3963 }, { "epoch": 0.539760348583878, "grad_norm": 36.841500797654454, "learning_rate": 4.143525090663784e-07, "logits/chosen": 13.334236145019531, "logits/rejected": 13.617402076721191, "logps/chosen": -3.708850860595703, "logps/rejected": -4.045782089233398, "loss": 3.5221, "rewards/accuracies": 1.0, "rewards/chosen": -37.08850860595703, "rewards/margins": 3.369312286376953, "rewards/rejected": -40.457820892333984, "step": 3964 }, { "epoch": 0.5398965141612201, "grad_norm": 41.843401337567215, "learning_rate": 4.141624896183913e-07, "logits/chosen": 13.365981101989746, "logits/rejected": 14.13882064819336, "logps/chosen": -3.829239845275879, "logps/rejected": -4.077088356018066, "loss": 3.3703, "rewards/accuracies": 0.5, "rewards/chosen": -38.29240036010742, "rewards/margins": 2.4784841537475586, "rewards/rejected": -40.77088165283203, "step": 3965 }, { "epoch": 0.5400326797385621, "grad_norm": 35.25964916156603, "learning_rate": 4.1397246697027237e-07, "logits/chosen": 12.423116683959961, "logits/rejected": 14.006834030151367, "logps/chosen": -3.641744613647461, "logps/rejected": -4.081176280975342, "loss": 3.8047, "rewards/accuracies": 1.0, "rewards/chosen": -36.417449951171875, "rewards/margins": 4.39431619644165, "rewards/rejected": -40.811763763427734, "step": 3966 }, { "epoch": 0.5401688453159041, "grad_norm": 86.44526446261548, "learning_rate": 4.137824411649592e-07, "logits/chosen": 13.531784057617188, "logits/rejected": 13.877254486083984, "logps/chosen": -3.9568488597869873, "logps/rejected": -4.199247360229492, "loss": 3.8971, "rewards/accuracies": 0.5, "rewards/chosen": -39.56848907470703, "rewards/margins": 2.42398738861084, "rewards/rejected": -41.99247741699219, "step": 3967 }, { "epoch": 0.5403050108932462, "grad_norm": 40.24535961196266, "learning_rate": 4.135924122453894e-07, "logits/chosen": 13.017839431762695, "logits/rejected": 14.068984031677246, "logps/chosen": -3.9933667182922363, "logps/rejected": -4.323385238647461, "loss": 4.1321, "rewards/accuracies": 0.75, "rewards/chosen": -39.93367004394531, "rewards/margins": 3.3001832962036133, "rewards/rejected": -43.23385238647461, "step": 3968 }, { "epoch": 0.5404411764705882, "grad_norm": 37.788504082407975, "learning_rate": 4.134023802545017e-07, "logits/chosen": 12.079391479492188, "logits/rejected": 12.777360916137695, "logps/chosen": -3.9337289333343506, "logps/rejected": -4.246149063110352, "loss": 4.4201, "rewards/accuracies": 1.0, "rewards/chosen": -39.33728790283203, "rewards/margins": 3.124203681945801, "rewards/rejected": -42.46149444580078, "step": 3969 }, { "epoch": 0.5405773420479303, "grad_norm": 40.906911395429574, "learning_rate": 4.1321234523523546e-07, "logits/chosen": 13.600943565368652, "logits/rejected": 13.822270393371582, "logps/chosen": -4.288196563720703, "logps/rejected": -4.122035980224609, "loss": 3.9454, "rewards/accuracies": 0.25, "rewards/chosen": -42.88196563720703, "rewards/margins": -1.661604881286621, "rewards/rejected": -41.220359802246094, "step": 3970 }, { "epoch": 0.5407135076252724, "grad_norm": 37.70451790367828, "learning_rate": 4.1302230723053053e-07, "logits/chosen": 13.595913887023926, "logits/rejected": 15.415191650390625, "logps/chosen": -4.554178237915039, "logps/rejected": -4.794695854187012, "loss": 3.877, "rewards/accuracies": 0.5, "rewards/chosen": -45.54178237915039, "rewards/margins": 2.40517520904541, "rewards/rejected": -47.946956634521484, "step": 3971 }, { "epoch": 0.5408496732026143, "grad_norm": 40.615411294756754, "learning_rate": 4.1283226628332774e-07, "logits/chosen": 12.307342529296875, "logits/rejected": 13.586860656738281, "logps/chosen": -3.8686933517456055, "logps/rejected": -4.425111770629883, "loss": 3.5786, "rewards/accuracies": 0.75, "rewards/chosen": -38.68693542480469, "rewards/margins": 5.564188003540039, "rewards/rejected": -44.251121520996094, "step": 3972 }, { "epoch": 0.5409858387799564, "grad_norm": 41.157731181980616, "learning_rate": 4.126422224365683e-07, "logits/chosen": 14.339505195617676, "logits/rejected": 14.370981216430664, "logps/chosen": -4.222564220428467, "logps/rejected": -4.455716133117676, "loss": 4.4, "rewards/accuracies": 0.75, "rewards/chosen": -42.22564697265625, "rewards/margins": 2.3315162658691406, "rewards/rejected": -44.557159423828125, "step": 3973 }, { "epoch": 0.5411220043572985, "grad_norm": 37.62146412433766, "learning_rate": 4.1245217573319407e-07, "logits/chosen": 13.350082397460938, "logits/rejected": 13.303573608398438, "logps/chosen": -4.0005903244018555, "logps/rejected": -4.0382795333862305, "loss": 3.8748, "rewards/accuracies": 0.25, "rewards/chosen": -40.005897521972656, "rewards/margins": 0.3768959045410156, "rewards/rejected": -40.38279342651367, "step": 3974 }, { "epoch": 0.5412581699346405, "grad_norm": 34.59178187975167, "learning_rate": 4.1226212621614793e-07, "logits/chosen": 11.937901496887207, "logits/rejected": 13.897003173828125, "logps/chosen": -3.9740798473358154, "logps/rejected": -4.486872673034668, "loss": 3.7244, "rewards/accuracies": 1.0, "rewards/chosen": -39.74079895019531, "rewards/margins": 5.127925872802734, "rewards/rejected": -44.86872482299805, "step": 3975 }, { "epoch": 0.5413943355119826, "grad_norm": 41.546083539144234, "learning_rate": 4.1207207392837306e-07, "logits/chosen": 13.801119804382324, "logits/rejected": 13.80103874206543, "logps/chosen": -3.9856884479522705, "logps/rejected": -4.412466049194336, "loss": 3.3537, "rewards/accuracies": 1.0, "rewards/chosen": -39.85688781738281, "rewards/margins": 4.267778396606445, "rewards/rejected": -44.124664306640625, "step": 3976 }, { "epoch": 0.5415305010893247, "grad_norm": 37.823612309999575, "learning_rate": 4.118820189128131e-07, "logits/chosen": 12.901304244995117, "logits/rejected": 14.857582092285156, "logps/chosen": -3.916879177093506, "logps/rejected": -4.4053192138671875, "loss": 3.6499, "rewards/accuracies": 1.0, "rewards/chosen": -39.168792724609375, "rewards/margins": 4.884401321411133, "rewards/rejected": -44.053192138671875, "step": 3977 }, { "epoch": 0.5416666666666666, "grad_norm": 40.10883356170304, "learning_rate": 4.116919612124129e-07, "logits/chosen": 13.15717887878418, "logits/rejected": 13.546852111816406, "logps/chosen": -4.081745147705078, "logps/rejected": -4.3099870681762695, "loss": 4.1268, "rewards/accuracies": 0.75, "rewards/chosen": -40.81745147705078, "rewards/margins": 2.2824230194091797, "rewards/rejected": -43.099876403808594, "step": 3978 }, { "epoch": 0.5418028322440087, "grad_norm": 41.06215612902499, "learning_rate": 4.115019008701174e-07, "logits/chosen": 13.69424819946289, "logits/rejected": 14.659887313842773, "logps/chosen": -4.197710990905762, "logps/rejected": -4.159328460693359, "loss": 4.0747, "rewards/accuracies": 0.5, "rewards/chosen": -41.977108001708984, "rewards/margins": -0.3838233947753906, "rewards/rejected": -41.593284606933594, "step": 3979 }, { "epoch": 0.5419389978213508, "grad_norm": 41.305635133068265, "learning_rate": 4.113118379288722e-07, "logits/chosen": 13.121564865112305, "logits/rejected": 14.246889114379883, "logps/chosen": -4.090435028076172, "logps/rejected": -4.316261291503906, "loss": 3.7766, "rewards/accuracies": 0.5, "rewards/chosen": -40.90435028076172, "rewards/margins": 2.258260726928711, "rewards/rejected": -43.16261291503906, "step": 3980 }, { "epoch": 0.5420751633986928, "grad_norm": 37.53944924082287, "learning_rate": 4.1112177243162386e-07, "logits/chosen": 12.888885498046875, "logits/rejected": 13.387548446655273, "logps/chosen": -3.634331226348877, "logps/rejected": -3.9636690616607666, "loss": 4.1163, "rewards/accuracies": 0.5, "rewards/chosen": -36.34331130981445, "rewards/margins": 3.293376922607422, "rewards/rejected": -39.636688232421875, "step": 3981 }, { "epoch": 0.5422113289760349, "grad_norm": 37.929437027988754, "learning_rate": 4.109317044213191e-07, "logits/chosen": 13.393341064453125, "logits/rejected": 14.24338150024414, "logps/chosen": -4.2237396240234375, "logps/rejected": -4.567409038543701, "loss": 3.7084, "rewards/accuracies": 1.0, "rewards/chosen": -42.23739242553711, "rewards/margins": 3.4366979598999023, "rewards/rejected": -45.67409133911133, "step": 3982 }, { "epoch": 0.5423474945533769, "grad_norm": 40.62809199735487, "learning_rate": 4.1074163394090535e-07, "logits/chosen": 13.05640983581543, "logits/rejected": 13.293014526367188, "logps/chosen": -4.423371315002441, "logps/rejected": -4.543248176574707, "loss": 4.0972, "rewards/accuracies": 0.5, "rewards/chosen": -44.23371124267578, "rewards/margins": 1.1987686157226562, "rewards/rejected": -45.43247985839844, "step": 3983 }, { "epoch": 0.5424836601307189, "grad_norm": 40.27588378279103, "learning_rate": 4.105515610333306e-07, "logits/chosen": 13.774309158325195, "logits/rejected": 13.348384857177734, "logps/chosen": -4.479032516479492, "logps/rejected": -4.492022514343262, "loss": 3.9936, "rewards/accuracies": 0.75, "rewards/chosen": -44.79032897949219, "rewards/margins": 0.1298999786376953, "rewards/rejected": -44.920230865478516, "step": 3984 }, { "epoch": 0.542619825708061, "grad_norm": 39.98175236983233, "learning_rate": 4.103614857415434e-07, "logits/chosen": 11.998647689819336, "logits/rejected": 13.006964683532715, "logps/chosen": -3.726191520690918, "logps/rejected": -3.875321388244629, "loss": 4.2592, "rewards/accuracies": 0.5, "rewards/chosen": -37.26191711425781, "rewards/margins": 1.4912991523742676, "rewards/rejected": -38.75321578979492, "step": 3985 }, { "epoch": 0.5427559912854031, "grad_norm": 40.400003726278435, "learning_rate": 4.1017140810849285e-07, "logits/chosen": 13.312047004699707, "logits/rejected": 14.012092590332031, "logps/chosen": -4.084247589111328, "logps/rejected": -4.607823371887207, "loss": 4.0031, "rewards/accuracies": 1.0, "rewards/chosen": -40.84247589111328, "rewards/margins": 5.23575496673584, "rewards/rejected": -46.07823181152344, "step": 3986 }, { "epoch": 0.5428921568627451, "grad_norm": 38.266612918686164, "learning_rate": 4.0998132817712853e-07, "logits/chosen": 13.292531967163086, "logits/rejected": 14.265890121459961, "logps/chosen": -4.073969841003418, "logps/rejected": -4.5079121589660645, "loss": 3.7113, "rewards/accuracies": 0.75, "rewards/chosen": -40.73970031738281, "rewards/margins": 4.339424133300781, "rewards/rejected": -45.079124450683594, "step": 3987 }, { "epoch": 0.5430283224400871, "grad_norm": 47.94692370665191, "learning_rate": 4.097912459904007e-07, "logits/chosen": 12.62879753112793, "logits/rejected": 13.532096862792969, "logps/chosen": -3.9993114471435547, "logps/rejected": -4.230180740356445, "loss": 4.7577, "rewards/accuracies": 0.5, "rewards/chosen": -39.99311828613281, "rewards/margins": 2.3086938858032227, "rewards/rejected": -42.30181121826172, "step": 3988 }, { "epoch": 0.5431644880174292, "grad_norm": 37.93744893410589, "learning_rate": 4.096011615912598e-07, "logits/chosen": 12.599924087524414, "logits/rejected": 13.511285781860352, "logps/chosen": -3.8317205905914307, "logps/rejected": -4.147059440612793, "loss": 3.8985, "rewards/accuracies": 0.75, "rewards/chosen": -38.31720733642578, "rewards/margins": 3.153388023376465, "rewards/rejected": -41.47059631347656, "step": 3989 }, { "epoch": 0.5433006535947712, "grad_norm": 42.00488200700917, "learning_rate": 4.094110750226571e-07, "logits/chosen": 14.148747444152832, "logits/rejected": 13.786630630493164, "logps/chosen": -4.37376594543457, "logps/rejected": -4.548366546630859, "loss": 4.3871, "rewards/accuracies": 1.0, "rewards/chosen": -43.73765563964844, "rewards/margins": 1.7460079193115234, "rewards/rejected": -45.483665466308594, "step": 3990 }, { "epoch": 0.5434368191721133, "grad_norm": 38.55057782074192, "learning_rate": 4.0922098632754424e-07, "logits/chosen": 13.618257522583008, "logits/rejected": 14.13418197631836, "logps/chosen": -4.245391845703125, "logps/rejected": -4.44418478012085, "loss": 3.7958, "rewards/accuracies": 1.0, "rewards/chosen": -42.45391845703125, "rewards/margins": 1.9879302978515625, "rewards/rejected": -44.44184875488281, "step": 3991 }, { "epoch": 0.5435729847494554, "grad_norm": 43.167919062888345, "learning_rate": 4.0903089554887324e-07, "logits/chosen": 13.75448226928711, "logits/rejected": 14.574972152709961, "logps/chosen": -4.2590556144714355, "logps/rejected": -4.45339822769165, "loss": 3.9262, "rewards/accuracies": 0.75, "rewards/chosen": -42.59055709838867, "rewards/margins": 1.9434261322021484, "rewards/rejected": -44.53398513793945, "step": 3992 }, { "epoch": 0.5437091503267973, "grad_norm": 46.84040788400778, "learning_rate": 4.088408027295968e-07, "logits/chosen": 13.237590789794922, "logits/rejected": 12.591409683227539, "logps/chosen": -4.198304653167725, "logps/rejected": -4.119689464569092, "loss": 4.0897, "rewards/accuracies": 0.5, "rewards/chosen": -41.98304748535156, "rewards/margins": -0.7861537933349609, "rewards/rejected": -41.19689178466797, "step": 3993 }, { "epoch": 0.5438453159041394, "grad_norm": 41.838504303133426, "learning_rate": 4.0865070791266796e-07, "logits/chosen": 13.483675003051758, "logits/rejected": 13.890586853027344, "logps/chosen": -4.152703762054443, "logps/rejected": -4.455568790435791, "loss": 3.6342, "rewards/accuracies": 1.0, "rewards/chosen": -41.52703857421875, "rewards/margins": 3.02864933013916, "rewards/rejected": -44.55569076538086, "step": 3994 }, { "epoch": 0.5439814814814815, "grad_norm": 41.36250112865281, "learning_rate": 4.0846061114103997e-07, "logits/chosen": 13.37514877319336, "logits/rejected": 13.512572288513184, "logps/chosen": -3.9950485229492188, "logps/rejected": -4.286439895629883, "loss": 3.6501, "rewards/accuracies": 0.75, "rewards/chosen": -39.95048522949219, "rewards/margins": 2.913912773132324, "rewards/rejected": -42.86439895629883, "step": 3995 }, { "epoch": 0.5441176470588235, "grad_norm": 38.82518645125051, "learning_rate": 4.0827051245766714e-07, "logits/chosen": 13.369401931762695, "logits/rejected": 14.000551223754883, "logps/chosen": -3.8415491580963135, "logps/rejected": -4.221872329711914, "loss": 3.8891, "rewards/accuracies": 0.75, "rewards/chosen": -38.415489196777344, "rewards/margins": 3.8032326698303223, "rewards/rejected": -42.21872329711914, "step": 3996 }, { "epoch": 0.5442538126361656, "grad_norm": 41.41560627781409, "learning_rate": 4.080804119055036e-07, "logits/chosen": 12.324790954589844, "logits/rejected": 13.229196548461914, "logps/chosen": -4.0547709465026855, "logps/rejected": -4.541029930114746, "loss": 3.8005, "rewards/accuracies": 1.0, "rewards/chosen": -40.54771041870117, "rewards/margins": 4.862587928771973, "rewards/rejected": -45.41029739379883, "step": 3997 }, { "epoch": 0.5443899782135077, "grad_norm": 44.41784323333986, "learning_rate": 4.0789030952750416e-07, "logits/chosen": 13.306377410888672, "logits/rejected": 13.628856658935547, "logps/chosen": -4.577884197235107, "logps/rejected": -4.314240455627441, "loss": 4.5379, "rewards/accuracies": 0.5, "rewards/chosen": -45.77884292602539, "rewards/margins": -2.6364355087280273, "rewards/rejected": -43.14240646362305, "step": 3998 }, { "epoch": 0.5445261437908496, "grad_norm": 44.499464621596736, "learning_rate": 4.0770020536662406e-07, "logits/chosen": 13.115951538085938, "logits/rejected": 13.640918731689453, "logps/chosen": -4.104519844055176, "logps/rejected": -4.406186103820801, "loss": 4.172, "rewards/accuracies": 0.75, "rewards/chosen": -41.04520034790039, "rewards/margins": 3.016659736633301, "rewards/rejected": -44.061859130859375, "step": 3999 }, { "epoch": 0.5446623093681917, "grad_norm": 36.817539541529, "learning_rate": 4.0751009946581896e-07, "logits/chosen": 13.07565689086914, "logits/rejected": 13.19550895690918, "logps/chosen": -4.085919380187988, "logps/rejected": -4.150641441345215, "loss": 3.8167, "rewards/accuracies": 0.5, "rewards/chosen": -40.859195709228516, "rewards/margins": 0.647216796875, "rewards/rejected": -41.50641632080078, "step": 4000 }, { "epoch": 0.5447984749455338, "grad_norm": 40.73267199384365, "learning_rate": 4.0731999186804476e-07, "logits/chosen": 12.294242858886719, "logits/rejected": 13.804736137390137, "logps/chosen": -3.883226156234741, "logps/rejected": -4.309426307678223, "loss": 3.7387, "rewards/accuracies": 1.0, "rewards/chosen": -38.83226013183594, "rewards/margins": 4.262001037597656, "rewards/rejected": -43.094261169433594, "step": 4001 }, { "epoch": 0.5449346405228758, "grad_norm": 35.81236971218131, "learning_rate": 4.071298826162579e-07, "logits/chosen": 12.509936332702637, "logits/rejected": 13.981880187988281, "logps/chosen": -4.032566070556641, "logps/rejected": -4.374109268188477, "loss": 3.5617, "rewards/accuracies": 0.75, "rewards/chosen": -40.32566452026367, "rewards/margins": 3.4154272079467773, "rewards/rejected": -43.741092681884766, "step": 4002 }, { "epoch": 0.5450708061002179, "grad_norm": 46.605605356670694, "learning_rate": 4.0693977175341514e-07, "logits/chosen": 12.773126602172852, "logits/rejected": 12.823731422424316, "logps/chosen": -3.827695846557617, "logps/rejected": -4.22443962097168, "loss": 3.7933, "rewards/accuracies": 1.0, "rewards/chosen": -38.27695846557617, "rewards/margins": 3.9674367904663086, "rewards/rejected": -42.24439239501953, "step": 4003 }, { "epoch": 0.5452069716775599, "grad_norm": 36.64788117051405, "learning_rate": 4.0674965932247354e-07, "logits/chosen": 12.600664138793945, "logits/rejected": 13.225984573364258, "logps/chosen": -3.750430107116699, "logps/rejected": -4.033307075500488, "loss": 3.828, "rewards/accuracies": 0.5, "rewards/chosen": -37.50429916381836, "rewards/margins": 2.8287715911865234, "rewards/rejected": -40.333072662353516, "step": 4004 }, { "epoch": 0.5453431372549019, "grad_norm": 40.97092722454257, "learning_rate": 4.065595453663907e-07, "logits/chosen": 13.738151550292969, "logits/rejected": 13.758604049682617, "logps/chosen": -4.078794479370117, "logps/rejected": -4.470776557922363, "loss": 3.9842, "rewards/accuracies": 1.0, "rewards/chosen": -40.78794860839844, "rewards/margins": 3.919818878173828, "rewards/rejected": -44.707763671875, "step": 4005 }, { "epoch": 0.545479302832244, "grad_norm": 40.279537790386186, "learning_rate": 4.063694299281244e-07, "logits/chosen": 13.121088027954102, "logits/rejected": 13.27387809753418, "logps/chosen": -3.841905117034912, "logps/rejected": -3.948840618133545, "loss": 4.0308, "rewards/accuracies": 0.5, "rewards/chosen": -38.41904830932617, "rewards/margins": 1.0693540573120117, "rewards/rejected": -39.4884033203125, "step": 4006 }, { "epoch": 0.5456154684095861, "grad_norm": 38.33331856560295, "learning_rate": 4.061793130506326e-07, "logits/chosen": 12.952301025390625, "logits/rejected": 12.445816993713379, "logps/chosen": -4.018735885620117, "logps/rejected": -4.22040319442749, "loss": 4.214, "rewards/accuracies": 0.75, "rewards/chosen": -40.18736267089844, "rewards/margins": 2.016669273376465, "rewards/rejected": -42.20402908325195, "step": 4007 }, { "epoch": 0.545751633986928, "grad_norm": 37.07798913161513, "learning_rate": 4.05989194776874e-07, "logits/chosen": 12.625171661376953, "logits/rejected": 13.253589630126953, "logps/chosen": -4.1354079246521, "logps/rejected": -4.318382740020752, "loss": 3.5848, "rewards/accuracies": 0.75, "rewards/chosen": -41.35408020019531, "rewards/margins": 1.8297452926635742, "rewards/rejected": -43.1838264465332, "step": 4008 }, { "epoch": 0.5458877995642701, "grad_norm": 35.314375203736915, "learning_rate": 4.0579907514980744e-07, "logits/chosen": 11.869270324707031, "logits/rejected": 14.234869003295898, "logps/chosen": -3.6333138942718506, "logps/rejected": -4.424622535705566, "loss": 4.059, "rewards/accuracies": 1.0, "rewards/chosen": -36.33313751220703, "rewards/margins": 7.913087368011475, "rewards/rejected": -44.24622344970703, "step": 4009 }, { "epoch": 0.5460239651416122, "grad_norm": 39.064034928305205, "learning_rate": 4.056089542123917e-07, "logits/chosen": 12.715716361999512, "logits/rejected": 13.593270301818848, "logps/chosen": -3.8475515842437744, "logps/rejected": -4.364304542541504, "loss": 3.7412, "rewards/accuracies": 1.0, "rewards/chosen": -38.47551727294922, "rewards/margins": 5.167531967163086, "rewards/rejected": -43.64304733276367, "step": 4010 }, { "epoch": 0.5461601307189542, "grad_norm": 38.550043340207495, "learning_rate": 4.054188320075866e-07, "logits/chosen": 13.061628341674805, "logits/rejected": 13.40831184387207, "logps/chosen": -4.3103556632995605, "logps/rejected": -4.128303527832031, "loss": 3.9339, "rewards/accuracies": 0.75, "rewards/chosen": -43.10355758666992, "rewards/margins": -1.8205175399780273, "rewards/rejected": -41.28303527832031, "step": 4011 }, { "epoch": 0.5462962962962963, "grad_norm": 38.614129612898125, "learning_rate": 4.052287085783515e-07, "logits/chosen": 12.718242645263672, "logits/rejected": 13.546539306640625, "logps/chosen": -3.7699501514434814, "logps/rejected": -4.198958396911621, "loss": 3.1884, "rewards/accuracies": 0.75, "rewards/chosen": -37.699501037597656, "rewards/margins": 4.290078163146973, "rewards/rejected": -41.98958206176758, "step": 4012 }, { "epoch": 0.5464324618736384, "grad_norm": 42.00399910792403, "learning_rate": 4.0503858396764655e-07, "logits/chosen": 13.485599517822266, "logits/rejected": 14.688230514526367, "logps/chosen": -4.076732635498047, "logps/rejected": -4.553833961486816, "loss": 3.8676, "rewards/accuracies": 1.0, "rewards/chosen": -40.767330169677734, "rewards/margins": 4.7710065841674805, "rewards/rejected": -45.53833770751953, "step": 4013 }, { "epoch": 0.5465686274509803, "grad_norm": 38.32040284946351, "learning_rate": 4.0484845821843184e-07, "logits/chosen": 14.616813659667969, "logits/rejected": 14.551719665527344, "logps/chosen": -4.600590705871582, "logps/rejected": -4.490829944610596, "loss": 3.9234, "rewards/accuracies": 0.25, "rewards/chosen": -46.00590515136719, "rewards/margins": -1.0976085662841797, "rewards/rejected": -44.90829849243164, "step": 4014 }, { "epoch": 0.5467047930283224, "grad_norm": 33.63513431209466, "learning_rate": 4.046583313736679e-07, "logits/chosen": 12.708671569824219, "logits/rejected": 12.897207260131836, "logps/chosen": -3.6469688415527344, "logps/rejected": -3.9102084636688232, "loss": 3.0791, "rewards/accuracies": 0.75, "rewards/chosen": -36.469688415527344, "rewards/margins": 2.632396697998047, "rewards/rejected": -39.10208511352539, "step": 4015 }, { "epoch": 0.5468409586056645, "grad_norm": 38.920190291646776, "learning_rate": 4.0446820347631555e-07, "logits/chosen": 13.423089027404785, "logits/rejected": 15.125178337097168, "logps/chosen": -4.073419094085693, "logps/rejected": -4.635340213775635, "loss": 4.0595, "rewards/accuracies": 0.75, "rewards/chosen": -40.73419189453125, "rewards/margins": 5.6192121505737305, "rewards/rejected": -46.35340118408203, "step": 4016 }, { "epoch": 0.5469771241830066, "grad_norm": 40.723284314930794, "learning_rate": 4.0427807456933565e-07, "logits/chosen": 13.012590408325195, "logits/rejected": 12.970453262329102, "logps/chosen": -3.9347312450408936, "logps/rejected": -4.279823303222656, "loss": 4.3983, "rewards/accuracies": 0.75, "rewards/chosen": -39.347312927246094, "rewards/margins": 3.4509172439575195, "rewards/rejected": -42.7982292175293, "step": 4017 }, { "epoch": 0.5471132897603486, "grad_norm": 39.09250896555479, "learning_rate": 4.0408794469568946e-07, "logits/chosen": 12.255451202392578, "logits/rejected": 13.191821098327637, "logps/chosen": -3.9452872276306152, "logps/rejected": -4.121847629547119, "loss": 3.9323, "rewards/accuracies": 0.75, "rewards/chosen": -39.45287322998047, "rewards/margins": 1.7656011581420898, "rewards/rejected": -41.218475341796875, "step": 4018 }, { "epoch": 0.5472494553376906, "grad_norm": 38.80882613298998, "learning_rate": 4.038978138983383e-07, "logits/chosen": 13.107880592346191, "logits/rejected": 13.43516731262207, "logps/chosen": -3.9941868782043457, "logps/rejected": -4.054452419281006, "loss": 4.1434, "rewards/accuracies": 0.75, "rewards/chosen": -39.941871643066406, "rewards/margins": 0.6026544570922852, "rewards/rejected": -40.544525146484375, "step": 4019 }, { "epoch": 0.5473856209150327, "grad_norm": 40.20362553876469, "learning_rate": 4.0370768222024393e-07, "logits/chosen": 13.734668731689453, "logits/rejected": 13.998716354370117, "logps/chosen": -4.279547214508057, "logps/rejected": -4.246699333190918, "loss": 4.3115, "rewards/accuracies": 0.5, "rewards/chosen": -42.79547119140625, "rewards/margins": -0.3284759521484375, "rewards/rejected": -42.46699523925781, "step": 4020 }, { "epoch": 0.5475217864923747, "grad_norm": 38.157124815470375, "learning_rate": 4.0351754970436815e-07, "logits/chosen": 12.316925048828125, "logits/rejected": 13.589916229248047, "logps/chosen": -3.792701244354248, "logps/rejected": -4.397955894470215, "loss": 4.1171, "rewards/accuracies": 1.0, "rewards/chosen": -37.9270133972168, "rewards/margins": 6.052547454833984, "rewards/rejected": -43.97956085205078, "step": 4021 }, { "epoch": 0.5476579520697168, "grad_norm": 36.42152876533411, "learning_rate": 4.0332741639367285e-07, "logits/chosen": 13.148564338684082, "logits/rejected": 13.020196914672852, "logps/chosen": -3.958383083343506, "logps/rejected": -4.150166034698486, "loss": 3.6179, "rewards/accuracies": 0.75, "rewards/chosen": -39.583831787109375, "rewards/margins": 1.9178276062011719, "rewards/rejected": -41.50165557861328, "step": 4022 }, { "epoch": 0.5477941176470589, "grad_norm": 49.85498876760195, "learning_rate": 4.031372823311204e-07, "logits/chosen": 13.162757873535156, "logits/rejected": 12.877030372619629, "logps/chosen": -4.0110321044921875, "logps/rejected": -4.011477947235107, "loss": 4.1462, "rewards/accuracies": 0.5, "rewards/chosen": -40.110321044921875, "rewards/margins": 0.0044574737548828125, "rewards/rejected": -40.11478042602539, "step": 4023 }, { "epoch": 0.5479302832244008, "grad_norm": 53.62414407667179, "learning_rate": 4.0294714755967307e-07, "logits/chosen": 13.484212875366211, "logits/rejected": 13.524323463439941, "logps/chosen": -4.187908172607422, "logps/rejected": -4.341492652893066, "loss": 3.8695, "rewards/accuracies": 0.5, "rewards/chosen": -41.87908172607422, "rewards/margins": 1.535848617553711, "rewards/rejected": -43.41492462158203, "step": 4024 }, { "epoch": 0.5480664488017429, "grad_norm": 39.25064738509078, "learning_rate": 4.0275701212229335e-07, "logits/chosen": 13.988645553588867, "logits/rejected": 12.979354858398438, "logps/chosen": -4.376850128173828, "logps/rejected": -4.30553674697876, "loss": 4.0031, "rewards/accuracies": 0.25, "rewards/chosen": -43.76850128173828, "rewards/margins": -0.7131290435791016, "rewards/rejected": -43.05537033081055, "step": 4025 }, { "epoch": 0.548202614379085, "grad_norm": 37.71706663173566, "learning_rate": 4.02566876061944e-07, "logits/chosen": 13.08256721496582, "logits/rejected": 13.61188793182373, "logps/chosen": -4.240723609924316, "logps/rejected": -4.314042091369629, "loss": 3.564, "rewards/accuracies": 0.5, "rewards/chosen": -42.40724182128906, "rewards/margins": 0.733180046081543, "rewards/rejected": -43.140419006347656, "step": 4026 }, { "epoch": 0.548338779956427, "grad_norm": 40.39362347547148, "learning_rate": 4.023767394215878e-07, "logits/chosen": 13.07158088684082, "logits/rejected": 12.993026733398438, "logps/chosen": -4.268904685974121, "logps/rejected": -4.1134419441223145, "loss": 4.2969, "rewards/accuracies": 0.0, "rewards/chosen": -42.689048767089844, "rewards/margins": -1.5546283721923828, "rewards/rejected": -41.134422302246094, "step": 4027 }, { "epoch": 0.5484749455337691, "grad_norm": 39.99057256164434, "learning_rate": 4.021866022441875e-07, "logits/chosen": 13.60175895690918, "logits/rejected": 12.993361473083496, "logps/chosen": -4.233473777770996, "logps/rejected": -4.053000450134277, "loss": 3.7999, "rewards/accuracies": 0.25, "rewards/chosen": -42.33473587036133, "rewards/margins": -1.8047304153442383, "rewards/rejected": -40.530006408691406, "step": 4028 }, { "epoch": 0.5486111111111112, "grad_norm": 39.84160253395257, "learning_rate": 4.019964645727065e-07, "logits/chosen": 12.509651184082031, "logits/rejected": 12.75214958190918, "logps/chosen": -4.074534893035889, "logps/rejected": -4.016571998596191, "loss": 3.7518, "rewards/accuracies": 0.5, "rewards/chosen": -40.74534606933594, "rewards/margins": -0.5796232223510742, "rewards/rejected": -40.16572570800781, "step": 4029 }, { "epoch": 0.5487472766884531, "grad_norm": 39.66978940860724, "learning_rate": 4.0180632645010784e-07, "logits/chosen": 13.356664657592773, "logits/rejected": 13.970527648925781, "logps/chosen": -4.340627193450928, "logps/rejected": -4.675766944885254, "loss": 3.9815, "rewards/accuracies": 1.0, "rewards/chosen": -43.406272888183594, "rewards/margins": 3.351400375366211, "rewards/rejected": -46.75767135620117, "step": 4030 }, { "epoch": 0.5488834422657952, "grad_norm": 36.461858518750006, "learning_rate": 4.0161618791935474e-07, "logits/chosen": 12.539082527160645, "logits/rejected": 12.745741844177246, "logps/chosen": -4.047501087188721, "logps/rejected": -4.097991943359375, "loss": 4.2142, "rewards/accuracies": 0.75, "rewards/chosen": -40.47500991821289, "rewards/margins": 0.5049076080322266, "rewards/rejected": -40.97991943359375, "step": 4031 }, { "epoch": 0.5490196078431373, "grad_norm": 34.40066210465939, "learning_rate": 4.0142604902341064e-07, "logits/chosen": 12.887017250061035, "logits/rejected": 12.928442001342773, "logps/chosen": -3.8488235473632812, "logps/rejected": -4.10140323638916, "loss": 3.6767, "rewards/accuracies": 0.75, "rewards/chosen": -38.48823928833008, "rewards/margins": 2.5257997512817383, "rewards/rejected": -41.0140380859375, "step": 4032 }, { "epoch": 0.5491557734204793, "grad_norm": 37.21275211360328, "learning_rate": 4.01235909805239e-07, "logits/chosen": 12.198165893554688, "logits/rejected": 12.968786239624023, "logps/chosen": -3.7479088306427, "logps/rejected": -3.9113430976867676, "loss": 3.7846, "rewards/accuracies": 0.75, "rewards/chosen": -37.479087829589844, "rewards/margins": 1.6343441009521484, "rewards/rejected": -39.113433837890625, "step": 4033 }, { "epoch": 0.5492919389978214, "grad_norm": 38.987297230428894, "learning_rate": 4.0104577030780316e-07, "logits/chosen": 12.42817211151123, "logits/rejected": 13.000333786010742, "logps/chosen": -4.010406970977783, "logps/rejected": -4.254244804382324, "loss": 4.1018, "rewards/accuracies": 1.0, "rewards/chosen": -40.104068756103516, "rewards/margins": 2.4383773803710938, "rewards/rejected": -42.542449951171875, "step": 4034 }, { "epoch": 0.5494281045751634, "grad_norm": 41.206100093998934, "learning_rate": 4.0085563057406714e-07, "logits/chosen": 12.130332946777344, "logits/rejected": 13.975835800170898, "logps/chosen": -4.018001079559326, "logps/rejected": -4.533860206604004, "loss": 3.3817, "rewards/accuracies": 0.75, "rewards/chosen": -40.18001174926758, "rewards/margins": 5.158592224121094, "rewards/rejected": -45.33860397338867, "step": 4035 }, { "epoch": 0.5495642701525054, "grad_norm": 39.58786198388283, "learning_rate": 4.0066549064699415e-07, "logits/chosen": 13.754928588867188, "logits/rejected": 13.402446746826172, "logps/chosen": -3.8470845222473145, "logps/rejected": -3.738032579421997, "loss": 4.1817, "rewards/accuracies": 0.5, "rewards/chosen": -38.47084426879883, "rewards/margins": -1.0905156135559082, "rewards/rejected": -37.38032913208008, "step": 4036 }, { "epoch": 0.5497004357298475, "grad_norm": 35.9078123263217, "learning_rate": 4.00475350569548e-07, "logits/chosen": 12.065162658691406, "logits/rejected": 12.680179595947266, "logps/chosen": -3.98614501953125, "logps/rejected": -4.185490608215332, "loss": 4.3975, "rewards/accuracies": 0.75, "rewards/chosen": -39.8614501953125, "rewards/margins": 1.9934539794921875, "rewards/rejected": -41.85490417480469, "step": 4037 }, { "epoch": 0.5498366013071896, "grad_norm": 34.61876811666007, "learning_rate": 4.0028521038469265e-07, "logits/chosen": 12.899477005004883, "logits/rejected": 12.610111236572266, "logps/chosen": -4.2613372802734375, "logps/rejected": -4.000894546508789, "loss": 3.7657, "rewards/accuracies": 0.5, "rewards/chosen": -42.61336898803711, "rewards/margins": -2.6044273376464844, "rewards/rejected": -40.008941650390625, "step": 4038 }, { "epoch": 0.5499727668845316, "grad_norm": 41.65496884937941, "learning_rate": 4.0009507013539155e-07, "logits/chosen": 13.130285263061523, "logits/rejected": 12.73390007019043, "logps/chosen": -4.035019874572754, "logps/rejected": -3.9507908821105957, "loss": 3.7676, "rewards/accuracies": 0.25, "rewards/chosen": -40.350196838378906, "rewards/margins": -0.8422937393188477, "rewards/rejected": -39.507904052734375, "step": 4039 }, { "epoch": 0.5501089324618736, "grad_norm": 40.7523990879945, "learning_rate": 3.9990492986460847e-07, "logits/chosen": 12.789031982421875, "logits/rejected": 13.536725044250488, "logps/chosen": -4.156532287597656, "logps/rejected": -4.478849411010742, "loss": 4.6094, "rewards/accuracies": 1.0, "rewards/chosen": -41.56532287597656, "rewards/margins": 3.223165512084961, "rewards/rejected": -44.788490295410156, "step": 4040 }, { "epoch": 0.5502450980392157, "grad_norm": 42.729783473384124, "learning_rate": 3.997147896153073e-07, "logits/chosen": 12.912553787231445, "logits/rejected": 14.004674911499023, "logps/chosen": -3.8815224170684814, "logps/rejected": -4.418611526489258, "loss": 4.0088, "rewards/accuracies": 0.75, "rewards/chosen": -38.815223693847656, "rewards/margins": 5.370888710021973, "rewards/rejected": -44.18611526489258, "step": 4041 }, { "epoch": 0.5503812636165577, "grad_norm": 37.067710765551396, "learning_rate": 3.99524649430452e-07, "logits/chosen": 12.801197052001953, "logits/rejected": 12.931827545166016, "logps/chosen": -3.942431926727295, "logps/rejected": -4.184288501739502, "loss": 3.614, "rewards/accuracies": 0.75, "rewards/chosen": -39.42431640625, "rewards/margins": 2.4185657501220703, "rewards/rejected": -41.84288787841797, "step": 4042 }, { "epoch": 0.5505174291938998, "grad_norm": 37.111897524094196, "learning_rate": 3.993345093530058e-07, "logits/chosen": 13.611306190490723, "logits/rejected": 13.041653633117676, "logps/chosen": -4.065317153930664, "logps/rejected": -4.353052139282227, "loss": 3.6318, "rewards/accuracies": 1.0, "rewards/chosen": -40.653167724609375, "rewards/margins": 2.877349853515625, "rewards/rejected": -43.530517578125, "step": 4043 }, { "epoch": 0.5506535947712419, "grad_norm": 34.8344018858426, "learning_rate": 3.991443694259328e-07, "logits/chosen": 13.516124725341797, "logits/rejected": 13.53720474243164, "logps/chosen": -4.261446952819824, "logps/rejected": -4.270631790161133, "loss": 3.6172, "rewards/accuracies": 0.25, "rewards/chosen": -42.614471435546875, "rewards/margins": 0.09184646606445312, "rewards/rejected": -42.70631408691406, "step": 4044 }, { "epoch": 0.5507897603485838, "grad_norm": 36.61284036832653, "learning_rate": 3.989542296921968e-07, "logits/chosen": 13.018214225769043, "logits/rejected": 13.8175630569458, "logps/chosen": -4.119622707366943, "logps/rejected": -4.532376766204834, "loss": 3.4845, "rewards/accuracies": 1.0, "rewards/chosen": -41.19622802734375, "rewards/margins": 4.1275434494018555, "rewards/rejected": -45.323768615722656, "step": 4045 }, { "epoch": 0.5509259259259259, "grad_norm": 44.745564378577264, "learning_rate": 3.9876409019476106e-07, "logits/chosen": 13.30850601196289, "logits/rejected": 14.163471221923828, "logps/chosen": -4.574514389038086, "logps/rejected": -4.335349082946777, "loss": 4.378, "rewards/accuracies": 0.25, "rewards/chosen": -45.745147705078125, "rewards/margins": -2.391653060913086, "rewards/rejected": -43.353492736816406, "step": 4046 }, { "epoch": 0.551062091503268, "grad_norm": 41.63990383264769, "learning_rate": 3.985739509765893e-07, "logits/chosen": 13.000638961791992, "logits/rejected": 13.223258972167969, "logps/chosen": -3.999817132949829, "logps/rejected": -3.940824031829834, "loss": 4.1394, "rewards/accuracies": 0.25, "rewards/chosen": -39.9981689453125, "rewards/margins": -0.589930534362793, "rewards/rejected": -39.408241271972656, "step": 4047 }, { "epoch": 0.55119825708061, "grad_norm": 40.45675116834933, "learning_rate": 3.9838381208064533e-07, "logits/chosen": 13.303169250488281, "logits/rejected": 13.45870590209961, "logps/chosen": -4.068001747131348, "logps/rejected": -4.376870155334473, "loss": 3.6627, "rewards/accuracies": 0.75, "rewards/chosen": -40.68001937866211, "rewards/margins": 3.0886831283569336, "rewards/rejected": -43.768707275390625, "step": 4048 }, { "epoch": 0.5513344226579521, "grad_norm": 41.86732386289002, "learning_rate": 3.981936735498922e-07, "logits/chosen": 12.907155990600586, "logits/rejected": 13.268596649169922, "logps/chosen": -4.078794479370117, "logps/rejected": -4.158078193664551, "loss": 3.8507, "rewards/accuracies": 0.75, "rewards/chosen": -40.78794860839844, "rewards/margins": 0.7928342819213867, "rewards/rejected": -41.580780029296875, "step": 4049 }, { "epoch": 0.5514705882352942, "grad_norm": 41.15791226253708, "learning_rate": 3.980035354272934e-07, "logits/chosen": 13.475969314575195, "logits/rejected": 14.046927452087402, "logps/chosen": -3.944108486175537, "logps/rejected": -4.425227165222168, "loss": 4.2988, "rewards/accuracies": 0.75, "rewards/chosen": -39.44108581542969, "rewards/margins": 4.811182975769043, "rewards/rejected": -44.25226974487305, "step": 4050 }, { "epoch": 0.5516067538126361, "grad_norm": 41.49573356868901, "learning_rate": 3.978133977558125e-07, "logits/chosen": 13.973976135253906, "logits/rejected": 13.683841705322266, "logps/chosen": -4.802279949188232, "logps/rejected": -4.761287689208984, "loss": 3.9926, "rewards/accuracies": 0.75, "rewards/chosen": -48.02280044555664, "rewards/margins": -0.4099254608154297, "rewards/rejected": -47.61287307739258, "step": 4051 }, { "epoch": 0.5517429193899782, "grad_norm": 38.387322662319896, "learning_rate": 3.976232605784123e-07, "logits/chosen": 13.01781940460205, "logits/rejected": 13.652002334594727, "logps/chosen": -4.295495986938477, "logps/rejected": -4.413779258728027, "loss": 3.4493, "rewards/accuracies": 0.75, "rewards/chosen": -42.9549560546875, "rewards/margins": 1.1828336715698242, "rewards/rejected": -44.13779067993164, "step": 4052 }, { "epoch": 0.5518790849673203, "grad_norm": 39.29786046438955, "learning_rate": 3.9743312393805593e-07, "logits/chosen": 13.635698318481445, "logits/rejected": 12.921466827392578, "logps/chosen": -3.8794150352478027, "logps/rejected": -4.276797294616699, "loss": 4.1606, "rewards/accuracies": 1.0, "rewards/chosen": -38.794151306152344, "rewards/margins": 3.973825454711914, "rewards/rejected": -42.767974853515625, "step": 4053 }, { "epoch": 0.5520152505446623, "grad_norm": 42.53569602042426, "learning_rate": 3.9724298787770667e-07, "logits/chosen": 13.504629135131836, "logits/rejected": 13.230798721313477, "logps/chosen": -4.513311386108398, "logps/rejected": -4.181559085845947, "loss": 3.8672, "rewards/accuracies": 0.25, "rewards/chosen": -45.133113861083984, "rewards/margins": -3.3175220489501953, "rewards/rejected": -41.81559371948242, "step": 4054 }, { "epoch": 0.5521514161220044, "grad_norm": 43.819887194971415, "learning_rate": 3.9705285244032695e-07, "logits/chosen": 12.635030746459961, "logits/rejected": 13.848763465881348, "logps/chosen": -3.8607983589172363, "logps/rejected": -4.577146053314209, "loss": 3.9148, "rewards/accuracies": 1.0, "rewards/chosen": -38.60798263549805, "rewards/margins": 7.163476943969727, "rewards/rejected": -45.771461486816406, "step": 4055 }, { "epoch": 0.5522875816993464, "grad_norm": 44.55731160935502, "learning_rate": 3.968627176688795e-07, "logits/chosen": 12.189990043640137, "logits/rejected": 12.922893524169922, "logps/chosen": -4.38102912902832, "logps/rejected": -4.627628326416016, "loss": 3.8925, "rewards/accuracies": 0.75, "rewards/chosen": -43.8102912902832, "rewards/margins": 2.4659929275512695, "rewards/rejected": -46.276283264160156, "step": 4056 }, { "epoch": 0.5524237472766884, "grad_norm": 41.29817932844146, "learning_rate": 3.9667258360632716e-07, "logits/chosen": 12.957901000976562, "logits/rejected": 13.846639633178711, "logps/chosen": -4.012360095977783, "logps/rejected": -4.456742286682129, "loss": 4.0011, "rewards/accuracies": 0.75, "rewards/chosen": -40.12360382080078, "rewards/margins": 4.443821907043457, "rewards/rejected": -44.56742477416992, "step": 4057 }, { "epoch": 0.5525599128540305, "grad_norm": 38.81429899985027, "learning_rate": 3.964824502956318e-07, "logits/chosen": 13.237359046936035, "logits/rejected": 13.140482902526855, "logps/chosen": -4.343313217163086, "logps/rejected": -4.388047218322754, "loss": 3.4175, "rewards/accuracies": 0.5, "rewards/chosen": -43.43313217163086, "rewards/margins": 0.4473390579223633, "rewards/rejected": -43.880470275878906, "step": 4058 }, { "epoch": 0.5526960784313726, "grad_norm": 46.414106357113475, "learning_rate": 3.96292317779756e-07, "logits/chosen": 13.043598175048828, "logits/rejected": 13.561835289001465, "logps/chosen": -4.498007774353027, "logps/rejected": -4.313660621643066, "loss": 3.7274, "rewards/accuracies": 0.5, "rewards/chosen": -44.980079650878906, "rewards/margins": -1.8434743881225586, "rewards/rejected": -43.13660430908203, "step": 4059 }, { "epoch": 0.5528322440087146, "grad_norm": 39.443843447918134, "learning_rate": 3.961021861016617e-07, "logits/chosen": 13.0819091796875, "logits/rejected": 12.719592094421387, "logps/chosen": -3.9098143577575684, "logps/rejected": -3.860116481781006, "loss": 3.9344, "rewards/accuracies": 0.5, "rewards/chosen": -39.09814453125, "rewards/margins": -0.496978759765625, "rewards/rejected": -38.601165771484375, "step": 4060 }, { "epoch": 0.5529684095860566, "grad_norm": 41.71293390216463, "learning_rate": 3.9591205530431056e-07, "logits/chosen": 12.883113861083984, "logits/rejected": 13.369662284851074, "logps/chosen": -4.10029411315918, "logps/rejected": -4.467260360717773, "loss": 4.4467, "rewards/accuracies": 0.75, "rewards/chosen": -41.00294494628906, "rewards/margins": 3.6696596145629883, "rewards/rejected": -44.672603607177734, "step": 4061 }, { "epoch": 0.5531045751633987, "grad_norm": 38.133427222816835, "learning_rate": 3.957219254306643e-07, "logits/chosen": 13.078118324279785, "logits/rejected": 13.629997253417969, "logps/chosen": -4.011451721191406, "logps/rejected": -4.2504658699035645, "loss": 4.19, "rewards/accuracies": 0.75, "rewards/chosen": -40.11451721191406, "rewards/margins": 2.390143394470215, "rewards/rejected": -42.50465774536133, "step": 4062 }, { "epoch": 0.5532407407407407, "grad_norm": 39.672342058037266, "learning_rate": 3.9553179652368447e-07, "logits/chosen": 13.391704559326172, "logits/rejected": 13.941075325012207, "logps/chosen": -3.927602767944336, "logps/rejected": -4.450233459472656, "loss": 3.9857, "rewards/accuracies": 1.0, "rewards/chosen": -39.276031494140625, "rewards/margins": 5.226306915283203, "rewards/rejected": -44.50233459472656, "step": 4063 }, { "epoch": 0.5533769063180828, "grad_norm": 40.63730086768911, "learning_rate": 3.953416686263321e-07, "logits/chosen": 11.998178482055664, "logits/rejected": 13.094493865966797, "logps/chosen": -4.027345657348633, "logps/rejected": -4.173765182495117, "loss": 3.7717, "rewards/accuracies": 0.25, "rewards/chosen": -40.273460388183594, "rewards/margins": 1.4641904830932617, "rewards/rejected": -41.737648010253906, "step": 4064 }, { "epoch": 0.5535130718954249, "grad_norm": 41.510120354720605, "learning_rate": 3.9515154178156817e-07, "logits/chosen": 13.398040771484375, "logits/rejected": 13.80842399597168, "logps/chosen": -4.128852844238281, "logps/rejected": -4.320042610168457, "loss": 3.7176, "rewards/accuracies": 0.5, "rewards/chosen": -41.28852844238281, "rewards/margins": 1.9118976593017578, "rewards/rejected": -43.2004280090332, "step": 4065 }, { "epoch": 0.5536492374727668, "grad_norm": 41.593542150063676, "learning_rate": 3.949614160323535e-07, "logits/chosen": 12.937372207641602, "logits/rejected": 13.96518611907959, "logps/chosen": -4.308514595031738, "logps/rejected": -4.570749282836914, "loss": 4.2843, "rewards/accuracies": 0.75, "rewards/chosen": -43.08514404296875, "rewards/margins": 2.622344970703125, "rewards/rejected": -45.707489013671875, "step": 4066 }, { "epoch": 0.5537854030501089, "grad_norm": 36.83583842725893, "learning_rate": 3.947712914216485e-07, "logits/chosen": 12.276811599731445, "logits/rejected": 13.004015922546387, "logps/chosen": -3.794929027557373, "logps/rejected": -4.095285415649414, "loss": 3.6523, "rewards/accuracies": 0.75, "rewards/chosen": -37.94928741455078, "rewards/margins": 3.0035629272460938, "rewards/rejected": -40.952850341796875, "step": 4067 }, { "epoch": 0.553921568627451, "grad_norm": 38.3586148295879, "learning_rate": 3.945811679924134e-07, "logits/chosen": 12.791716575622559, "logits/rejected": 13.061552047729492, "logps/chosen": -4.079894065856934, "logps/rejected": -4.28134822845459, "loss": 3.2663, "rewards/accuracies": 0.75, "rewards/chosen": -40.79894256591797, "rewards/margins": 2.014537811279297, "rewards/rejected": -42.813480377197266, "step": 4068 }, { "epoch": 0.554057734204793, "grad_norm": 42.70421794233687, "learning_rate": 3.9439104578760824e-07, "logits/chosen": 13.20377254486084, "logits/rejected": 13.344499588012695, "logps/chosen": -3.9611830711364746, "logps/rejected": -4.196141242980957, "loss": 4.1007, "rewards/accuracies": 0.75, "rewards/chosen": -39.61183166503906, "rewards/margins": 2.3495798110961914, "rewards/rejected": -41.96141052246094, "step": 4069 }, { "epoch": 0.5541938997821351, "grad_norm": 62.57050029265619, "learning_rate": 3.9420092485019263e-07, "logits/chosen": 13.828857421875, "logits/rejected": 14.258285522460938, "logps/chosen": -4.053315162658691, "logps/rejected": -4.345518112182617, "loss": 3.6996, "rewards/accuracies": 0.5, "rewards/chosen": -40.53314971923828, "rewards/margins": 2.922030448913574, "rewards/rejected": -43.45518493652344, "step": 4070 }, { "epoch": 0.5543300653594772, "grad_norm": 43.445084715878636, "learning_rate": 3.94010805223126e-07, "logits/chosen": 13.019073486328125, "logits/rejected": 12.756959915161133, "logps/chosen": -4.160458087921143, "logps/rejected": -4.243243217468262, "loss": 4.0029, "rewards/accuracies": 0.5, "rewards/chosen": -41.60457992553711, "rewards/margins": 0.827855110168457, "rewards/rejected": -42.43243408203125, "step": 4071 }, { "epoch": 0.5544662309368191, "grad_norm": 42.82047240424702, "learning_rate": 3.938206869493674e-07, "logits/chosen": 12.846654891967773, "logits/rejected": 13.187251091003418, "logps/chosen": -4.265739917755127, "logps/rejected": -4.362174034118652, "loss": 4.1186, "rewards/accuracies": 0.75, "rewards/chosen": -42.65739822387695, "rewards/margins": 0.9643421173095703, "rewards/rejected": -43.62173843383789, "step": 4072 }, { "epoch": 0.5546023965141612, "grad_norm": 39.406981302278794, "learning_rate": 3.9363057007187563e-07, "logits/chosen": 13.552875518798828, "logits/rejected": 13.158906936645508, "logps/chosen": -4.1991119384765625, "logps/rejected": -4.340928554534912, "loss": 3.7835, "rewards/accuracies": 0.75, "rewards/chosen": -41.991119384765625, "rewards/margins": 1.4181671142578125, "rewards/rejected": -43.40928649902344, "step": 4073 }, { "epoch": 0.5547385620915033, "grad_norm": 39.0393352928664, "learning_rate": 3.934404546336093e-07, "logits/chosen": 12.742607116699219, "logits/rejected": 13.545608520507812, "logps/chosen": -4.104629039764404, "logps/rejected": -4.460268974304199, "loss": 4.0676, "rewards/accuracies": 0.75, "rewards/chosen": -41.046287536621094, "rewards/margins": 3.556403160095215, "rewards/rejected": -44.602691650390625, "step": 4074 }, { "epoch": 0.5548747276688453, "grad_norm": 43.44033513187625, "learning_rate": 3.9325034067752643e-07, "logits/chosen": 11.94410514831543, "logits/rejected": 12.714092254638672, "logps/chosen": -3.777657985687256, "logps/rejected": -4.18187141418457, "loss": 3.6228, "rewards/accuracies": 0.75, "rewards/chosen": -37.776580810546875, "rewards/margins": 4.042133331298828, "rewards/rejected": -41.81871032714844, "step": 4075 }, { "epoch": 0.5550108932461874, "grad_norm": 40.63334672335921, "learning_rate": 3.930602282465848e-07, "logits/chosen": 12.931741714477539, "logits/rejected": 13.185785293579102, "logps/chosen": -3.9906582832336426, "logps/rejected": -3.8858225345611572, "loss": 4.4029, "rewards/accuracies": 0.75, "rewards/chosen": -39.90658187866211, "rewards/margins": -1.048356056213379, "rewards/rejected": -38.85822296142578, "step": 4076 }, { "epoch": 0.5551470588235294, "grad_norm": 37.30065380778341, "learning_rate": 3.9287011738374203e-07, "logits/chosen": 12.350751876831055, "logits/rejected": 14.38973617553711, "logps/chosen": -3.8378820419311523, "logps/rejected": -4.367668151855469, "loss": 4.2687, "rewards/accuracies": 1.0, "rewards/chosen": -38.378822326660156, "rewards/margins": 5.297860145568848, "rewards/rejected": -43.67667770385742, "step": 4077 }, { "epoch": 0.5552832244008714, "grad_norm": 38.17021087832309, "learning_rate": 3.9268000813195526e-07, "logits/chosen": 12.978134155273438, "logits/rejected": 12.86972427368164, "logps/chosen": -4.216504096984863, "logps/rejected": -3.980400562286377, "loss": 3.6086, "rewards/accuracies": 0.5, "rewards/chosen": -42.165042877197266, "rewards/margins": -2.361037254333496, "rewards/rejected": -39.80400466918945, "step": 4078 }, { "epoch": 0.5554193899782135, "grad_norm": 41.57880280367449, "learning_rate": 3.92489900534181e-07, "logits/chosen": 13.14766788482666, "logits/rejected": 13.78899097442627, "logps/chosen": -4.159370422363281, "logps/rejected": -4.560691833496094, "loss": 4.3812, "rewards/accuracies": 1.0, "rewards/chosen": -41.59370803833008, "rewards/margins": 4.013214111328125, "rewards/rejected": -45.6069221496582, "step": 4079 }, { "epoch": 0.5555555555555556, "grad_norm": 36.68337215759666, "learning_rate": 3.9229979463337585e-07, "logits/chosen": 14.19198226928711, "logits/rejected": 14.346216201782227, "logps/chosen": -4.2104811668396, "logps/rejected": -4.449696063995361, "loss": 3.8722, "rewards/accuracies": 1.0, "rewards/chosen": -42.10481262207031, "rewards/margins": 2.392148971557617, "rewards/rejected": -44.49696350097656, "step": 4080 }, { "epoch": 0.5556917211328976, "grad_norm": 40.44493441834306, "learning_rate": 3.9210969047249586e-07, "logits/chosen": 13.594646453857422, "logits/rejected": 13.888999938964844, "logps/chosen": -4.370247840881348, "logps/rejected": -4.532802581787109, "loss": 4.0374, "rewards/accuracies": 0.75, "rewards/chosen": -43.70248031616211, "rewards/margins": 1.6255502700805664, "rewards/rejected": -45.32802963256836, "step": 4081 }, { "epoch": 0.5558278867102396, "grad_norm": 39.542003566463805, "learning_rate": 3.919195880944964e-07, "logits/chosen": 13.353423118591309, "logits/rejected": 13.803361892700195, "logps/chosen": -4.0160417556762695, "logps/rejected": -4.295479774475098, "loss": 3.8318, "rewards/accuracies": 0.5, "rewards/chosen": -40.16041946411133, "rewards/margins": 2.794375419616699, "rewards/rejected": -42.954795837402344, "step": 4082 }, { "epoch": 0.5559640522875817, "grad_norm": 37.507112839871446, "learning_rate": 3.9172948754233277e-07, "logits/chosen": 12.504161834716797, "logits/rejected": 12.57166862487793, "logps/chosen": -3.7571606636047363, "logps/rejected": -3.947350263595581, "loss": 3.7474, "rewards/accuracies": 0.75, "rewards/chosen": -37.57160949707031, "rewards/margins": 1.9018950462341309, "rewards/rejected": -39.47350311279297, "step": 4083 }, { "epoch": 0.5561002178649237, "grad_norm": 44.73970256790003, "learning_rate": 3.9153938885896005e-07, "logits/chosen": 12.982450485229492, "logits/rejected": 12.88125991821289, "logps/chosen": -4.010821342468262, "logps/rejected": -4.110513210296631, "loss": 4.1194, "rewards/accuracies": 0.75, "rewards/chosen": -40.10821533203125, "rewards/margins": 0.9969158172607422, "rewards/rejected": -41.105133056640625, "step": 4084 }, { "epoch": 0.5562363834422658, "grad_norm": 41.09459066387857, "learning_rate": 3.9134929208733205e-07, "logits/chosen": 13.58037281036377, "logits/rejected": 14.181436538696289, "logps/chosen": -4.015735149383545, "logps/rejected": -4.611635208129883, "loss": 4.2613, "rewards/accuracies": 0.75, "rewards/chosen": -40.157352447509766, "rewards/margins": 5.9589996337890625, "rewards/rejected": -46.11634826660156, "step": 4085 }, { "epoch": 0.5563725490196079, "grad_norm": 45.15887308067167, "learning_rate": 3.911591972704031e-07, "logits/chosen": 13.708333015441895, "logits/rejected": 13.639825820922852, "logps/chosen": -4.605100154876709, "logps/rejected": -4.574553489685059, "loss": 4.2395, "rewards/accuracies": 0.5, "rewards/chosen": -46.051002502441406, "rewards/margins": -0.3054685592651367, "rewards/rejected": -45.74553298950195, "step": 4086 }, { "epoch": 0.5565087145969498, "grad_norm": 36.527748867420485, "learning_rate": 3.909691044511268e-07, "logits/chosen": 13.242817878723145, "logits/rejected": 13.655600547790527, "logps/chosen": -4.345986843109131, "logps/rejected": -4.431692123413086, "loss": 3.6828, "rewards/accuracies": 0.5, "rewards/chosen": -43.459869384765625, "rewards/margins": 0.8570489883422852, "rewards/rejected": -44.316917419433594, "step": 4087 }, { "epoch": 0.5566448801742919, "grad_norm": 45.48999074893493, "learning_rate": 3.907790136724558e-07, "logits/chosen": 13.979080200195312, "logits/rejected": 14.066913604736328, "logps/chosen": -4.4583845138549805, "logps/rejected": -4.733768463134766, "loss": 4.1495, "rewards/accuracies": 0.75, "rewards/chosen": -44.58384704589844, "rewards/margins": 2.753835678100586, "rewards/rejected": -47.33768081665039, "step": 4088 }, { "epoch": 0.556781045751634, "grad_norm": 39.61163967113749, "learning_rate": 3.905889249773428e-07, "logits/chosen": 13.142436027526855, "logits/rejected": 13.384550094604492, "logps/chosen": -3.922733783721924, "logps/rejected": -4.425210952758789, "loss": 3.641, "rewards/accuracies": 1.0, "rewards/chosen": -39.227333068847656, "rewards/margins": 5.024776458740234, "rewards/rejected": -44.252113342285156, "step": 4089 }, { "epoch": 0.556917211328976, "grad_norm": 41.834662382635194, "learning_rate": 3.9039883840874027e-07, "logits/chosen": 12.813111305236816, "logits/rejected": 13.216874122619629, "logps/chosen": -3.9031736850738525, "logps/rejected": -3.806199550628662, "loss": 4.2936, "rewards/accuracies": 0.25, "rewards/chosen": -39.03173828125, "rewards/margins": -0.9697413444519043, "rewards/rejected": -38.06199645996094, "step": 4090 }, { "epoch": 0.5570533769063181, "grad_norm": 36.469934134065284, "learning_rate": 3.9020875400959935e-07, "logits/chosen": 13.822149276733398, "logits/rejected": 14.358957290649414, "logps/chosen": -4.24617338180542, "logps/rejected": -4.4805498123168945, "loss": 3.5636, "rewards/accuracies": 0.5, "rewards/chosen": -42.46173095703125, "rewards/margins": 2.3437623977661133, "rewards/rejected": -44.80549621582031, "step": 4091 }, { "epoch": 0.5571895424836601, "grad_norm": 38.54278040000104, "learning_rate": 3.900186718228714e-07, "logits/chosen": 12.393738746643066, "logits/rejected": 12.277843475341797, "logps/chosen": -3.9847190380096436, "logps/rejected": -4.051152229309082, "loss": 3.7364, "rewards/accuracies": 0.5, "rewards/chosen": -39.847190856933594, "rewards/margins": 0.6643333435058594, "rewards/rejected": -40.51152038574219, "step": 4092 }, { "epoch": 0.5573257080610022, "grad_norm": 42.03052234478178, "learning_rate": 3.898285918915072e-07, "logits/chosen": 13.419780731201172, "logits/rejected": 14.014771461486816, "logps/chosen": -4.380608558654785, "logps/rejected": -4.710857391357422, "loss": 3.3986, "rewards/accuracies": 1.0, "rewards/chosen": -43.80608367919922, "rewards/margins": 3.3024892807006836, "rewards/rejected": -47.10857391357422, "step": 4093 }, { "epoch": 0.5574618736383442, "grad_norm": 37.78778714387956, "learning_rate": 3.8963851425845664e-07, "logits/chosen": 14.246501922607422, "logits/rejected": 13.585653305053711, "logps/chosen": -4.547384262084961, "logps/rejected": -4.731720447540283, "loss": 3.7768, "rewards/accuracies": 0.5, "rewards/chosen": -45.47384262084961, "rewards/margins": 1.8433647155761719, "rewards/rejected": -47.31720733642578, "step": 4094 }, { "epoch": 0.5575980392156863, "grad_norm": 39.57414316281996, "learning_rate": 3.8944843896666934e-07, "logits/chosen": 13.516035079956055, "logits/rejected": 13.691275596618652, "logps/chosen": -4.021146774291992, "logps/rejected": -4.343389987945557, "loss": 3.5582, "rewards/accuracies": 0.75, "rewards/chosen": -40.21147155761719, "rewards/margins": 3.2224302291870117, "rewards/rejected": -43.43389892578125, "step": 4095 }, { "epoch": 0.5577342047930284, "grad_norm": 40.148509279088394, "learning_rate": 3.892583660590947e-07, "logits/chosen": 14.40793228149414, "logits/rejected": 14.600759506225586, "logps/chosen": -4.644700050354004, "logps/rejected": -4.915297508239746, "loss": 3.5308, "rewards/accuracies": 0.5, "rewards/chosen": -46.446998596191406, "rewards/margins": 2.7059707641601562, "rewards/rejected": -49.15297317504883, "step": 4096 }, { "epoch": 0.5578703703703703, "grad_norm": 41.72068287624928, "learning_rate": 3.8906829557868093e-07, "logits/chosen": 13.648038864135742, "logits/rejected": 14.685420989990234, "logps/chosen": -4.269303798675537, "logps/rejected": -4.562702178955078, "loss": 3.9908, "rewards/accuracies": 0.75, "rewards/chosen": -42.69303512573242, "rewards/margins": 2.933980941772461, "rewards/rejected": -45.62701416015625, "step": 4097 }, { "epoch": 0.5580065359477124, "grad_norm": 40.31065430188032, "learning_rate": 3.8887822756837605e-07, "logits/chosen": 12.925039291381836, "logits/rejected": 13.745623588562012, "logps/chosen": -4.038259506225586, "logps/rejected": -4.830073356628418, "loss": 4.1513, "rewards/accuracies": 0.75, "rewards/chosen": -40.382598876953125, "rewards/margins": 7.918139457702637, "rewards/rejected": -48.30073547363281, "step": 4098 }, { "epoch": 0.5581427015250545, "grad_norm": 42.01502147401705, "learning_rate": 3.8868816207112776e-07, "logits/chosen": 13.256461143493652, "logits/rejected": 13.422256469726562, "logps/chosen": -4.188589096069336, "logps/rejected": -4.479393005371094, "loss": 3.9607, "rewards/accuracies": 0.5, "rewards/chosen": -41.885894775390625, "rewards/margins": 2.908039093017578, "rewards/rejected": -44.79393005371094, "step": 4099 }, { "epoch": 0.5582788671023965, "grad_norm": 42.80572584808233, "learning_rate": 3.884980991298826e-07, "logits/chosen": 13.29434585571289, "logits/rejected": 13.539131164550781, "logps/chosen": -4.267816543579102, "logps/rejected": -4.381777286529541, "loss": 3.9545, "rewards/accuracies": 1.0, "rewards/chosen": -42.67816925048828, "rewards/margins": 1.1396045684814453, "rewards/rejected": -43.817771911621094, "step": 4100 }, { "epoch": 0.5584150326797386, "grad_norm": 42.27907640380687, "learning_rate": 3.88308038787587e-07, "logits/chosen": 13.998207092285156, "logits/rejected": 13.679277420043945, "logps/chosen": -4.394657135009766, "logps/rejected": -4.18137264251709, "loss": 4.1445, "rewards/accuracies": 0.25, "rewards/chosen": -43.94656753540039, "rewards/margins": -2.132845878601074, "rewards/rejected": -41.813720703125, "step": 4101 }, { "epoch": 0.5585511982570807, "grad_norm": 44.26724724269483, "learning_rate": 3.881179810871869e-07, "logits/chosen": 13.407306671142578, "logits/rejected": 13.973691940307617, "logps/chosen": -4.110629558563232, "logps/rejected": -4.440878868103027, "loss": 4.2778, "rewards/accuracies": 0.75, "rewards/chosen": -41.106292724609375, "rewards/margins": 3.3024959564208984, "rewards/rejected": -44.408790588378906, "step": 4102 }, { "epoch": 0.5586873638344226, "grad_norm": 44.26719569435172, "learning_rate": 3.87927926071627e-07, "logits/chosen": 14.053367614746094, "logits/rejected": 14.073343276977539, "logps/chosen": -4.868559837341309, "logps/rejected": -4.598749160766602, "loss": 4.3289, "rewards/accuracies": 0.0, "rewards/chosen": -48.68560028076172, "rewards/margins": -2.6981096267700195, "rewards/rejected": -45.987491607666016, "step": 4103 }, { "epoch": 0.5588235294117647, "grad_norm": 41.88373078534187, "learning_rate": 3.87737873783852e-07, "logits/chosen": 13.541162490844727, "logits/rejected": 13.229196548461914, "logps/chosen": -3.87046480178833, "logps/rejected": -4.133636474609375, "loss": 4.2573, "rewards/accuracies": 0.75, "rewards/chosen": -38.704647064208984, "rewards/margins": 2.631716728210449, "rewards/rejected": -41.33636474609375, "step": 4104 }, { "epoch": 0.5589596949891068, "grad_norm": 37.492427285913784, "learning_rate": 3.875478242668059e-07, "logits/chosen": 12.574277877807617, "logits/rejected": 13.932126998901367, "logps/chosen": -3.7842354774475098, "logps/rejected": -4.28203821182251, "loss": 3.763, "rewards/accuracies": 1.0, "rewards/chosen": -37.84235382080078, "rewards/margins": 4.978029251098633, "rewards/rejected": -42.82038116455078, "step": 4105 }, { "epoch": 0.5590958605664488, "grad_norm": 42.56028811957282, "learning_rate": 3.873577775634317e-07, "logits/chosen": 13.285747528076172, "logits/rejected": 14.61089038848877, "logps/chosen": -4.175262451171875, "logps/rejected": -4.479986190795898, "loss": 4.335, "rewards/accuracies": 0.75, "rewards/chosen": -41.75262451171875, "rewards/margins": 3.0472373962402344, "rewards/rejected": -44.799861907958984, "step": 4106 }, { "epoch": 0.5592320261437909, "grad_norm": 36.13154565101483, "learning_rate": 3.871677337166722e-07, "logits/chosen": 13.629921913146973, "logits/rejected": 13.369733810424805, "logps/chosen": -4.116804599761963, "logps/rejected": -4.427041530609131, "loss": 3.919, "rewards/accuracies": 0.75, "rewards/chosen": -41.16804504394531, "rewards/margins": 3.102372169494629, "rewards/rejected": -44.270416259765625, "step": 4107 }, { "epoch": 0.559368191721133, "grad_norm": 44.61483708186775, "learning_rate": 3.8697769276946943e-07, "logits/chosen": 13.896929740905762, "logits/rejected": 13.61019515991211, "logps/chosen": -4.4004130363464355, "logps/rejected": -4.381082534790039, "loss": 3.9701, "rewards/accuracies": 0.25, "rewards/chosen": -44.004127502441406, "rewards/margins": -0.19330596923828125, "rewards/rejected": -43.810821533203125, "step": 4108 }, { "epoch": 0.5595043572984749, "grad_norm": 40.496936468613875, "learning_rate": 3.867876547647645e-07, "logits/chosen": 12.850043296813965, "logits/rejected": 13.546477317810059, "logps/chosen": -3.855215549468994, "logps/rejected": -4.216407299041748, "loss": 3.9107, "rewards/accuracies": 0.75, "rewards/chosen": -38.552154541015625, "rewards/margins": 3.611917495727539, "rewards/rejected": -42.16407012939453, "step": 4109 }, { "epoch": 0.559640522875817, "grad_norm": 42.241970515248866, "learning_rate": 3.865976197454982e-07, "logits/chosen": 14.08442497253418, "logits/rejected": 14.781363487243652, "logps/chosen": -4.00945520401001, "logps/rejected": -4.582577705383301, "loss": 4.2542, "rewards/accuracies": 1.0, "rewards/chosen": -40.09455490112305, "rewards/margins": 5.731220245361328, "rewards/rejected": -45.825775146484375, "step": 4110 }, { "epoch": 0.5597766884531591, "grad_norm": 39.85161343217715, "learning_rate": 3.864075877546106e-07, "logits/chosen": 13.52051067352295, "logits/rejected": 13.910444259643555, "logps/chosen": -4.234813213348389, "logps/rejected": -4.640452861785889, "loss": 3.9723, "rewards/accuracies": 1.0, "rewards/chosen": -42.34812927246094, "rewards/margins": 4.056397438049316, "rewards/rejected": -46.40452575683594, "step": 4111 }, { "epoch": 0.5599128540305011, "grad_norm": 38.93347860301182, "learning_rate": 3.862175588350409e-07, "logits/chosen": 15.023839950561523, "logits/rejected": 14.904549598693848, "logps/chosen": -4.512299537658691, "logps/rejected": -4.737921714782715, "loss": 3.4584, "rewards/accuracies": 0.75, "rewards/chosen": -45.12299346923828, "rewards/margins": 2.2562217712402344, "rewards/rejected": -47.379215240478516, "step": 4112 }, { "epoch": 0.5600490196078431, "grad_norm": 38.346673403723486, "learning_rate": 3.860275330297276e-07, "logits/chosen": 13.214984893798828, "logits/rejected": 14.715191841125488, "logps/chosen": -4.137574195861816, "logps/rejected": -4.396928787231445, "loss": 3.7911, "rewards/accuracies": 0.75, "rewards/chosen": -41.37574005126953, "rewards/margins": 2.5935487747192383, "rewards/rejected": -43.96928787231445, "step": 4113 }, { "epoch": 0.5601851851851852, "grad_norm": 41.47505606529205, "learning_rate": 3.8583751038160876e-07, "logits/chosen": 13.559942245483398, "logits/rejected": 14.792583465576172, "logps/chosen": -4.142988681793213, "logps/rejected": -4.690154075622559, "loss": 3.8295, "rewards/accuracies": 1.0, "rewards/chosen": -41.42988586425781, "rewards/margins": 5.471653938293457, "rewards/rejected": -46.90153884887695, "step": 4114 }, { "epoch": 0.5603213507625272, "grad_norm": 36.75964090615985, "learning_rate": 3.8564749093362154e-07, "logits/chosen": 13.231620788574219, "logits/rejected": 13.243099212646484, "logps/chosen": -4.060612678527832, "logps/rejected": -4.229668617248535, "loss": 4.0011, "rewards/accuracies": 0.5, "rewards/chosen": -40.60612869262695, "rewards/margins": 1.6905584335327148, "rewards/rejected": -42.29668426513672, "step": 4115 }, { "epoch": 0.5604575163398693, "grad_norm": 38.251422446275036, "learning_rate": 3.854574747287023e-07, "logits/chosen": 14.46882438659668, "logits/rejected": 15.221412658691406, "logps/chosen": -4.094898223876953, "logps/rejected": -4.640722274780273, "loss": 3.7274, "rewards/accuracies": 1.0, "rewards/chosen": -40.94898223876953, "rewards/margins": 5.4582366943359375, "rewards/rejected": -46.40721893310547, "step": 4116 }, { "epoch": 0.5605936819172114, "grad_norm": 45.56692099103102, "learning_rate": 3.8526746180978696e-07, "logits/chosen": 12.89271354675293, "logits/rejected": 13.387399673461914, "logps/chosen": -4.079300403594971, "logps/rejected": -4.348571300506592, "loss": 4.33, "rewards/accuracies": 1.0, "rewards/chosen": -40.79300308227539, "rewards/margins": 2.69271183013916, "rewards/rejected": -43.485713958740234, "step": 4117 }, { "epoch": 0.5607298474945533, "grad_norm": 40.15351695323408, "learning_rate": 3.850774522198103e-07, "logits/chosen": 12.477513313293457, "logits/rejected": 13.508806228637695, "logps/chosen": -3.961559534072876, "logps/rejected": -4.254344940185547, "loss": 3.6457, "rewards/accuracies": 0.75, "rewards/chosen": -39.61559295654297, "rewards/margins": 2.9278526306152344, "rewards/rejected": -42.54344940185547, "step": 4118 }, { "epoch": 0.5608660130718954, "grad_norm": 37.500445189671034, "learning_rate": 3.8488744600170677e-07, "logits/chosen": 13.806794166564941, "logits/rejected": 13.613398551940918, "logps/chosen": -4.535580635070801, "logps/rejected": -4.51881217956543, "loss": 3.795, "rewards/accuracies": 0.5, "rewards/chosen": -45.355804443359375, "rewards/margins": -0.16768455505371094, "rewards/rejected": -45.18811798095703, "step": 4119 }, { "epoch": 0.5610021786492375, "grad_norm": 39.914192304890484, "learning_rate": 3.846974431984097e-07, "logits/chosen": 14.259824752807617, "logits/rejected": 13.753217697143555, "logps/chosen": -4.822134017944336, "logps/rejected": -4.815227031707764, "loss": 4.2833, "rewards/accuracies": 0.75, "rewards/chosen": -48.221336364746094, "rewards/margins": -0.06906700134277344, "rewards/rejected": -48.15226745605469, "step": 4120 }, { "epoch": 0.5611383442265795, "grad_norm": 43.654127748276636, "learning_rate": 3.8450744385285183e-07, "logits/chosen": 13.583911895751953, "logits/rejected": 13.557758331298828, "logps/chosen": -4.32080602645874, "logps/rejected": -4.241818428039551, "loss": 3.7104, "rewards/accuracies": 0.5, "rewards/chosen": -43.20805740356445, "rewards/margins": -0.7898769378662109, "rewards/rejected": -42.418182373046875, "step": 4121 }, { "epoch": 0.5612745098039216, "grad_norm": 40.66278857700495, "learning_rate": 3.8431744800796507e-07, "logits/chosen": 13.421567916870117, "logits/rejected": 13.822700500488281, "logps/chosen": -4.238955974578857, "logps/rejected": -4.499539852142334, "loss": 3.3834, "rewards/accuracies": 0.75, "rewards/chosen": -42.38956069946289, "rewards/margins": 2.6058349609375, "rewards/rejected": -44.99539566040039, "step": 4122 }, { "epoch": 0.5614106753812637, "grad_norm": 39.133412908753805, "learning_rate": 3.841274557066806e-07, "logits/chosen": 13.562248229980469, "logits/rejected": 13.93796443939209, "logps/chosen": -4.129301071166992, "logps/rejected": -4.224982261657715, "loss": 4.0942, "rewards/accuracies": 0.5, "rewards/chosen": -41.29301452636719, "rewards/margins": 0.9568099975585938, "rewards/rejected": -42.249820709228516, "step": 4123 }, { "epoch": 0.5615468409586056, "grad_norm": 41.76301268481594, "learning_rate": 3.8393746699192863e-07, "logits/chosen": 13.530077934265137, "logits/rejected": 13.912373542785645, "logps/chosen": -4.376669883728027, "logps/rejected": -4.188616752624512, "loss": 3.779, "rewards/accuracies": 0.25, "rewards/chosen": -43.766700744628906, "rewards/margins": -1.8805313110351562, "rewards/rejected": -41.886165618896484, "step": 4124 }, { "epoch": 0.5616830065359477, "grad_norm": 41.04534612533723, "learning_rate": 3.837474819066388e-07, "logits/chosen": 14.308280944824219, "logits/rejected": 15.018905639648438, "logps/chosen": -4.562823295593262, "logps/rejected": -4.697332382202148, "loss": 4.1054, "rewards/accuracies": 0.75, "rewards/chosen": -45.62823486328125, "rewards/margins": 1.3450851440429688, "rewards/rejected": -46.97332000732422, "step": 4125 }, { "epoch": 0.5618191721132898, "grad_norm": 42.507931043500655, "learning_rate": 3.835575004937399e-07, "logits/chosen": 13.382501602172852, "logits/rejected": 13.599202156066895, "logps/chosen": -4.228850841522217, "logps/rejected": -4.309725761413574, "loss": 3.4142, "rewards/accuracies": 0.5, "rewards/chosen": -42.288509368896484, "rewards/margins": 0.8087472915649414, "rewards/rejected": -43.09725570678711, "step": 4126 }, { "epoch": 0.5619553376906318, "grad_norm": 39.636285880278706, "learning_rate": 3.8336752279615955e-07, "logits/chosen": 13.452380180358887, "logits/rejected": 14.656362533569336, "logps/chosen": -4.1529860496521, "logps/rejected": -4.643579959869385, "loss": 4.2908, "rewards/accuracies": 0.75, "rewards/chosen": -41.52985763549805, "rewards/margins": 4.905941963195801, "rewards/rejected": -46.43579864501953, "step": 4127 }, { "epoch": 0.5620915032679739, "grad_norm": 43.12239073612119, "learning_rate": 3.831775488568249e-07, "logits/chosen": 13.572793006896973, "logits/rejected": 12.636124610900879, "logps/chosen": -4.5360517501831055, "logps/rejected": -4.133395671844482, "loss": 4.3597, "rewards/accuracies": 0.25, "rewards/chosen": -45.36051559448242, "rewards/margins": -4.026559829711914, "rewards/rejected": -41.333953857421875, "step": 4128 }, { "epoch": 0.5622276688453159, "grad_norm": 43.44018480571459, "learning_rate": 3.8298757871866226e-07, "logits/chosen": 13.506431579589844, "logits/rejected": 13.937427520751953, "logps/chosen": -4.2307448387146, "logps/rejected": -4.586329460144043, "loss": 4.1013, "rewards/accuracies": 0.75, "rewards/chosen": -42.30744934082031, "rewards/margins": 3.555849075317383, "rewards/rejected": -45.86329650878906, "step": 4129 }, { "epoch": 0.5623638344226579, "grad_norm": 37.80204509113461, "learning_rate": 3.8279761242459674e-07, "logits/chosen": 12.458316802978516, "logits/rejected": 13.264153480529785, "logps/chosen": -4.07737398147583, "logps/rejected": -4.273885250091553, "loss": 3.4458, "rewards/accuracies": 0.75, "rewards/chosen": -40.773738861083984, "rewards/margins": 1.9651155471801758, "rewards/rejected": -42.738853454589844, "step": 4130 }, { "epoch": 0.5625, "grad_norm": 37.816378104931225, "learning_rate": 3.8260765001755286e-07, "logits/chosen": 13.441434860229492, "logits/rejected": 13.921304702758789, "logps/chosen": -4.070791721343994, "logps/rejected": -4.341823577880859, "loss": 4.0648, "rewards/accuracies": 0.75, "rewards/chosen": -40.707916259765625, "rewards/margins": 2.7103166580200195, "rewards/rejected": -43.418235778808594, "step": 4131 }, { "epoch": 0.5626361655773421, "grad_norm": 42.651461803245134, "learning_rate": 3.824176915404544e-07, "logits/chosen": 13.572599411010742, "logits/rejected": 13.795580863952637, "logps/chosen": -4.279265880584717, "logps/rejected": -4.509439468383789, "loss": 3.7436, "rewards/accuracies": 0.5, "rewards/chosen": -42.79265594482422, "rewards/margins": 2.3017377853393555, "rewards/rejected": -45.094398498535156, "step": 4132 }, { "epoch": 0.5627723311546841, "grad_norm": 41.755323636444224, "learning_rate": 3.822277370362237e-07, "logits/chosen": 13.817659378051758, "logits/rejected": 14.242542266845703, "logps/chosen": -4.434763431549072, "logps/rejected": -4.312324523925781, "loss": 4.0443, "rewards/accuracies": 0.25, "rewards/chosen": -44.347633361816406, "rewards/margins": -1.2243881225585938, "rewards/rejected": -43.12324523925781, "step": 4133 }, { "epoch": 0.5629084967320261, "grad_norm": 39.44365158307155, "learning_rate": 3.8203778654778273e-07, "logits/chosen": 13.412601470947266, "logits/rejected": 13.879792213439941, "logps/chosen": -4.273073196411133, "logps/rejected": -4.732794284820557, "loss": 3.7681, "rewards/accuracies": 1.0, "rewards/chosen": -42.73072814941406, "rewards/margins": 4.597210884094238, "rewards/rejected": -47.32794189453125, "step": 4134 }, { "epoch": 0.5630446623093682, "grad_norm": 44.288760933470094, "learning_rate": 3.818478401180525e-07, "logits/chosen": 14.124155044555664, "logits/rejected": 13.48758316040039, "logps/chosen": -4.078949451446533, "logps/rejected": -4.094984531402588, "loss": 4.113, "rewards/accuracies": 0.25, "rewards/chosen": -40.78949737548828, "rewards/margins": 0.16035175323486328, "rewards/rejected": -40.94984436035156, "step": 4135 }, { "epoch": 0.5631808278867102, "grad_norm": 41.927936589765274, "learning_rate": 3.8165789778995265e-07, "logits/chosen": 12.430039405822754, "logits/rejected": 12.25816535949707, "logps/chosen": -4.262358665466309, "logps/rejected": -4.380365371704102, "loss": 4.0635, "rewards/accuracies": 0.5, "rewards/chosen": -42.62358856201172, "rewards/margins": 1.1800651550292969, "rewards/rejected": -43.803653717041016, "step": 4136 }, { "epoch": 0.5633169934640523, "grad_norm": 41.662953803532716, "learning_rate": 3.814679596064025e-07, "logits/chosen": 13.480817794799805, "logits/rejected": 14.980175971984863, "logps/chosen": -4.266933441162109, "logps/rejected": -4.9342041015625, "loss": 3.6952, "rewards/accuracies": 1.0, "rewards/chosen": -42.66933059692383, "rewards/margins": 6.672708511352539, "rewards/rejected": -49.342041015625, "step": 4137 }, { "epoch": 0.5634531590413944, "grad_norm": 40.09360081361984, "learning_rate": 3.812780256103202e-07, "logits/chosen": 13.4620361328125, "logits/rejected": 13.614595413208008, "logps/chosen": -4.215860843658447, "logps/rejected": -4.424941062927246, "loss": 3.7559, "rewards/accuracies": 0.5, "rewards/chosen": -42.15861129760742, "rewards/margins": 2.0908031463623047, "rewards/rejected": -44.249412536621094, "step": 4138 }, { "epoch": 0.5635893246187363, "grad_norm": 38.02364165065286, "learning_rate": 3.8108809584462256e-07, "logits/chosen": 14.005988121032715, "logits/rejected": 14.091033935546875, "logps/chosen": -4.406278610229492, "logps/rejected": -4.588187217712402, "loss": 3.6869, "rewards/accuracies": 0.75, "rewards/chosen": -44.06278991699219, "rewards/margins": 1.8190784454345703, "rewards/rejected": -45.881866455078125, "step": 4139 }, { "epoch": 0.5637254901960784, "grad_norm": 42.40806603394798, "learning_rate": 3.8089817035222606e-07, "logits/chosen": 13.029030799865723, "logits/rejected": 13.269415855407715, "logps/chosen": -4.156286239624023, "logps/rejected": -4.297985553741455, "loss": 4.501, "rewards/accuracies": 1.0, "rewards/chosen": -41.562862396240234, "rewards/margins": 1.4169921875, "rewards/rejected": -42.979854583740234, "step": 4140 }, { "epoch": 0.5638616557734205, "grad_norm": 46.26085639206252, "learning_rate": 3.80708249176046e-07, "logits/chosen": 13.889877319335938, "logits/rejected": 13.638263702392578, "logps/chosen": -4.604635238647461, "logps/rejected": -4.268789291381836, "loss": 4.3524, "rewards/accuracies": 0.0, "rewards/chosen": -46.04635238647461, "rewards/margins": -3.3584585189819336, "rewards/rejected": -42.687896728515625, "step": 4141 }, { "epoch": 0.5639978213507625, "grad_norm": 41.94094942047809, "learning_rate": 3.8051833235899634e-07, "logits/chosen": 13.559555053710938, "logits/rejected": 14.31737995147705, "logps/chosen": -4.264932632446289, "logps/rejected": -4.77169942855835, "loss": 3.7948, "rewards/accuracies": 1.0, "rewards/chosen": -42.64932632446289, "rewards/margins": 5.067670822143555, "rewards/rejected": -47.71699523925781, "step": 4142 }, { "epoch": 0.5641339869281046, "grad_norm": 38.840708968669624, "learning_rate": 3.803284199439905e-07, "logits/chosen": 13.158406257629395, "logits/rejected": 13.766366958618164, "logps/chosen": -4.29014253616333, "logps/rejected": -4.56605339050293, "loss": 3.2619, "rewards/accuracies": 0.75, "rewards/chosen": -42.90142822265625, "rewards/margins": 2.7591066360473633, "rewards/rejected": -45.66053009033203, "step": 4143 }, { "epoch": 0.5642701525054467, "grad_norm": 40.83606301737352, "learning_rate": 3.80138511973941e-07, "logits/chosen": 12.59130859375, "logits/rejected": 13.891250610351562, "logps/chosen": -3.759188413619995, "logps/rejected": -4.118473052978516, "loss": 4.1966, "rewards/accuracies": 0.5, "rewards/chosen": -37.59188461303711, "rewards/margins": 3.5928430557250977, "rewards/rejected": -41.184730529785156, "step": 4144 }, { "epoch": 0.5644063180827886, "grad_norm": 39.449510718463635, "learning_rate": 3.799486084917587e-07, "logits/chosen": 13.338333129882812, "logits/rejected": 14.270038604736328, "logps/chosen": -3.780034065246582, "logps/rejected": -4.350868225097656, "loss": 3.727, "rewards/accuracies": 1.0, "rewards/chosen": -37.80033874511719, "rewards/margins": 5.708340644836426, "rewards/rejected": -43.50868225097656, "step": 4145 }, { "epoch": 0.5645424836601307, "grad_norm": 45.28068235904397, "learning_rate": 3.7975870954035406e-07, "logits/chosen": 13.127006530761719, "logits/rejected": 14.254413604736328, "logps/chosen": -4.050545692443848, "logps/rejected": -4.411030292510986, "loss": 4.5841, "rewards/accuracies": 0.75, "rewards/chosen": -40.505462646484375, "rewards/margins": 3.6048412322998047, "rewards/rejected": -44.11030197143555, "step": 4146 }, { "epoch": 0.5646786492374728, "grad_norm": 37.240383989162545, "learning_rate": 3.7956881516263646e-07, "logits/chosen": 13.601125717163086, "logits/rejected": 13.672880172729492, "logps/chosen": -3.9725115299224854, "logps/rejected": -4.48429536819458, "loss": 3.7629, "rewards/accuracies": 0.75, "rewards/chosen": -39.72511672973633, "rewards/margins": 5.117837905883789, "rewards/rejected": -44.842952728271484, "step": 4147 }, { "epoch": 0.5648148148148148, "grad_norm": 38.2516666838945, "learning_rate": 3.793789254015138e-07, "logits/chosen": 13.678033828735352, "logits/rejected": 13.460482597351074, "logps/chosen": -4.209804534912109, "logps/rejected": -4.226886749267578, "loss": 4.0699, "rewards/accuracies": 0.75, "rewards/chosen": -42.098045349121094, "rewards/margins": 0.1708230972290039, "rewards/rejected": -42.26886749267578, "step": 4148 }, { "epoch": 0.5649509803921569, "grad_norm": 39.989645746930464, "learning_rate": 3.791890402998934e-07, "logits/chosen": 12.958227157592773, "logits/rejected": 13.13061237335205, "logps/chosen": -4.357886791229248, "logps/rejected": -4.195404052734375, "loss": 3.6823, "rewards/accuracies": 0.25, "rewards/chosen": -43.57887268066406, "rewards/margins": -1.624826431274414, "rewards/rejected": -41.954044342041016, "step": 4149 }, { "epoch": 0.5650871459694989, "grad_norm": 43.948733647827225, "learning_rate": 3.7899915990068154e-07, "logits/chosen": 13.388259887695312, "logits/rejected": 13.66500186920166, "logps/chosen": -4.410833358764648, "logps/rejected": -4.672738552093506, "loss": 4.0717, "rewards/accuracies": 0.5, "rewards/chosen": -44.10833740234375, "rewards/margins": 2.6190481185913086, "rewards/rejected": -46.72738265991211, "step": 4150 }, { "epoch": 0.5652233115468409, "grad_norm": 40.67761811163057, "learning_rate": 3.7880928424678293e-07, "logits/chosen": 13.208335876464844, "logits/rejected": 12.635772705078125, "logps/chosen": -4.258500576019287, "logps/rejected": -4.276025772094727, "loss": 3.8818, "rewards/accuracies": 0.5, "rewards/chosen": -42.58500671386719, "rewards/margins": 0.17525386810302734, "rewards/rejected": -42.760257720947266, "step": 4151 }, { "epoch": 0.565359477124183, "grad_norm": 42.348359927615434, "learning_rate": 3.7861941338110175e-07, "logits/chosen": 13.287013053894043, "logits/rejected": 14.376497268676758, "logps/chosen": -4.387129783630371, "logps/rejected": -4.428964614868164, "loss": 3.929, "rewards/accuracies": 0.5, "rewards/chosen": -43.871299743652344, "rewards/margins": 0.4183473587036133, "rewards/rejected": -44.28964614868164, "step": 4152 }, { "epoch": 0.5654956427015251, "grad_norm": 45.846546941490644, "learning_rate": 3.7842954734654096e-07, "logits/chosen": 13.284433364868164, "logits/rejected": 13.375093460083008, "logps/chosen": -4.123608589172363, "logps/rejected": -4.375801086425781, "loss": 3.8088, "rewards/accuracies": 0.75, "rewards/chosen": -41.236083984375, "rewards/margins": 2.5219268798828125, "rewards/rejected": -43.75801086425781, "step": 4153 }, { "epoch": 0.565631808278867, "grad_norm": 43.50721217582923, "learning_rate": 3.7823968618600214e-07, "logits/chosen": 12.432901382446289, "logits/rejected": 13.628730773925781, "logps/chosen": -4.113980293273926, "logps/rejected": -4.489961624145508, "loss": 3.7128, "rewards/accuracies": 0.75, "rewards/chosen": -41.13980484008789, "rewards/margins": 3.7598114013671875, "rewards/rejected": -44.89961624145508, "step": 4154 }, { "epoch": 0.5657679738562091, "grad_norm": 40.98656511807166, "learning_rate": 3.7804982994238603e-07, "logits/chosen": 13.26019287109375, "logits/rejected": 14.54405403137207, "logps/chosen": -4.505645751953125, "logps/rejected": -4.917830944061279, "loss": 3.7285, "rewards/accuracies": 1.0, "rewards/chosen": -45.05645751953125, "rewards/margins": 4.121851921081543, "rewards/rejected": -49.178306579589844, "step": 4155 }, { "epoch": 0.5659041394335512, "grad_norm": 36.34561731863532, "learning_rate": 3.778599786585925e-07, "logits/chosen": 13.192649841308594, "logits/rejected": 13.57366943359375, "logps/chosen": -4.217230796813965, "logps/rejected": -4.560685157775879, "loss": 3.6644, "rewards/accuracies": 0.75, "rewards/chosen": -42.172306060791016, "rewards/margins": 3.4345483779907227, "rewards/rejected": -45.60685729980469, "step": 4156 }, { "epoch": 0.5660403050108932, "grad_norm": 39.923113068447094, "learning_rate": 3.7767013237751964e-07, "logits/chosen": 14.050515174865723, "logits/rejected": 14.088724136352539, "logps/chosen": -4.067734241485596, "logps/rejected": -3.9686145782470703, "loss": 4.1957, "rewards/accuracies": 0.25, "rewards/chosen": -40.677345275878906, "rewards/margins": -0.9911985397338867, "rewards/rejected": -39.6861457824707, "step": 4157 }, { "epoch": 0.5661764705882353, "grad_norm": 39.9534524522439, "learning_rate": 3.774802911420649e-07, "logits/chosen": 13.331476211547852, "logits/rejected": 12.913337707519531, "logps/chosen": -3.9669933319091797, "logps/rejected": -4.179705619812012, "loss": 3.5275, "rewards/accuracies": 0.5, "rewards/chosen": -39.6699333190918, "rewards/margins": 2.127124786376953, "rewards/rejected": -41.79705810546875, "step": 4158 }, { "epoch": 0.5663126361655774, "grad_norm": 43.81800392823716, "learning_rate": 3.7729045499512476e-07, "logits/chosen": 13.334941864013672, "logits/rejected": 13.624046325683594, "logps/chosen": -4.069108009338379, "logps/rejected": -4.279500961303711, "loss": 3.8217, "rewards/accuracies": 0.75, "rewards/chosen": -40.69108200073242, "rewards/margins": 2.103921890258789, "rewards/rejected": -42.795005798339844, "step": 4159 }, { "epoch": 0.5664488017429193, "grad_norm": 38.87416491729457, "learning_rate": 3.7710062397959386e-07, "logits/chosen": 12.386035919189453, "logits/rejected": 13.353580474853516, "logps/chosen": -3.762507200241089, "logps/rejected": -4.042524814605713, "loss": 3.9004, "rewards/accuracies": 0.75, "rewards/chosen": -37.62507247924805, "rewards/margins": 2.8001766204833984, "rewards/rejected": -40.42524719238281, "step": 4160 }, { "epoch": 0.5665849673202614, "grad_norm": 39.44133839623453, "learning_rate": 3.7691079813836615e-07, "logits/chosen": 13.259208679199219, "logits/rejected": 13.46044921875, "logps/chosen": -4.295001029968262, "logps/rejected": -4.282106876373291, "loss": 3.9734, "rewards/accuracies": 0.5, "rewards/chosen": -42.95001220703125, "rewards/margins": -0.12894535064697266, "rewards/rejected": -42.821067810058594, "step": 4161 }, { "epoch": 0.5667211328976035, "grad_norm": 39.7777780091127, "learning_rate": 3.7672097751433466e-07, "logits/chosen": 13.046852111816406, "logits/rejected": 13.530028343200684, "logps/chosen": -4.48066520690918, "logps/rejected": -4.381288528442383, "loss": 4.2211, "rewards/accuracies": 0.25, "rewards/chosen": -44.80664825439453, "rewards/margins": -0.9937639236450195, "rewards/rejected": -43.812889099121094, "step": 4162 }, { "epoch": 0.5668572984749455, "grad_norm": 35.300303442464426, "learning_rate": 3.765311621503907e-07, "logits/chosen": 13.678847312927246, "logits/rejected": 13.448663711547852, "logps/chosen": -3.8423378467559814, "logps/rejected": -4.336581707000732, "loss": 3.8353, "rewards/accuracies": 0.75, "rewards/chosen": -38.423377990722656, "rewards/margins": 4.942435264587402, "rewards/rejected": -43.365814208984375, "step": 4163 }, { "epoch": 0.5669934640522876, "grad_norm": 39.01425274056463, "learning_rate": 3.763413520894245e-07, "logits/chosen": 12.77698802947998, "logits/rejected": 13.297723770141602, "logps/chosen": -3.799220561981201, "logps/rejected": -4.03669548034668, "loss": 3.9175, "rewards/accuracies": 0.75, "rewards/chosen": -37.99220275878906, "rewards/margins": 2.3747482299804688, "rewards/rejected": -40.36695098876953, "step": 4164 }, { "epoch": 0.5671296296296297, "grad_norm": 44.37249049661274, "learning_rate": 3.7615154737432555e-07, "logits/chosen": 12.948588371276855, "logits/rejected": 13.353300094604492, "logps/chosen": -4.084049224853516, "logps/rejected": -4.487638473510742, "loss": 4.1103, "rewards/accuracies": 1.0, "rewards/chosen": -40.840492248535156, "rewards/margins": 4.035893440246582, "rewards/rejected": -44.876380920410156, "step": 4165 }, { "epoch": 0.5672657952069716, "grad_norm": 40.86382664217546, "learning_rate": 3.7596174804798153e-07, "logits/chosen": 13.13615608215332, "logits/rejected": 13.77951717376709, "logps/chosen": -4.15878963470459, "logps/rejected": -4.389912128448486, "loss": 4.4137, "rewards/accuracies": 0.75, "rewards/chosen": -41.58789825439453, "rewards/margins": 2.3112220764160156, "rewards/rejected": -43.89912414550781, "step": 4166 }, { "epoch": 0.5674019607843137, "grad_norm": 42.91061415024828, "learning_rate": 3.757719541532792e-07, "logits/chosen": 13.396251678466797, "logits/rejected": 14.075237274169922, "logps/chosen": -4.094019889831543, "logps/rejected": -4.464138031005859, "loss": 4.2916, "rewards/accuracies": 0.75, "rewards/chosen": -40.94019317626953, "rewards/margins": 3.701186180114746, "rewards/rejected": -44.641380310058594, "step": 4167 }, { "epoch": 0.5675381263616558, "grad_norm": 43.45535058059587, "learning_rate": 3.7558216573310415e-07, "logits/chosen": 14.393716812133789, "logits/rejected": 13.564445495605469, "logps/chosen": -4.281398773193359, "logps/rejected": -4.036080360412598, "loss": 4.7195, "rewards/accuracies": 0.5, "rewards/chosen": -42.81398391723633, "rewards/margins": -2.453179359436035, "rewards/rejected": -40.360801696777344, "step": 4168 }, { "epoch": 0.5676742919389978, "grad_norm": 38.264729979449314, "learning_rate": 3.753923828303405e-07, "logits/chosen": 13.208370208740234, "logits/rejected": 13.518054962158203, "logps/chosen": -4.215946197509766, "logps/rejected": -4.282028675079346, "loss": 4.3079, "rewards/accuracies": 0.5, "rewards/chosen": -42.15946578979492, "rewards/margins": 0.660822868347168, "rewards/rejected": -42.820289611816406, "step": 4169 }, { "epoch": 0.5678104575163399, "grad_norm": 37.27805304711813, "learning_rate": 3.752026054878715e-07, "logits/chosen": 13.187712669372559, "logits/rejected": 13.71596908569336, "logps/chosen": -3.9534525871276855, "logps/rejected": -4.216255187988281, "loss": 3.5203, "rewards/accuracies": 0.75, "rewards/chosen": -39.534523010253906, "rewards/margins": 2.628026008605957, "rewards/rejected": -42.16255187988281, "step": 4170 }, { "epoch": 0.5679466230936819, "grad_norm": 39.69721027142199, "learning_rate": 3.7501283374857874e-07, "logits/chosen": 13.26107120513916, "logits/rejected": 13.307640075683594, "logps/chosen": -4.000174522399902, "logps/rejected": -4.169307708740234, "loss": 3.9106, "rewards/accuracies": 0.5, "rewards/chosen": -40.001747131347656, "rewards/margins": 1.6913280487060547, "rewards/rejected": -41.693077087402344, "step": 4171 }, { "epoch": 0.568082788671024, "grad_norm": 41.26150218536338, "learning_rate": 3.7482306765534264e-07, "logits/chosen": 13.818258285522461, "logits/rejected": 14.41726303100586, "logps/chosen": -4.4844560623168945, "logps/rejected": -4.682026386260986, "loss": 3.711, "rewards/accuracies": 0.75, "rewards/chosen": -44.84456253051758, "rewards/margins": 1.9757003784179688, "rewards/rejected": -46.82026290893555, "step": 4172 }, { "epoch": 0.568218954248366, "grad_norm": 39.41113684363583, "learning_rate": 3.746333072510426e-07, "logits/chosen": 13.990117073059082, "logits/rejected": 13.232022285461426, "logps/chosen": -4.240049362182617, "logps/rejected": -4.251177787780762, "loss": 4.3741, "rewards/accuracies": 0.5, "rewards/chosen": -42.400489807128906, "rewards/margins": 0.11128711700439453, "rewards/rejected": -42.51177978515625, "step": 4173 }, { "epoch": 0.5683551198257081, "grad_norm": 36.78733781462761, "learning_rate": 3.7444355257855646e-07, "logits/chosen": 12.86658000946045, "logits/rejected": 13.565101623535156, "logps/chosen": -3.932957172393799, "logps/rejected": -4.173308372497559, "loss": 4.1359, "rewards/accuracies": 0.5, "rewards/chosen": -39.32957077026367, "rewards/margins": 2.403510093688965, "rewards/rejected": -41.73308181762695, "step": 4174 }, { "epoch": 0.5684912854030502, "grad_norm": 34.64305939032856, "learning_rate": 3.742538036807608e-07, "logits/chosen": 13.62120246887207, "logits/rejected": 14.54944133758545, "logps/chosen": -4.231805801391602, "logps/rejected": -4.569760322570801, "loss": 3.838, "rewards/accuracies": 0.75, "rewards/chosen": -42.318058013916016, "rewards/margins": 3.379542350769043, "rewards/rejected": -45.697601318359375, "step": 4175 }, { "epoch": 0.5686274509803921, "grad_norm": 36.25455674652175, "learning_rate": 3.74064060600531e-07, "logits/chosen": 13.740473747253418, "logits/rejected": 13.469717979431152, "logps/chosen": -4.240509986877441, "logps/rejected": -4.248846054077148, "loss": 4.1301, "rewards/accuracies": 0.5, "rewards/chosen": -42.40509796142578, "rewards/margins": 0.0833578109741211, "rewards/rejected": -42.48845672607422, "step": 4176 }, { "epoch": 0.5687636165577342, "grad_norm": 41.36522615834846, "learning_rate": 3.738743233807413e-07, "logits/chosen": 12.256872177124023, "logits/rejected": 13.513452529907227, "logps/chosen": -3.918421745300293, "logps/rejected": -3.9881887435913086, "loss": 3.7902, "rewards/accuracies": 0.5, "rewards/chosen": -39.18421173095703, "rewards/margins": 0.6976728439331055, "rewards/rejected": -39.88188934326172, "step": 4177 }, { "epoch": 0.5688997821350763, "grad_norm": 39.56528758518745, "learning_rate": 3.7368459206426405e-07, "logits/chosen": 12.699087142944336, "logits/rejected": 13.332496643066406, "logps/chosen": -4.180427074432373, "logps/rejected": -4.437661170959473, "loss": 4.168, "rewards/accuracies": 0.75, "rewards/chosen": -41.80426788330078, "rewards/margins": 2.5723390579223633, "rewards/rejected": -44.37660598754883, "step": 4178 }, { "epoch": 0.5690359477124183, "grad_norm": 40.754879495124214, "learning_rate": 3.7349486669397067e-07, "logits/chosen": 13.675064086914062, "logits/rejected": 13.760839462280273, "logps/chosen": -4.160296440124512, "logps/rejected": -4.401342391967773, "loss": 3.9392, "rewards/accuracies": 1.0, "rewards/chosen": -41.60295867919922, "rewards/margins": 2.4104604721069336, "rewards/rejected": -44.01342010498047, "step": 4179 }, { "epoch": 0.5691721132897604, "grad_norm": 37.147875659206726, "learning_rate": 3.7330514731273147e-07, "logits/chosen": 13.538196563720703, "logits/rejected": 13.400251388549805, "logps/chosen": -4.3321685791015625, "logps/rejected": -4.3927226066589355, "loss": 3.7164, "rewards/accuracies": 0.5, "rewards/chosen": -43.32168197631836, "rewards/margins": 0.6055440902709961, "rewards/rejected": -43.92723083496094, "step": 4180 }, { "epoch": 0.5693082788671024, "grad_norm": 38.26723397458663, "learning_rate": 3.731154339634147e-07, "logits/chosen": 13.563518524169922, "logits/rejected": 13.441747665405273, "logps/chosen": -4.139254570007324, "logps/rejected": -4.3340301513671875, "loss": 3.5849, "rewards/accuracies": 0.5, "rewards/chosen": -41.39254379272461, "rewards/margins": 1.9477615356445312, "rewards/rejected": -43.34030532836914, "step": 4181 }, { "epoch": 0.5694444444444444, "grad_norm": 36.74288617417699, "learning_rate": 3.7292572668888787e-07, "logits/chosen": 13.436485290527344, "logits/rejected": 13.374250411987305, "logps/chosen": -4.124728202819824, "logps/rejected": -3.9850988388061523, "loss": 3.682, "rewards/accuracies": 0.5, "rewards/chosen": -41.247283935546875, "rewards/margins": -1.396296501159668, "rewards/rejected": -39.85098648071289, "step": 4182 }, { "epoch": 0.5695806100217865, "grad_norm": 36.32157053031847, "learning_rate": 3.727360255320171e-07, "logits/chosen": 13.734234809875488, "logits/rejected": 14.217920303344727, "logps/chosen": -4.236310005187988, "logps/rejected": -4.315448760986328, "loss": 3.9548, "rewards/accuracies": 0.75, "rewards/chosen": -42.363101959228516, "rewards/margins": 0.791386604309082, "rewards/rejected": -43.15448760986328, "step": 4183 }, { "epoch": 0.5697167755991286, "grad_norm": 36.29576716795639, "learning_rate": 3.725463305356665e-07, "logits/chosen": 12.121706008911133, "logits/rejected": 12.888187408447266, "logps/chosen": -3.848385810852051, "logps/rejected": -4.293555736541748, "loss": 3.5745, "rewards/accuracies": 1.0, "rewards/chosen": -38.483856201171875, "rewards/margins": 4.4517011642456055, "rewards/rejected": -42.9355583190918, "step": 4184 }, { "epoch": 0.5698529411764706, "grad_norm": 32.697709703015235, "learning_rate": 3.723566417426995e-07, "logits/chosen": 14.27353572845459, "logits/rejected": 13.622535705566406, "logps/chosen": -4.576329231262207, "logps/rejected": -4.394518852233887, "loss": 3.9613, "rewards/accuracies": 0.5, "rewards/chosen": -45.7632942199707, "rewards/margins": -1.8181095123291016, "rewards/rejected": -43.94518280029297, "step": 4185 }, { "epoch": 0.5699891067538126, "grad_norm": 39.460682590378255, "learning_rate": 3.721669591959779e-07, "logits/chosen": 13.241703033447266, "logits/rejected": 13.539870262145996, "logps/chosen": -3.9445760250091553, "logps/rejected": -4.236296653747559, "loss": 3.6403, "rewards/accuracies": 1.0, "rewards/chosen": -39.44575881958008, "rewards/margins": 2.917203903198242, "rewards/rejected": -42.36296081542969, "step": 4186 }, { "epoch": 0.5701252723311547, "grad_norm": 56.53228530739481, "learning_rate": 3.719772829383618e-07, "logits/chosen": 13.675018310546875, "logits/rejected": 13.40875244140625, "logps/chosen": -4.426536560058594, "logps/rejected": -4.162591934204102, "loss": 3.9638, "rewards/accuracies": 0.25, "rewards/chosen": -44.26536560058594, "rewards/margins": -2.639446258544922, "rewards/rejected": -41.62591552734375, "step": 4187 }, { "epoch": 0.5702614379084967, "grad_norm": 33.840396181272375, "learning_rate": 3.7178761301271025e-07, "logits/chosen": 13.10940933227539, "logits/rejected": 13.142721176147461, "logps/chosen": -4.171446323394775, "logps/rejected": -4.268797874450684, "loss": 4.035, "rewards/accuracies": 0.75, "rewards/chosen": -41.71446228027344, "rewards/margins": 0.973515510559082, "rewards/rejected": -42.6879768371582, "step": 4188 }, { "epoch": 0.5703976034858388, "grad_norm": 36.57249445724194, "learning_rate": 3.7159794946188097e-07, "logits/chosen": 12.483672142028809, "logits/rejected": 13.680343627929688, "logps/chosen": -3.9954614639282227, "logps/rejected": -4.4717559814453125, "loss": 4.0153, "rewards/accuracies": 1.0, "rewards/chosen": -39.954612731933594, "rewards/margins": 4.762945175170898, "rewards/rejected": -44.717559814453125, "step": 4189 }, { "epoch": 0.5705337690631809, "grad_norm": 35.437767685264355, "learning_rate": 3.714082923287296e-07, "logits/chosen": 14.257854461669922, "logits/rejected": 13.686323165893555, "logps/chosen": -4.355581760406494, "logps/rejected": -4.1564106941223145, "loss": 3.8469, "rewards/accuracies": 0.25, "rewards/chosen": -43.55582046508789, "rewards/margins": -1.9917106628417969, "rewards/rejected": -41.56410598754883, "step": 4190 }, { "epoch": 0.5706699346405228, "grad_norm": 40.028469753450146, "learning_rate": 3.712186416561109e-07, "logits/chosen": 13.95557975769043, "logits/rejected": 13.89858341217041, "logps/chosen": -4.4054155349731445, "logps/rejected": -4.533228874206543, "loss": 4.2499, "rewards/accuracies": 0.75, "rewards/chosen": -44.05415344238281, "rewards/margins": 1.2781314849853516, "rewards/rejected": -45.33228302001953, "step": 4191 }, { "epoch": 0.5708061002178649, "grad_norm": 38.100040411345695, "learning_rate": 3.7102899748687816e-07, "logits/chosen": 13.392698287963867, "logits/rejected": 14.04337215423584, "logps/chosen": -4.004278182983398, "logps/rejected": -4.52966833114624, "loss": 4.2751, "rewards/accuracies": 1.0, "rewards/chosen": -40.042781829833984, "rewards/margins": 5.253903388977051, "rewards/rejected": -45.29668426513672, "step": 4192 }, { "epoch": 0.570942265795207, "grad_norm": 36.748461008042185, "learning_rate": 3.7083935986388277e-07, "logits/chosen": 12.889656066894531, "logits/rejected": 13.192094802856445, "logps/chosen": -3.9040021896362305, "logps/rejected": -4.026998996734619, "loss": 3.56, "rewards/accuracies": 0.5, "rewards/chosen": -39.04002380371094, "rewards/margins": 1.2299690246582031, "rewards/rejected": -40.269989013671875, "step": 4193 }, { "epoch": 0.571078431372549, "grad_norm": 38.19739724872509, "learning_rate": 3.7064972882997505e-07, "logits/chosen": 13.244280815124512, "logits/rejected": 12.966975212097168, "logps/chosen": -4.058790683746338, "logps/rejected": -4.249579429626465, "loss": 3.8163, "rewards/accuracies": 0.5, "rewards/chosen": -40.58790588378906, "rewards/margins": 1.9078845977783203, "rewards/rejected": -42.495792388916016, "step": 4194 }, { "epoch": 0.5712145969498911, "grad_norm": 36.823382844864085, "learning_rate": 3.7046010442800395e-07, "logits/chosen": 13.736143112182617, "logits/rejected": 13.796772003173828, "logps/chosen": -4.352246284484863, "logps/rejected": -4.372776031494141, "loss": 3.8647, "rewards/accuracies": 0.75, "rewards/chosen": -43.5224609375, "rewards/margins": 0.20529747009277344, "rewards/rejected": -43.727760314941406, "step": 4195 }, { "epoch": 0.5713507625272332, "grad_norm": 37.555696392325544, "learning_rate": 3.702704867008162e-07, "logits/chosen": 13.748069763183594, "logits/rejected": 13.349449157714844, "logps/chosen": -4.0231547355651855, "logps/rejected": -3.9903924465179443, "loss": 4.1469, "rewards/accuracies": 0.5, "rewards/chosen": -40.23155212402344, "rewards/margins": -0.3276252746582031, "rewards/rejected": -39.90392303466797, "step": 4196 }, { "epoch": 0.5714869281045751, "grad_norm": 36.27680291770694, "learning_rate": 3.700808756912577e-07, "logits/chosen": 12.904176712036133, "logits/rejected": 13.563822746276855, "logps/chosen": -3.9384937286376953, "logps/rejected": -4.41910982131958, "loss": 4.0062, "rewards/accuracies": 1.0, "rewards/chosen": -39.38493728637695, "rewards/margins": 4.8061628341674805, "rewards/rejected": -44.191097259521484, "step": 4197 }, { "epoch": 0.5716230936819172, "grad_norm": 40.422064051615074, "learning_rate": 3.698912714421729e-07, "logits/chosen": 14.127805709838867, "logits/rejected": 14.177705764770508, "logps/chosen": -4.229941368103027, "logps/rejected": -4.486137390136719, "loss": 4.334, "rewards/accuracies": 0.5, "rewards/chosen": -42.299415588378906, "rewards/margins": 2.561955451965332, "rewards/rejected": -44.86137008666992, "step": 4198 }, { "epoch": 0.5717592592592593, "grad_norm": 41.74658957355935, "learning_rate": 3.69701673996404e-07, "logits/chosen": 13.449960708618164, "logits/rejected": 13.655130386352539, "logps/chosen": -4.103294849395752, "logps/rejected": -4.393508434295654, "loss": 3.3415, "rewards/accuracies": 0.5, "rewards/chosen": -41.0329475402832, "rewards/margins": 2.9021339416503906, "rewards/rejected": -43.935081481933594, "step": 4199 }, { "epoch": 0.5718954248366013, "grad_norm": 32.96575727781539, "learning_rate": 3.6951208339679234e-07, "logits/chosen": 13.748223304748535, "logits/rejected": 13.927478790283203, "logps/chosen": -3.772470235824585, "logps/rejected": -4.264529228210449, "loss": 3.3553, "rewards/accuracies": 0.75, "rewards/chosen": -37.724700927734375, "rewards/margins": 4.920587539672852, "rewards/rejected": -42.645286560058594, "step": 4200 }, { "epoch": 0.5720315904139434, "grad_norm": 35.554906254663905, "learning_rate": 3.6932249968617757e-07, "logits/chosen": 12.365180969238281, "logits/rejected": 13.325944900512695, "logps/chosen": -4.191171646118164, "logps/rejected": -4.459819316864014, "loss": 3.3335, "rewards/accuracies": 0.75, "rewards/chosen": -41.91171646118164, "rewards/margins": 2.686476707458496, "rewards/rejected": -44.59819030761719, "step": 4201 }, { "epoch": 0.5721677559912854, "grad_norm": 36.43364533146066, "learning_rate": 3.691329229073974e-07, "logits/chosen": 13.157452583312988, "logits/rejected": 13.50495719909668, "logps/chosen": -4.4633259773254395, "logps/rejected": -4.645948886871338, "loss": 4.0603, "rewards/accuracies": 0.5, "rewards/chosen": -44.633262634277344, "rewards/margins": 1.826228141784668, "rewards/rejected": -46.45948791503906, "step": 4202 }, { "epoch": 0.5723039215686274, "grad_norm": 41.988956716329, "learning_rate": 3.689433531032885e-07, "logits/chosen": 14.042818069458008, "logits/rejected": 14.542814254760742, "logps/chosen": -3.9179635047912598, "logps/rejected": -4.36669921875, "loss": 4.4579, "rewards/accuracies": 0.75, "rewards/chosen": -39.17963409423828, "rewards/margins": 4.487361907958984, "rewards/rejected": -43.666996002197266, "step": 4203 }, { "epoch": 0.5724400871459695, "grad_norm": 38.59866805924322, "learning_rate": 3.687537903166858e-07, "logits/chosen": 13.233673095703125, "logits/rejected": 14.761621475219727, "logps/chosen": -4.090453624725342, "logps/rejected": -4.567609786987305, "loss": 3.98, "rewards/accuracies": 0.75, "rewards/chosen": -40.904537200927734, "rewards/margins": 4.771564483642578, "rewards/rejected": -45.67610168457031, "step": 4204 }, { "epoch": 0.5725762527233116, "grad_norm": 40.9224370459201, "learning_rate": 3.685642345904223e-07, "logits/chosen": 13.478094100952148, "logits/rejected": 13.832822799682617, "logps/chosen": -4.297011852264404, "logps/rejected": -4.207695960998535, "loss": 4.1726, "rewards/accuracies": 0.5, "rewards/chosen": -42.97011947631836, "rewards/margins": -0.8931570053100586, "rewards/rejected": -42.076961517333984, "step": 4205 }, { "epoch": 0.5727124183006536, "grad_norm": 38.13579325347539, "learning_rate": 3.683746859673299e-07, "logits/chosen": 13.923118591308594, "logits/rejected": 13.632867813110352, "logps/chosen": -3.8722681999206543, "logps/rejected": -4.426486968994141, "loss": 3.7551, "rewards/accuracies": 1.0, "rewards/chosen": -38.72268295288086, "rewards/margins": 5.542187690734863, "rewards/rejected": -44.264869689941406, "step": 4206 }, { "epoch": 0.5728485838779956, "grad_norm": 38.36217040288148, "learning_rate": 3.6818514449023877e-07, "logits/chosen": 11.944110870361328, "logits/rejected": 13.26052474975586, "logps/chosen": -3.694272041320801, "logps/rejected": -4.381870746612549, "loss": 3.7575, "rewards/accuracies": 1.0, "rewards/chosen": -36.94272232055664, "rewards/margins": 6.875988960266113, "rewards/rejected": -43.81871032714844, "step": 4207 }, { "epoch": 0.5729847494553377, "grad_norm": 39.6930029337717, "learning_rate": 3.6799561020197704e-07, "logits/chosen": 13.121004104614258, "logits/rejected": 12.861414909362793, "logps/chosen": -4.03526496887207, "logps/rejected": -3.899169921875, "loss": 4.2273, "rewards/accuracies": 0.5, "rewards/chosen": -40.3526496887207, "rewards/margins": -1.360952377319336, "rewards/rejected": -38.99169921875, "step": 4208 }, { "epoch": 0.5731209150326797, "grad_norm": 38.005212470091564, "learning_rate": 3.678060831453717e-07, "logits/chosen": 13.106707572937012, "logits/rejected": 12.721797943115234, "logps/chosen": -4.202365875244141, "logps/rejected": -4.085454940795898, "loss": 4.3462, "rewards/accuracies": 0.25, "rewards/chosen": -42.023658752441406, "rewards/margins": -1.1691112518310547, "rewards/rejected": -40.85454559326172, "step": 4209 }, { "epoch": 0.5732570806100218, "grad_norm": 39.22518505959749, "learning_rate": 3.6761656336324815e-07, "logits/chosen": 13.743236541748047, "logits/rejected": 14.533870697021484, "logps/chosen": -4.353898048400879, "logps/rejected": -4.512609481811523, "loss": 4.0716, "rewards/accuracies": 0.5, "rewards/chosen": -43.53898239135742, "rewards/margins": 1.5871105194091797, "rewards/rejected": -45.12609100341797, "step": 4210 }, { "epoch": 0.5733932461873639, "grad_norm": 37.853907127411794, "learning_rate": 3.6742705089842974e-07, "logits/chosen": 14.327634811401367, "logits/rejected": 13.7823486328125, "logps/chosen": -4.270382881164551, "logps/rejected": -4.535137176513672, "loss": 4.2517, "rewards/accuracies": 0.75, "rewards/chosen": -42.70383071899414, "rewards/margins": 2.6475391387939453, "rewards/rejected": -45.35137176513672, "step": 4211 }, { "epoch": 0.5735294117647058, "grad_norm": 43.10706676626207, "learning_rate": 3.6723754579373836e-07, "logits/chosen": 13.664438247680664, "logits/rejected": 14.723007202148438, "logps/chosen": -3.883659839630127, "logps/rejected": -4.443902969360352, "loss": 3.7572, "rewards/accuracies": 1.0, "rewards/chosen": -38.83659744262695, "rewards/margins": 5.602434158325195, "rewards/rejected": -44.43903350830078, "step": 4212 }, { "epoch": 0.5736655773420479, "grad_norm": 43.173193081922776, "learning_rate": 3.670480480919944e-07, "logits/chosen": 13.734890937805176, "logits/rejected": 13.442852973937988, "logps/chosen": -3.865098476409912, "logps/rejected": -3.8510262966156006, "loss": 3.821, "rewards/accuracies": 0.25, "rewards/chosen": -38.65098571777344, "rewards/margins": -0.14071941375732422, "rewards/rejected": -38.5102653503418, "step": 4213 }, { "epoch": 0.57380174291939, "grad_norm": 41.983746740762996, "learning_rate": 3.668585578360164e-07, "logits/chosen": 13.559083938598633, "logits/rejected": 13.735980987548828, "logps/chosen": -4.363584041595459, "logps/rejected": -4.473240852355957, "loss": 3.9918, "rewards/accuracies": 0.5, "rewards/chosen": -43.635841369628906, "rewards/margins": 1.0965690612792969, "rewards/rejected": -44.73240661621094, "step": 4214 }, { "epoch": 0.573937908496732, "grad_norm": 47.985770715472974, "learning_rate": 3.6666907506862107e-07, "logits/chosen": 13.819780349731445, "logits/rejected": 14.325784683227539, "logps/chosen": -4.171360492706299, "logps/rejected": -4.503785133361816, "loss": 3.099, "rewards/accuracies": 1.0, "rewards/chosen": -41.71360778808594, "rewards/margins": 3.3242416381835938, "rewards/rejected": -45.03784942626953, "step": 4215 }, { "epoch": 0.5740740740740741, "grad_norm": 42.35860735050613, "learning_rate": 3.6647959983262387e-07, "logits/chosen": 13.583038330078125, "logits/rejected": 13.621129035949707, "logps/chosen": -4.1299729347229, "logps/rejected": -4.182845115661621, "loss": 4.5714, "rewards/accuracies": 0.5, "rewards/chosen": -41.29972839355469, "rewards/margins": 0.5287227630615234, "rewards/rejected": -41.828453063964844, "step": 4216 }, { "epoch": 0.5742102396514162, "grad_norm": 42.969867785736824, "learning_rate": 3.6629013217083806e-07, "logits/chosen": 13.323556900024414, "logits/rejected": 13.25724983215332, "logps/chosen": -4.22906494140625, "logps/rejected": -4.086461544036865, "loss": 3.7877, "rewards/accuracies": 0.25, "rewards/chosen": -42.290645599365234, "rewards/margins": -1.4260330200195312, "rewards/rejected": -40.8646125793457, "step": 4217 }, { "epoch": 0.5743464052287581, "grad_norm": 36.9880996846473, "learning_rate": 3.6610067212607564e-07, "logits/chosen": 13.107868194580078, "logits/rejected": 13.188173294067383, "logps/chosen": -4.062398910522461, "logps/rejected": -4.084238052368164, "loss": 3.7488, "rewards/accuracies": 0.75, "rewards/chosen": -40.623985290527344, "rewards/margins": 0.21839046478271484, "rewards/rejected": -40.84238052368164, "step": 4218 }, { "epoch": 0.5744825708061002, "grad_norm": 37.096958377847535, "learning_rate": 3.659112197411466e-07, "logits/chosen": 13.71035385131836, "logits/rejected": 12.85999584197998, "logps/chosen": -4.414421081542969, "logps/rejected": -4.293552875518799, "loss": 4.0362, "rewards/accuracies": 0.75, "rewards/chosen": -44.14421081542969, "rewards/margins": -1.2086849212646484, "rewards/rejected": -42.935523986816406, "step": 4219 }, { "epoch": 0.5746187363834423, "grad_norm": 44.416810802248676, "learning_rate": 3.6572177505885905e-07, "logits/chosen": 13.462636947631836, "logits/rejected": 13.733116149902344, "logps/chosen": -4.202265739440918, "logps/rejected": -4.334021091461182, "loss": 3.8632, "rewards/accuracies": 0.75, "rewards/chosen": -42.02265167236328, "rewards/margins": 1.3175554275512695, "rewards/rejected": -43.3402099609375, "step": 4220 }, { "epoch": 0.5747549019607843, "grad_norm": 46.79212737854191, "learning_rate": 3.6553233812201994e-07, "logits/chosen": 12.812165260314941, "logits/rejected": 13.27738094329834, "logps/chosen": -4.1852569580078125, "logps/rejected": -4.568034648895264, "loss": 3.5231, "rewards/accuracies": 1.0, "rewards/chosen": -41.852569580078125, "rewards/margins": 3.827775001525879, "rewards/rejected": -45.68034362792969, "step": 4221 }, { "epoch": 0.5748910675381264, "grad_norm": 42.47881262836984, "learning_rate": 3.653429089734339e-07, "logits/chosen": 14.131891250610352, "logits/rejected": 13.40872573852539, "logps/chosen": -4.458661079406738, "logps/rejected": -4.3809614181518555, "loss": 4.7033, "rewards/accuracies": 0.5, "rewards/chosen": -44.586612701416016, "rewards/margins": -0.7770013809204102, "rewards/rejected": -43.80961227416992, "step": 4222 }, { "epoch": 0.5750272331154684, "grad_norm": 39.381641597318435, "learning_rate": 3.65153487655904e-07, "logits/chosen": 13.653580665588379, "logits/rejected": 13.743583679199219, "logps/chosen": -4.168906211853027, "logps/rejected": -4.354101181030273, "loss": 3.9257, "rewards/accuracies": 0.75, "rewards/chosen": -41.689064025878906, "rewards/margins": 1.8519535064697266, "rewards/rejected": -43.541015625, "step": 4223 }, { "epoch": 0.5751633986928104, "grad_norm": 39.35012834117003, "learning_rate": 3.649640742122316e-07, "logits/chosen": 13.417526245117188, "logits/rejected": 12.957803726196289, "logps/chosen": -4.225160598754883, "logps/rejected": -4.339997291564941, "loss": 4.155, "rewards/accuracies": 0.5, "rewards/chosen": -42.251609802246094, "rewards/margins": 1.1483659744262695, "rewards/rejected": -43.39997482299805, "step": 4224 }, { "epoch": 0.5752995642701525, "grad_norm": 38.70314953855648, "learning_rate": 3.647746686852164e-07, "logits/chosen": 13.71717643737793, "logits/rejected": 13.650489807128906, "logps/chosen": -4.02508544921875, "logps/rejected": -4.510960578918457, "loss": 4.1317, "rewards/accuracies": 0.75, "rewards/chosen": -40.2508544921875, "rewards/margins": 4.858747482299805, "rewards/rejected": -45.10960388183594, "step": 4225 }, { "epoch": 0.5754357298474946, "grad_norm": 38.241660225724004, "learning_rate": 3.6458527111765585e-07, "logits/chosen": 14.065552711486816, "logits/rejected": 13.88300895690918, "logps/chosen": -4.5467329025268555, "logps/rejected": -4.425777435302734, "loss": 4.1011, "rewards/accuracies": 0.5, "rewards/chosen": -45.46733093261719, "rewards/margins": -1.2095556259155273, "rewards/rejected": -44.257774353027344, "step": 4226 }, { "epoch": 0.5755718954248366, "grad_norm": 37.816625975219075, "learning_rate": 3.64395881552346e-07, "logits/chosen": 13.560914993286133, "logits/rejected": 13.229032516479492, "logps/chosen": -4.275584697723389, "logps/rejected": -4.226781845092773, "loss": 3.6526, "rewards/accuracies": 0.25, "rewards/chosen": -42.75584411621094, "rewards/margins": -0.48802947998046875, "rewards/rejected": -42.267818450927734, "step": 4227 }, { "epoch": 0.5757080610021786, "grad_norm": 38.01831707093568, "learning_rate": 3.642065000320812e-07, "logits/chosen": 13.269928932189941, "logits/rejected": 12.936701774597168, "logps/chosen": -4.054030418395996, "logps/rejected": -4.017139434814453, "loss": 3.9219, "rewards/accuracies": 0.75, "rewards/chosen": -40.540306091308594, "rewards/margins": -0.3689088821411133, "rewards/rejected": -40.1713981628418, "step": 4228 }, { "epoch": 0.5758442265795207, "grad_norm": 39.24977622139749, "learning_rate": 3.640171265996534e-07, "logits/chosen": 13.552539825439453, "logits/rejected": 14.596517562866211, "logps/chosen": -4.543489933013916, "logps/rejected": -4.727771282196045, "loss": 4.1286, "rewards/accuracies": 0.75, "rewards/chosen": -45.434898376464844, "rewards/margins": 1.8428153991699219, "rewards/rejected": -47.27771759033203, "step": 4229 }, { "epoch": 0.5759803921568627, "grad_norm": 38.15492677961072, "learning_rate": 3.638277612978533e-07, "logits/chosen": 13.065380096435547, "logits/rejected": 13.489234924316406, "logps/chosen": -3.932619571685791, "logps/rejected": -4.385103225708008, "loss": 3.9553, "rewards/accuracies": 0.75, "rewards/chosen": -39.326194763183594, "rewards/margins": 4.524839401245117, "rewards/rejected": -43.851036071777344, "step": 4230 }, { "epoch": 0.5761165577342048, "grad_norm": 40.72832828786694, "learning_rate": 3.636384041694697e-07, "logits/chosen": 13.64995002746582, "logits/rejected": 13.77260971069336, "logps/chosen": -4.279488563537598, "logps/rejected": -4.126533031463623, "loss": 3.9364, "rewards/accuracies": 0.5, "rewards/chosen": -42.794891357421875, "rewards/margins": -1.5295610427856445, "rewards/rejected": -41.26533126831055, "step": 4231 }, { "epoch": 0.5762527233115469, "grad_norm": 37.551662301450726, "learning_rate": 3.6344905525728907e-07, "logits/chosen": 12.656830787658691, "logits/rejected": 12.740514755249023, "logps/chosen": -3.938197135925293, "logps/rejected": -4.100109100341797, "loss": 3.3262, "rewards/accuracies": 0.75, "rewards/chosen": -39.38197326660156, "rewards/margins": 1.619114875793457, "rewards/rejected": -41.00109100341797, "step": 4232 }, { "epoch": 0.5763888888888888, "grad_norm": 41.46025136089876, "learning_rate": 3.632597146040966e-07, "logits/chosen": 13.482837677001953, "logits/rejected": 13.143841743469238, "logps/chosen": -4.191019058227539, "logps/rejected": -4.124111175537109, "loss": 4.0, "rewards/accuracies": 0.25, "rewards/chosen": -41.910186767578125, "rewards/margins": -0.6690740585327148, "rewards/rejected": -41.241111755371094, "step": 4233 }, { "epoch": 0.5765250544662309, "grad_norm": 37.89491165198298, "learning_rate": 3.630703822526754e-07, "logits/chosen": 13.215099334716797, "logits/rejected": 13.472909927368164, "logps/chosen": -4.2855753898620605, "logps/rejected": -4.780992031097412, "loss": 4.1669, "rewards/accuracies": 1.0, "rewards/chosen": -42.85575485229492, "rewards/margins": 4.954164505004883, "rewards/rejected": -47.80992126464844, "step": 4234 }, { "epoch": 0.576661220043573, "grad_norm": 35.73199672867141, "learning_rate": 3.628810582458065e-07, "logits/chosen": 14.303796768188477, "logits/rejected": 14.508392333984375, "logps/chosen": -4.737360954284668, "logps/rejected": -4.893312454223633, "loss": 3.638, "rewards/accuracies": 0.5, "rewards/chosen": -47.37360763549805, "rewards/margins": 1.5595159530639648, "rewards/rejected": -48.93312454223633, "step": 4235 }, { "epoch": 0.576797385620915, "grad_norm": 38.573788775763035, "learning_rate": 3.6269174262626926e-07, "logits/chosen": 13.616706848144531, "logits/rejected": 14.711307525634766, "logps/chosen": -4.161739349365234, "logps/rejected": -4.517187118530273, "loss": 3.962, "rewards/accuracies": 0.5, "rewards/chosen": -41.617393493652344, "rewards/margins": 3.554476261138916, "rewards/rejected": -45.17186737060547, "step": 4236 }, { "epoch": 0.5769335511982571, "grad_norm": 38.112405046524174, "learning_rate": 3.625024354368413e-07, "logits/chosen": 12.807939529418945, "logits/rejected": 12.748514175415039, "logps/chosen": -3.8581418991088867, "logps/rejected": -4.0418806076049805, "loss": 3.9955, "rewards/accuracies": 0.75, "rewards/chosen": -38.581417083740234, "rewards/margins": 1.8373870849609375, "rewards/rejected": -40.41880416870117, "step": 4237 }, { "epoch": 0.5770697167755992, "grad_norm": 37.74172420801813, "learning_rate": 3.62313136720298e-07, "logits/chosen": 13.625565528869629, "logits/rejected": 14.662578582763672, "logps/chosen": -4.093250274658203, "logps/rejected": -4.3870038986206055, "loss": 3.9017, "rewards/accuracies": 0.75, "rewards/chosen": -40.932498931884766, "rewards/margins": 2.9375429153442383, "rewards/rejected": -43.87004089355469, "step": 4238 }, { "epoch": 0.5772058823529411, "grad_norm": 38.760294072168584, "learning_rate": 3.621238465194128e-07, "logits/chosen": 14.191831588745117, "logits/rejected": 14.520532608032227, "logps/chosen": -4.278399467468262, "logps/rejected": -4.381889343261719, "loss": 4.2579, "rewards/accuracies": 0.5, "rewards/chosen": -42.78399658203125, "rewards/margins": 1.034895896911621, "rewards/rejected": -43.81889343261719, "step": 4239 }, { "epoch": 0.5773420479302832, "grad_norm": 38.19490612810641, "learning_rate": 3.619345648769578e-07, "logits/chosen": 13.81472396850586, "logits/rejected": 14.025976181030273, "logps/chosen": -4.614807605743408, "logps/rejected": -4.608728408813477, "loss": 4.2049, "rewards/accuracies": 0.5, "rewards/chosen": -46.148075103759766, "rewards/margins": -0.06079292297363281, "rewards/rejected": -46.0872802734375, "step": 4240 }, { "epoch": 0.5774782135076253, "grad_norm": 43.650388589711106, "learning_rate": 3.6174529183570244e-07, "logits/chosen": 13.860953330993652, "logits/rejected": 13.640848159790039, "logps/chosen": -4.219461917877197, "logps/rejected": -4.174930572509766, "loss": 4.8088, "rewards/accuracies": 0.5, "rewards/chosen": -42.194618225097656, "rewards/margins": -0.4453134536743164, "rewards/rejected": -41.749305725097656, "step": 4241 }, { "epoch": 0.5776143790849673, "grad_norm": 39.29906235409213, "learning_rate": 3.615560274384145e-07, "logits/chosen": 13.694938659667969, "logits/rejected": 13.20946216583252, "logps/chosen": -4.285162925720215, "logps/rejected": -4.163360595703125, "loss": 3.5382, "rewards/accuracies": 0.5, "rewards/chosen": -42.85163116455078, "rewards/margins": -1.2180280685424805, "rewards/rejected": -41.633602142333984, "step": 4242 }, { "epoch": 0.5777505446623094, "grad_norm": 45.709631920868006, "learning_rate": 3.613667717278601e-07, "logits/chosen": 12.648521423339844, "logits/rejected": 13.416584014892578, "logps/chosen": -3.7313621044158936, "logps/rejected": -3.908172369003296, "loss": 3.3273, "rewards/accuracies": 0.75, "rewards/chosen": -37.313621520996094, "rewards/margins": 1.7681012153625488, "rewards/rejected": -39.081722259521484, "step": 4243 }, { "epoch": 0.5778867102396514, "grad_norm": 42.1256494485577, "learning_rate": 3.611775247468029e-07, "logits/chosen": 13.203603744506836, "logits/rejected": 13.342823028564453, "logps/chosen": -4.218916893005371, "logps/rejected": -3.999452829360962, "loss": 4.3255, "rewards/accuracies": 0.25, "rewards/chosen": -42.189170837402344, "rewards/margins": -2.1946401596069336, "rewards/rejected": -39.994529724121094, "step": 4244 }, { "epoch": 0.5780228758169934, "grad_norm": 41.340010828374886, "learning_rate": 3.609882865380048e-07, "logits/chosen": 14.567031860351562, "logits/rejected": 14.722970008850098, "logps/chosen": -4.468872547149658, "logps/rejected": -4.740121841430664, "loss": 4.4347, "rewards/accuracies": 0.75, "rewards/chosen": -44.688724517822266, "rewards/margins": 2.712493896484375, "rewards/rejected": -47.40121841430664, "step": 4245 }, { "epoch": 0.5781590413943355, "grad_norm": 35.84411669908716, "learning_rate": 3.6079905714422607e-07, "logits/chosen": 13.788658142089844, "logits/rejected": 13.234831809997559, "logps/chosen": -4.509222984313965, "logps/rejected": -4.38861608505249, "loss": 4.0636, "rewards/accuracies": 0.5, "rewards/chosen": -45.09223175048828, "rewards/margins": -1.2060661315917969, "rewards/rejected": -43.88616180419922, "step": 4246 }, { "epoch": 0.5782952069716776, "grad_norm": 38.6853658184452, "learning_rate": 3.6060983660822417e-07, "logits/chosen": 13.371216773986816, "logits/rejected": 13.054372787475586, "logps/chosen": -4.0704345703125, "logps/rejected": -4.179974555969238, "loss": 3.564, "rewards/accuracies": 0.5, "rewards/chosen": -40.704345703125, "rewards/margins": 1.0953989028930664, "rewards/rejected": -41.79974365234375, "step": 4247 }, { "epoch": 0.5784313725490197, "grad_norm": 34.9146677064309, "learning_rate": 3.604206249727554e-07, "logits/chosen": 13.010601043701172, "logits/rejected": 13.81926441192627, "logps/chosen": -4.048076629638672, "logps/rejected": -4.5061821937561035, "loss": 3.7886, "rewards/accuracies": 0.75, "rewards/chosen": -40.48076629638672, "rewards/margins": 4.581057548522949, "rewards/rejected": -45.06182098388672, "step": 4248 }, { "epoch": 0.5785675381263616, "grad_norm": 51.34497100539351, "learning_rate": 3.6023142228057364e-07, "logits/chosen": 12.60179328918457, "logits/rejected": 13.89482307434082, "logps/chosen": -3.9968178272247314, "logps/rejected": -4.242275238037109, "loss": 4.266, "rewards/accuracies": 0.75, "rewards/chosen": -39.968177795410156, "rewards/margins": 2.4545764923095703, "rewards/rejected": -42.422752380371094, "step": 4249 }, { "epoch": 0.5787037037037037, "grad_norm": 39.36678004055825, "learning_rate": 3.600422285744306e-07, "logits/chosen": 12.859249114990234, "logits/rejected": 13.194661140441895, "logps/chosen": -3.7933621406555176, "logps/rejected": -3.909257411956787, "loss": 4.1984, "rewards/accuracies": 0.5, "rewards/chosen": -37.93362045288086, "rewards/margins": 1.1589555740356445, "rewards/rejected": -39.09257507324219, "step": 4250 }, { "epoch": 0.5788398692810458, "grad_norm": 38.55456725546766, "learning_rate": 3.5985304389707635e-07, "logits/chosen": 13.353816032409668, "logits/rejected": 13.264751434326172, "logps/chosen": -4.239008903503418, "logps/rejected": -4.437474250793457, "loss": 4.2207, "rewards/accuracies": 0.75, "rewards/chosen": -42.39009475708008, "rewards/margins": 1.9846506118774414, "rewards/rejected": -44.37474060058594, "step": 4251 }, { "epoch": 0.5789760348583878, "grad_norm": 41.574274396999655, "learning_rate": 3.596638682912589e-07, "logits/chosen": 13.007915496826172, "logits/rejected": 13.934553146362305, "logps/chosen": -4.406652450561523, "logps/rejected": -4.892055511474609, "loss": 4.0304, "rewards/accuracies": 1.0, "rewards/chosen": -44.066524505615234, "rewards/margins": 4.854028701782227, "rewards/rejected": -48.920555114746094, "step": 4252 }, { "epoch": 0.5791122004357299, "grad_norm": 35.45634407741334, "learning_rate": 3.5947470179972355e-07, "logits/chosen": 13.464892387390137, "logits/rejected": 13.483186721801758, "logps/chosen": -3.831451416015625, "logps/rejected": -3.944254159927368, "loss": 3.6537, "rewards/accuracies": 0.75, "rewards/chosen": -38.31451416015625, "rewards/margins": 1.128026008605957, "rewards/rejected": -39.442543029785156, "step": 4253 }, { "epoch": 0.579248366013072, "grad_norm": 38.15662092148279, "learning_rate": 3.5928554446521434e-07, "logits/chosen": 13.738969802856445, "logits/rejected": 13.971899032592773, "logps/chosen": -4.096656799316406, "logps/rejected": -4.2639241218566895, "loss": 3.6738, "rewards/accuracies": 0.75, "rewards/chosen": -40.96656799316406, "rewards/margins": 1.672673225402832, "rewards/rejected": -42.639244079589844, "step": 4254 }, { "epoch": 0.5793845315904139, "grad_norm": 37.61074632028811, "learning_rate": 3.590963963304731e-07, "logits/chosen": 13.228618621826172, "logits/rejected": 13.94039535522461, "logps/chosen": -4.207413196563721, "logps/rejected": -4.430229187011719, "loss": 3.9263, "rewards/accuracies": 0.75, "rewards/chosen": -42.074134826660156, "rewards/margins": 2.2281627655029297, "rewards/rejected": -44.30229568481445, "step": 4255 }, { "epoch": 0.579520697167756, "grad_norm": 42.49667727528975, "learning_rate": 3.5890725743823905e-07, "logits/chosen": 13.334915161132812, "logits/rejected": 13.469512939453125, "logps/chosen": -4.372389793395996, "logps/rejected": -4.444180011749268, "loss": 4.504, "rewards/accuracies": 0.5, "rewards/chosen": -43.723899841308594, "rewards/margins": 0.7178983688354492, "rewards/rejected": -44.44179916381836, "step": 4256 }, { "epoch": 0.5796568627450981, "grad_norm": 36.29194809763268, "learning_rate": 3.5871812783124987e-07, "logits/chosen": 13.882911682128906, "logits/rejected": 14.186830520629883, "logps/chosen": -4.377410411834717, "logps/rejected": -4.679954528808594, "loss": 3.8633, "rewards/accuracies": 0.75, "rewards/chosen": -43.77410888671875, "rewards/margins": 3.025437355041504, "rewards/rejected": -46.79954528808594, "step": 4257 }, { "epoch": 0.5797930283224401, "grad_norm": 40.95013467392382, "learning_rate": 3.5852900755224115e-07, "logits/chosen": 12.430763244628906, "logits/rejected": 13.0740966796875, "logps/chosen": -4.073322296142578, "logps/rejected": -4.226016521453857, "loss": 4.4542, "rewards/accuracies": 0.75, "rewards/chosen": -40.73322296142578, "rewards/margins": 1.5269441604614258, "rewards/rejected": -42.26016616821289, "step": 4258 }, { "epoch": 0.5799291938997821, "grad_norm": 39.16145210898278, "learning_rate": 3.5833989664394574e-07, "logits/chosen": 13.440322875976562, "logits/rejected": 13.677752494812012, "logps/chosen": -4.373043537139893, "logps/rejected": -4.391547679901123, "loss": 4.0893, "rewards/accuracies": 0.25, "rewards/chosen": -43.730438232421875, "rewards/margins": 0.18503952026367188, "rewards/rejected": -43.91547393798828, "step": 4259 }, { "epoch": 0.5800653594771242, "grad_norm": 41.49719171125037, "learning_rate": 3.5815079514909504e-07, "logits/chosen": 13.525541305541992, "logits/rejected": 13.288925170898438, "logps/chosen": -4.471171855926514, "logps/rejected": -4.316394805908203, "loss": 4.4526, "rewards/accuracies": 0.5, "rewards/chosen": -44.71171951293945, "rewards/margins": -1.5477714538574219, "rewards/rejected": -43.16394805908203, "step": 4260 }, { "epoch": 0.5802015250544662, "grad_norm": 35.19225728286332, "learning_rate": 3.5796170311041826e-07, "logits/chosen": 13.10183334350586, "logits/rejected": 13.104421615600586, "logps/chosen": -3.7479286193847656, "logps/rejected": -4.195146560668945, "loss": 3.8951, "rewards/accuracies": 0.75, "rewards/chosen": -37.479286193847656, "rewards/margins": 4.472182273864746, "rewards/rejected": -41.95146942138672, "step": 4261 }, { "epoch": 0.5803376906318083, "grad_norm": 40.22812724350172, "learning_rate": 3.577726205706421e-07, "logits/chosen": 13.478307723999023, "logits/rejected": 13.713424682617188, "logps/chosen": -4.036830902099609, "logps/rejected": -4.357508659362793, "loss": 3.7009, "rewards/accuracies": 1.0, "rewards/chosen": -40.368309020996094, "rewards/margins": 3.206782341003418, "rewards/rejected": -43.57509231567383, "step": 4262 }, { "epoch": 0.5804738562091504, "grad_norm": 36.9582146574571, "learning_rate": 3.575835475724913e-07, "logits/chosen": 13.311845779418945, "logits/rejected": 13.06332778930664, "logps/chosen": -4.124588966369629, "logps/rejected": -4.083412170410156, "loss": 3.78, "rewards/accuracies": 0.5, "rewards/chosen": -41.24589157104492, "rewards/margins": -0.4117708206176758, "rewards/rejected": -40.83412170410156, "step": 4263 }, { "epoch": 0.5806100217864923, "grad_norm": 37.562345445982004, "learning_rate": 3.5739448415868867e-07, "logits/chosen": 13.061286926269531, "logits/rejected": 13.853401184082031, "logps/chosen": -4.078197956085205, "logps/rejected": -4.2812018394470215, "loss": 3.4625, "rewards/accuracies": 0.75, "rewards/chosen": -40.781982421875, "rewards/margins": 2.0300378799438477, "rewards/rejected": -42.81201934814453, "step": 4264 }, { "epoch": 0.5807461873638344, "grad_norm": 40.48209322933946, "learning_rate": 3.572054303719545e-07, "logits/chosen": 12.681683540344238, "logits/rejected": 13.259060859680176, "logps/chosen": -3.8715105056762695, "logps/rejected": -3.9768242835998535, "loss": 3.7971, "rewards/accuracies": 0.5, "rewards/chosen": -38.71510314941406, "rewards/margins": 1.0531396865844727, "rewards/rejected": -39.76824188232422, "step": 4265 }, { "epoch": 0.5808823529411765, "grad_norm": 37.854159648872134, "learning_rate": 3.5701638625500697e-07, "logits/chosen": 12.925999641418457, "logits/rejected": 13.359612464904785, "logps/chosen": -3.807497978210449, "logps/rejected": -3.9873900413513184, "loss": 3.8281, "rewards/accuracies": 0.5, "rewards/chosen": -38.074981689453125, "rewards/margins": 1.798922061920166, "rewards/rejected": -39.8739013671875, "step": 4266 }, { "epoch": 0.5810185185185185, "grad_norm": 36.32152688275263, "learning_rate": 3.5682735185056235e-07, "logits/chosen": 13.335636138916016, "logits/rejected": 13.814115524291992, "logps/chosen": -3.797865390777588, "logps/rejected": -4.159404754638672, "loss": 3.6454, "rewards/accuracies": 0.75, "rewards/chosen": -37.97865295410156, "rewards/margins": 3.6153974533081055, "rewards/rejected": -41.594051361083984, "step": 4267 }, { "epoch": 0.5811546840958606, "grad_norm": 40.13696769685199, "learning_rate": 3.566383272013344e-07, "logits/chosen": 12.278800010681152, "logits/rejected": 13.493602752685547, "logps/chosen": -3.6559813022613525, "logps/rejected": -4.159241676330566, "loss": 4.3793, "rewards/accuracies": 1.0, "rewards/chosen": -36.559814453125, "rewards/margins": 5.032607078552246, "rewards/rejected": -41.59242248535156, "step": 4268 }, { "epoch": 0.5812908496732027, "grad_norm": 44.85665109493625, "learning_rate": 3.56449312350035e-07, "logits/chosen": 12.301254272460938, "logits/rejected": 13.256715774536133, "logps/chosen": -3.902998208999634, "logps/rejected": -4.132973670959473, "loss": 3.7437, "rewards/accuracies": 0.75, "rewards/chosen": -39.02998352050781, "rewards/margins": 2.2997541427612305, "rewards/rejected": -41.329734802246094, "step": 4269 }, { "epoch": 0.5814270152505446, "grad_norm": 39.33915903202537, "learning_rate": 3.562603073393733e-07, "logits/chosen": 13.385677337646484, "logits/rejected": 13.726085662841797, "logps/chosen": -4.1051225662231445, "logps/rejected": -4.507150650024414, "loss": 4.4742, "rewards/accuracies": 0.75, "rewards/chosen": -41.05122375488281, "rewards/margins": 4.020284652709961, "rewards/rejected": -45.071502685546875, "step": 4270 }, { "epoch": 0.5815631808278867, "grad_norm": 39.91003844640914, "learning_rate": 3.5607131221205674e-07, "logits/chosen": 13.265339851379395, "logits/rejected": 13.365432739257812, "logps/chosen": -4.1934356689453125, "logps/rejected": -4.018012523651123, "loss": 4.1155, "rewards/accuracies": 0.25, "rewards/chosen": -41.934356689453125, "rewards/margins": -1.7542304992675781, "rewards/rejected": -40.18012619018555, "step": 4271 }, { "epoch": 0.5816993464052288, "grad_norm": 39.20613853485117, "learning_rate": 3.558823270107904e-07, "logits/chosen": 12.975115776062012, "logits/rejected": 13.566256523132324, "logps/chosen": -4.206681728363037, "logps/rejected": -4.577511310577393, "loss": 3.791, "rewards/accuracies": 1.0, "rewards/chosen": -42.06681823730469, "rewards/margins": 3.7082948684692383, "rewards/rejected": -45.77511215209961, "step": 4272 }, { "epoch": 0.5818355119825708, "grad_norm": 41.31662921241146, "learning_rate": 3.556933517782769e-07, "logits/chosen": 12.827831268310547, "logits/rejected": 13.507173538208008, "logps/chosen": -3.917588710784912, "logps/rejected": -4.234413146972656, "loss": 3.6487, "rewards/accuracies": 1.0, "rewards/chosen": -39.17588424682617, "rewards/margins": 3.1682443618774414, "rewards/rejected": -42.34413146972656, "step": 4273 }, { "epoch": 0.5819716775599129, "grad_norm": 39.572529147253796, "learning_rate": 3.5550438655721676e-07, "logits/chosen": 12.558202743530273, "logits/rejected": 13.110934257507324, "logps/chosen": -3.9962470531463623, "logps/rejected": -4.322517395019531, "loss": 3.9927, "rewards/accuracies": 0.75, "rewards/chosen": -39.96247100830078, "rewards/margins": 3.262704849243164, "rewards/rejected": -43.22517395019531, "step": 4274 }, { "epoch": 0.5821078431372549, "grad_norm": 40.05510623653836, "learning_rate": 3.5531543139030826e-07, "logits/chosen": 13.602797508239746, "logits/rejected": 13.971309661865234, "logps/chosen": -4.1167426109313965, "logps/rejected": -4.126230716705322, "loss": 4.3553, "rewards/accuracies": 0.5, "rewards/chosen": -41.16742706298828, "rewards/margins": 0.09487724304199219, "rewards/rejected": -41.262306213378906, "step": 4275 }, { "epoch": 0.5822440087145969, "grad_norm": 42.730126061468184, "learning_rate": 3.551264863202476e-07, "logits/chosen": 12.23138427734375, "logits/rejected": 12.589784622192383, "logps/chosen": -3.7857561111450195, "logps/rejected": -4.107543468475342, "loss": 4.0815, "rewards/accuracies": 0.5, "rewards/chosen": -37.85755920410156, "rewards/margins": 3.217874526977539, "rewards/rejected": -41.075435638427734, "step": 4276 }, { "epoch": 0.582380174291939, "grad_norm": 36.87843416273103, "learning_rate": 3.549375513897281e-07, "logits/chosen": 12.48792839050293, "logits/rejected": 13.323083877563477, "logps/chosen": -3.828223943710327, "logps/rejected": -4.06245756149292, "loss": 3.8702, "rewards/accuracies": 0.75, "rewards/chosen": -38.28224182128906, "rewards/margins": 2.342336654663086, "rewards/rejected": -40.624576568603516, "step": 4277 }, { "epoch": 0.5825163398692811, "grad_norm": 36.87123116528135, "learning_rate": 3.5474862664144134e-07, "logits/chosen": 13.185978889465332, "logits/rejected": 13.31985092163086, "logps/chosen": -4.364792823791504, "logps/rejected": -4.316437244415283, "loss": 3.961, "rewards/accuracies": 0.75, "rewards/chosen": -43.647926330566406, "rewards/margins": -0.4835548400878906, "rewards/rejected": -43.16437530517578, "step": 4278 }, { "epoch": 0.5826525054466231, "grad_norm": 42.67548822045292, "learning_rate": 3.545597121180766e-07, "logits/chosen": 13.411056518554688, "logits/rejected": 13.659709930419922, "logps/chosen": -4.445974349975586, "logps/rejected": -4.669503688812256, "loss": 4.3668, "rewards/accuracies": 0.5, "rewards/chosen": -44.459747314453125, "rewards/margins": 2.2352895736694336, "rewards/rejected": -46.695037841796875, "step": 4279 }, { "epoch": 0.5827886710239651, "grad_norm": 38.7634665744295, "learning_rate": 3.543708078623204e-07, "logits/chosen": 12.817900657653809, "logits/rejected": 13.974882125854492, "logps/chosen": -3.9347496032714844, "logps/rejected": -3.9129154682159424, "loss": 3.9299, "rewards/accuracies": 0.5, "rewards/chosen": -39.347496032714844, "rewards/margins": -0.21834182739257812, "rewards/rejected": -39.129154205322266, "step": 4280 }, { "epoch": 0.5829248366013072, "grad_norm": 39.24017021220592, "learning_rate": 3.541819139168573e-07, "logits/chosen": 12.827800750732422, "logits/rejected": 14.179933547973633, "logps/chosen": -3.9202334880828857, "logps/rejected": -4.240945816040039, "loss": 4.2276, "rewards/accuracies": 0.75, "rewards/chosen": -39.202335357666016, "rewards/margins": 3.207127571105957, "rewards/rejected": -42.409461975097656, "step": 4281 }, { "epoch": 0.5830610021786492, "grad_norm": 41.88416612677054, "learning_rate": 3.5399303032436967e-07, "logits/chosen": 12.416455268859863, "logits/rejected": 13.321088790893555, "logps/chosen": -3.9702343940734863, "logps/rejected": -4.385807991027832, "loss": 4.4689, "rewards/accuracies": 0.75, "rewards/chosen": -39.70234680175781, "rewards/margins": 4.155734062194824, "rewards/rejected": -43.85807800292969, "step": 4282 }, { "epoch": 0.5831971677559913, "grad_norm": 44.4614100871878, "learning_rate": 3.5380415712753695e-07, "logits/chosen": 14.023828506469727, "logits/rejected": 14.019861221313477, "logps/chosen": -4.328784465789795, "logps/rejected": -4.543615341186523, "loss": 4.044, "rewards/accuracies": 1.0, "rewards/chosen": -43.287845611572266, "rewards/margins": 2.148305892944336, "rewards/rejected": -45.43614959716797, "step": 4283 }, { "epoch": 0.5833333333333334, "grad_norm": 38.05242077013399, "learning_rate": 3.536152943690368e-07, "logits/chosen": 14.09477424621582, "logits/rejected": 13.409561157226562, "logps/chosen": -4.390015602111816, "logps/rejected": -4.111143112182617, "loss": 3.5413, "rewards/accuracies": 0.5, "rewards/chosen": -43.90015411376953, "rewards/margins": -2.7887210845947266, "rewards/rejected": -41.11143493652344, "step": 4284 }, { "epoch": 0.5834694989106753, "grad_norm": 43.38002711122084, "learning_rate": 3.534264420915445e-07, "logits/chosen": 13.651406288146973, "logits/rejected": 13.456954956054688, "logps/chosen": -4.426023006439209, "logps/rejected": -4.360034942626953, "loss": 4.5005, "rewards/accuracies": 0.75, "rewards/chosen": -44.260231018066406, "rewards/margins": -0.6598806381225586, "rewards/rejected": -43.60034942626953, "step": 4285 }, { "epoch": 0.5836056644880174, "grad_norm": 40.03423156681485, "learning_rate": 3.532376003377324e-07, "logits/chosen": 13.372037887573242, "logits/rejected": 13.0363130569458, "logps/chosen": -3.819772481918335, "logps/rejected": -4.025119781494141, "loss": 4.4066, "rewards/accuracies": 0.75, "rewards/chosen": -38.197723388671875, "rewards/margins": 2.0534772872924805, "rewards/rejected": -40.25120162963867, "step": 4286 }, { "epoch": 0.5837418300653595, "grad_norm": 39.90577619939357, "learning_rate": 3.53048769150271e-07, "logits/chosen": 13.327818870544434, "logits/rejected": 12.961023330688477, "logps/chosen": -4.137443542480469, "logps/rejected": -4.201053619384766, "loss": 3.8132, "rewards/accuracies": 0.75, "rewards/chosen": -41.37443542480469, "rewards/margins": 0.6360979080200195, "rewards/rejected": -42.010536193847656, "step": 4287 }, { "epoch": 0.5838779956427015, "grad_norm": 42.25790395641728, "learning_rate": 3.528599485718285e-07, "logits/chosen": 13.92453384399414, "logits/rejected": 14.893209457397461, "logps/chosen": -4.3072686195373535, "logps/rejected": -4.585822105407715, "loss": 4.0689, "rewards/accuracies": 1.0, "rewards/chosen": -43.07268524169922, "rewards/margins": 2.7855348587036133, "rewards/rejected": -45.85822296142578, "step": 4288 }, { "epoch": 0.5840141612200436, "grad_norm": 39.14793910939196, "learning_rate": 3.5267113864507016e-07, "logits/chosen": 12.724369049072266, "logits/rejected": 12.716724395751953, "logps/chosen": -3.658282518386841, "logps/rejected": -3.9520177841186523, "loss": 3.8065, "rewards/accuracies": 0.75, "rewards/chosen": -36.58282470703125, "rewards/margins": 2.9373507499694824, "rewards/rejected": -39.520179748535156, "step": 4289 }, { "epoch": 0.5841503267973857, "grad_norm": 38.72871982319146, "learning_rate": 3.5248233941265926e-07, "logits/chosen": 13.84050178527832, "logits/rejected": 13.087806701660156, "logps/chosen": -4.139301300048828, "logps/rejected": -3.853227138519287, "loss": 4.2677, "rewards/accuracies": 0.25, "rewards/chosen": -41.39301681518555, "rewards/margins": -2.860743522644043, "rewards/rejected": -38.53227233886719, "step": 4290 }, { "epoch": 0.5842864923747276, "grad_norm": 39.07489720645732, "learning_rate": 3.522935509172567e-07, "logits/chosen": 13.558060646057129, "logits/rejected": 13.919966697692871, "logps/chosen": -4.1512579917907715, "logps/rejected": -4.239260196685791, "loss": 3.7499, "rewards/accuracies": 0.5, "rewards/chosen": -41.51258087158203, "rewards/margins": 0.8800239562988281, "rewards/rejected": -42.39260482788086, "step": 4291 }, { "epoch": 0.5844226579520697, "grad_norm": 37.24021250398234, "learning_rate": 3.521047732015205e-07, "logits/chosen": 13.340045928955078, "logits/rejected": 13.601536750793457, "logps/chosen": -4.015135288238525, "logps/rejected": -4.300358295440674, "loss": 3.9455, "rewards/accuracies": 0.75, "rewards/chosen": -40.15135192871094, "rewards/margins": 2.8522281646728516, "rewards/rejected": -43.00358200073242, "step": 4292 }, { "epoch": 0.5845588235294118, "grad_norm": 41.90742226183146, "learning_rate": 3.519160063081067e-07, "logits/chosen": 13.264213562011719, "logits/rejected": 13.293352127075195, "logps/chosen": -4.049216270446777, "logps/rejected": -4.242258071899414, "loss": 4.337, "rewards/accuracies": 0.75, "rewards/chosen": -40.492164611816406, "rewards/margins": 1.9304189682006836, "rewards/rejected": -42.422584533691406, "step": 4293 }, { "epoch": 0.5846949891067538, "grad_norm": 40.94298392168802, "learning_rate": 3.517272502796689e-07, "logits/chosen": 14.019323348999023, "logits/rejected": 14.36801528930664, "logps/chosen": -4.205334663391113, "logps/rejected": -4.453843116760254, "loss": 4.2056, "rewards/accuracies": 0.75, "rewards/chosen": -42.0533447265625, "rewards/margins": 2.4850854873657227, "rewards/rejected": -44.538429260253906, "step": 4294 }, { "epoch": 0.5848311546840959, "grad_norm": 40.81726863084586, "learning_rate": 3.515385051588578e-07, "logits/chosen": 13.06460189819336, "logits/rejected": 13.83283805847168, "logps/chosen": -4.301624298095703, "logps/rejected": -4.713613986968994, "loss": 4.4754, "rewards/accuracies": 1.0, "rewards/chosen": -43.01624298095703, "rewards/margins": 4.119897842407227, "rewards/rejected": -47.136138916015625, "step": 4295 }, { "epoch": 0.5849673202614379, "grad_norm": 39.38079973377912, "learning_rate": 3.5134977098832195e-07, "logits/chosen": 12.260909080505371, "logits/rejected": 12.762991905212402, "logps/chosen": -4.198428153991699, "logps/rejected": -4.007899284362793, "loss": 4.601, "rewards/accuracies": 0.5, "rewards/chosen": -41.984283447265625, "rewards/margins": -1.905287742614746, "rewards/rejected": -40.07899475097656, "step": 4296 }, { "epoch": 0.5851034858387799, "grad_norm": 40.48429005309049, "learning_rate": 3.5116104781070774e-07, "logits/chosen": 12.585498809814453, "logits/rejected": 12.26144790649414, "logps/chosen": -3.995225429534912, "logps/rejected": -3.7787747383117676, "loss": 4.2401, "rewards/accuracies": 0.25, "rewards/chosen": -39.95225524902344, "rewards/margins": -2.164504051208496, "rewards/rejected": -37.78774642944336, "step": 4297 }, { "epoch": 0.585239651416122, "grad_norm": 40.88827567767022, "learning_rate": 3.509723356686583e-07, "logits/chosen": 13.8365478515625, "logits/rejected": 13.206258773803711, "logps/chosen": -4.288195610046387, "logps/rejected": -4.305489540100098, "loss": 3.6642, "rewards/accuracies": 0.5, "rewards/chosen": -42.8819580078125, "rewards/margins": 0.17293548583984375, "rewards/rejected": -43.054893493652344, "step": 4298 }, { "epoch": 0.5853758169934641, "grad_norm": 37.29975020385796, "learning_rate": 3.507836346048149e-07, "logits/chosen": 13.594030380249023, "logits/rejected": 13.50540542602539, "logps/chosen": -4.272902488708496, "logps/rejected": -4.389895915985107, "loss": 4.1332, "rewards/accuracies": 0.5, "rewards/chosen": -42.72902297973633, "rewards/margins": 1.1699352264404297, "rewards/rejected": -43.898956298828125, "step": 4299 }, { "epoch": 0.585511982570806, "grad_norm": 35.8112374808351, "learning_rate": 3.5059494466181623e-07, "logits/chosen": 14.332916259765625, "logits/rejected": 14.056316375732422, "logps/chosen": -4.409590721130371, "logps/rejected": -4.704174518585205, "loss": 3.6817, "rewards/accuracies": 0.75, "rewards/chosen": -44.09590530395508, "rewards/margins": 2.945840835571289, "rewards/rejected": -47.041748046875, "step": 4300 }, { "epoch": 0.5856481481481481, "grad_norm": 39.076346566694426, "learning_rate": 3.5040626588229803e-07, "logits/chosen": 13.723167419433594, "logits/rejected": 13.512081146240234, "logps/chosen": -4.538295745849609, "logps/rejected": -4.518967628479004, "loss": 3.9113, "rewards/accuracies": 0.5, "rewards/chosen": -45.382957458496094, "rewards/margins": -0.19328594207763672, "rewards/rejected": -45.189674377441406, "step": 4301 }, { "epoch": 0.5857843137254902, "grad_norm": 38.85288772422081, "learning_rate": 3.50217598308894e-07, "logits/chosen": 13.663837432861328, "logits/rejected": 13.925735473632812, "logps/chosen": -4.418360710144043, "logps/rejected": -4.390642166137695, "loss": 4.2599, "rewards/accuracies": 0.75, "rewards/chosen": -44.18360900878906, "rewards/margins": -0.2771883010864258, "rewards/rejected": -43.90642166137695, "step": 4302 }, { "epoch": 0.5859204793028322, "grad_norm": 36.49167496663895, "learning_rate": 3.5002894198423533e-07, "logits/chosen": 13.775625228881836, "logits/rejected": 13.578862190246582, "logps/chosen": -4.18446683883667, "logps/rejected": -4.259560585021973, "loss": 3.861, "rewards/accuracies": 0.5, "rewards/chosen": -41.84466552734375, "rewards/margins": 0.7509384155273438, "rewards/rejected": -42.595603942871094, "step": 4303 }, { "epoch": 0.5860566448801743, "grad_norm": 37.47285526832838, "learning_rate": 3.498402969509501e-07, "logits/chosen": 13.889129638671875, "logits/rejected": 14.606204986572266, "logps/chosen": -4.276574611663818, "logps/rejected": -4.518589973449707, "loss": 4.208, "rewards/accuracies": 1.0, "rewards/chosen": -42.7657470703125, "rewards/margins": 2.4201574325561523, "rewards/rejected": -45.18590545654297, "step": 4304 }, { "epoch": 0.5861928104575164, "grad_norm": 38.30357188187501, "learning_rate": 3.496516632516644e-07, "logits/chosen": 13.752565383911133, "logits/rejected": 13.62933349609375, "logps/chosen": -4.277185440063477, "logps/rejected": -4.2754034996032715, "loss": 4.0598, "rewards/accuracies": 0.5, "rewards/chosen": -42.7718505859375, "rewards/margins": -0.017816543579101562, "rewards/rejected": -42.75403594970703, "step": 4305 }, { "epoch": 0.5863289760348583, "grad_norm": 38.114734952894786, "learning_rate": 3.494630409290017e-07, "logits/chosen": 14.19108772277832, "logits/rejected": 14.089987754821777, "logps/chosen": -4.513749122619629, "logps/rejected": -4.480059623718262, "loss": 4.0037, "rewards/accuracies": 0.75, "rewards/chosen": -45.13749313354492, "rewards/margins": -0.3368949890136719, "rewards/rejected": -44.800594329833984, "step": 4306 }, { "epoch": 0.5864651416122004, "grad_norm": 36.94847144073054, "learning_rate": 3.4927443002558255e-07, "logits/chosen": 12.649343490600586, "logits/rejected": 13.224841117858887, "logps/chosen": -3.872968912124634, "logps/rejected": -4.025104999542236, "loss": 3.7972, "rewards/accuracies": 0.75, "rewards/chosen": -38.72968673706055, "rewards/margins": 1.5213603973388672, "rewards/rejected": -40.25104904174805, "step": 4307 }, { "epoch": 0.5866013071895425, "grad_norm": 38.1386122428987, "learning_rate": 3.4908583058402517e-07, "logits/chosen": 14.056288719177246, "logits/rejected": 14.583396911621094, "logps/chosen": -4.772212028503418, "logps/rejected": -4.872341632843018, "loss": 3.8107, "rewards/accuracies": 0.75, "rewards/chosen": -47.72211837768555, "rewards/margins": 1.0012969970703125, "rewards/rejected": -48.72341537475586, "step": 4308 }, { "epoch": 0.5867374727668845, "grad_norm": 38.03069596402022, "learning_rate": 3.488972426469454e-07, "logits/chosen": 12.246881484985352, "logits/rejected": 13.290349960327148, "logps/chosen": -3.839609146118164, "logps/rejected": -4.294493198394775, "loss": 4.1914, "rewards/accuracies": 1.0, "rewards/chosen": -38.39609146118164, "rewards/margins": 4.54884147644043, "rewards/rejected": -42.94493103027344, "step": 4309 }, { "epoch": 0.5868736383442266, "grad_norm": 42.24601645033692, "learning_rate": 3.4870866625695595e-07, "logits/chosen": 14.067134857177734, "logits/rejected": 13.760225296020508, "logps/chosen": -4.303431510925293, "logps/rejected": -4.151589393615723, "loss": 3.3727, "rewards/accuracies": 0.25, "rewards/chosen": -43.03431701660156, "rewards/margins": -1.5184249877929688, "rewards/rejected": -41.515892028808594, "step": 4310 }, { "epoch": 0.5870098039215687, "grad_norm": 34.57812267062475, "learning_rate": 3.4852010145666733e-07, "logits/chosen": 13.726381301879883, "logits/rejected": 13.36335277557373, "logps/chosen": -4.299925804138184, "logps/rejected": -4.436337471008301, "loss": 3.9796, "rewards/accuracies": 0.5, "rewards/chosen": -42.99925994873047, "rewards/margins": 1.3641166687011719, "rewards/rejected": -44.363372802734375, "step": 4311 }, { "epoch": 0.5871459694989106, "grad_norm": 41.01427644660221, "learning_rate": 3.483315482886874e-07, "logits/chosen": 13.86043930053711, "logits/rejected": 14.062722206115723, "logps/chosen": -4.467157363891602, "logps/rejected": -4.521623134613037, "loss": 4.139, "rewards/accuracies": 0.5, "rewards/chosen": -44.67157745361328, "rewards/margins": 0.5446557998657227, "rewards/rejected": -45.21623229980469, "step": 4312 }, { "epoch": 0.5872821350762527, "grad_norm": 38.569016049395316, "learning_rate": 3.4814300679562127e-07, "logits/chosen": 13.692977905273438, "logits/rejected": 14.611987113952637, "logps/chosen": -4.379764556884766, "logps/rejected": -4.926974773406982, "loss": 3.5497, "rewards/accuracies": 0.75, "rewards/chosen": -43.79764938354492, "rewards/margins": 5.472097396850586, "rewards/rejected": -49.269744873046875, "step": 4313 }, { "epoch": 0.5874183006535948, "grad_norm": 36.84793485233523, "learning_rate": 3.4795447702007127e-07, "logits/chosen": 13.084144592285156, "logits/rejected": 13.760452270507812, "logps/chosen": -4.255722999572754, "logps/rejected": -4.4373250007629395, "loss": 4.1063, "rewards/accuracies": 0.75, "rewards/chosen": -42.55723190307617, "rewards/margins": 1.816019058227539, "rewards/rejected": -44.373252868652344, "step": 4314 }, { "epoch": 0.5875544662309368, "grad_norm": 39.42314802443885, "learning_rate": 3.4776595900463745e-07, "logits/chosen": 14.12357234954834, "logits/rejected": 14.477846145629883, "logps/chosen": -4.512610912322998, "logps/rejected": -4.646657943725586, "loss": 3.9722, "rewards/accuracies": 0.75, "rewards/chosen": -45.1261100769043, "rewards/margins": 1.340470314025879, "rewards/rejected": -46.466583251953125, "step": 4315 }, { "epoch": 0.5876906318082789, "grad_norm": 37.112777477556534, "learning_rate": 3.47577452791917e-07, "logits/chosen": 14.357176780700684, "logits/rejected": 14.155025482177734, "logps/chosen": -4.232803821563721, "logps/rejected": -4.33065128326416, "loss": 4.2826, "rewards/accuracies": 0.75, "rewards/chosen": -42.328041076660156, "rewards/margins": 0.9784708023071289, "rewards/rejected": -43.30651092529297, "step": 4316 }, { "epoch": 0.5878267973856209, "grad_norm": 36.08386350942166, "learning_rate": 3.473889584245044e-07, "logits/chosen": 13.308584213256836, "logits/rejected": 13.958389282226562, "logps/chosen": -3.950670003890991, "logps/rejected": -4.4111528396606445, "loss": 3.481, "rewards/accuracies": 1.0, "rewards/chosen": -39.50669860839844, "rewards/margins": 4.604827880859375, "rewards/rejected": -44.11153030395508, "step": 4317 }, { "epoch": 0.5879629629629629, "grad_norm": 37.72441714293938, "learning_rate": 3.472004759449916e-07, "logits/chosen": 14.4815092086792, "logits/rejected": 13.866601943969727, "logps/chosen": -4.525211334228516, "logps/rejected": -4.720281600952148, "loss": 4.1781, "rewards/accuracies": 0.75, "rewards/chosen": -45.252113342285156, "rewards/margins": 1.950704574584961, "rewards/rejected": -47.20281982421875, "step": 4318 }, { "epoch": 0.588099128540305, "grad_norm": 38.735722525661465, "learning_rate": 3.470120053959675e-07, "logits/chosen": 13.862127304077148, "logits/rejected": 14.034194946289062, "logps/chosen": -4.361239910125732, "logps/rejected": -4.545773506164551, "loss": 4.1159, "rewards/accuracies": 0.75, "rewards/chosen": -43.61240005493164, "rewards/margins": 1.8453340530395508, "rewards/rejected": -45.457733154296875, "step": 4319 }, { "epoch": 0.5882352941176471, "grad_norm": 38.739711408122645, "learning_rate": 3.46823546820019e-07, "logits/chosen": 13.764813423156738, "logits/rejected": 13.85952377319336, "logps/chosen": -4.208062171936035, "logps/rejected": -4.359122276306152, "loss": 4.4597, "rewards/accuracies": 0.75, "rewards/chosen": -42.08061981201172, "rewards/margins": 1.510599136352539, "rewards/rejected": -43.591224670410156, "step": 4320 }, { "epoch": 0.588371459694989, "grad_norm": 37.3773240074557, "learning_rate": 3.466351002597296e-07, "logits/chosen": 13.09473705291748, "logits/rejected": 14.067058563232422, "logps/chosen": -3.993889808654785, "logps/rejected": -4.424417495727539, "loss": 3.8482, "rewards/accuracies": 1.0, "rewards/chosen": -39.93889617919922, "rewards/margins": 4.305278778076172, "rewards/rejected": -44.24417495727539, "step": 4321 }, { "epoch": 0.5885076252723311, "grad_norm": 35.9238432084314, "learning_rate": 3.4644666575768035e-07, "logits/chosen": 12.538803100585938, "logits/rejected": 13.725044250488281, "logps/chosen": -3.877128839492798, "logps/rejected": -4.159786224365234, "loss": 3.904, "rewards/accuracies": 0.5, "rewards/chosen": -38.77128982543945, "rewards/margins": 2.8265724182128906, "rewards/rejected": -41.597862243652344, "step": 4322 }, { "epoch": 0.5886437908496732, "grad_norm": 39.132660309643164, "learning_rate": 3.4625824335644963e-07, "logits/chosen": 12.974662780761719, "logits/rejected": 13.748433113098145, "logps/chosen": -4.276216506958008, "logps/rejected": -4.347321510314941, "loss": 4.0842, "rewards/accuracies": 0.25, "rewards/chosen": -42.76216506958008, "rewards/margins": 0.7110509872436523, "rewards/rejected": -43.47322082519531, "step": 4323 }, { "epoch": 0.5887799564270153, "grad_norm": 43.32917298952114, "learning_rate": 3.460698330986132e-07, "logits/chosen": 13.555734634399414, "logits/rejected": 13.507484436035156, "logps/chosen": -4.307680130004883, "logps/rejected": -4.52004861831665, "loss": 3.7355, "rewards/accuracies": 0.5, "rewards/chosen": -43.07680130004883, "rewards/margins": 2.1236820220947266, "rewards/rejected": -45.20048522949219, "step": 4324 }, { "epoch": 0.5889161220043573, "grad_norm": 34.46152930598745, "learning_rate": 3.458814350267437e-07, "logits/chosen": 13.787429809570312, "logits/rejected": 13.337310791015625, "logps/chosen": -4.659398078918457, "logps/rejected": -4.479397773742676, "loss": 3.9205, "rewards/accuracies": 0.25, "rewards/chosen": -46.5939826965332, "rewards/margins": -1.8000068664550781, "rewards/rejected": -44.793975830078125, "step": 4325 }, { "epoch": 0.5890522875816994, "grad_norm": 36.92408396487178, "learning_rate": 3.4569304918341124e-07, "logits/chosen": 13.216463088989258, "logits/rejected": 13.15979290008545, "logps/chosen": -3.8795948028564453, "logps/rejected": -3.9735186100006104, "loss": 3.9429, "rewards/accuracies": 0.75, "rewards/chosen": -38.79594802856445, "rewards/margins": 0.9392366409301758, "rewards/rejected": -39.73518371582031, "step": 4326 }, { "epoch": 0.5891884531590414, "grad_norm": 38.297541649831075, "learning_rate": 3.455046756111834e-07, "logits/chosen": 13.24958610534668, "logits/rejected": 13.834488868713379, "logps/chosen": -4.1557512283325195, "logps/rejected": -4.6426239013671875, "loss": 4.4081, "rewards/accuracies": 1.0, "rewards/chosen": -41.557518005371094, "rewards/margins": 4.868722915649414, "rewards/rejected": -46.426239013671875, "step": 4327 }, { "epoch": 0.5893246187363834, "grad_norm": 39.36875579021716, "learning_rate": 3.453163143526244e-07, "logits/chosen": 14.375615119934082, "logits/rejected": 14.149740219116211, "logps/chosen": -4.743488788604736, "logps/rejected": -4.580954551696777, "loss": 4.0906, "rewards/accuracies": 0.25, "rewards/chosen": -47.43489074707031, "rewards/margins": -1.6253385543823242, "rewards/rejected": -45.80955123901367, "step": 4328 }, { "epoch": 0.5894607843137255, "grad_norm": 39.48744628896191, "learning_rate": 3.4512796545029616e-07, "logits/chosen": 13.354780197143555, "logits/rejected": 13.868285179138184, "logps/chosen": -4.080694198608398, "logps/rejected": -4.749279975891113, "loss": 3.8935, "rewards/accuracies": 1.0, "rewards/chosen": -40.80694580078125, "rewards/margins": 6.685859680175781, "rewards/rejected": -47.49280548095703, "step": 4329 }, { "epoch": 0.5895969498910676, "grad_norm": 37.52630338379116, "learning_rate": 3.4493962894675794e-07, "logits/chosen": 14.48802375793457, "logits/rejected": 13.725677490234375, "logps/chosen": -4.650273323059082, "logps/rejected": -4.7589111328125, "loss": 3.6014, "rewards/accuracies": 0.5, "rewards/chosen": -46.50273132324219, "rewards/margins": 1.0863847732543945, "rewards/rejected": -47.589115142822266, "step": 4330 }, { "epoch": 0.5897331154684096, "grad_norm": 42.656100089654295, "learning_rate": 3.4475130488456543e-07, "logits/chosen": 13.520200729370117, "logits/rejected": 14.023191452026367, "logps/chosen": -4.048078536987305, "logps/rejected": -4.482209205627441, "loss": 4.0378, "rewards/accuracies": 0.75, "rewards/chosen": -40.48078536987305, "rewards/margins": 4.341310501098633, "rewards/rejected": -44.82209777832031, "step": 4331 }, { "epoch": 0.5898692810457516, "grad_norm": 41.27849320091231, "learning_rate": 3.445629933062723e-07, "logits/chosen": 13.465300559997559, "logits/rejected": 14.115345001220703, "logps/chosen": -4.556851387023926, "logps/rejected": -4.457605361938477, "loss": 3.7537, "rewards/accuracies": 0.25, "rewards/chosen": -45.568511962890625, "rewards/margins": -0.9924583435058594, "rewards/rejected": -44.5760498046875, "step": 4332 }, { "epoch": 0.5900054466230937, "grad_norm": 37.64968766025188, "learning_rate": 3.443746942544293e-07, "logits/chosen": 13.357054710388184, "logits/rejected": 13.344141006469727, "logps/chosen": -4.210342884063721, "logps/rejected": -4.079684257507324, "loss": 4.0484, "rewards/accuracies": 0.25, "rewards/chosen": -42.103431701660156, "rewards/margins": -1.3065853118896484, "rewards/rejected": -40.796844482421875, "step": 4333 }, { "epoch": 0.5901416122004357, "grad_norm": 44.01984764960777, "learning_rate": 3.441864077715838e-07, "logits/chosen": 13.58279037475586, "logits/rejected": 13.706653594970703, "logps/chosen": -4.402734279632568, "logps/rejected": -4.304465293884277, "loss": 4.4569, "rewards/accuracies": 0.5, "rewards/chosen": -44.02734375, "rewards/margins": -0.9826898574829102, "rewards/rejected": -43.04465103149414, "step": 4334 }, { "epoch": 0.5902777777777778, "grad_norm": 38.76324277169754, "learning_rate": 3.4399813390028073e-07, "logits/chosen": 13.344270706176758, "logits/rejected": 13.31871223449707, "logps/chosen": -4.054477691650391, "logps/rejected": -4.253724575042725, "loss": 3.651, "rewards/accuracies": 0.75, "rewards/chosen": -40.544776916503906, "rewards/margins": 1.992466926574707, "rewards/rejected": -42.53724670410156, "step": 4335 }, { "epoch": 0.5904139433551199, "grad_norm": 39.262085542067794, "learning_rate": 3.438098726830624e-07, "logits/chosen": 13.297491073608398, "logits/rejected": 13.530534744262695, "logps/chosen": -4.332587242126465, "logps/rejected": -4.678703308105469, "loss": 3.6045, "rewards/accuracies": 0.75, "rewards/chosen": -43.325870513916016, "rewards/margins": 3.4611663818359375, "rewards/rejected": -46.78703689575195, "step": 4336 }, { "epoch": 0.5905501089324618, "grad_norm": 37.396251273375015, "learning_rate": 3.436216241624677e-07, "logits/chosen": 12.89764404296875, "logits/rejected": 13.439208984375, "logps/chosen": -3.7695586681365967, "logps/rejected": -4.259912490844727, "loss": 3.9513, "rewards/accuracies": 1.0, "rewards/chosen": -37.695587158203125, "rewards/margins": 4.903539657592773, "rewards/rejected": -42.59912872314453, "step": 4337 }, { "epoch": 0.5906862745098039, "grad_norm": 39.98848285146997, "learning_rate": 3.43433388381033e-07, "logits/chosen": 13.599515914916992, "logits/rejected": 14.041257858276367, "logps/chosen": -4.422293663024902, "logps/rejected": -4.073714733123779, "loss": 4.3238, "rewards/accuracies": 0.25, "rewards/chosen": -44.222938537597656, "rewards/margins": -3.4857873916625977, "rewards/rejected": -40.73714828491211, "step": 4338 }, { "epoch": 0.590822440087146, "grad_norm": 37.91462262852273, "learning_rate": 3.43245165381292e-07, "logits/chosen": 13.544193267822266, "logits/rejected": 13.507966995239258, "logps/chosen": -4.399207592010498, "logps/rejected": -4.4200263023376465, "loss": 4.0009, "rewards/accuracies": 0.5, "rewards/chosen": -43.9920768737793, "rewards/margins": 0.20818519592285156, "rewards/rejected": -44.200260162353516, "step": 4339 }, { "epoch": 0.590958605664488, "grad_norm": 38.3404929520501, "learning_rate": 3.430569552057748e-07, "logits/chosen": 13.917924880981445, "logits/rejected": 13.532848358154297, "logps/chosen": -3.99137806892395, "logps/rejected": -4.030611038208008, "loss": 3.9791, "rewards/accuracies": 0.5, "rewards/chosen": -39.913780212402344, "rewards/margins": 0.3923320770263672, "rewards/rejected": -40.306114196777344, "step": 4340 }, { "epoch": 0.5910947712418301, "grad_norm": 39.53658860884229, "learning_rate": 3.4286875789700926e-07, "logits/chosen": 13.482172966003418, "logits/rejected": 14.13880729675293, "logps/chosen": -4.4557037353515625, "logps/rejected": -4.603591442108154, "loss": 3.7119, "rewards/accuracies": 0.5, "rewards/chosen": -44.557037353515625, "rewards/margins": 1.4788789749145508, "rewards/rejected": -46.035911560058594, "step": 4341 }, { "epoch": 0.5912309368191722, "grad_norm": 38.637040566671274, "learning_rate": 3.426805734975203e-07, "logits/chosen": 14.0687255859375, "logits/rejected": 14.268158912658691, "logps/chosen": -4.400908946990967, "logps/rejected": -4.463334083557129, "loss": 4.3885, "rewards/accuracies": 0.5, "rewards/chosen": -44.00908660888672, "rewards/margins": 0.6242523193359375, "rewards/rejected": -44.633338928222656, "step": 4342 }, { "epoch": 0.5913671023965141, "grad_norm": 40.19515320169067, "learning_rate": 3.4249240204982944e-07, "logits/chosen": 12.749048233032227, "logits/rejected": 13.259954452514648, "logps/chosen": -4.182737350463867, "logps/rejected": -4.438941478729248, "loss": 4.5953, "rewards/accuracies": 0.5, "rewards/chosen": -41.82737731933594, "rewards/margins": 2.562037467956543, "rewards/rejected": -44.3894157409668, "step": 4343 }, { "epoch": 0.5915032679738562, "grad_norm": 42.90763585278384, "learning_rate": 3.423042435964557e-07, "logits/chosen": 13.947840690612793, "logits/rejected": 13.992996215820312, "logps/chosen": -4.145829677581787, "logps/rejected": -4.697044372558594, "loss": 4.3973, "rewards/accuracies": 0.75, "rewards/chosen": -41.45829772949219, "rewards/margins": 5.512148857116699, "rewards/rejected": -46.9704475402832, "step": 4344 }, { "epoch": 0.5916394335511983, "grad_norm": 39.337748730379644, "learning_rate": 3.421160981799152e-07, "logits/chosen": 14.131926536560059, "logits/rejected": 13.483972549438477, "logps/chosen": -4.695797920227051, "logps/rejected": -4.246277332305908, "loss": 4.5067, "rewards/accuracies": 0.25, "rewards/chosen": -46.957977294921875, "rewards/margins": -4.495206832885742, "rewards/rejected": -42.462772369384766, "step": 4345 }, { "epoch": 0.5917755991285403, "grad_norm": 37.63038582991664, "learning_rate": 3.4192796584272057e-07, "logits/chosen": 12.938699722290039, "logits/rejected": 13.49220085144043, "logps/chosen": -4.0576982498168945, "logps/rejected": -4.45589542388916, "loss": 3.9095, "rewards/accuracies": 1.0, "rewards/chosen": -40.57698059082031, "rewards/margins": 3.9819679260253906, "rewards/rejected": -44.5589485168457, "step": 4346 }, { "epoch": 0.5919117647058824, "grad_norm": 39.9646920933693, "learning_rate": 3.417398466273821e-07, "logits/chosen": 14.624092102050781, "logits/rejected": 13.708362579345703, "logps/chosen": -4.495657920837402, "logps/rejected": -4.200526237487793, "loss": 4.284, "rewards/accuracies": 0.0, "rewards/chosen": -44.956581115722656, "rewards/margins": -2.95131778717041, "rewards/rejected": -42.00526428222656, "step": 4347 }, { "epoch": 0.5920479302832244, "grad_norm": 38.67149269915991, "learning_rate": 3.4155174057640703e-07, "logits/chosen": 13.547174453735352, "logits/rejected": 14.642745018005371, "logps/chosen": -4.226554870605469, "logps/rejected": -4.767749309539795, "loss": 3.9093, "rewards/accuracies": 0.75, "rewards/chosen": -42.26554870605469, "rewards/margins": 5.411943435668945, "rewards/rejected": -47.677490234375, "step": 4348 }, { "epoch": 0.5921840958605664, "grad_norm": 41.17282615872478, "learning_rate": 3.413636477322992e-07, "logits/chosen": 13.72732925415039, "logits/rejected": 13.796581268310547, "logps/chosen": -4.290469646453857, "logps/rejected": -4.340692520141602, "loss": 4.1598, "rewards/accuracies": 0.75, "rewards/chosen": -42.904693603515625, "rewards/margins": 0.5022296905517578, "rewards/rejected": -43.40692901611328, "step": 4349 }, { "epoch": 0.5923202614379085, "grad_norm": 39.379087979611484, "learning_rate": 3.4117556813755985e-07, "logits/chosen": 13.36984634399414, "logits/rejected": 13.302560806274414, "logps/chosen": -4.081461429595947, "logps/rejected": -4.230128288269043, "loss": 4.2723, "rewards/accuracies": 0.75, "rewards/chosen": -40.814613342285156, "rewards/margins": 1.4866724014282227, "rewards/rejected": -42.30128479003906, "step": 4350 }, { "epoch": 0.5924564270152506, "grad_norm": 38.649665248025535, "learning_rate": 3.4098750183468726e-07, "logits/chosen": 13.232181549072266, "logits/rejected": 13.949403762817383, "logps/chosen": -4.13218879699707, "logps/rejected": -4.385784149169922, "loss": 3.8373, "rewards/accuracies": 0.75, "rewards/chosen": -41.3218879699707, "rewards/margins": 2.5359535217285156, "rewards/rejected": -43.85784149169922, "step": 4351 }, { "epoch": 0.5925925925925926, "grad_norm": 39.95618893881031, "learning_rate": 3.407994488661763e-07, "logits/chosen": 13.467432975769043, "logits/rejected": 13.79685115814209, "logps/chosen": -4.16130256652832, "logps/rejected": -4.452337265014648, "loss": 4.11, "rewards/accuracies": 1.0, "rewards/chosen": -41.61302185058594, "rewards/margins": 2.910348892211914, "rewards/rejected": -44.523372650146484, "step": 4352 }, { "epoch": 0.5927287581699346, "grad_norm": 42.75652737391036, "learning_rate": 3.4061140927451915e-07, "logits/chosen": 13.585285186767578, "logits/rejected": 13.546966552734375, "logps/chosen": -4.444923400878906, "logps/rejected": -4.4259233474731445, "loss": 4.3578, "rewards/accuracies": 0.5, "rewards/chosen": -44.44923400878906, "rewards/margins": -0.18999958038330078, "rewards/rejected": -44.25923156738281, "step": 4353 }, { "epoch": 0.5928649237472767, "grad_norm": 43.87916844731516, "learning_rate": 3.4042338310220524e-07, "logits/chosen": 13.333660125732422, "logits/rejected": 14.681329727172852, "logps/chosen": -4.071044445037842, "logps/rejected": -4.552981853485107, "loss": 3.4855, "rewards/accuracies": 1.0, "rewards/chosen": -40.710445404052734, "rewards/margins": 4.819375038146973, "rewards/rejected": -45.52981948852539, "step": 4354 }, { "epoch": 0.5930010893246187, "grad_norm": 37.94105967458056, "learning_rate": 3.4023537039172015e-07, "logits/chosen": 13.156864166259766, "logits/rejected": 13.247808456420898, "logps/chosen": -4.3321943283081055, "logps/rejected": -4.326774597167969, "loss": 3.7376, "rewards/accuracies": 0.5, "rewards/chosen": -43.32194137573242, "rewards/margins": -0.05419635772705078, "rewards/rejected": -43.26774597167969, "step": 4355 }, { "epoch": 0.5931372549019608, "grad_norm": 42.600089945821935, "learning_rate": 3.400473711855472e-07, "logits/chosen": 13.72909164428711, "logits/rejected": 14.038032531738281, "logps/chosen": -4.354201316833496, "logps/rejected": -4.532033920288086, "loss": 3.7128, "rewards/accuracies": 0.5, "rewards/chosen": -43.54201126098633, "rewards/margins": 1.7783279418945312, "rewards/rejected": -45.320343017578125, "step": 4356 }, { "epoch": 0.5932734204793029, "grad_norm": 39.91366139291477, "learning_rate": 3.3985938552616646e-07, "logits/chosen": 13.559768676757812, "logits/rejected": 13.660333633422852, "logps/chosen": -4.122287273406982, "logps/rejected": -4.5477142333984375, "loss": 3.8039, "rewards/accuracies": 1.0, "rewards/chosen": -41.222869873046875, "rewards/margins": 4.254269599914551, "rewards/rejected": -45.477142333984375, "step": 4357 }, { "epoch": 0.5934095860566448, "grad_norm": 39.72743182492158, "learning_rate": 3.396714134560545e-07, "logits/chosen": 13.322929382324219, "logits/rejected": 14.376996040344238, "logps/chosen": -4.202911376953125, "logps/rejected": -4.370260238647461, "loss": 3.8585, "rewards/accuracies": 0.5, "rewards/chosen": -42.02911376953125, "rewards/margins": 1.6734867095947266, "rewards/rejected": -43.702598571777344, "step": 4358 }, { "epoch": 0.5935457516339869, "grad_norm": 46.23636996792269, "learning_rate": 3.394834550176853e-07, "logits/chosen": 13.456262588500977, "logits/rejected": 13.2935791015625, "logps/chosen": -4.404225826263428, "logps/rejected": -4.33476448059082, "loss": 4.0535, "rewards/accuracies": 0.5, "rewards/chosen": -44.042259216308594, "rewards/margins": -0.6946115493774414, "rewards/rejected": -43.3476448059082, "step": 4359 }, { "epoch": 0.593681917211329, "grad_norm": 38.677001080737554, "learning_rate": 3.3929551025352987e-07, "logits/chosen": 13.754498481750488, "logits/rejected": 13.139480590820312, "logps/chosen": -4.2662200927734375, "logps/rejected": -3.9647817611694336, "loss": 4.2341, "rewards/accuracies": 0.25, "rewards/chosen": -42.662200927734375, "rewards/margins": -3.0143814086914062, "rewards/rejected": -39.64781951904297, "step": 4360 }, { "epoch": 0.593818082788671, "grad_norm": 39.70762107897886, "learning_rate": 3.391075792060556e-07, "logits/chosen": 13.615595817565918, "logits/rejected": 13.284366607666016, "logps/chosen": -4.131932258605957, "logps/rejected": -4.115660667419434, "loss": 4.4998, "rewards/accuracies": 0.5, "rewards/chosen": -41.31932067871094, "rewards/margins": -0.16271400451660156, "rewards/rejected": -41.15660858154297, "step": 4361 }, { "epoch": 0.5939542483660131, "grad_norm": 37.23559003269909, "learning_rate": 3.389196619177271e-07, "logits/chosen": 13.594356536865234, "logits/rejected": 14.47036075592041, "logps/chosen": -4.716218948364258, "logps/rejected": -4.722359657287598, "loss": 4.1728, "rewards/accuracies": 0.5, "rewards/chosen": -47.16218948364258, "rewards/margins": 0.06140613555908203, "rewards/rejected": -47.223594665527344, "step": 4362 }, { "epoch": 0.5940904139433552, "grad_norm": 39.723674110511176, "learning_rate": 3.38731758431006e-07, "logits/chosen": 13.947361946105957, "logits/rejected": 14.287342071533203, "logps/chosen": -4.531709671020508, "logps/rejected": -4.5877580642700195, "loss": 3.9153, "rewards/accuracies": 0.5, "rewards/chosen": -45.317100524902344, "rewards/margins": 0.560481071472168, "rewards/rejected": -45.87757873535156, "step": 4363 }, { "epoch": 0.5942265795206971, "grad_norm": 39.29608882404382, "learning_rate": 3.385438687883504e-07, "logits/chosen": 12.479082107543945, "logits/rejected": 13.517268180847168, "logps/chosen": -4.072942733764648, "logps/rejected": -4.393553733825684, "loss": 3.882, "rewards/accuracies": 0.75, "rewards/chosen": -40.729427337646484, "rewards/margins": 3.2061100006103516, "rewards/rejected": -43.93553924560547, "step": 4364 }, { "epoch": 0.5943627450980392, "grad_norm": 34.06763828226013, "learning_rate": 3.3835599303221567e-07, "logits/chosen": 13.43199634552002, "logits/rejected": 14.474605560302734, "logps/chosen": -3.9363582134246826, "logps/rejected": -4.553433418273926, "loss": 3.9982, "rewards/accuracies": 1.0, "rewards/chosen": -39.36358642578125, "rewards/margins": 6.170753479003906, "rewards/rejected": -45.53433609008789, "step": 4365 }, { "epoch": 0.5944989106753813, "grad_norm": 38.97705122427941, "learning_rate": 3.38168131205054e-07, "logits/chosen": 13.529789924621582, "logits/rejected": 13.556604385375977, "logps/chosen": -4.093886375427246, "logps/rejected": -4.515609264373779, "loss": 3.8765, "rewards/accuracies": 1.0, "rewards/chosen": -40.938865661621094, "rewards/margins": 4.217228889465332, "rewards/rejected": -45.15609359741211, "step": 4366 }, { "epoch": 0.5946350762527233, "grad_norm": 35.10258558709529, "learning_rate": 3.3798028334931404e-07, "logits/chosen": 12.980192184448242, "logits/rejected": 13.213353157043457, "logps/chosen": -4.252026557922363, "logps/rejected": -4.077353477478027, "loss": 3.9041, "rewards/accuracies": 0.5, "rewards/chosen": -42.520263671875, "rewards/margins": -1.7467317581176758, "rewards/rejected": -40.77353286743164, "step": 4367 }, { "epoch": 0.5947712418300654, "grad_norm": 40.313230238479065, "learning_rate": 3.3779244950744177e-07, "logits/chosen": 13.314109802246094, "logits/rejected": 13.486270904541016, "logps/chosen": -4.364346504211426, "logps/rejected": -4.109287261962891, "loss": 3.9492, "rewards/accuracies": 0.0, "rewards/chosen": -43.643463134765625, "rewards/margins": -2.550591468811035, "rewards/rejected": -41.092872619628906, "step": 4368 }, { "epoch": 0.5949074074074074, "grad_norm": 39.29070849273491, "learning_rate": 3.376046297218798e-07, "logits/chosen": 14.47038459777832, "logits/rejected": 13.70136547088623, "logps/chosen": -4.245891571044922, "logps/rejected": -4.3706955909729, "loss": 3.388, "rewards/accuracies": 0.5, "rewards/chosen": -42.458919525146484, "rewards/margins": 1.2480363845825195, "rewards/rejected": -43.70695495605469, "step": 4369 }, { "epoch": 0.5950435729847494, "grad_norm": 39.24761896054341, "learning_rate": 3.3741682403506746e-07, "logits/chosen": 13.80859088897705, "logits/rejected": 13.811698913574219, "logps/chosen": -4.3686981201171875, "logps/rejected": -4.3791184425354, "loss": 4.1693, "rewards/accuracies": 0.5, "rewards/chosen": -43.68697738647461, "rewards/margins": 0.10420703887939453, "rewards/rejected": -43.79118728637695, "step": 4370 }, { "epoch": 0.5951797385620915, "grad_norm": 38.64512039907904, "learning_rate": 3.372290324894411e-07, "logits/chosen": 12.859007835388184, "logits/rejected": 13.160604476928711, "logps/chosen": -4.07866096496582, "logps/rejected": -4.24149227142334, "loss": 3.8474, "rewards/accuracies": 0.5, "rewards/chosen": -40.78660583496094, "rewards/margins": 1.628316879272461, "rewards/rejected": -42.41492462158203, "step": 4371 }, { "epoch": 0.5953159041394336, "grad_norm": 35.29559212516307, "learning_rate": 3.370412551274337e-07, "logits/chosen": 13.496221542358398, "logits/rejected": 13.76979923248291, "logps/chosen": -4.019923210144043, "logps/rejected": -3.9449596405029297, "loss": 3.8767, "rewards/accuracies": 0.25, "rewards/chosen": -40.19923782348633, "rewards/margins": -0.749638557434082, "rewards/rejected": -39.4495964050293, "step": 4372 }, { "epoch": 0.5954520697167756, "grad_norm": 37.86590008822852, "learning_rate": 3.36853491991475e-07, "logits/chosen": 13.122566223144531, "logits/rejected": 14.10055923461914, "logps/chosen": -3.8323488235473633, "logps/rejected": -4.357969760894775, "loss": 3.7766, "rewards/accuracies": 0.75, "rewards/chosen": -38.323490142822266, "rewards/margins": 5.256209373474121, "rewards/rejected": -43.57969665527344, "step": 4373 }, { "epoch": 0.5955882352941176, "grad_norm": 36.02336600938672, "learning_rate": 3.3666574312399183e-07, "logits/chosen": 12.81760025024414, "logits/rejected": 12.716046333312988, "logps/chosen": -4.2307586669921875, "logps/rejected": -4.105389595031738, "loss": 4.1814, "rewards/accuracies": 0.5, "rewards/chosen": -42.307586669921875, "rewards/margins": -1.2536945343017578, "rewards/rejected": -41.05389404296875, "step": 4374 }, { "epoch": 0.5957244008714597, "grad_norm": 38.04049329386753, "learning_rate": 3.3647800856740766e-07, "logits/chosen": 13.63264274597168, "logits/rejected": 13.874025344848633, "logps/chosen": -3.990212917327881, "logps/rejected": -4.383299827575684, "loss": 4.1426, "rewards/accuracies": 0.75, "rewards/chosen": -39.902130126953125, "rewards/margins": 3.9308691024780273, "rewards/rejected": -43.83300018310547, "step": 4375 }, { "epoch": 0.5958605664488017, "grad_norm": 42.330856528545475, "learning_rate": 3.362902883641424e-07, "logits/chosen": 12.501839637756348, "logits/rejected": 13.216113090515137, "logps/chosen": -3.8342223167419434, "logps/rejected": -4.160920143127441, "loss": 4.0276, "rewards/accuracies": 1.0, "rewards/chosen": -38.34222412109375, "rewards/margins": 3.266972541809082, "rewards/rejected": -41.609195709228516, "step": 4376 }, { "epoch": 0.5959967320261438, "grad_norm": 41.74179368793354, "learning_rate": 3.3610258255661303e-07, "logits/chosen": 13.603092193603516, "logits/rejected": 13.191059112548828, "logps/chosen": -4.381747722625732, "logps/rejected": -4.480151653289795, "loss": 4.243, "rewards/accuracies": 0.75, "rewards/chosen": -43.817474365234375, "rewards/margins": 0.984039306640625, "rewards/rejected": -44.801513671875, "step": 4377 }, { "epoch": 0.5961328976034859, "grad_norm": 39.30836028970808, "learning_rate": 3.359148911872336e-07, "logits/chosen": 13.241708755493164, "logits/rejected": 13.388435363769531, "logps/chosen": -4.0762619972229, "logps/rejected": -4.295402526855469, "loss": 3.7859, "rewards/accuracies": 0.75, "rewards/chosen": -40.76261901855469, "rewards/margins": 2.1914100646972656, "rewards/rejected": -42.95402908325195, "step": 4378 }, { "epoch": 0.5962690631808278, "grad_norm": 38.07865831253116, "learning_rate": 3.35727214298414e-07, "logits/chosen": 12.966087341308594, "logits/rejected": 13.693506240844727, "logps/chosen": -3.8179380893707275, "logps/rejected": -3.9433324337005615, "loss": 4.3308, "rewards/accuracies": 0.75, "rewards/chosen": -38.17938232421875, "rewards/margins": 1.2539424896240234, "rewards/rejected": -39.43332290649414, "step": 4379 }, { "epoch": 0.5964052287581699, "grad_norm": 42.48806574334113, "learning_rate": 3.355395519325616e-07, "logits/chosen": 13.846010208129883, "logits/rejected": 14.020938873291016, "logps/chosen": -4.412147521972656, "logps/rejected": -4.204104423522949, "loss": 4.1101, "rewards/accuracies": 0.25, "rewards/chosen": -44.12147521972656, "rewards/margins": -2.0804319381713867, "rewards/rejected": -42.04104232788086, "step": 4380 }, { "epoch": 0.596541394335512, "grad_norm": 40.84263677478258, "learning_rate": 3.3535190413208046e-07, "logits/chosen": 12.792808532714844, "logits/rejected": 12.938497543334961, "logps/chosen": -4.160579681396484, "logps/rejected": -4.0961222648620605, "loss": 3.974, "rewards/accuracies": 0.75, "rewards/chosen": -41.605796813964844, "rewards/margins": -0.6445684432983398, "rewards/rejected": -40.96122741699219, "step": 4381 }, { "epoch": 0.596677559912854, "grad_norm": 38.4756292479481, "learning_rate": 3.351642709393708e-07, "logits/chosen": 14.258916854858398, "logits/rejected": 14.14824104309082, "logps/chosen": -4.109044075012207, "logps/rejected": -4.946466445922852, "loss": 3.8226, "rewards/accuracies": 1.0, "rewards/chosen": -41.09043884277344, "rewards/margins": 8.374226570129395, "rewards/rejected": -49.464664459228516, "step": 4382 }, { "epoch": 0.5968137254901961, "grad_norm": 43.061089410291544, "learning_rate": 3.349766523968301e-07, "logits/chosen": 13.074649810791016, "logits/rejected": 13.905048370361328, "logps/chosen": -4.08799934387207, "logps/rejected": -4.644075393676758, "loss": 3.6618, "rewards/accuracies": 0.75, "rewards/chosen": -40.87998962402344, "rewards/margins": 5.560763359069824, "rewards/rejected": -46.44075393676758, "step": 4383 }, { "epoch": 0.5969498910675382, "grad_norm": 37.065514734838196, "learning_rate": 3.347890485468524e-07, "logits/chosen": 13.865801811218262, "logits/rejected": 13.841846466064453, "logps/chosen": -4.541224002838135, "logps/rejected": -4.6501593589782715, "loss": 3.7893, "rewards/accuracies": 0.25, "rewards/chosen": -45.4122428894043, "rewards/margins": 1.08935546875, "rewards/rejected": -46.50159454345703, "step": 4384 }, { "epoch": 0.5970860566448801, "grad_norm": 44.42517069841397, "learning_rate": 3.346014594318281e-07, "logits/chosen": 13.185745239257812, "logits/rejected": 13.370784759521484, "logps/chosen": -3.9835333824157715, "logps/rejected": -4.008126258850098, "loss": 4.0377, "rewards/accuracies": 0.5, "rewards/chosen": -39.83533477783203, "rewards/margins": 0.24592971801757812, "rewards/rejected": -40.08126449584961, "step": 4385 }, { "epoch": 0.5972222222222222, "grad_norm": 38.40115205989403, "learning_rate": 3.344138850941446e-07, "logits/chosen": 12.960807800292969, "logits/rejected": 13.08238697052002, "logps/chosen": -4.027019500732422, "logps/rejected": -4.259662628173828, "loss": 3.3231, "rewards/accuracies": 0.75, "rewards/chosen": -40.27019500732422, "rewards/margins": 2.326430320739746, "rewards/rejected": -42.59662628173828, "step": 4386 }, { "epoch": 0.5973583877995643, "grad_norm": 37.71050893300738, "learning_rate": 3.34226325576186e-07, "logits/chosen": 12.640731811523438, "logits/rejected": 13.476836204528809, "logps/chosen": -3.9916117191314697, "logps/rejected": -4.417053699493408, "loss": 4.1169, "rewards/accuracies": 1.0, "rewards/chosen": -39.916114807128906, "rewards/margins": 4.25441837310791, "rewards/rejected": -44.170536041259766, "step": 4387 }, { "epoch": 0.5974945533769063, "grad_norm": 44.964603989505946, "learning_rate": 3.3403878092033276e-07, "logits/chosen": 13.75992202758789, "logits/rejected": 13.783174514770508, "logps/chosen": -4.17343807220459, "logps/rejected": -3.9322621822357178, "loss": 4.0556, "rewards/accuracies": 0.25, "rewards/chosen": -41.73438262939453, "rewards/margins": -2.411757469177246, "rewards/rejected": -39.32262420654297, "step": 4388 }, { "epoch": 0.5976307189542484, "grad_norm": 37.86320838840871, "learning_rate": 3.338512511689622e-07, "logits/chosen": 13.044689178466797, "logits/rejected": 13.398027420043945, "logps/chosen": -4.300573348999023, "logps/rejected": -4.416031837463379, "loss": 4.0614, "rewards/accuracies": 0.5, "rewards/chosen": -43.005733489990234, "rewards/margins": 1.1545829772949219, "rewards/rejected": -44.160316467285156, "step": 4389 }, { "epoch": 0.5977668845315904, "grad_norm": 42.41749113157436, "learning_rate": 3.336637363644484e-07, "logits/chosen": 13.193803787231445, "logits/rejected": 14.031709671020508, "logps/chosen": -4.068443298339844, "logps/rejected": -4.436221599578857, "loss": 4.3438, "rewards/accuracies": 0.75, "rewards/chosen": -40.68443298339844, "rewards/margins": 3.6777801513671875, "rewards/rejected": -44.36221694946289, "step": 4390 }, { "epoch": 0.5979030501089324, "grad_norm": 37.17641747706821, "learning_rate": 3.3347623654916147e-07, "logits/chosen": 12.442121505737305, "logits/rejected": 13.707361221313477, "logps/chosen": -4.062270641326904, "logps/rejected": -4.276651859283447, "loss": 3.846, "rewards/accuracies": 0.5, "rewards/chosen": -40.622703552246094, "rewards/margins": 2.1438169479370117, "rewards/rejected": -42.76652145385742, "step": 4391 }, { "epoch": 0.5980392156862745, "grad_norm": 37.74668284837683, "learning_rate": 3.332887517654688e-07, "logits/chosen": 13.130621910095215, "logits/rejected": 14.270304679870605, "logps/chosen": -4.274992942810059, "logps/rejected": -4.6221489906311035, "loss": 3.7128, "rewards/accuracies": 1.0, "rewards/chosen": -42.74993133544922, "rewards/margins": 3.4715576171875, "rewards/rejected": -46.22148895263672, "step": 4392 }, { "epoch": 0.5981753812636166, "grad_norm": 37.966349714734925, "learning_rate": 3.331012820557344e-07, "logits/chosen": 14.515009880065918, "logits/rejected": 13.744443893432617, "logps/chosen": -4.485894203186035, "logps/rejected": -4.301605224609375, "loss": 4.0931, "rewards/accuracies": 0.25, "rewards/chosen": -44.858943939208984, "rewards/margins": -1.8428897857666016, "rewards/rejected": -43.01605224609375, "step": 4393 }, { "epoch": 0.5983115468409586, "grad_norm": 39.823445035205644, "learning_rate": 3.32913827462318e-07, "logits/chosen": 13.980203628540039, "logits/rejected": 13.954888343811035, "logps/chosen": -4.190625190734863, "logps/rejected": -4.585171699523926, "loss": 3.5387, "rewards/accuracies": 1.0, "rewards/chosen": -41.90625762939453, "rewards/margins": 3.945465087890625, "rewards/rejected": -45.851722717285156, "step": 4394 }, { "epoch": 0.5984477124183006, "grad_norm": 38.937052440730696, "learning_rate": 3.3272638802757687e-07, "logits/chosen": 13.313962936401367, "logits/rejected": 13.307220458984375, "logps/chosen": -4.0040740966796875, "logps/rejected": -4.316163063049316, "loss": 3.6355, "rewards/accuracies": 0.5, "rewards/chosen": -40.040740966796875, "rewards/margins": 3.120889663696289, "rewards/rejected": -43.16162872314453, "step": 4395 }, { "epoch": 0.5985838779956427, "grad_norm": 38.105182731326984, "learning_rate": 3.325389637938646e-07, "logits/chosen": 13.223004341125488, "logits/rejected": 13.06863021850586, "logps/chosen": -4.126811981201172, "logps/rejected": -4.155432224273682, "loss": 4.1, "rewards/accuracies": 0.75, "rewards/chosen": -41.26811981201172, "rewards/margins": 0.2862062454223633, "rewards/rejected": -41.5543212890625, "step": 4396 }, { "epoch": 0.5987200435729847, "grad_norm": 41.17097336168802, "learning_rate": 3.32351554803531e-07, "logits/chosen": 13.769553184509277, "logits/rejected": 14.221334457397461, "logps/chosen": -4.469333648681641, "logps/rejected": -4.622392654418945, "loss": 4.5318, "rewards/accuracies": 0.5, "rewards/chosen": -44.693336486816406, "rewards/margins": 1.5305900573730469, "rewards/rejected": -46.22392272949219, "step": 4397 }, { "epoch": 0.5988562091503268, "grad_norm": 41.72717401197417, "learning_rate": 3.3216416109892274e-07, "logits/chosen": 13.900495529174805, "logits/rejected": 13.543580055236816, "logps/chosen": -4.091568946838379, "logps/rejected": -4.360380172729492, "loss": 3.9645, "rewards/accuracies": 1.0, "rewards/chosen": -40.915687561035156, "rewards/margins": 2.6881113052368164, "rewards/rejected": -43.603797912597656, "step": 4398 }, { "epoch": 0.5989923747276689, "grad_norm": 43.94711219911716, "learning_rate": 3.3197678272238317e-07, "logits/chosen": 12.407772064208984, "logits/rejected": 13.731979370117188, "logps/chosen": -3.7450573444366455, "logps/rejected": -4.170195579528809, "loss": 3.3662, "rewards/accuracies": 0.75, "rewards/chosen": -37.45057678222656, "rewards/margins": 4.251384735107422, "rewards/rejected": -41.70195770263672, "step": 4399 }, { "epoch": 0.599128540305011, "grad_norm": 41.88078104859519, "learning_rate": 3.317894197162517e-07, "logits/chosen": 13.65312671661377, "logits/rejected": 13.71668815612793, "logps/chosen": -4.051105499267578, "logps/rejected": -4.430812835693359, "loss": 4.3412, "rewards/accuracies": 0.75, "rewards/chosen": -40.511051177978516, "rewards/margins": 3.797079086303711, "rewards/rejected": -44.308128356933594, "step": 4400 }, { "epoch": 0.5992647058823529, "grad_norm": 37.443004742422936, "learning_rate": 3.3160207212286465e-07, "logits/chosen": 13.068492889404297, "logits/rejected": 13.335883140563965, "logps/chosen": -3.9368300437927246, "logps/rejected": -4.424432754516602, "loss": 3.714, "rewards/accuracies": 0.75, "rewards/chosen": -39.36830139160156, "rewards/margins": 4.876028060913086, "rewards/rejected": -44.24433135986328, "step": 4401 }, { "epoch": 0.599400871459695, "grad_norm": 43.954378489141355, "learning_rate": 3.3141473998455495e-07, "logits/chosen": 13.33337116241455, "logits/rejected": 13.81279182434082, "logps/chosen": -4.309429168701172, "logps/rejected": -4.403308868408203, "loss": 4.715, "rewards/accuracies": 0.75, "rewards/chosen": -43.09429168701172, "rewards/margins": 0.9387941360473633, "rewards/rejected": -44.03308868408203, "step": 4402 }, { "epoch": 0.5995370370370371, "grad_norm": 43.34208669398584, "learning_rate": 3.3122742334365154e-07, "logits/chosen": 13.314424514770508, "logits/rejected": 13.719247817993164, "logps/chosen": -4.3558759689331055, "logps/rejected": -4.489473342895508, "loss": 3.9126, "rewards/accuracies": 0.75, "rewards/chosen": -43.558753967285156, "rewards/margins": 1.3359785079956055, "rewards/rejected": -44.894737243652344, "step": 4403 }, { "epoch": 0.5996732026143791, "grad_norm": 41.897983756426235, "learning_rate": 3.310401222424803e-07, "logits/chosen": 13.149590492248535, "logits/rejected": 13.494625091552734, "logps/chosen": -4.2629170417785645, "logps/rejected": -4.455548286437988, "loss": 4.0641, "rewards/accuracies": 0.5, "rewards/chosen": -42.629173278808594, "rewards/margins": 1.926309585571289, "rewards/rejected": -44.55548095703125, "step": 4404 }, { "epoch": 0.5998093681917211, "grad_norm": 36.49784020280638, "learning_rate": 3.3085283672336364e-07, "logits/chosen": 13.425521850585938, "logits/rejected": 14.222713470458984, "logps/chosen": -4.389666557312012, "logps/rejected": -4.403984069824219, "loss": 3.5415, "rewards/accuracies": 0.5, "rewards/chosen": -43.89666748046875, "rewards/margins": 0.14317703247070312, "rewards/rejected": -44.03984069824219, "step": 4405 }, { "epoch": 0.5999455337690632, "grad_norm": 38.83274492779917, "learning_rate": 3.3066556682861987e-07, "logits/chosen": 13.426057815551758, "logits/rejected": 13.898370742797852, "logps/chosen": -4.336933135986328, "logps/rejected": -4.538023948669434, "loss": 3.4421, "rewards/accuracies": 0.5, "rewards/chosen": -43.36933135986328, "rewards/margins": 2.0109081268310547, "rewards/rejected": -45.38024139404297, "step": 4406 }, { "epoch": 0.6000816993464052, "grad_norm": 38.6610161620642, "learning_rate": 3.3047831260056446e-07, "logits/chosen": 13.77161979675293, "logits/rejected": 13.598552703857422, "logps/chosen": -3.9441006183624268, "logps/rejected": -3.995288848876953, "loss": 3.6685, "rewards/accuracies": 0.25, "rewards/chosen": -39.44100570678711, "rewards/margins": 0.5118799209594727, "rewards/rejected": -39.95288848876953, "step": 4407 }, { "epoch": 0.6002178649237473, "grad_norm": 42.35173011995881, "learning_rate": 3.3029107408150903e-07, "logits/chosen": 13.369890213012695, "logits/rejected": 12.85490608215332, "logps/chosen": -4.303526878356934, "logps/rejected": -4.467713356018066, "loss": 4.5352, "rewards/accuracies": 0.5, "rewards/chosen": -43.03527069091797, "rewards/margins": 1.6418657302856445, "rewards/rejected": -44.67713928222656, "step": 4408 }, { "epoch": 0.6003540305010894, "grad_norm": 35.43162313992477, "learning_rate": 3.3010385131376167e-07, "logits/chosen": 14.383111953735352, "logits/rejected": 14.749578475952148, "logps/chosen": -4.487885475158691, "logps/rejected": -4.662581443786621, "loss": 3.5961, "rewards/accuracies": 0.75, "rewards/chosen": -44.87885284423828, "rewards/margins": 1.746962547302246, "rewards/rejected": -46.625816345214844, "step": 4409 }, { "epoch": 0.6004901960784313, "grad_norm": 40.04047174240849, "learning_rate": 3.2991664433962674e-07, "logits/chosen": 13.085380554199219, "logits/rejected": 13.892339706420898, "logps/chosen": -4.085007667541504, "logps/rejected": -4.270379543304443, "loss": 3.8356, "rewards/accuracies": 0.5, "rewards/chosen": -40.85008239746094, "rewards/margins": 1.8537178039550781, "rewards/rejected": -42.70379638671875, "step": 4410 }, { "epoch": 0.6006263616557734, "grad_norm": 42.394899118464345, "learning_rate": 3.297294532014055e-07, "logits/chosen": 13.483287811279297, "logits/rejected": 13.918323516845703, "logps/chosen": -4.530893325805664, "logps/rejected": -4.668519973754883, "loss": 4.2178, "rewards/accuracies": 0.75, "rewards/chosen": -45.308937072753906, "rewards/margins": 1.3762664794921875, "rewards/rejected": -46.68519973754883, "step": 4411 }, { "epoch": 0.6007625272331155, "grad_norm": 45.41192706062203, "learning_rate": 3.2954227794139514e-07, "logits/chosen": 13.855871200561523, "logits/rejected": 13.770716667175293, "logps/chosen": -4.2980804443359375, "logps/rejected": -4.399637699127197, "loss": 4.4686, "rewards/accuracies": 0.75, "rewards/chosen": -42.98080825805664, "rewards/margins": 1.0155696868896484, "rewards/rejected": -43.996376037597656, "step": 4412 }, { "epoch": 0.6008986928104575, "grad_norm": 36.962500676901875, "learning_rate": 3.293551186018894e-07, "logits/chosen": 13.170736312866211, "logits/rejected": 14.234308242797852, "logps/chosen": -3.8786420822143555, "logps/rejected": -4.6119608879089355, "loss": 3.0828, "rewards/accuracies": 1.0, "rewards/chosen": -38.78642272949219, "rewards/margins": 7.333189964294434, "rewards/rejected": -46.11961364746094, "step": 4413 }, { "epoch": 0.6010348583877996, "grad_norm": 39.63542679360138, "learning_rate": 3.291679752251786e-07, "logits/chosen": 13.855096817016602, "logits/rejected": 14.00816822052002, "logps/chosen": -3.9487385749816895, "logps/rejected": -4.438364505767822, "loss": 3.7146, "rewards/accuracies": 1.0, "rewards/chosen": -39.487388610839844, "rewards/margins": 4.896259307861328, "rewards/rejected": -44.383644104003906, "step": 4414 }, { "epoch": 0.6011710239651417, "grad_norm": 41.86608309421618, "learning_rate": 3.2898084785354925e-07, "logits/chosen": 13.503807067871094, "logits/rejected": 13.694178581237793, "logps/chosen": -4.3375349044799805, "logps/rejected": -4.326663970947266, "loss": 4.0025, "rewards/accuracies": 0.25, "rewards/chosen": -43.37534713745117, "rewards/margins": -0.10871315002441406, "rewards/rejected": -43.26663589477539, "step": 4415 }, { "epoch": 0.6013071895424836, "grad_norm": 41.24739734119234, "learning_rate": 3.287937365292845e-07, "logits/chosen": 14.000864028930664, "logits/rejected": 14.00711441040039, "logps/chosen": -4.424051284790039, "logps/rejected": -3.8787379264831543, "loss": 3.7699, "rewards/accuracies": 0.0, "rewards/chosen": -44.240509033203125, "rewards/margins": -5.453129768371582, "rewards/rejected": -38.787384033203125, "step": 4416 }, { "epoch": 0.6014433551198257, "grad_norm": 37.847125461225644, "learning_rate": 3.2860664129466357e-07, "logits/chosen": 12.569791793823242, "logits/rejected": 12.70968246459961, "logps/chosen": -4.28510046005249, "logps/rejected": -3.9800193309783936, "loss": 3.9339, "rewards/accuracies": 0.5, "rewards/chosen": -42.85100555419922, "rewards/margins": -3.050808906555176, "rewards/rejected": -39.800193786621094, "step": 4417 }, { "epoch": 0.6015795206971678, "grad_norm": 40.02788281702667, "learning_rate": 3.284195621919621e-07, "logits/chosen": 14.322118759155273, "logits/rejected": 14.190925598144531, "logps/chosen": -4.300446510314941, "logps/rejected": -4.598320007324219, "loss": 4.3503, "rewards/accuracies": 0.5, "rewards/chosen": -43.00446319580078, "rewards/margins": 2.9787302017211914, "rewards/rejected": -45.98319625854492, "step": 4418 }, { "epoch": 0.6017156862745098, "grad_norm": 38.59286497851525, "learning_rate": 3.2823249926345227e-07, "logits/chosen": 12.967448234558105, "logits/rejected": 13.792684555053711, "logps/chosen": -4.033741474151611, "logps/rejected": -4.336970329284668, "loss": 4.2924, "rewards/accuracies": 0.5, "rewards/chosen": -40.3374137878418, "rewards/margins": 3.032292366027832, "rewards/rejected": -43.36970520019531, "step": 4419 }, { "epoch": 0.6018518518518519, "grad_norm": 39.676410028719346, "learning_rate": 3.280454525514025e-07, "logits/chosen": 14.359707832336426, "logits/rejected": 13.520895004272461, "logps/chosen": -4.353386878967285, "logps/rejected": -4.312063217163086, "loss": 3.8782, "rewards/accuracies": 0.5, "rewards/chosen": -43.53386688232422, "rewards/margins": -0.4132356643676758, "rewards/rejected": -43.12063217163086, "step": 4420 }, { "epoch": 0.601988017429194, "grad_norm": 39.77321877755552, "learning_rate": 3.2785842209807743e-07, "logits/chosen": 13.21146011352539, "logits/rejected": 13.657681465148926, "logps/chosen": -4.208599090576172, "logps/rejected": -4.419805526733398, "loss": 4.1507, "rewards/accuracies": 0.75, "rewards/chosen": -42.08599090576172, "rewards/margins": 2.1120662689208984, "rewards/rejected": -44.19805908203125, "step": 4421 }, { "epoch": 0.6021241830065359, "grad_norm": 47.338063613499926, "learning_rate": 3.276714079457383e-07, "logits/chosen": 13.635817527770996, "logits/rejected": 13.000165939331055, "logps/chosen": -4.214570045471191, "logps/rejected": -4.295273780822754, "loss": 4.0589, "rewards/accuracies": 0.75, "rewards/chosen": -42.14570236206055, "rewards/margins": 0.8070354461669922, "rewards/rejected": -42.95273971557617, "step": 4422 }, { "epoch": 0.602260348583878, "grad_norm": 37.99911692754771, "learning_rate": 3.2748441013664243e-07, "logits/chosen": 14.24710750579834, "logits/rejected": 14.367445945739746, "logps/chosen": -4.17880916595459, "logps/rejected": -4.537502288818359, "loss": 3.8954, "rewards/accuracies": 0.75, "rewards/chosen": -41.7880859375, "rewards/margins": 3.586933135986328, "rewards/rejected": -45.375022888183594, "step": 4423 }, { "epoch": 0.6023965141612201, "grad_norm": 37.26457264590682, "learning_rate": 3.2729742871304347e-07, "logits/chosen": 14.000900268554688, "logits/rejected": 14.94078254699707, "logps/chosen": -4.228253364562988, "logps/rejected": -4.560084819793701, "loss": 3.2235, "rewards/accuracies": 0.75, "rewards/chosen": -42.28253173828125, "rewards/margins": 3.3183164596557617, "rewards/rejected": -45.60084915161133, "step": 4424 }, { "epoch": 0.6025326797385621, "grad_norm": 39.20808781657807, "learning_rate": 3.271104637171914e-07, "logits/chosen": 12.338029861450195, "logits/rejected": 13.770105361938477, "logps/chosen": -3.8700928688049316, "logps/rejected": -4.4463629722595215, "loss": 3.8247, "rewards/accuracies": 1.0, "rewards/chosen": -38.700927734375, "rewards/margins": 5.762699127197266, "rewards/rejected": -44.46363067626953, "step": 4425 }, { "epoch": 0.6026688453159041, "grad_norm": 39.864569365603614, "learning_rate": 3.2692351519133274e-07, "logits/chosen": 13.619447708129883, "logits/rejected": 14.267086029052734, "logps/chosen": -4.155482292175293, "logps/rejected": -4.451877593994141, "loss": 3.5904, "rewards/accuracies": 1.0, "rewards/chosen": -41.55481719970703, "rewards/margins": 2.9639577865600586, "rewards/rejected": -44.518775939941406, "step": 4426 }, { "epoch": 0.6028050108932462, "grad_norm": 37.54587056651021, "learning_rate": 3.2673658317770965e-07, "logits/chosen": 13.193989753723145, "logits/rejected": 13.354312896728516, "logps/chosen": -4.005596160888672, "logps/rejected": -4.3053178787231445, "loss": 3.4775, "rewards/accuracies": 0.75, "rewards/chosen": -40.05596160888672, "rewards/margins": 2.9972171783447266, "rewards/rejected": -43.05318069458008, "step": 4427 }, { "epoch": 0.6029411764705882, "grad_norm": 40.851494674717486, "learning_rate": 3.2654966771856127e-07, "logits/chosen": 12.917998313903809, "logits/rejected": 13.692150115966797, "logps/chosen": -4.041175365447998, "logps/rejected": -4.111695289611816, "loss": 4.2849, "rewards/accuracies": 0.5, "rewards/chosen": -40.41175079345703, "rewards/margins": 0.7051982879638672, "rewards/rejected": -41.11695098876953, "step": 4428 }, { "epoch": 0.6030773420479303, "grad_norm": 38.59671886734552, "learning_rate": 3.263627688561227e-07, "logits/chosen": 14.359935760498047, "logits/rejected": 14.374836921691895, "logps/chosen": -4.552463531494141, "logps/rejected": -5.035675048828125, "loss": 4.4505, "rewards/accuracies": 0.75, "rewards/chosen": -45.524635314941406, "rewards/margins": 4.832117080688477, "rewards/rejected": -50.35675048828125, "step": 4429 }, { "epoch": 0.6032135076252724, "grad_norm": 39.01901916841773, "learning_rate": 3.261758866326251e-07, "logits/chosen": 13.086865425109863, "logits/rejected": 13.424795150756836, "logps/chosen": -4.3281073570251465, "logps/rejected": -4.373597145080566, "loss": 4.0636, "rewards/accuracies": 0.5, "rewards/chosen": -43.281070709228516, "rewards/margins": 0.45490169525146484, "rewards/rejected": -43.73597717285156, "step": 4430 }, { "epoch": 0.6033496732026143, "grad_norm": 35.914977149300384, "learning_rate": 3.259890210902962e-07, "logits/chosen": 12.765119552612305, "logits/rejected": 13.15507984161377, "logps/chosen": -3.7800710201263428, "logps/rejected": -4.112516403198242, "loss": 3.7103, "rewards/accuracies": 0.75, "rewards/chosen": -37.80071258544922, "rewards/margins": 3.3244576454162598, "rewards/rejected": -41.12516784667969, "step": 4431 }, { "epoch": 0.6034858387799564, "grad_norm": 39.261504695820236, "learning_rate": 3.258021722713599e-07, "logits/chosen": 13.725688934326172, "logits/rejected": 13.46550464630127, "logps/chosen": -4.224354267120361, "logps/rejected": -4.292329788208008, "loss": 3.863, "rewards/accuracies": 0.75, "rewards/chosen": -42.24353790283203, "rewards/margins": 0.6797542572021484, "rewards/rejected": -42.92329406738281, "step": 4432 }, { "epoch": 0.6036220043572985, "grad_norm": 45.155454353224215, "learning_rate": 3.2561534021803587e-07, "logits/chosen": 13.333675384521484, "logits/rejected": 13.890088081359863, "logps/chosen": -4.344292163848877, "logps/rejected": -4.576825141906738, "loss": 4.278, "rewards/accuracies": 0.5, "rewards/chosen": -43.44292068481445, "rewards/margins": 2.325326919555664, "rewards/rejected": -45.76824951171875, "step": 4433 }, { "epoch": 0.6037581699346405, "grad_norm": 40.21972971512776, "learning_rate": 3.254285249725407e-07, "logits/chosen": 12.67120361328125, "logits/rejected": 13.166150093078613, "logps/chosen": -4.049642562866211, "logps/rejected": -4.2148756980896, "loss": 4.0732, "rewards/accuracies": 0.75, "rewards/chosen": -40.49642562866211, "rewards/margins": 1.6523323059082031, "rewards/rejected": -42.14875793457031, "step": 4434 }, { "epoch": 0.6038943355119826, "grad_norm": 39.198716356935115, "learning_rate": 3.2524172657708676e-07, "logits/chosen": 13.819207191467285, "logits/rejected": 13.753475189208984, "logps/chosen": -4.494779586791992, "logps/rejected": -4.254535675048828, "loss": 3.838, "rewards/accuracies": 0.5, "rewards/chosen": -44.94779586791992, "rewards/margins": -2.4024410247802734, "rewards/rejected": -42.54535675048828, "step": 4435 }, { "epoch": 0.6040305010893247, "grad_norm": 40.54579013843984, "learning_rate": 3.2505494507388256e-07, "logits/chosen": 13.020180702209473, "logits/rejected": 13.927249908447266, "logps/chosen": -4.3790669441223145, "logps/rejected": -4.6137495040893555, "loss": 4.0012, "rewards/accuracies": 0.75, "rewards/chosen": -43.79066848754883, "rewards/margins": 2.346830368041992, "rewards/rejected": -46.13749694824219, "step": 4436 }, { "epoch": 0.6041666666666666, "grad_norm": 40.57331144342062, "learning_rate": 3.24868180505133e-07, "logits/chosen": 13.438154220581055, "logits/rejected": 13.949230194091797, "logps/chosen": -4.13120698928833, "logps/rejected": -4.5772199630737305, "loss": 3.644, "rewards/accuracies": 1.0, "rewards/chosen": -41.31206512451172, "rewards/margins": 4.4601335525512695, "rewards/rejected": -45.77220153808594, "step": 4437 }, { "epoch": 0.6043028322440087, "grad_norm": 43.100617887790584, "learning_rate": 3.246814329130393e-07, "logits/chosen": 13.931154251098633, "logits/rejected": 14.588518142700195, "logps/chosen": -4.309548377990723, "logps/rejected": -4.508482456207275, "loss": 3.3687, "rewards/accuracies": 1.0, "rewards/chosen": -43.095481872558594, "rewards/margins": 1.989344596862793, "rewards/rejected": -45.0848274230957, "step": 4438 }, { "epoch": 0.6044389978213508, "grad_norm": 43.27224071783225, "learning_rate": 3.2449470233979825e-07, "logits/chosen": 13.367668151855469, "logits/rejected": 13.946290969848633, "logps/chosen": -4.263176918029785, "logps/rejected": -4.528603553771973, "loss": 3.9559, "rewards/accuracies": 0.75, "rewards/chosen": -42.63177490234375, "rewards/margins": 2.6542606353759766, "rewards/rejected": -45.286033630371094, "step": 4439 }, { "epoch": 0.6045751633986928, "grad_norm": 38.84307037045459, "learning_rate": 3.243079888276033e-07, "logits/chosen": 13.742141723632812, "logits/rejected": 13.725237846374512, "logps/chosen": -4.308098793029785, "logps/rejected": -4.187520980834961, "loss": 3.2879, "rewards/accuracies": 0.5, "rewards/chosen": -43.08099365234375, "rewards/margins": -1.2057781219482422, "rewards/rejected": -41.875213623046875, "step": 4440 }, { "epoch": 0.6047113289760349, "grad_norm": 39.28799646299077, "learning_rate": 3.241212924186442e-07, "logits/chosen": 13.214746475219727, "logits/rejected": 13.418669700622559, "logps/chosen": -4.100752353668213, "logps/rejected": -4.320995330810547, "loss": 3.534, "rewards/accuracies": 0.75, "rewards/chosen": -41.00752258300781, "rewards/margins": 2.20242977142334, "rewards/rejected": -43.20995330810547, "step": 4441 }, { "epoch": 0.6048474945533769, "grad_norm": 38.36278319819965, "learning_rate": 3.2393461315510605e-07, "logits/chosen": 12.856014251708984, "logits/rejected": 13.30714225769043, "logps/chosen": -3.862248420715332, "logps/rejected": -4.417849540710449, "loss": 3.6691, "rewards/accuracies": 1.0, "rewards/chosen": -38.62248611450195, "rewards/margins": 5.556007385253906, "rewards/rejected": -44.178489685058594, "step": 4442 }, { "epoch": 0.6049836601307189, "grad_norm": 42.969470608621315, "learning_rate": 3.2374795107917085e-07, "logits/chosen": 12.867237091064453, "logits/rejected": 12.74447250366211, "logps/chosen": -4.269561767578125, "logps/rejected": -4.271808624267578, "loss": 4.0858, "rewards/accuracies": 0.5, "rewards/chosen": -42.695613861083984, "rewards/margins": 0.022467613220214844, "rewards/rejected": -42.718082427978516, "step": 4443 }, { "epoch": 0.605119825708061, "grad_norm": 41.524064397128335, "learning_rate": 3.235613062330166e-07, "logits/chosen": 14.152093887329102, "logits/rejected": 14.141162872314453, "logps/chosen": -4.5709075927734375, "logps/rejected": -4.470149040222168, "loss": 3.6708, "rewards/accuracies": 0.5, "rewards/chosen": -45.70907974243164, "rewards/margins": -1.0075902938842773, "rewards/rejected": -44.70148849487305, "step": 4444 }, { "epoch": 0.6052559912854031, "grad_norm": 43.08421918233014, "learning_rate": 3.233746786588168e-07, "logits/chosen": 12.934672355651855, "logits/rejected": 13.89497184753418, "logps/chosen": -3.8257594108581543, "logps/rejected": -4.296822547912598, "loss": 3.8581, "rewards/accuracies": 0.75, "rewards/chosen": -38.257591247558594, "rewards/margins": 4.71063232421875, "rewards/rejected": -42.968223571777344, "step": 4445 }, { "epoch": 0.6053921568627451, "grad_norm": 39.91766043706711, "learning_rate": 3.231880683987418e-07, "logits/chosen": 13.284496307373047, "logits/rejected": 13.692485809326172, "logps/chosen": -4.513896942138672, "logps/rejected": -4.639167785644531, "loss": 3.7592, "rewards/accuracies": 0.75, "rewards/chosen": -45.13896942138672, "rewards/margins": 1.2527084350585938, "rewards/rejected": -46.39167785644531, "step": 4446 }, { "epoch": 0.6055283224400871, "grad_norm": 41.066730340284344, "learning_rate": 3.230014754949579e-07, "logits/chosen": 12.70003890991211, "logits/rejected": 13.593427658081055, "logps/chosen": -4.05607795715332, "logps/rejected": -4.150320529937744, "loss": 4.171, "rewards/accuracies": 0.5, "rewards/chosen": -40.56077575683594, "rewards/margins": 0.9424285888671875, "rewards/rejected": -41.503204345703125, "step": 4447 }, { "epoch": 0.6056644880174292, "grad_norm": 42.450594956050516, "learning_rate": 3.2281489998962687e-07, "logits/chosen": 12.965997695922852, "logits/rejected": 12.810171127319336, "logps/chosen": -4.206847190856934, "logps/rejected": -4.210084915161133, "loss": 4.4085, "rewards/accuracies": 0.25, "rewards/chosen": -42.06846618652344, "rewards/margins": 0.03238391876220703, "rewards/rejected": -42.100852966308594, "step": 4448 }, { "epoch": 0.6058006535947712, "grad_norm": 42.0600305320561, "learning_rate": 3.2262834192490724e-07, "logits/chosen": 13.566795349121094, "logits/rejected": 13.869732856750488, "logps/chosen": -3.8292908668518066, "logps/rejected": -4.028467178344727, "loss": 4.1616, "rewards/accuracies": 0.75, "rewards/chosen": -38.29290771484375, "rewards/margins": 1.991765022277832, "rewards/rejected": -40.28467559814453, "step": 4449 }, { "epoch": 0.6059368191721133, "grad_norm": 40.99519252137061, "learning_rate": 3.2244180134295347e-07, "logits/chosen": 12.892921447753906, "logits/rejected": 12.248748779296875, "logps/chosen": -3.883446216583252, "logps/rejected": -4.1190409660339355, "loss": 3.7752, "rewards/accuracies": 0.75, "rewards/chosen": -38.8344612121582, "rewards/margins": 2.355947494506836, "rewards/rejected": -41.19041061401367, "step": 4450 }, { "epoch": 0.6060729847494554, "grad_norm": 42.032149423961776, "learning_rate": 3.222552782859156e-07, "logits/chosen": 12.323187828063965, "logits/rejected": 13.351706504821777, "logps/chosen": -4.03549337387085, "logps/rejected": -4.3847174644470215, "loss": 4.2205, "rewards/accuracies": 1.0, "rewards/chosen": -40.35493469238281, "rewards/margins": 3.492238998413086, "rewards/rejected": -43.84717559814453, "step": 4451 }, { "epoch": 0.6062091503267973, "grad_norm": 41.257347058792895, "learning_rate": 3.220687727959402e-07, "logits/chosen": 13.823466300964355, "logits/rejected": 13.928502082824707, "logps/chosen": -4.4179182052612305, "logps/rejected": -4.6883344650268555, "loss": 3.9092, "rewards/accuracies": 0.75, "rewards/chosen": -44.17918395996094, "rewards/margins": 2.704164505004883, "rewards/rejected": -46.88334655761719, "step": 4452 }, { "epoch": 0.6063453159041394, "grad_norm": 42.37321245176176, "learning_rate": 3.2188228491517e-07, "logits/chosen": 13.058427810668945, "logits/rejected": 12.968524932861328, "logps/chosen": -4.218587875366211, "logps/rejected": -4.097040176391602, "loss": 3.6014, "rewards/accuracies": 0.25, "rewards/chosen": -42.185882568359375, "rewards/margins": -1.2154788970947266, "rewards/rejected": -40.97040557861328, "step": 4453 }, { "epoch": 0.6064814814814815, "grad_norm": 37.20913748281863, "learning_rate": 3.216958146857431e-07, "logits/chosen": 13.695229530334473, "logits/rejected": 14.161916732788086, "logps/chosen": -4.120950698852539, "logps/rejected": -4.521122932434082, "loss": 3.452, "rewards/accuracies": 0.75, "rewards/chosen": -41.209510803222656, "rewards/margins": 4.0017194747924805, "rewards/rejected": -45.21122741699219, "step": 4454 }, { "epoch": 0.6066176470588235, "grad_norm": 38.95030584865762, "learning_rate": 3.2150936214979416e-07, "logits/chosen": 13.030658721923828, "logits/rejected": 13.903005599975586, "logps/chosen": -4.115006923675537, "logps/rejected": -4.459905624389648, "loss": 3.8259, "rewards/accuracies": 1.0, "rewards/chosen": -41.15007019042969, "rewards/margins": 3.448986053466797, "rewards/rejected": -44.59905242919922, "step": 4455 }, { "epoch": 0.6067538126361656, "grad_norm": 36.352836779661594, "learning_rate": 3.213229273494537e-07, "logits/chosen": 13.911317825317383, "logits/rejected": 13.845410346984863, "logps/chosen": -4.36012077331543, "logps/rejected": -4.530961036682129, "loss": 3.5357, "rewards/accuracies": 0.5, "rewards/chosen": -43.60120391845703, "rewards/margins": 1.7084035873413086, "rewards/rejected": -45.309608459472656, "step": 4456 }, { "epoch": 0.6068899782135077, "grad_norm": 38.21412858503291, "learning_rate": 3.211365103268481e-07, "logits/chosen": 13.134129524230957, "logits/rejected": 13.615537643432617, "logps/chosen": -4.402887344360352, "logps/rejected": -4.434382438659668, "loss": 3.4249, "rewards/accuracies": 0.5, "rewards/chosen": -44.028873443603516, "rewards/margins": 0.31494903564453125, "rewards/rejected": -44.34382247924805, "step": 4457 }, { "epoch": 0.6070261437908496, "grad_norm": 38.36934495668234, "learning_rate": 3.2095011112409986e-07, "logits/chosen": 13.91380500793457, "logits/rejected": 13.604768753051758, "logps/chosen": -4.400015830993652, "logps/rejected": -4.302953720092773, "loss": 4.0895, "rewards/accuracies": 0.5, "rewards/chosen": -44.00015640258789, "rewards/margins": -0.9706201553344727, "rewards/rejected": -43.029537200927734, "step": 4458 }, { "epoch": 0.6071623093681917, "grad_norm": 39.60939114918218, "learning_rate": 3.2076372978332754e-07, "logits/chosen": 13.735055923461914, "logits/rejected": 14.369790077209473, "logps/chosen": -4.232526779174805, "logps/rejected": -4.489315032958984, "loss": 4.109, "rewards/accuracies": 0.75, "rewards/chosen": -42.32526779174805, "rewards/margins": 2.5678844451904297, "rewards/rejected": -44.893150329589844, "step": 4459 }, { "epoch": 0.6072984749455338, "grad_norm": 37.75504455646742, "learning_rate": 3.205773663466454e-07, "logits/chosen": 12.83946418762207, "logits/rejected": 13.88975715637207, "logps/chosen": -4.047113418579102, "logps/rejected": -4.345072269439697, "loss": 3.9693, "rewards/accuracies": 1.0, "rewards/chosen": -40.471134185791016, "rewards/margins": 2.979588508605957, "rewards/rejected": -43.450721740722656, "step": 4460 }, { "epoch": 0.6074346405228758, "grad_norm": 36.76121175727419, "learning_rate": 3.203910208561638e-07, "logits/chosen": 13.257194519042969, "logits/rejected": 13.945733070373535, "logps/chosen": -4.282134056091309, "logps/rejected": -4.508724212646484, "loss": 3.7685, "rewards/accuracies": 0.5, "rewards/chosen": -42.82133483886719, "rewards/margins": 2.2659082412719727, "rewards/rejected": -45.087242126464844, "step": 4461 }, { "epoch": 0.6075708061002179, "grad_norm": 40.0527855132323, "learning_rate": 3.2020469335398915e-07, "logits/chosen": 12.70681095123291, "logits/rejected": 12.847070693969727, "logps/chosen": -3.953789234161377, "logps/rejected": -4.100037574768066, "loss": 4.1088, "rewards/accuracies": 0.5, "rewards/chosen": -39.53789138793945, "rewards/margins": 1.4624824523925781, "rewards/rejected": -41.00037384033203, "step": 4462 }, { "epoch": 0.6077069716775599, "grad_norm": 39.89697842501137, "learning_rate": 3.2001838388222366e-07, "logits/chosen": 13.476044654846191, "logits/rejected": 13.334144592285156, "logps/chosen": -4.448180198669434, "logps/rejected": -4.473586559295654, "loss": 4.0246, "rewards/accuracies": 0.5, "rewards/chosen": -44.4818000793457, "rewards/margins": 0.25406742095947266, "rewards/rejected": -44.735870361328125, "step": 4463 }, { "epoch": 0.6078431372549019, "grad_norm": 39.19820693684991, "learning_rate": 3.1983209248296537e-07, "logits/chosen": 12.514427185058594, "logits/rejected": 13.565597534179688, "logps/chosen": -3.9145619869232178, "logps/rejected": -4.490889549255371, "loss": 3.2573, "rewards/accuracies": 0.75, "rewards/chosen": -39.1456184387207, "rewards/margins": 5.763280868530273, "rewards/rejected": -44.90890121459961, "step": 4464 }, { "epoch": 0.607979302832244, "grad_norm": 42.12862862814249, "learning_rate": 3.196458191983086e-07, "logits/chosen": 12.189127922058105, "logits/rejected": 12.483135223388672, "logps/chosen": -3.952648878097534, "logps/rejected": -4.294236183166504, "loss": 3.564, "rewards/accuracies": 1.0, "rewards/chosen": -39.5264892578125, "rewards/margins": 3.415874481201172, "rewards/rejected": -42.942359924316406, "step": 4465 }, { "epoch": 0.6081154684095861, "grad_norm": 41.54059999158464, "learning_rate": 3.19459564070343e-07, "logits/chosen": 13.04832649230957, "logits/rejected": 13.67123794555664, "logps/chosen": -4.00125789642334, "logps/rejected": -4.306282043457031, "loss": 3.6621, "rewards/accuracies": 0.75, "rewards/chosen": -40.01258087158203, "rewards/margins": 3.050241470336914, "rewards/rejected": -43.06282043457031, "step": 4466 }, { "epoch": 0.608251633986928, "grad_norm": 42.283356094672044, "learning_rate": 3.192733271411548e-07, "logits/chosen": 12.567310333251953, "logits/rejected": 12.575212478637695, "logps/chosen": -4.171256065368652, "logps/rejected": -4.3576154708862305, "loss": 4.2515, "rewards/accuracies": 0.5, "rewards/chosen": -41.712562561035156, "rewards/margins": 1.8635940551757812, "rewards/rejected": -43.57615661621094, "step": 4467 }, { "epoch": 0.6083877995642701, "grad_norm": 40.82513919991977, "learning_rate": 3.1908710845282564e-07, "logits/chosen": 13.600642204284668, "logits/rejected": 13.374948501586914, "logps/chosen": -4.500308036804199, "logps/rejected": -4.4457550048828125, "loss": 3.8604, "rewards/accuracies": 0.25, "rewards/chosen": -45.00307846069336, "rewards/margins": -0.5455293655395508, "rewards/rejected": -44.457550048828125, "step": 4468 }, { "epoch": 0.6085239651416122, "grad_norm": 45.31715312210238, "learning_rate": 3.1890090804743304e-07, "logits/chosen": 12.37155532836914, "logits/rejected": 13.060784339904785, "logps/chosen": -3.889159917831421, "logps/rejected": -4.307461738586426, "loss": 3.6837, "rewards/accuracies": 0.75, "rewards/chosen": -38.8916015625, "rewards/margins": 4.183017730712891, "rewards/rejected": -43.074615478515625, "step": 4469 }, { "epoch": 0.6086601307189542, "grad_norm": 40.408747129594246, "learning_rate": 3.187147259670507e-07, "logits/chosen": 12.695688247680664, "logits/rejected": 13.677648544311523, "logps/chosen": -4.085854530334473, "logps/rejected": -4.431255340576172, "loss": 3.58, "rewards/accuracies": 0.5, "rewards/chosen": -40.858543395996094, "rewards/margins": 3.4540090560913086, "rewards/rejected": -44.31255340576172, "step": 4470 }, { "epoch": 0.6087962962962963, "grad_norm": 36.58326014769553, "learning_rate": 3.18528562253748e-07, "logits/chosen": 13.693023681640625, "logits/rejected": 13.937971115112305, "logps/chosen": -4.330118179321289, "logps/rejected": -4.442331314086914, "loss": 4.1642, "rewards/accuracies": 0.75, "rewards/chosen": -43.301185607910156, "rewards/margins": 1.1221284866333008, "rewards/rejected": -44.42331314086914, "step": 4471 }, { "epoch": 0.6089324618736384, "grad_norm": 42.2039380610071, "learning_rate": 3.1834241694959e-07, "logits/chosen": 12.478927612304688, "logits/rejected": 13.140924453735352, "logps/chosen": -3.861330509185791, "logps/rejected": -4.065589904785156, "loss": 4.2591, "rewards/accuracies": 0.75, "rewards/chosen": -38.613304138183594, "rewards/margins": 2.0425891876220703, "rewards/rejected": -40.6558952331543, "step": 4472 }, { "epoch": 0.6090686274509803, "grad_norm": 39.72702445648483, "learning_rate": 3.1815629009663786e-07, "logits/chosen": 12.201027870178223, "logits/rejected": 13.9711275100708, "logps/chosen": -4.311647415161133, "logps/rejected": -4.7751665115356445, "loss": 3.7134, "rewards/accuracies": 1.0, "rewards/chosen": -43.11647415161133, "rewards/margins": 4.63519287109375, "rewards/rejected": -47.75166702270508, "step": 4473 }, { "epoch": 0.6092047930283224, "grad_norm": 38.69907078737522, "learning_rate": 3.1797018173694874e-07, "logits/chosen": 14.407022476196289, "logits/rejected": 14.128408432006836, "logps/chosen": -4.215246200561523, "logps/rejected": -4.519899845123291, "loss": 3.3899, "rewards/accuracies": 0.75, "rewards/chosen": -42.1524658203125, "rewards/margins": 3.04653263092041, "rewards/rejected": -45.198997497558594, "step": 4474 }, { "epoch": 0.6093409586056645, "grad_norm": 48.442339013071646, "learning_rate": 3.1778409191257487e-07, "logits/chosen": 13.078781127929688, "logits/rejected": 13.470054626464844, "logps/chosen": -4.020760536193848, "logps/rejected": -4.637372016906738, "loss": 4.6745, "rewards/accuracies": 1.0, "rewards/chosen": -40.207603454589844, "rewards/margins": 6.166120529174805, "rewards/rejected": -46.37372589111328, "step": 4475 }, { "epoch": 0.6094771241830066, "grad_norm": 45.143997604833174, "learning_rate": 3.175980206655651e-07, "logits/chosen": 14.052225112915039, "logits/rejected": 13.855093002319336, "logps/chosen": -4.684267520904541, "logps/rejected": -4.661746501922607, "loss": 4.1276, "rewards/accuracies": 0.5, "rewards/chosen": -46.842674255371094, "rewards/margins": -0.2252063751220703, "rewards/rejected": -46.61746597290039, "step": 4476 }, { "epoch": 0.6096132897603486, "grad_norm": 43.071220672275814, "learning_rate": 3.174119680379638e-07, "logits/chosen": 13.476058959960938, "logits/rejected": 13.162191390991211, "logps/chosen": -4.180771827697754, "logps/rejected": -4.235889434814453, "loss": 4.5629, "rewards/accuracies": 0.5, "rewards/chosen": -41.80772018432617, "rewards/margins": 0.5511751174926758, "rewards/rejected": -42.35889434814453, "step": 4477 }, { "epoch": 0.6097494553376906, "grad_norm": 39.026696171096305, "learning_rate": 3.172259340718109e-07, "logits/chosen": 13.359138488769531, "logits/rejected": 13.575403213500977, "logps/chosen": -4.221770286560059, "logps/rejected": -4.568609237670898, "loss": 3.9813, "rewards/accuracies": 0.5, "rewards/chosen": -42.21770477294922, "rewards/margins": 3.468390464782715, "rewards/rejected": -45.686092376708984, "step": 4478 }, { "epoch": 0.6098856209150327, "grad_norm": 38.93637682139555, "learning_rate": 3.1703991880914236e-07, "logits/chosen": 13.30533218383789, "logits/rejected": 13.486929893493652, "logps/chosen": -4.205724716186523, "logps/rejected": -4.591976642608643, "loss": 3.3924, "rewards/accuracies": 0.75, "rewards/chosen": -42.057247161865234, "rewards/margins": 3.862520217895508, "rewards/rejected": -45.919769287109375, "step": 4479 }, { "epoch": 0.6100217864923747, "grad_norm": 69.4469242807371, "learning_rate": 3.168539222919901e-07, "logits/chosen": 13.636432647705078, "logits/rejected": 13.90222454071045, "logps/chosen": -4.266218185424805, "logps/rejected": -4.296341896057129, "loss": 4.086, "rewards/accuracies": 0.25, "rewards/chosen": -42.66218566894531, "rewards/margins": 0.30123424530029297, "rewards/rejected": -42.963417053222656, "step": 4480 }, { "epoch": 0.6101579520697168, "grad_norm": 60.69174535880984, "learning_rate": 3.166679445623812e-07, "logits/chosen": 13.863451957702637, "logits/rejected": 13.151546478271484, "logps/chosen": -4.242880344390869, "logps/rejected": -4.094677925109863, "loss": 3.99, "rewards/accuracies": 0.5, "rewards/chosen": -42.42880630493164, "rewards/margins": -1.4820222854614258, "rewards/rejected": -40.946781158447266, "step": 4481 }, { "epoch": 0.6102941176470589, "grad_norm": 36.29277223999132, "learning_rate": 3.1648198566233915e-07, "logits/chosen": 13.714380264282227, "logits/rejected": 14.224652290344238, "logps/chosen": -4.301557540893555, "logps/rejected": -4.578843593597412, "loss": 4.0439, "rewards/accuracies": 0.75, "rewards/chosen": -43.01557540893555, "rewards/margins": 2.772859573364258, "rewards/rejected": -45.78843688964844, "step": 4482 }, { "epoch": 0.6104302832244008, "grad_norm": 41.47379316477506, "learning_rate": 3.1629604563388287e-07, "logits/chosen": 12.654739379882812, "logits/rejected": 13.340829849243164, "logps/chosen": -4.544833183288574, "logps/rejected": -4.384744644165039, "loss": 3.8874, "rewards/accuracies": 0.25, "rewards/chosen": -45.44832992553711, "rewards/margins": -1.6008853912353516, "rewards/rejected": -43.847442626953125, "step": 4483 }, { "epoch": 0.6105664488017429, "grad_norm": 33.78900891674889, "learning_rate": 3.161101245190268e-07, "logits/chosen": 13.398088455200195, "logits/rejected": 13.546638488769531, "logps/chosen": -4.464967727661133, "logps/rejected": -4.654763221740723, "loss": 3.727, "rewards/accuracies": 0.75, "rewards/chosen": -44.649681091308594, "rewards/margins": 1.8979558944702148, "rewards/rejected": -46.54763412475586, "step": 4484 }, { "epoch": 0.610702614379085, "grad_norm": 35.06550686052362, "learning_rate": 3.1592422235978164e-07, "logits/chosen": 12.176578521728516, "logits/rejected": 12.605202674865723, "logps/chosen": -3.9991559982299805, "logps/rejected": -4.128352165222168, "loss": 3.7856, "rewards/accuracies": 0.5, "rewards/chosen": -39.99155807495117, "rewards/margins": 1.2919626235961914, "rewards/rejected": -41.28351974487305, "step": 4485 }, { "epoch": 0.610838779956427, "grad_norm": 39.92766801299032, "learning_rate": 3.157383391981535e-07, "logits/chosen": 13.042037963867188, "logits/rejected": 13.407753944396973, "logps/chosen": -4.2016119956970215, "logps/rejected": -4.3533759117126465, "loss": 4.4132, "rewards/accuracies": 0.75, "rewards/chosen": -42.016117095947266, "rewards/margins": 1.5176401138305664, "rewards/rejected": -43.53376007080078, "step": 4486 }, { "epoch": 0.6109749455337691, "grad_norm": 39.69576118185605, "learning_rate": 3.15552475076144e-07, "logits/chosen": 12.271432876586914, "logits/rejected": 13.037230491638184, "logps/chosen": -4.185850143432617, "logps/rejected": -4.300275802612305, "loss": 3.9397, "rewards/accuracies": 0.5, "rewards/chosen": -41.85850524902344, "rewards/margins": 1.144256591796875, "rewards/rejected": -43.00276184082031, "step": 4487 }, { "epoch": 0.6111111111111112, "grad_norm": 41.46653846648763, "learning_rate": 3.1536663003575083e-07, "logits/chosen": 12.625179290771484, "logits/rejected": 13.704673767089844, "logps/chosen": -4.068140506744385, "logps/rejected": -4.417865753173828, "loss": 4.2517, "rewards/accuracies": 0.75, "rewards/chosen": -40.68140411376953, "rewards/margins": 3.497251510620117, "rewards/rejected": -44.17865753173828, "step": 4488 }, { "epoch": 0.6112472766884531, "grad_norm": 37.65846191555139, "learning_rate": 3.1518080411896736e-07, "logits/chosen": 13.624996185302734, "logits/rejected": 13.643220901489258, "logps/chosen": -4.648739337921143, "logps/rejected": -4.746706962585449, "loss": 3.6888, "rewards/accuracies": 0.5, "rewards/chosen": -46.48739242553711, "rewards/margins": 0.9796791076660156, "rewards/rejected": -47.467071533203125, "step": 4489 }, { "epoch": 0.6113834422657952, "grad_norm": 39.74080513315073, "learning_rate": 3.1499499736778214e-07, "logits/chosen": 12.783788681030273, "logits/rejected": 13.402828216552734, "logps/chosen": -3.9515631198883057, "logps/rejected": -4.123316287994385, "loss": 3.7482, "rewards/accuracies": 0.75, "rewards/chosen": -39.51563262939453, "rewards/margins": 1.7175331115722656, "rewards/rejected": -41.23316192626953, "step": 4490 }, { "epoch": 0.6115196078431373, "grad_norm": 42.06232501530169, "learning_rate": 3.1480920982417993e-07, "logits/chosen": 13.290160179138184, "logits/rejected": 13.264970779418945, "logps/chosen": -4.571144104003906, "logps/rejected": -4.264354705810547, "loss": 3.7706, "rewards/accuracies": 0.0, "rewards/chosen": -45.71144104003906, "rewards/margins": -3.0678911209106445, "rewards/rejected": -42.64354705810547, "step": 4491 }, { "epoch": 0.6116557734204793, "grad_norm": 41.087158930320506, "learning_rate": 3.1462344153014107e-07, "logits/chosen": 13.81177043914795, "logits/rejected": 13.195987701416016, "logps/chosen": -4.23939323425293, "logps/rejected": -4.347844123840332, "loss": 3.8576, "rewards/accuracies": 0.75, "rewards/chosen": -42.3939323425293, "rewards/margins": 1.0845069885253906, "rewards/rejected": -43.47843933105469, "step": 4492 }, { "epoch": 0.6117919389978214, "grad_norm": 39.98310579824364, "learning_rate": 3.144376925276412e-07, "logits/chosen": 12.779829978942871, "logits/rejected": 13.211009979248047, "logps/chosen": -4.30797815322876, "logps/rejected": -4.592948913574219, "loss": 4.0486, "rewards/accuracies": 0.75, "rewards/chosen": -43.07978057861328, "rewards/margins": 2.849710464477539, "rewards/rejected": -45.92948913574219, "step": 4493 }, { "epoch": 0.6119281045751634, "grad_norm": 37.04430716464625, "learning_rate": 3.14251962858652e-07, "logits/chosen": 13.815631866455078, "logits/rejected": 13.569382667541504, "logps/chosen": -4.1218342781066895, "logps/rejected": -4.416720867156982, "loss": 4.1687, "rewards/accuracies": 1.0, "rewards/chosen": -41.21834182739258, "rewards/margins": 2.948866844177246, "rewards/rejected": -44.16720962524414, "step": 4494 }, { "epoch": 0.6120642701525054, "grad_norm": 37.62270472768903, "learning_rate": 3.140662525651407e-07, "logits/chosen": 13.007427215576172, "logits/rejected": 14.01728630065918, "logps/chosen": -3.9070470333099365, "logps/rejected": -4.371201515197754, "loss": 4.0173, "rewards/accuracies": 0.75, "rewards/chosen": -39.070472717285156, "rewards/margins": 4.641543388366699, "rewards/rejected": -43.712013244628906, "step": 4495 }, { "epoch": 0.6122004357298475, "grad_norm": 38.62011964639455, "learning_rate": 3.138805616890698e-07, "logits/chosen": 12.91976261138916, "logits/rejected": 13.58272933959961, "logps/chosen": -4.104042053222656, "logps/rejected": -4.393254280090332, "loss": 3.8393, "rewards/accuracies": 1.0, "rewards/chosen": -41.04042053222656, "rewards/margins": 2.892125129699707, "rewards/rejected": -43.93254470825195, "step": 4496 }, { "epoch": 0.6123366013071896, "grad_norm": 36.68185956838093, "learning_rate": 3.1369489027239786e-07, "logits/chosen": 13.341266632080078, "logits/rejected": 12.752918243408203, "logps/chosen": -4.2649760246276855, "logps/rejected": -4.093078136444092, "loss": 3.9564, "rewards/accuracies": 0.0, "rewards/chosen": -42.64976119995117, "rewards/margins": -1.718979835510254, "rewards/rejected": -40.93077850341797, "step": 4497 }, { "epoch": 0.6124727668845316, "grad_norm": 35.394608080660774, "learning_rate": 3.1350923835707907e-07, "logits/chosen": 12.793094635009766, "logits/rejected": 13.758360862731934, "logps/chosen": -4.262631893157959, "logps/rejected": -4.662467002868652, "loss": 3.6784, "rewards/accuracies": 0.75, "rewards/chosen": -42.626319885253906, "rewards/margins": 3.998350143432617, "rewards/rejected": -46.62466812133789, "step": 4498 }, { "epoch": 0.6126089324618736, "grad_norm": 39.283734697053816, "learning_rate": 3.133236059850627e-07, "logits/chosen": 12.577157974243164, "logits/rejected": 13.930137634277344, "logps/chosen": -4.205810070037842, "logps/rejected": -4.766834259033203, "loss": 4.176, "rewards/accuracies": 1.0, "rewards/chosen": -42.058101654052734, "rewards/margins": 5.610240936279297, "rewards/rejected": -47.66834259033203, "step": 4499 }, { "epoch": 0.6127450980392157, "grad_norm": 37.90712641872816, "learning_rate": 3.131379931982939e-07, "logits/chosen": 13.079309463500977, "logits/rejected": 12.769410133361816, "logps/chosen": -4.495579242706299, "logps/rejected": -4.277215480804443, "loss": 4.4934, "rewards/accuracies": 0.25, "rewards/chosen": -44.955787658691406, "rewards/margins": -2.183633804321289, "rewards/rejected": -42.77215576171875, "step": 4500 }, { "epoch": 0.6128812636165577, "grad_norm": 39.0770722837055, "learning_rate": 3.129524000387138e-07, "logits/chosen": 12.631847381591797, "logits/rejected": 13.481973648071289, "logps/chosen": -3.905851125717163, "logps/rejected": -4.360349178314209, "loss": 4.2701, "rewards/accuracies": 1.0, "rewards/chosen": -39.058509826660156, "rewards/margins": 4.544980049133301, "rewards/rejected": -43.603492736816406, "step": 4501 }, { "epoch": 0.6130174291938998, "grad_norm": 37.79986603474537, "learning_rate": 3.127668265482582e-07, "logits/chosen": 12.966157913208008, "logits/rejected": 13.286977767944336, "logps/chosen": -4.180903434753418, "logps/rejected": -4.284766674041748, "loss": 3.9407, "rewards/accuracies": 0.25, "rewards/chosen": -41.80903625488281, "rewards/margins": 1.0386314392089844, "rewards/rejected": -42.84767150878906, "step": 4502 }, { "epoch": 0.6131535947712419, "grad_norm": 39.421911204109264, "learning_rate": 3.1258127276885934e-07, "logits/chosen": 12.70999526977539, "logits/rejected": 13.541360855102539, "logps/chosen": -4.490160942077637, "logps/rejected": -4.552486419677734, "loss": 3.8424, "rewards/accuracies": 0.5, "rewards/chosen": -44.901607513427734, "rewards/margins": 0.6232538223266602, "rewards/rejected": -45.524864196777344, "step": 4503 }, { "epoch": 0.6132897603485838, "grad_norm": 40.62331595099933, "learning_rate": 3.123957387424446e-07, "logits/chosen": 13.312570571899414, "logits/rejected": 13.215967178344727, "logps/chosen": -4.009866237640381, "logps/rejected": -4.379883766174316, "loss": 3.6153, "rewards/accuracies": 0.75, "rewards/chosen": -40.098663330078125, "rewards/margins": 3.7001733779907227, "rewards/rejected": -43.79883575439453, "step": 4504 }, { "epoch": 0.6134259259259259, "grad_norm": 40.59227699008809, "learning_rate": 3.1221022451093666e-07, "logits/chosen": 12.863895416259766, "logits/rejected": 14.223787307739258, "logps/chosen": -3.9868831634521484, "logps/rejected": -4.569980144500732, "loss": 3.8375, "rewards/accuracies": 1.0, "rewards/chosen": -39.86882781982422, "rewards/margins": 5.830972671508789, "rewards/rejected": -45.699806213378906, "step": 4505 }, { "epoch": 0.613562091503268, "grad_norm": 41.163290327399146, "learning_rate": 3.1202473011625423e-07, "logits/chosen": 12.903066635131836, "logits/rejected": 13.67083740234375, "logps/chosen": -4.199457168579102, "logps/rejected": -4.462099075317383, "loss": 3.8534, "rewards/accuracies": 0.75, "rewards/chosen": -41.99456787109375, "rewards/margins": 2.626420021057129, "rewards/rejected": -44.62099075317383, "step": 4506 }, { "epoch": 0.61369825708061, "grad_norm": 41.22802074191203, "learning_rate": 3.118392556003114e-07, "logits/chosen": 12.521814346313477, "logits/rejected": 13.481958389282227, "logps/chosen": -3.885145664215088, "logps/rejected": -4.2145490646362305, "loss": 4.7398, "rewards/accuracies": 1.0, "rewards/chosen": -38.85145568847656, "rewards/margins": 3.294034957885742, "rewards/rejected": -42.14549255371094, "step": 4507 }, { "epoch": 0.6138344226579521, "grad_norm": 38.90363123205027, "learning_rate": 3.116538010050173e-07, "logits/chosen": 13.233997344970703, "logits/rejected": 13.641231536865234, "logps/chosen": -4.2230424880981445, "logps/rejected": -4.483829498291016, "loss": 4.2685, "rewards/accuracies": 0.75, "rewards/chosen": -42.23042678833008, "rewards/margins": 2.6078691482543945, "rewards/rejected": -44.838294982910156, "step": 4508 }, { "epoch": 0.6139705882352942, "grad_norm": 42.81367422022632, "learning_rate": 3.114683663722771e-07, "logits/chosen": 12.974739074707031, "logits/rejected": 13.323415756225586, "logps/chosen": -4.009565353393555, "logps/rejected": -4.115710735321045, "loss": 4.1719, "rewards/accuracies": 1.0, "rewards/chosen": -40.09565734863281, "rewards/margins": 1.0614490509033203, "rewards/rejected": -41.1571044921875, "step": 4509 }, { "epoch": 0.6141067538126361, "grad_norm": 35.53911742138649, "learning_rate": 3.112829517439915e-07, "logits/chosen": 13.343971252441406, "logits/rejected": 13.235204696655273, "logps/chosen": -4.164093971252441, "logps/rejected": -4.184264183044434, "loss": 3.6933, "rewards/accuracies": 0.5, "rewards/chosen": -41.64094161987305, "rewards/margins": 0.20169925689697266, "rewards/rejected": -41.8426399230957, "step": 4510 }, { "epoch": 0.6142429193899782, "grad_norm": 37.586848820173635, "learning_rate": 3.1109755716205625e-07, "logits/chosen": 13.494266510009766, "logits/rejected": 13.2710542678833, "logps/chosen": -4.172026634216309, "logps/rejected": -4.256951332092285, "loss": 3.9642, "rewards/accuracies": 0.5, "rewards/chosen": -41.72026443481445, "rewards/margins": 0.8492507934570312, "rewards/rejected": -42.569515228271484, "step": 4511 }, { "epoch": 0.6143790849673203, "grad_norm": 40.69189922275922, "learning_rate": 3.1091218266836283e-07, "logits/chosen": 12.3139066696167, "logits/rejected": 12.524679183959961, "logps/chosen": -3.9480865001678467, "logps/rejected": -4.281985282897949, "loss": 3.6611, "rewards/accuracies": 1.0, "rewards/chosen": -39.480865478515625, "rewards/margins": 3.3389883041381836, "rewards/rejected": -42.819854736328125, "step": 4512 }, { "epoch": 0.6145152505446623, "grad_norm": 36.267787625924775, "learning_rate": 3.1072682830479815e-07, "logits/chosen": 12.792196273803711, "logits/rejected": 13.25129508972168, "logps/chosen": -4.212339401245117, "logps/rejected": -4.4913201332092285, "loss": 3.8835, "rewards/accuracies": 0.5, "rewards/chosen": -42.12339782714844, "rewards/margins": 2.7898054122924805, "rewards/rejected": -44.91320037841797, "step": 4513 }, { "epoch": 0.6146514161220044, "grad_norm": 35.37657693909682, "learning_rate": 3.1054149411324454e-07, "logits/chosen": 13.149328231811523, "logits/rejected": 13.323512077331543, "logps/chosen": -3.95131254196167, "logps/rejected": -4.175896644592285, "loss": 3.8631, "rewards/accuracies": 0.75, "rewards/chosen": -39.513126373291016, "rewards/margins": 2.245840072631836, "rewards/rejected": -41.75896453857422, "step": 4514 }, { "epoch": 0.6147875816993464, "grad_norm": 41.946014018238564, "learning_rate": 3.1035618013557974e-07, "logits/chosen": 13.015066146850586, "logits/rejected": 13.333114624023438, "logps/chosen": -4.307076454162598, "logps/rejected": -4.464210510253906, "loss": 3.9409, "rewards/accuracies": 0.5, "rewards/chosen": -43.070770263671875, "rewards/margins": 1.5713386535644531, "rewards/rejected": -44.64210510253906, "step": 4515 }, { "epoch": 0.6149237472766884, "grad_norm": 34.57268700396041, "learning_rate": 3.1017088641367714e-07, "logits/chosen": 12.854228973388672, "logits/rejected": 12.676460266113281, "logps/chosen": -4.229137420654297, "logps/rejected": -4.276251792907715, "loss": 3.4335, "rewards/accuracies": 0.5, "rewards/chosen": -42.2913703918457, "rewards/margins": 0.4711465835571289, "rewards/rejected": -42.76251983642578, "step": 4516 }, { "epoch": 0.6150599128540305, "grad_norm": 37.24049222120483, "learning_rate": 3.0998561298940516e-07, "logits/chosen": 13.704561233520508, "logits/rejected": 14.055734634399414, "logps/chosen": -4.4394659996032715, "logps/rejected": -4.47157621383667, "loss": 4.0891, "rewards/accuracies": 0.25, "rewards/chosen": -44.39466094970703, "rewards/margins": 0.3211021423339844, "rewards/rejected": -44.71575927734375, "step": 4517 }, { "epoch": 0.6151960784313726, "grad_norm": 40.40543370935569, "learning_rate": 3.098003599046282e-07, "logits/chosen": 12.630159378051758, "logits/rejected": 13.406782150268555, "logps/chosen": -3.944309949874878, "logps/rejected": -4.251960754394531, "loss": 3.9392, "rewards/accuracies": 0.75, "rewards/chosen": -39.44309997558594, "rewards/margins": 3.076505661010742, "rewards/rejected": -42.51960754394531, "step": 4518 }, { "epoch": 0.6153322440087146, "grad_norm": 38.39867612883457, "learning_rate": 3.096151272012054e-07, "logits/chosen": 13.890467643737793, "logits/rejected": 12.928197860717773, "logps/chosen": -4.192205905914307, "logps/rejected": -4.130555152893066, "loss": 3.4988, "rewards/accuracies": 0.25, "rewards/chosen": -41.92205810546875, "rewards/margins": -0.6165065765380859, "rewards/rejected": -41.3055534362793, "step": 4519 }, { "epoch": 0.6154684095860566, "grad_norm": 38.772465311007664, "learning_rate": 3.0942991492099167e-07, "logits/chosen": 13.608768463134766, "logits/rejected": 13.420136451721191, "logps/chosen": -3.8891050815582275, "logps/rejected": -4.559920310974121, "loss": 4.154, "rewards/accuracies": 0.75, "rewards/chosen": -38.89105224609375, "rewards/margins": 6.708153247833252, "rewards/rejected": -45.599205017089844, "step": 4520 }, { "epoch": 0.6156045751633987, "grad_norm": 44.39315638026413, "learning_rate": 3.092447231058374e-07, "logits/chosen": 13.55765151977539, "logits/rejected": 13.150243759155273, "logps/chosen": -4.439012050628662, "logps/rejected": -4.283686637878418, "loss": 4.3073, "rewards/accuracies": 0.25, "rewards/chosen": -44.39012145996094, "rewards/margins": -1.5532550811767578, "rewards/rejected": -42.83686828613281, "step": 4521 }, { "epoch": 0.6157407407407407, "grad_norm": 36.05861280234858, "learning_rate": 3.090595517975882e-07, "logits/chosen": 12.867323875427246, "logits/rejected": 12.9427490234375, "logps/chosen": -4.041132926940918, "logps/rejected": -4.269185543060303, "loss": 3.3986, "rewards/accuracies": 0.75, "rewards/chosen": -40.41133117675781, "rewards/margins": 2.280527114868164, "rewards/rejected": -42.691856384277344, "step": 4522 }, { "epoch": 0.6158769063180828, "grad_norm": 41.50964309946515, "learning_rate": 3.0887440103808484e-07, "logits/chosen": 14.511638641357422, "logits/rejected": 14.410255432128906, "logps/chosen": -4.778444290161133, "logps/rejected": -4.872000694274902, "loss": 4.2831, "rewards/accuracies": 0.5, "rewards/chosen": -47.78443908691406, "rewards/margins": 0.9355669021606445, "rewards/rejected": -48.72000503540039, "step": 4523 }, { "epoch": 0.6160130718954249, "grad_norm": 47.5428269214324, "learning_rate": 3.0868927086916385e-07, "logits/chosen": 12.510116577148438, "logits/rejected": 13.155183792114258, "logps/chosen": -4.092497825622559, "logps/rejected": -4.493408679962158, "loss": 3.556, "rewards/accuracies": 1.0, "rewards/chosen": -40.92497634887695, "rewards/margins": 4.009113311767578, "rewards/rejected": -44.93408966064453, "step": 4524 }, { "epoch": 0.6161492374727668, "grad_norm": 39.24398715338437, "learning_rate": 3.0850416133265705e-07, "logits/chosen": 13.931894302368164, "logits/rejected": 14.48291015625, "logps/chosen": -4.341281890869141, "logps/rejected": -4.324675559997559, "loss": 3.9914, "rewards/accuracies": 0.5, "rewards/chosen": -43.412818908691406, "rewards/margins": -0.16606807708740234, "rewards/rejected": -43.24674987792969, "step": 4525 }, { "epoch": 0.6162854030501089, "grad_norm": 36.53869218426147, "learning_rate": 3.0831907247039114e-07, "logits/chosen": 13.977336883544922, "logits/rejected": 14.210759162902832, "logps/chosen": -4.573376655578613, "logps/rejected": -4.510471343994141, "loss": 4.1585, "rewards/accuracies": 0.5, "rewards/chosen": -45.7337646484375, "rewards/margins": -0.629054069519043, "rewards/rejected": -45.104713439941406, "step": 4526 }, { "epoch": 0.616421568627451, "grad_norm": 39.07905261676631, "learning_rate": 3.081340043241887e-07, "logits/chosen": 12.750938415527344, "logits/rejected": 12.993314743041992, "logps/chosen": -4.185748100280762, "logps/rejected": -4.2710957527160645, "loss": 4.2748, "rewards/accuracies": 0.5, "rewards/chosen": -41.85748291015625, "rewards/margins": 0.8534774780273438, "rewards/rejected": -42.71095657348633, "step": 4527 }, { "epoch": 0.616557734204793, "grad_norm": 39.5068965228567, "learning_rate": 3.0794895693586746e-07, "logits/chosen": 13.168087005615234, "logits/rejected": 13.116673469543457, "logps/chosen": -4.362700462341309, "logps/rejected": -4.212353706359863, "loss": 4.2913, "rewards/accuracies": 0.25, "rewards/chosen": -43.62700271606445, "rewards/margins": -1.5034637451171875, "rewards/rejected": -42.123538970947266, "step": 4528 }, { "epoch": 0.6166938997821351, "grad_norm": 41.08877747399139, "learning_rate": 3.077639303472401e-07, "logits/chosen": 13.06747817993164, "logits/rejected": 13.67922592163086, "logps/chosen": -4.26835823059082, "logps/rejected": -4.399811267852783, "loss": 3.9866, "rewards/accuracies": 0.5, "rewards/chosen": -42.68357849121094, "rewards/margins": 1.3145341873168945, "rewards/rejected": -43.99811553955078, "step": 4529 }, { "epoch": 0.6168300653594772, "grad_norm": 41.01052596719166, "learning_rate": 3.075789246001152e-07, "logits/chosen": 13.61418628692627, "logits/rejected": 13.881500244140625, "logps/chosen": -4.222742557525635, "logps/rejected": -4.494459629058838, "loss": 4.0937, "rewards/accuracies": 1.0, "rewards/chosen": -42.2274284362793, "rewards/margins": 2.7171688079833984, "rewards/rejected": -44.94459533691406, "step": 4530 }, { "epoch": 0.6169662309368191, "grad_norm": 38.83780523929563, "learning_rate": 3.0739393973629636e-07, "logits/chosen": 13.249496459960938, "logits/rejected": 14.226625442504883, "logps/chosen": -4.379506587982178, "logps/rejected": -4.704248428344727, "loss": 4.0683, "rewards/accuracies": 0.75, "rewards/chosen": -43.79506301879883, "rewards/margins": 3.2474184036254883, "rewards/rejected": -47.042484283447266, "step": 4531 }, { "epoch": 0.6171023965141612, "grad_norm": 38.95753220535026, "learning_rate": 3.0720897579758215e-07, "logits/chosen": 13.875299453735352, "logits/rejected": 14.394627571105957, "logps/chosen": -4.253211498260498, "logps/rejected": -4.6954545974731445, "loss": 4.36, "rewards/accuracies": 1.0, "rewards/chosen": -42.53211975097656, "rewards/margins": 4.422429084777832, "rewards/rejected": -46.95454406738281, "step": 4532 }, { "epoch": 0.6172385620915033, "grad_norm": 40.464597099590115, "learning_rate": 3.070240328257669e-07, "logits/chosen": 13.100183486938477, "logits/rejected": 13.7597074508667, "logps/chosen": -3.9761836528778076, "logps/rejected": -4.467498302459717, "loss": 3.8336, "rewards/accuracies": 1.0, "rewards/chosen": -39.76183319091797, "rewards/margins": 4.913145065307617, "rewards/rejected": -44.67498016357422, "step": 4533 }, { "epoch": 0.6173747276688453, "grad_norm": 39.496468866565614, "learning_rate": 3.068391108626402e-07, "logits/chosen": 13.486162185668945, "logits/rejected": 13.207235336303711, "logps/chosen": -4.064267635345459, "logps/rejected": -4.2900390625, "loss": 3.7019, "rewards/accuracies": 0.75, "rewards/chosen": -40.642677307128906, "rewards/margins": 2.2577199935913086, "rewards/rejected": -42.900394439697266, "step": 4534 }, { "epoch": 0.6175108932461874, "grad_norm": 39.45299438271205, "learning_rate": 3.0665420994998623e-07, "logits/chosen": 14.633190155029297, "logits/rejected": 14.360588073730469, "logps/chosen": -4.225234031677246, "logps/rejected": -4.446699142456055, "loss": 3.8572, "rewards/accuracies": 0.75, "rewards/chosen": -42.25233840942383, "rewards/margins": 2.2146501541137695, "rewards/rejected": -44.46698760986328, "step": 4535 }, { "epoch": 0.6176470588235294, "grad_norm": 36.861808983331514, "learning_rate": 3.0646933012958516e-07, "logits/chosen": 14.304506301879883, "logits/rejected": 14.163055419921875, "logps/chosen": -4.37943172454834, "logps/rejected": -4.579670429229736, "loss": 3.553, "rewards/accuracies": 0.75, "rewards/chosen": -43.79431915283203, "rewards/margins": 2.0023860931396484, "rewards/rejected": -45.79670333862305, "step": 4536 }, { "epoch": 0.6177832244008714, "grad_norm": 43.86862344537051, "learning_rate": 3.0628447144321225e-07, "logits/chosen": 13.074359893798828, "logits/rejected": 12.84716796875, "logps/chosen": -4.185420989990234, "logps/rejected": -4.2774481773376465, "loss": 4.2205, "rewards/accuracies": 0.5, "rewards/chosen": -41.85420608520508, "rewards/margins": 0.9202766418457031, "rewards/rejected": -42.77448272705078, "step": 4537 }, { "epoch": 0.6179193899782135, "grad_norm": 157.1133503220909, "learning_rate": 3.0609963393263745e-07, "logits/chosen": 14.05715560913086, "logits/rejected": 14.224698066711426, "logps/chosen": -4.179851055145264, "logps/rejected": -4.459240913391113, "loss": 4.8815, "rewards/accuracies": 1.0, "rewards/chosen": -41.79851150512695, "rewards/margins": 2.7939023971557617, "rewards/rejected": -44.59241485595703, "step": 4538 }, { "epoch": 0.6180555555555556, "grad_norm": 37.839164450676094, "learning_rate": 3.059148176396266e-07, "logits/chosen": 12.620492935180664, "logits/rejected": 14.631996154785156, "logps/chosen": -4.036359786987305, "logps/rejected": -4.700493335723877, "loss": 3.6676, "rewards/accuracies": 1.0, "rewards/chosen": -40.36360168457031, "rewards/margins": 6.641328811645508, "rewards/rejected": -47.00492858886719, "step": 4539 }, { "epoch": 0.6181917211328976, "grad_norm": 35.677395115870205, "learning_rate": 3.0573002260594064e-07, "logits/chosen": 14.682848930358887, "logits/rejected": 13.546621322631836, "logps/chosen": -4.449558734893799, "logps/rejected": -4.381193161010742, "loss": 3.4829, "rewards/accuracies": 0.75, "rewards/chosen": -44.49558639526367, "rewards/margins": -0.6836585998535156, "rewards/rejected": -43.811927795410156, "step": 4540 }, { "epoch": 0.6183278867102396, "grad_norm": 40.975599504045014, "learning_rate": 3.05545248873335e-07, "logits/chosen": 14.159774780273438, "logits/rejected": 14.737421035766602, "logps/chosen": -4.175103187561035, "logps/rejected": -4.407591819763184, "loss": 4.363, "rewards/accuracies": 0.75, "rewards/chosen": -41.75103759765625, "rewards/margins": 2.3248815536499023, "rewards/rejected": -44.07592010498047, "step": 4541 }, { "epoch": 0.6184640522875817, "grad_norm": 39.508550830441564, "learning_rate": 3.053604964835613e-07, "logits/chosen": 14.164297103881836, "logits/rejected": 13.793119430541992, "logps/chosen": -4.5986714363098145, "logps/rejected": -4.671299934387207, "loss": 3.4442, "rewards/accuracies": 0.75, "rewards/chosen": -45.986717224121094, "rewards/margins": 0.7262802124023438, "rewards/rejected": -46.71299743652344, "step": 4542 }, { "epoch": 0.6186002178649237, "grad_norm": 37.65781044242297, "learning_rate": 3.0517576547836585e-07, "logits/chosen": 13.206573486328125, "logits/rejected": 13.962821006774902, "logps/chosen": -4.337433815002441, "logps/rejected": -4.422892093658447, "loss": 3.381, "rewards/accuracies": 0.5, "rewards/chosen": -43.37433624267578, "rewards/margins": 0.8545808792114258, "rewards/rejected": -44.228919982910156, "step": 4543 }, { "epoch": 0.6187363834422658, "grad_norm": 42.11305950967997, "learning_rate": 3.049910558994898e-07, "logits/chosen": 14.01098346710205, "logits/rejected": 13.663849830627441, "logps/chosen": -4.331215858459473, "logps/rejected": -4.3878679275512695, "loss": 3.675, "rewards/accuracies": 0.5, "rewards/chosen": -43.312156677246094, "rewards/margins": 0.5665187835693359, "rewards/rejected": -43.87867736816406, "step": 4544 }, { "epoch": 0.6188725490196079, "grad_norm": 40.48593194420518, "learning_rate": 3.0480636778867004e-07, "logits/chosen": 13.060518264770508, "logits/rejected": 13.297504425048828, "logps/chosen": -4.427664279937744, "logps/rejected": -4.5397820472717285, "loss": 3.9938, "rewards/accuracies": 0.75, "rewards/chosen": -44.276641845703125, "rewards/margins": 1.1211814880371094, "rewards/rejected": -45.39781951904297, "step": 4545 }, { "epoch": 0.6190087145969498, "grad_norm": 40.94639885076603, "learning_rate": 3.0462170118763856e-07, "logits/chosen": 12.783781051635742, "logits/rejected": 13.733308792114258, "logps/chosen": -3.8488881587982178, "logps/rejected": -4.512844085693359, "loss": 4.1224, "rewards/accuracies": 1.0, "rewards/chosen": -38.4888801574707, "rewards/margins": 6.639559745788574, "rewards/rejected": -45.128440856933594, "step": 4546 }, { "epoch": 0.6191448801742919, "grad_norm": 40.06751688827215, "learning_rate": 3.044370561381219e-07, "logits/chosen": 13.408185958862305, "logits/rejected": 13.823163986206055, "logps/chosen": -4.131013870239258, "logps/rejected": -4.313290119171143, "loss": 3.8076, "rewards/accuracies": 0.75, "rewards/chosen": -41.310142517089844, "rewards/margins": 1.8227596282958984, "rewards/rejected": -43.13290023803711, "step": 4547 }, { "epoch": 0.619281045751634, "grad_norm": 46.53823737331887, "learning_rate": 3.0425243268184233e-07, "logits/chosen": 13.063669204711914, "logits/rejected": 13.875619888305664, "logps/chosen": -4.011388301849365, "logps/rejected": -4.269189357757568, "loss": 4.473, "rewards/accuracies": 0.75, "rewards/chosen": -40.11388397216797, "rewards/margins": 2.5780115127563477, "rewards/rejected": -42.69189453125, "step": 4548 }, { "epoch": 0.619417211328976, "grad_norm": 41.411222791431285, "learning_rate": 3.040678308605172e-07, "logits/chosen": 12.782176971435547, "logits/rejected": 12.868144035339355, "logps/chosen": -4.1183624267578125, "logps/rejected": -4.2767415046691895, "loss": 3.9578, "rewards/accuracies": 0.75, "rewards/chosen": -41.183624267578125, "rewards/margins": 1.5837907791137695, "rewards/rejected": -42.76741409301758, "step": 4549 }, { "epoch": 0.6195533769063181, "grad_norm": 37.02949726563675, "learning_rate": 3.038832507158586e-07, "logits/chosen": 11.75126838684082, "logits/rejected": 13.163373947143555, "logps/chosen": -3.8116443157196045, "logps/rejected": -4.288546085357666, "loss": 4.1276, "rewards/accuracies": 1.0, "rewards/chosen": -38.11643981933594, "rewards/margins": 4.769016265869141, "rewards/rejected": -42.885459899902344, "step": 4550 }, { "epoch": 0.6196895424836601, "grad_norm": 40.71370014627455, "learning_rate": 3.036986922895739e-07, "logits/chosen": 14.497058868408203, "logits/rejected": 13.725439071655273, "logps/chosen": -4.077416896820068, "logps/rejected": -3.933633804321289, "loss": 4.3378, "rewards/accuracies": 0.25, "rewards/chosen": -40.774169921875, "rewards/margins": -1.4378299713134766, "rewards/rejected": -39.336341857910156, "step": 4551 }, { "epoch": 0.6198257080610022, "grad_norm": 37.637246667653855, "learning_rate": 3.0351415562336594e-07, "logits/chosen": 14.099536895751953, "logits/rejected": 14.133256912231445, "logps/chosen": -4.4314775466918945, "logps/rejected": -4.597811222076416, "loss": 3.6485, "rewards/accuracies": 0.75, "rewards/chosen": -44.31477737426758, "rewards/margins": 1.6633377075195312, "rewards/rejected": -45.978111267089844, "step": 4552 }, { "epoch": 0.6199618736383442, "grad_norm": 41.400372263953855, "learning_rate": 3.033296407589319e-07, "logits/chosen": 12.887479782104492, "logits/rejected": 14.186702728271484, "logps/chosen": -3.9745981693267822, "logps/rejected": -4.753375053405762, "loss": 3.3646, "rewards/accuracies": 1.0, "rewards/chosen": -39.74597930908203, "rewards/margins": 7.7877655029296875, "rewards/rejected": -47.53374481201172, "step": 4553 }, { "epoch": 0.6200980392156863, "grad_norm": 37.13685109978138, "learning_rate": 3.0314514773796463e-07, "logits/chosen": 14.109687805175781, "logits/rejected": 13.843223571777344, "logps/chosen": -4.271310329437256, "logps/rejected": -4.114320278167725, "loss": 4.0342, "rewards/accuracies": 0.25, "rewards/chosen": -42.713104248046875, "rewards/margins": -1.569901466369629, "rewards/rejected": -41.1431999206543, "step": 4554 }, { "epoch": 0.6202342047930284, "grad_norm": 39.85469269336558, "learning_rate": 3.02960676602152e-07, "logits/chosen": 13.60194206237793, "logits/rejected": 14.091703414916992, "logps/chosen": -4.280889511108398, "logps/rejected": -4.61430549621582, "loss": 3.9226, "rewards/accuracies": 0.75, "rewards/chosen": -42.80889892578125, "rewards/margins": 3.334157943725586, "rewards/rejected": -46.14305114746094, "step": 4555 }, { "epoch": 0.6203703703703703, "grad_norm": 39.8829832577755, "learning_rate": 3.0277622739317643e-07, "logits/chosen": 13.2882719039917, "logits/rejected": 13.777604103088379, "logps/chosen": -4.078916072845459, "logps/rejected": -4.161056995391846, "loss": 3.62, "rewards/accuracies": 0.5, "rewards/chosen": -40.789161682128906, "rewards/margins": 0.8214101791381836, "rewards/rejected": -41.610572814941406, "step": 4556 }, { "epoch": 0.6205065359477124, "grad_norm": 35.06357726940967, "learning_rate": 3.0259180015271594e-07, "logits/chosen": 13.546521186828613, "logits/rejected": 13.458900451660156, "logps/chosen": -4.1863203048706055, "logps/rejected": -4.36271858215332, "loss": 3.9262, "rewards/accuracies": 0.75, "rewards/chosen": -41.86320114135742, "rewards/margins": 1.7639856338500977, "rewards/rejected": -43.6271858215332, "step": 4557 }, { "epoch": 0.6206427015250545, "grad_norm": 37.41398203181906, "learning_rate": 3.024073949224435e-07, "logits/chosen": 13.27962875366211, "logits/rejected": 13.897335052490234, "logps/chosen": -3.8133819103240967, "logps/rejected": -4.1647844314575195, "loss": 3.4799, "rewards/accuracies": 0.75, "rewards/chosen": -38.133819580078125, "rewards/margins": 3.5140275955200195, "rewards/rejected": -41.64784622192383, "step": 4558 }, { "epoch": 0.6207788671023965, "grad_norm": 43.95415271115668, "learning_rate": 3.0222301174402684e-07, "logits/chosen": 13.97675895690918, "logits/rejected": 14.542976379394531, "logps/chosen": -4.25156831741333, "logps/rejected": -4.5032243728637695, "loss": 4.7174, "rewards/accuracies": 0.5, "rewards/chosen": -42.51567840576172, "rewards/margins": 2.5165624618530273, "rewards/rejected": -45.03224182128906, "step": 4559 }, { "epoch": 0.6209150326797386, "grad_norm": 38.92586067326346, "learning_rate": 3.020386506591289e-07, "logits/chosen": 12.79239273071289, "logits/rejected": 12.857070922851562, "logps/chosen": -4.007376670837402, "logps/rejected": -4.198678493499756, "loss": 3.9378, "rewards/accuracies": 0.5, "rewards/chosen": -40.073768615722656, "rewards/margins": 1.913015365600586, "rewards/rejected": -41.986785888671875, "step": 4560 }, { "epoch": 0.6210511982570807, "grad_norm": 38.53894213789017, "learning_rate": 3.018543117094076e-07, "logits/chosen": 13.577878952026367, "logits/rejected": 13.26353645324707, "logps/chosen": -4.431658744812012, "logps/rejected": -4.33326530456543, "loss": 3.9192, "rewards/accuracies": 0.25, "rewards/chosen": -44.31658935546875, "rewards/margins": -0.9839372634887695, "rewards/rejected": -43.33264923095703, "step": 4561 }, { "epoch": 0.6211873638344226, "grad_norm": 36.52194724338492, "learning_rate": 3.0166999493651595e-07, "logits/chosen": 13.499683380126953, "logits/rejected": 14.784566879272461, "logps/chosen": -4.378808498382568, "logps/rejected": -4.9574384689331055, "loss": 3.5213, "rewards/accuracies": 1.0, "rewards/chosen": -43.7880859375, "rewards/margins": 5.7863006591796875, "rewards/rejected": -49.57438659667969, "step": 4562 }, { "epoch": 0.6213235294117647, "grad_norm": 39.110337910351646, "learning_rate": 3.014857003821016e-07, "logits/chosen": 13.620237350463867, "logits/rejected": 13.825618743896484, "logps/chosen": -4.170143127441406, "logps/rejected": -4.263927936553955, "loss": 3.6604, "rewards/accuracies": 0.5, "rewards/chosen": -41.70143127441406, "rewards/margins": 0.9378490447998047, "rewards/rejected": -42.6392822265625, "step": 4563 }, { "epoch": 0.6214596949891068, "grad_norm": 41.334094666352776, "learning_rate": 3.0130142808780764e-07, "logits/chosen": 14.261564254760742, "logits/rejected": 14.326971054077148, "logps/chosen": -4.1329145431518555, "logps/rejected": -4.239559173583984, "loss": 4.3492, "rewards/accuracies": 0.75, "rewards/chosen": -41.32914352416992, "rewards/margins": 1.0664501190185547, "rewards/rejected": -42.395591735839844, "step": 4564 }, { "epoch": 0.6215958605664488, "grad_norm": 35.606038480013424, "learning_rate": 3.0111717809527185e-07, "logits/chosen": 13.20376968383789, "logits/rejected": 12.72087287902832, "logps/chosen": -3.9359402656555176, "logps/rejected": -4.130475044250488, "loss": 3.7555, "rewards/accuracies": 0.75, "rewards/chosen": -39.359405517578125, "rewards/margins": 1.9453439712524414, "rewards/rejected": -41.30474853515625, "step": 4565 }, { "epoch": 0.6217320261437909, "grad_norm": 39.628072857102744, "learning_rate": 3.0093295044612705e-07, "logits/chosen": 13.394411087036133, "logits/rejected": 13.4578857421875, "logps/chosen": -4.007164001464844, "logps/rejected": -4.142702102661133, "loss": 4.1042, "rewards/accuracies": 0.75, "rewards/chosen": -40.07164001464844, "rewards/margins": 1.3553829193115234, "rewards/rejected": -41.427024841308594, "step": 4566 }, { "epoch": 0.621868191721133, "grad_norm": 37.955997351886595, "learning_rate": 3.00748745182001e-07, "logits/chosen": 12.963722229003906, "logits/rejected": 13.180191993713379, "logps/chosen": -4.09361457824707, "logps/rejected": -4.179415225982666, "loss": 3.7798, "rewards/accuracies": 0.5, "rewards/chosen": -40.9361457824707, "rewards/margins": 0.8580036163330078, "rewards/rejected": -41.79414749145508, "step": 4567 }, { "epoch": 0.6220043572984749, "grad_norm": 41.37763641694989, "learning_rate": 3.005645623445163e-07, "logits/chosen": 12.983621597290039, "logits/rejected": 13.547836303710938, "logps/chosen": -4.209332466125488, "logps/rejected": -4.315553665161133, "loss": 4.7673, "rewards/accuracies": 0.5, "rewards/chosen": -42.093326568603516, "rewards/margins": 1.062209129333496, "rewards/rejected": -43.15553665161133, "step": 4568 }, { "epoch": 0.622140522875817, "grad_norm": 37.54012136549944, "learning_rate": 3.003804019752908e-07, "logits/chosen": 12.842893600463867, "logits/rejected": 13.20598030090332, "logps/chosen": -4.493707656860352, "logps/rejected": -4.372782230377197, "loss": 3.3699, "rewards/accuracies": 0.5, "rewards/chosen": -44.93707275390625, "rewards/margins": -1.2092504501342773, "rewards/rejected": -43.727821350097656, "step": 4569 }, { "epoch": 0.6222766884531591, "grad_norm": 39.911492085362454, "learning_rate": 3.0019626411593695e-07, "logits/chosen": 13.78178596496582, "logits/rejected": 13.812287330627441, "logps/chosen": -4.829073905944824, "logps/rejected": -4.640079498291016, "loss": 3.6708, "rewards/accuracies": 0.25, "rewards/chosen": -48.29073715209961, "rewards/margins": -1.8899412155151367, "rewards/rejected": -46.400794982910156, "step": 4570 }, { "epoch": 0.6224128540305011, "grad_norm": 36.86479018963451, "learning_rate": 3.0001214880806213e-07, "logits/chosen": 13.16189956665039, "logits/rejected": 14.404607772827148, "logps/chosen": -4.138396263122559, "logps/rejected": -4.817712783813477, "loss": 3.58, "rewards/accuracies": 1.0, "rewards/chosen": -41.38396453857422, "rewards/margins": 6.793161392211914, "rewards/rejected": -48.177127838134766, "step": 4571 }, { "epoch": 0.6225490196078431, "grad_norm": 38.77835981228009, "learning_rate": 2.998280560932688e-07, "logits/chosen": 13.353645324707031, "logits/rejected": 13.459917068481445, "logps/chosen": -4.243333339691162, "logps/rejected": -4.372696876525879, "loss": 4.0844, "rewards/accuracies": 0.5, "rewards/chosen": -42.43333053588867, "rewards/margins": 1.2936391830444336, "rewards/rejected": -43.72697067260742, "step": 4572 }, { "epoch": 0.6226851851851852, "grad_norm": 39.239169348098706, "learning_rate": 2.996439860131543e-07, "logits/chosen": 13.260122299194336, "logits/rejected": 13.47935676574707, "logps/chosen": -3.6798527240753174, "logps/rejected": -4.0286455154418945, "loss": 3.9809, "rewards/accuracies": 1.0, "rewards/chosen": -36.798526763916016, "rewards/margins": 3.4879302978515625, "rewards/rejected": -40.28645706176758, "step": 4573 }, { "epoch": 0.6228213507625272, "grad_norm": 43.48195215287337, "learning_rate": 2.9945993860931066e-07, "logits/chosen": 13.805700302124023, "logits/rejected": 12.797832489013672, "logps/chosen": -4.536929130554199, "logps/rejected": -4.42237663269043, "loss": 4.6114, "rewards/accuracies": 0.0, "rewards/chosen": -45.369293212890625, "rewards/margins": -1.1455297470092773, "rewards/rejected": -44.22376251220703, "step": 4574 }, { "epoch": 0.6229575163398693, "grad_norm": 36.856901332361424, "learning_rate": 2.992759139233249e-07, "logits/chosen": 12.730560302734375, "logits/rejected": 13.302595138549805, "logps/chosen": -4.116412162780762, "logps/rejected": -4.267290115356445, "loss": 3.4305, "rewards/accuracies": 0.75, "rewards/chosen": -41.164119720458984, "rewards/margins": 1.508779525756836, "rewards/rejected": -42.67290115356445, "step": 4575 }, { "epoch": 0.6230936819172114, "grad_norm": 38.537629506103755, "learning_rate": 2.9909191199677917e-07, "logits/chosen": 12.028642654418945, "logits/rejected": 13.29924201965332, "logps/chosen": -3.9092371463775635, "logps/rejected": -4.182476997375488, "loss": 3.4407, "rewards/accuracies": 0.5, "rewards/chosen": -39.092369079589844, "rewards/margins": 2.7323946952819824, "rewards/rejected": -41.82476806640625, "step": 4576 }, { "epoch": 0.6232298474945533, "grad_norm": 38.07470591550161, "learning_rate": 2.9890793287124987e-07, "logits/chosen": 13.169088363647461, "logits/rejected": 12.872323989868164, "logps/chosen": -4.321077823638916, "logps/rejected": -4.517241477966309, "loss": 3.8595, "rewards/accuracies": 0.75, "rewards/chosen": -43.210777282714844, "rewards/margins": 1.9616327285766602, "rewards/rejected": -45.17241287231445, "step": 4577 }, { "epoch": 0.6233660130718954, "grad_norm": 43.07016054513338, "learning_rate": 2.987239765883088e-07, "logits/chosen": 14.00674057006836, "logits/rejected": 14.391266822814941, "logps/chosen": -4.382697105407715, "logps/rejected": -4.519504547119141, "loss": 4.1855, "rewards/accuracies": 0.75, "rewards/chosen": -43.826969146728516, "rewards/margins": 1.3680782318115234, "rewards/rejected": -45.195045471191406, "step": 4578 }, { "epoch": 0.6235021786492375, "grad_norm": 37.530409524448594, "learning_rate": 2.985400431895225e-07, "logits/chosen": 14.260982513427734, "logits/rejected": 14.419378280639648, "logps/chosen": -4.664959907531738, "logps/rejected": -4.71897029876709, "loss": 4.0948, "rewards/accuracies": 0.75, "rewards/chosen": -46.64959716796875, "rewards/margins": 0.5401067733764648, "rewards/rejected": -47.18970489501953, "step": 4579 }, { "epoch": 0.6236383442265795, "grad_norm": 36.204251489787694, "learning_rate": 2.9835613271645194e-07, "logits/chosen": 12.346029281616211, "logits/rejected": 12.638872146606445, "logps/chosen": -4.0435895919799805, "logps/rejected": -4.159560203552246, "loss": 3.8788, "rewards/accuracies": 0.5, "rewards/chosen": -40.43589401245117, "rewards/margins": 1.1597061157226562, "rewards/rejected": -41.59560012817383, "step": 4580 }, { "epoch": 0.6237745098039216, "grad_norm": 36.71692432563135, "learning_rate": 2.981722452106534e-07, "logits/chosen": 13.20638656616211, "logits/rejected": 12.755115509033203, "logps/chosen": -4.255669593811035, "logps/rejected": -4.790901184082031, "loss": 3.4487, "rewards/accuracies": 0.75, "rewards/chosen": -42.556697845458984, "rewards/margins": 5.352313995361328, "rewards/rejected": -47.90901184082031, "step": 4581 }, { "epoch": 0.6239106753812637, "grad_norm": 41.421694703379075, "learning_rate": 2.9798838071367797e-07, "logits/chosen": 13.153517723083496, "logits/rejected": 13.201221466064453, "logps/chosen": -4.334979057312012, "logps/rejected": -4.388270854949951, "loss": 4.2373, "rewards/accuracies": 0.25, "rewards/chosen": -43.34978485107422, "rewards/margins": 0.5329227447509766, "rewards/rejected": -43.88270950317383, "step": 4582 }, { "epoch": 0.6240468409586056, "grad_norm": 37.9207226250289, "learning_rate": 2.97804539267071e-07, "logits/chosen": 13.379125595092773, "logits/rejected": 13.30006217956543, "logps/chosen": -4.245995044708252, "logps/rejected": -4.401847839355469, "loss": 3.9691, "rewards/accuracies": 0.75, "rewards/chosen": -42.4599494934082, "rewards/margins": 1.55853271484375, "rewards/rejected": -44.01848220825195, "step": 4583 }, { "epoch": 0.6241830065359477, "grad_norm": 40.54970852642419, "learning_rate": 2.976207209123731e-07, "logits/chosen": 12.658381462097168, "logits/rejected": 12.571521759033203, "logps/chosen": -4.243352890014648, "logps/rejected": -4.406312465667725, "loss": 4.0949, "rewards/accuracies": 0.75, "rewards/chosen": -42.43353271484375, "rewards/margins": 1.6295900344848633, "rewards/rejected": -44.0631217956543, "step": 4584 }, { "epoch": 0.6243191721132898, "grad_norm": 40.22268219710607, "learning_rate": 2.974369256911197e-07, "logits/chosen": 12.799676895141602, "logits/rejected": 13.629667282104492, "logps/chosen": -4.024300575256348, "logps/rejected": -4.445627212524414, "loss": 4.5947, "rewards/accuracies": 1.0, "rewards/chosen": -40.243003845214844, "rewards/margins": 4.213266372680664, "rewards/rejected": -44.456268310546875, "step": 4585 }, { "epoch": 0.6244553376906318, "grad_norm": 40.459673963383416, "learning_rate": 2.9725315364484067e-07, "logits/chosen": 13.662151336669922, "logits/rejected": 13.870417594909668, "logps/chosen": -4.505556583404541, "logps/rejected": -4.702478408813477, "loss": 4.1084, "rewards/accuracies": 1.0, "rewards/chosen": -45.055564880371094, "rewards/margins": 1.9692182540893555, "rewards/rejected": -47.02478790283203, "step": 4586 }, { "epoch": 0.6245915032679739, "grad_norm": 35.57972882766003, "learning_rate": 2.9706940481506085e-07, "logits/chosen": 12.606372833251953, "logits/rejected": 13.911494255065918, "logps/chosen": -3.7194485664367676, "logps/rejected": -4.060031890869141, "loss": 3.6999, "rewards/accuracies": 1.0, "rewards/chosen": -37.19448471069336, "rewards/margins": 3.405834197998047, "rewards/rejected": -40.600318908691406, "step": 4587 }, { "epoch": 0.6247276688453159, "grad_norm": 38.914089234217464, "learning_rate": 2.9688567924329995e-07, "logits/chosen": 12.317533493041992, "logits/rejected": 13.16159439086914, "logps/chosen": -3.9052700996398926, "logps/rejected": -4.23576545715332, "loss": 4.1189, "rewards/accuracies": 0.75, "rewards/chosen": -39.052703857421875, "rewards/margins": 3.3049488067626953, "rewards/rejected": -42.35765075683594, "step": 4588 }, { "epoch": 0.6248638344226579, "grad_norm": 53.7274278434664, "learning_rate": 2.967019769710721e-07, "logits/chosen": 12.624895095825195, "logits/rejected": 12.824508666992188, "logps/chosen": -4.101576805114746, "logps/rejected": -4.07073974609375, "loss": 4.5128, "rewards/accuracies": 0.5, "rewards/chosen": -41.015769958496094, "rewards/margins": -0.3083686828613281, "rewards/rejected": -40.707401275634766, "step": 4589 }, { "epoch": 0.625, "grad_norm": 35.30602551079966, "learning_rate": 2.965182980398864e-07, "logits/chosen": 12.556110382080078, "logits/rejected": 12.573650360107422, "logps/chosen": -4.109709739685059, "logps/rejected": -4.266674518585205, "loss": 3.8719, "rewards/accuracies": 0.75, "rewards/chosen": -41.09709548950195, "rewards/margins": 1.5696477890014648, "rewards/rejected": -42.666748046875, "step": 4590 }, { "epoch": 0.6251361655773421, "grad_norm": 35.80175707956458, "learning_rate": 2.9633464249124683e-07, "logits/chosen": 12.59286117553711, "logits/rejected": 12.526395797729492, "logps/chosen": -4.180413246154785, "logps/rejected": -4.32375431060791, "loss": 3.874, "rewards/accuracies": 0.75, "rewards/chosen": -41.80413055419922, "rewards/margins": 1.433415412902832, "rewards/rejected": -43.237548828125, "step": 4591 }, { "epoch": 0.6252723311546841, "grad_norm": 37.56714287857309, "learning_rate": 2.9615101036665147e-07, "logits/chosen": 13.60584545135498, "logits/rejected": 13.343789100646973, "logps/chosen": -4.592801570892334, "logps/rejected": -4.4625725746154785, "loss": 4.3489, "rewards/accuracies": 0.25, "rewards/chosen": -45.928016662597656, "rewards/margins": -1.3022890090942383, "rewards/rejected": -44.62572479248047, "step": 4592 }, { "epoch": 0.6254084967320261, "grad_norm": 37.45684732867918, "learning_rate": 2.9596740170759377e-07, "logits/chosen": 13.278766632080078, "logits/rejected": 13.804563522338867, "logps/chosen": -4.179252624511719, "logps/rejected": -4.378993988037109, "loss": 3.7808, "rewards/accuracies": 0.75, "rewards/chosen": -41.79252624511719, "rewards/margins": 1.997415542602539, "rewards/rejected": -43.789939880371094, "step": 4593 }, { "epoch": 0.6255446623093682, "grad_norm": 40.547754684653945, "learning_rate": 2.9578381655556175e-07, "logits/chosen": 13.632575988769531, "logits/rejected": 14.063629150390625, "logps/chosen": -3.9609057903289795, "logps/rejected": -4.386662006378174, "loss": 3.4175, "rewards/accuracies": 1.0, "rewards/chosen": -39.60906219482422, "rewards/margins": 4.257561683654785, "rewards/rejected": -43.86662292480469, "step": 4594 }, { "epoch": 0.6256808278867102, "grad_norm": 36.130881514359025, "learning_rate": 2.956002549520377e-07, "logits/chosen": 12.452825546264648, "logits/rejected": 11.88704776763916, "logps/chosen": -4.013990879058838, "logps/rejected": -3.9997177124023438, "loss": 3.9477, "rewards/accuracies": 0.5, "rewards/chosen": -40.13990783691406, "rewards/margins": -0.1427316665649414, "rewards/rejected": -39.99717712402344, "step": 4595 }, { "epoch": 0.6258169934640523, "grad_norm": 40.599632530792654, "learning_rate": 2.9541671693849904e-07, "logits/chosen": 12.554954528808594, "logits/rejected": 13.90115737915039, "logps/chosen": -4.010290145874023, "logps/rejected": -4.511492729187012, "loss": 4.0742, "rewards/accuracies": 0.75, "rewards/chosen": -40.10289764404297, "rewards/margins": 5.012027740478516, "rewards/rejected": -45.11492919921875, "step": 4596 }, { "epoch": 0.6259531590413944, "grad_norm": 36.115895337663886, "learning_rate": 2.9523320255641785e-07, "logits/chosen": 12.883647918701172, "logits/rejected": 13.418840408325195, "logps/chosen": -3.8562257289886475, "logps/rejected": -4.174644947052002, "loss": 3.7631, "rewards/accuracies": 1.0, "rewards/chosen": -38.562255859375, "rewards/margins": 3.1841936111450195, "rewards/rejected": -41.74645233154297, "step": 4597 }, { "epoch": 0.6260893246187363, "grad_norm": 38.026188769724214, "learning_rate": 2.9504971184726037e-07, "logits/chosen": 13.034043312072754, "logits/rejected": 13.255369186401367, "logps/chosen": -4.181951999664307, "logps/rejected": -4.6982855796813965, "loss": 4.0806, "rewards/accuracies": 0.75, "rewards/chosen": -41.81951904296875, "rewards/margins": 5.163336753845215, "rewards/rejected": -46.98285675048828, "step": 4598 }, { "epoch": 0.6262254901960784, "grad_norm": 35.67786831606108, "learning_rate": 2.9486624485248797e-07, "logits/chosen": 12.92203140258789, "logits/rejected": 13.0397367477417, "logps/chosen": -4.033382415771484, "logps/rejected": -4.32426643371582, "loss": 3.7676, "rewards/accuracies": 0.75, "rewards/chosen": -40.333824157714844, "rewards/margins": 2.908839225769043, "rewards/rejected": -43.2426643371582, "step": 4599 }, { "epoch": 0.6263616557734205, "grad_norm": 37.80035293095747, "learning_rate": 2.9468280161355677e-07, "logits/chosen": 12.701495170593262, "logits/rejected": 12.907684326171875, "logps/chosen": -4.188953399658203, "logps/rejected": -4.17622184753418, "loss": 3.6494, "rewards/accuracies": 0.25, "rewards/chosen": -41.88953399658203, "rewards/margins": -0.12731170654296875, "rewards/rejected": -41.76222229003906, "step": 4600 }, { "epoch": 0.6264978213507625, "grad_norm": 37.77954498801564, "learning_rate": 2.94499382171917e-07, "logits/chosen": 11.909233093261719, "logits/rejected": 12.780191421508789, "logps/chosen": -3.9479308128356934, "logps/rejected": -4.231630802154541, "loss": 4.1468, "rewards/accuracies": 0.5, "rewards/chosen": -39.47930908203125, "rewards/margins": 2.8370018005371094, "rewards/rejected": -42.31631088256836, "step": 4601 }, { "epoch": 0.6266339869281046, "grad_norm": 38.20340419195539, "learning_rate": 2.9431598656901387e-07, "logits/chosen": 13.0208740234375, "logits/rejected": 12.899856567382812, "logps/chosen": -4.396977424621582, "logps/rejected": -4.264639377593994, "loss": 3.848, "rewards/accuracies": 0.0, "rewards/chosen": -43.96977615356445, "rewards/margins": -1.3233842849731445, "rewards/rejected": -42.646392822265625, "step": 4602 }, { "epoch": 0.6267701525054467, "grad_norm": 36.0796792360859, "learning_rate": 2.941326148462873e-07, "logits/chosen": 12.290063858032227, "logits/rejected": 13.696674346923828, "logps/chosen": -4.062004089355469, "logps/rejected": -4.357231140136719, "loss": 3.1885, "rewards/accuracies": 1.0, "rewards/chosen": -40.62004089355469, "rewards/margins": 2.9522695541381836, "rewards/rejected": -43.57231140136719, "step": 4603 }, { "epoch": 0.6269063180827886, "grad_norm": 38.98592555163606, "learning_rate": 2.939492670451714e-07, "logits/chosen": 12.71774673461914, "logits/rejected": 12.16464614868164, "logps/chosen": -4.108983039855957, "logps/rejected": -4.0297160148620605, "loss": 4.0057, "rewards/accuracies": 0.5, "rewards/chosen": -41.08983612060547, "rewards/margins": -0.7926740646362305, "rewards/rejected": -40.297157287597656, "step": 4604 }, { "epoch": 0.6270424836601307, "grad_norm": 37.409560968995514, "learning_rate": 2.9376594320709523e-07, "logits/chosen": 12.576406478881836, "logits/rejected": 13.28459644317627, "logps/chosen": -4.03444766998291, "logps/rejected": -4.388443470001221, "loss": 4.0824, "rewards/accuracies": 1.0, "rewards/chosen": -40.34447479248047, "rewards/margins": 3.539958953857422, "rewards/rejected": -43.884437561035156, "step": 4605 }, { "epoch": 0.6271786492374728, "grad_norm": 37.29351932318885, "learning_rate": 2.935826433734825e-07, "logits/chosen": 13.334938049316406, "logits/rejected": 13.618192672729492, "logps/chosen": -4.020843029022217, "logps/rejected": -4.430676460266113, "loss": 4.316, "rewards/accuracies": 0.75, "rewards/chosen": -40.208431243896484, "rewards/margins": 4.098332405090332, "rewards/rejected": -44.3067626953125, "step": 4606 }, { "epoch": 0.6273148148148148, "grad_norm": 38.6066124988711, "learning_rate": 2.9339936758575097e-07, "logits/chosen": 11.124507904052734, "logits/rejected": 11.691043853759766, "logps/chosen": -4.034204959869385, "logps/rejected": -4.245724201202393, "loss": 3.8746, "rewards/accuracies": 1.0, "rewards/chosen": -40.34204864501953, "rewards/margins": 2.1151952743530273, "rewards/rejected": -42.457244873046875, "step": 4607 }, { "epoch": 0.6274509803921569, "grad_norm": 34.13834633794326, "learning_rate": 2.932161158853135e-07, "logits/chosen": 13.22783374786377, "logits/rejected": 13.259414672851562, "logps/chosen": -4.164796352386475, "logps/rejected": -4.107420444488525, "loss": 3.789, "rewards/accuracies": 0.25, "rewards/chosen": -41.64796447753906, "rewards/margins": -0.5737600326538086, "rewards/rejected": -41.07420349121094, "step": 4608 }, { "epoch": 0.6275871459694989, "grad_norm": 41.39391349921025, "learning_rate": 2.9303288831357744e-07, "logits/chosen": 14.061141014099121, "logits/rejected": 13.402463912963867, "logps/chosen": -4.661767959594727, "logps/rejected": -4.294689178466797, "loss": 3.9189, "rewards/accuracies": 0.0, "rewards/chosen": -46.61768341064453, "rewards/margins": -3.670792579650879, "rewards/rejected": -42.94689178466797, "step": 4609 }, { "epoch": 0.6277233115468409, "grad_norm": 39.011276700166384, "learning_rate": 2.9284968491194447e-07, "logits/chosen": 11.838845252990723, "logits/rejected": 12.922897338867188, "logps/chosen": -3.6687421798706055, "logps/rejected": -4.298677444458008, "loss": 4.2807, "rewards/accuracies": 1.0, "rewards/chosen": -36.68742370605469, "rewards/margins": 6.299349784851074, "rewards/rejected": -42.98677444458008, "step": 4610 }, { "epoch": 0.627859477124183, "grad_norm": 38.12140575687847, "learning_rate": 2.9266650572181084e-07, "logits/chosen": 13.161985397338867, "logits/rejected": 12.19472885131836, "logps/chosen": -4.2170867919921875, "logps/rejected": -4.3112688064575195, "loss": 3.7572, "rewards/accuracies": 0.5, "rewards/chosen": -42.170867919921875, "rewards/margins": 0.9418210983276367, "rewards/rejected": -43.11268997192383, "step": 4611 }, { "epoch": 0.6279956427015251, "grad_norm": 40.40331235188343, "learning_rate": 2.9248335078456746e-07, "logits/chosen": 12.848206520080566, "logits/rejected": 11.85842227935791, "logps/chosen": -4.021195888519287, "logps/rejected": -4.089815139770508, "loss": 4.0375, "rewards/accuracies": 0.75, "rewards/chosen": -40.21195983886719, "rewards/margins": 0.6861867904663086, "rewards/rejected": -40.89814758300781, "step": 4612 }, { "epoch": 0.628131808278867, "grad_norm": 39.35743972207002, "learning_rate": 2.9230022014159976e-07, "logits/chosen": 12.006778717041016, "logits/rejected": 12.436771392822266, "logps/chosen": -3.881138563156128, "logps/rejected": -4.101362705230713, "loss": 3.772, "rewards/accuracies": 1.0, "rewards/chosen": -38.81138610839844, "rewards/margins": 2.2022390365600586, "rewards/rejected": -41.01362609863281, "step": 4613 }, { "epoch": 0.6282679738562091, "grad_norm": 39.550202643013805, "learning_rate": 2.921171138342875e-07, "logits/chosen": 13.008934020996094, "logits/rejected": 13.257359504699707, "logps/chosen": -4.313503265380859, "logps/rejected": -4.091933250427246, "loss": 3.5094, "rewards/accuracies": 0.5, "rewards/chosen": -43.13502883911133, "rewards/margins": -2.215695381164551, "rewards/rejected": -40.919334411621094, "step": 4614 }, { "epoch": 0.6284041394335512, "grad_norm": 35.89543286993632, "learning_rate": 2.9193403190400524e-07, "logits/chosen": 13.291954040527344, "logits/rejected": 13.660536766052246, "logps/chosen": -4.344786643981934, "logps/rejected": -4.596292018890381, "loss": 4.0932, "rewards/accuracies": 0.5, "rewards/chosen": -43.44786834716797, "rewards/margins": 2.51505184173584, "rewards/rejected": -45.962921142578125, "step": 4615 }, { "epoch": 0.6285403050108932, "grad_norm": 36.37746950797726, "learning_rate": 2.9175097439212166e-07, "logits/chosen": 13.262938499450684, "logits/rejected": 13.445642471313477, "logps/chosen": -4.338376045227051, "logps/rejected": -4.511255264282227, "loss": 3.7856, "rewards/accuracies": 0.5, "rewards/chosen": -43.383758544921875, "rewards/margins": 1.7287988662719727, "rewards/rejected": -45.11255645751953, "step": 4616 }, { "epoch": 0.6286764705882353, "grad_norm": 39.652180641361895, "learning_rate": 2.915679413400003e-07, "logits/chosen": 12.0955810546875, "logits/rejected": 12.631404876708984, "logps/chosen": -4.105066776275635, "logps/rejected": -4.120337963104248, "loss": 4.1046, "rewards/accuracies": 0.5, "rewards/chosen": -41.05066680908203, "rewards/margins": 0.15271377563476562, "rewards/rejected": -41.2033805847168, "step": 4617 }, { "epoch": 0.6288126361655774, "grad_norm": 39.64539752305667, "learning_rate": 2.9138493278899886e-07, "logits/chosen": 13.444742202758789, "logits/rejected": 13.3421630859375, "logps/chosen": -4.2085347175598145, "logps/rejected": -4.5937652587890625, "loss": 3.595, "rewards/accuracies": 1.0, "rewards/chosen": -42.085350036621094, "rewards/margins": 3.852304458618164, "rewards/rejected": -45.937652587890625, "step": 4618 }, { "epoch": 0.6289488017429193, "grad_norm": 34.28885578804631, "learning_rate": 2.9120194878046964e-07, "logits/chosen": 12.387930870056152, "logits/rejected": 12.87417984008789, "logps/chosen": -3.8624157905578613, "logps/rejected": -4.26625919342041, "loss": 3.5623, "rewards/accuracies": 0.5, "rewards/chosen": -38.62415313720703, "rewards/margins": 4.0384345054626465, "rewards/rejected": -42.66259002685547, "step": 4619 }, { "epoch": 0.6290849673202614, "grad_norm": 37.814113208491506, "learning_rate": 2.9101898935575946e-07, "logits/chosen": 14.042963981628418, "logits/rejected": 13.480087280273438, "logps/chosen": -4.335552215576172, "logps/rejected": -4.198332786560059, "loss": 3.9215, "rewards/accuracies": 0.5, "rewards/chosen": -43.35552215576172, "rewards/margins": -1.3721923828125, "rewards/rejected": -41.98332977294922, "step": 4620 }, { "epoch": 0.6292211328976035, "grad_norm": 42.52580916176346, "learning_rate": 2.9083605455620954e-07, "logits/chosen": 12.960580825805664, "logits/rejected": 12.26295280456543, "logps/chosen": -4.155116081237793, "logps/rejected": -4.161434650421143, "loss": 4.6008, "rewards/accuracies": 0.5, "rewards/chosen": -41.55116271972656, "rewards/margins": 0.06318283081054688, "rewards/rejected": -41.61434555053711, "step": 4621 }, { "epoch": 0.6293572984749455, "grad_norm": 38.05161699785544, "learning_rate": 2.906531444231553e-07, "logits/chosen": 13.344768524169922, "logits/rejected": 12.99141788482666, "logps/chosen": -4.505483627319336, "logps/rejected": -4.610166549682617, "loss": 4.1693, "rewards/accuracies": 0.5, "rewards/chosen": -45.054832458496094, "rewards/margins": 1.0468292236328125, "rewards/rejected": -46.10166549682617, "step": 4622 }, { "epoch": 0.6294934640522876, "grad_norm": 38.884208508904315, "learning_rate": 2.90470258997927e-07, "logits/chosen": 11.94809627532959, "logits/rejected": 12.788660049438477, "logps/chosen": -3.6915032863616943, "logps/rejected": -3.9338815212249756, "loss": 4.413, "rewards/accuracies": 0.75, "rewards/chosen": -36.91503143310547, "rewards/margins": 2.4237828254699707, "rewards/rejected": -39.33881378173828, "step": 4623 }, { "epoch": 0.6296296296296297, "grad_norm": 40.22319343335744, "learning_rate": 2.9028739832184925e-07, "logits/chosen": 13.299212455749512, "logits/rejected": 13.226703643798828, "logps/chosen": -4.199244499206543, "logps/rejected": -4.289059638977051, "loss": 4.2041, "rewards/accuracies": 0.5, "rewards/chosen": -41.99244689941406, "rewards/margins": 0.8981533050537109, "rewards/rejected": -42.890602111816406, "step": 4624 }, { "epoch": 0.6297657952069716, "grad_norm": 42.63514072466779, "learning_rate": 2.9010456243624056e-07, "logits/chosen": 13.369548797607422, "logits/rejected": 13.716571807861328, "logps/chosen": -3.874532461166382, "logps/rejected": -4.375993728637695, "loss": 4.3341, "rewards/accuracies": 1.0, "rewards/chosen": -38.74532699584961, "rewards/margins": 5.014614105224609, "rewards/rejected": -43.75994110107422, "step": 4625 }, { "epoch": 0.6299019607843137, "grad_norm": 37.093688674082784, "learning_rate": 2.8992175138241435e-07, "logits/chosen": 12.880678176879883, "logits/rejected": 13.179062843322754, "logps/chosen": -4.0404534339904785, "logps/rejected": -4.234529972076416, "loss": 3.7689, "rewards/accuracies": 0.75, "rewards/chosen": -40.40453338623047, "rewards/margins": 1.9407691955566406, "rewards/rejected": -42.345298767089844, "step": 4626 }, { "epoch": 0.6300381263616558, "grad_norm": 44.61703504348637, "learning_rate": 2.897389652016786e-07, "logits/chosen": 13.209924697875977, "logits/rejected": 13.51419448852539, "logps/chosen": -4.498538017272949, "logps/rejected": -4.50212287902832, "loss": 3.8691, "rewards/accuracies": 0.25, "rewards/chosen": -44.98537826538086, "rewards/margins": 0.035849571228027344, "rewards/rejected": -45.02123260498047, "step": 4627 }, { "epoch": 0.6301742919389978, "grad_norm": 40.37182646167838, "learning_rate": 2.895562039353348e-07, "logits/chosen": 13.302581787109375, "logits/rejected": 12.9591064453125, "logps/chosen": -4.347902297973633, "logps/rejected": -3.907165765762329, "loss": 4.1484, "rewards/accuracies": 0.25, "rewards/chosen": -43.47901916503906, "rewards/margins": -4.407362937927246, "rewards/rejected": -39.071659088134766, "step": 4628 }, { "epoch": 0.6303104575163399, "grad_norm": 39.16136257978451, "learning_rate": 2.8937346762467974e-07, "logits/chosen": 13.226424217224121, "logits/rejected": 13.467193603515625, "logps/chosen": -4.321005821228027, "logps/rejected": -4.517484664916992, "loss": 3.6885, "rewards/accuracies": 0.75, "rewards/chosen": -43.21005630493164, "rewards/margins": 1.9647912979125977, "rewards/rejected": -45.17484664916992, "step": 4629 }, { "epoch": 0.6304466230936819, "grad_norm": 37.33711650781562, "learning_rate": 2.8919075631100424e-07, "logits/chosen": 13.486095428466797, "logits/rejected": 13.97723388671875, "logps/chosen": -4.244588851928711, "logps/rejected": -4.29616117477417, "loss": 3.7601, "rewards/accuracies": 0.5, "rewards/chosen": -42.44588851928711, "rewards/margins": 0.5157251358032227, "rewards/rejected": -42.961612701416016, "step": 4630 }, { "epoch": 0.630582788671024, "grad_norm": 39.337242575373004, "learning_rate": 2.890080700355932e-07, "logits/chosen": 13.050338745117188, "logits/rejected": 13.854639053344727, "logps/chosen": -4.518428802490234, "logps/rejected": -4.542789459228516, "loss": 3.9178, "rewards/accuracies": 0.75, "rewards/chosen": -45.184288024902344, "rewards/margins": 0.2436075210571289, "rewards/rejected": -45.427894592285156, "step": 4631 }, { "epoch": 0.630718954248366, "grad_norm": 41.02259058259796, "learning_rate": 2.8882540883972606e-07, "logits/chosen": 12.882343292236328, "logits/rejected": 13.52837085723877, "logps/chosen": -3.990100860595703, "logps/rejected": -4.47768497467041, "loss": 3.5341, "rewards/accuracies": 0.75, "rewards/chosen": -39.90100860595703, "rewards/margins": 4.875842094421387, "rewards/rejected": -44.776851654052734, "step": 4632 }, { "epoch": 0.6308551198257081, "grad_norm": 42.532498096742366, "learning_rate": 2.8864277276467706e-07, "logits/chosen": 13.994056701660156, "logits/rejected": 13.937058448791504, "logps/chosen": -4.680801868438721, "logps/rejected": -4.530023574829102, "loss": 4.0302, "rewards/accuracies": 0.25, "rewards/chosen": -46.808021545410156, "rewards/margins": -1.5077829360961914, "rewards/rejected": -45.30023956298828, "step": 4633 }, { "epoch": 0.6309912854030502, "grad_norm": 40.43172528941401, "learning_rate": 2.8846016185171384e-07, "logits/chosen": 13.243444442749023, "logits/rejected": 13.591889381408691, "logps/chosen": -4.4442830085754395, "logps/rejected": -4.652334213256836, "loss": 3.5823, "rewards/accuracies": 0.75, "rewards/chosen": -44.442832946777344, "rewards/margins": 2.0805130004882812, "rewards/rejected": -46.52334213256836, "step": 4634 }, { "epoch": 0.6311274509803921, "grad_norm": 37.485921848541444, "learning_rate": 2.882775761420991e-07, "logits/chosen": 12.681264877319336, "logits/rejected": 13.512725830078125, "logps/chosen": -3.8748714923858643, "logps/rejected": -4.1717939376831055, "loss": 3.4858, "rewards/accuracies": 1.0, "rewards/chosen": -38.748714447021484, "rewards/margins": 2.9692249298095703, "rewards/rejected": -41.71793746948242, "step": 4635 }, { "epoch": 0.6312636165577342, "grad_norm": 39.18504901070485, "learning_rate": 2.8809501567708967e-07, "logits/chosen": 12.857115745544434, "logits/rejected": 13.162375450134277, "logps/chosen": -4.288280487060547, "logps/rejected": -4.410964012145996, "loss": 4.0807, "rewards/accuracies": 0.75, "rewards/chosen": -42.88280487060547, "rewards/margins": 1.2268400192260742, "rewards/rejected": -44.109642028808594, "step": 4636 }, { "epoch": 0.6313997821350763, "grad_norm": 40.559286508922575, "learning_rate": 2.8791248049793624e-07, "logits/chosen": 12.315377235412598, "logits/rejected": 13.607498168945312, "logps/chosen": -4.185537338256836, "logps/rejected": -4.508829116821289, "loss": 3.6277, "rewards/accuracies": 1.0, "rewards/chosen": -41.855369567871094, "rewards/margins": 3.2329206466674805, "rewards/rejected": -45.088294982910156, "step": 4637 }, { "epoch": 0.6315359477124183, "grad_norm": 41.2137366375681, "learning_rate": 2.8772997064588443e-07, "logits/chosen": 13.337148666381836, "logits/rejected": 13.276479721069336, "logps/chosen": -4.211836814880371, "logps/rejected": -4.222238540649414, "loss": 4.3487, "rewards/accuracies": 0.5, "rewards/chosen": -42.11836624145508, "rewards/margins": 0.1040182113647461, "rewards/rejected": -42.22238540649414, "step": 4638 }, { "epoch": 0.6316721132897604, "grad_norm": 37.407713634552465, "learning_rate": 2.87547486162174e-07, "logits/chosen": 13.502862930297852, "logits/rejected": 13.940376281738281, "logps/chosen": -4.338901996612549, "logps/rejected": -4.580426216125488, "loss": 3.5538, "rewards/accuracies": 0.5, "rewards/chosen": -43.38902282714844, "rewards/margins": 2.4152450561523438, "rewards/rejected": -45.80426788330078, "step": 4639 }, { "epoch": 0.6318082788671024, "grad_norm": 39.56593389609664, "learning_rate": 2.8736502708803835e-07, "logits/chosen": 12.461769104003906, "logits/rejected": 13.376800537109375, "logps/chosen": -4.292471408843994, "logps/rejected": -4.518154144287109, "loss": 3.7996, "rewards/accuracies": 0.75, "rewards/chosen": -42.924713134765625, "rewards/margins": 2.2568283081054688, "rewards/rejected": -45.181541442871094, "step": 4640 }, { "epoch": 0.6319444444444444, "grad_norm": 38.397407603872516, "learning_rate": 2.8718259346470593e-07, "logits/chosen": 12.983439445495605, "logits/rejected": 13.513349533081055, "logps/chosen": -4.352521896362305, "logps/rejected": -4.484686851501465, "loss": 3.3951, "rewards/accuracies": 0.5, "rewards/chosen": -43.52521514892578, "rewards/margins": 1.3216562271118164, "rewards/rejected": -44.84687042236328, "step": 4641 }, { "epoch": 0.6320806100217865, "grad_norm": 41.10845756790606, "learning_rate": 2.870001853333992e-07, "logits/chosen": 13.36117172241211, "logits/rejected": 13.654865264892578, "logps/chosen": -4.324845790863037, "logps/rejected": -4.418642520904541, "loss": 4.0203, "rewards/accuracies": 0.75, "rewards/chosen": -43.24845886230469, "rewards/margins": 0.9379663467407227, "rewards/rejected": -44.186424255371094, "step": 4642 }, { "epoch": 0.6322167755991286, "grad_norm": 37.031874099878834, "learning_rate": 2.8681780273533454e-07, "logits/chosen": 13.375663757324219, "logits/rejected": 13.523533821105957, "logps/chosen": -4.14021110534668, "logps/rejected": -4.4587225914001465, "loss": 3.7966, "rewards/accuracies": 0.75, "rewards/chosen": -41.4021110534668, "rewards/margins": 3.1851158142089844, "rewards/rejected": -44.58722686767578, "step": 4643 }, { "epoch": 0.6323529411764706, "grad_norm": 42.02735465972544, "learning_rate": 2.866354457117229e-07, "logits/chosen": 13.802760124206543, "logits/rejected": 13.575445175170898, "logps/chosen": -3.9912972450256348, "logps/rejected": -4.132959365844727, "loss": 4.0555, "rewards/accuracies": 1.0, "rewards/chosen": -39.9129753112793, "rewards/margins": 1.4166231155395508, "rewards/rejected": -41.32959747314453, "step": 4644 }, { "epoch": 0.6324891067538126, "grad_norm": 36.66831969773986, "learning_rate": 2.8645311430376957e-07, "logits/chosen": 12.804399490356445, "logits/rejected": 13.624931335449219, "logps/chosen": -4.287837028503418, "logps/rejected": -4.5887346267700195, "loss": 4.1762, "rewards/accuracies": 0.75, "rewards/chosen": -42.87836837768555, "rewards/margins": 3.0089731216430664, "rewards/rejected": -45.88734436035156, "step": 4645 }, { "epoch": 0.6326252723311547, "grad_norm": 40.133263172925766, "learning_rate": 2.8627080855267344e-07, "logits/chosen": 13.223489761352539, "logits/rejected": 13.292984008789062, "logps/chosen": -4.420421600341797, "logps/rejected": -4.281291961669922, "loss": 4.0578, "rewards/accuracies": 0.5, "rewards/chosen": -44.2042121887207, "rewards/margins": -1.3912935256958008, "rewards/rejected": -42.81291961669922, "step": 4646 }, { "epoch": 0.6327614379084967, "grad_norm": 37.707734607164205, "learning_rate": 2.8608852849962826e-07, "logits/chosen": 13.237491607666016, "logits/rejected": 13.216584205627441, "logps/chosen": -4.352968692779541, "logps/rejected": -4.261867046356201, "loss": 3.8202, "rewards/accuracies": 0.25, "rewards/chosen": -43.52968978881836, "rewards/margins": -0.9110193252563477, "rewards/rejected": -42.61866760253906, "step": 4647 }, { "epoch": 0.6328976034858388, "grad_norm": 39.62745856569356, "learning_rate": 2.859062741858218e-07, "logits/chosen": 12.934650421142578, "logits/rejected": 13.016731262207031, "logps/chosen": -4.473252296447754, "logps/rejected": -4.419097900390625, "loss": 4.016, "rewards/accuracies": 0.5, "rewards/chosen": -44.73252487182617, "rewards/margins": -0.5415468215942383, "rewards/rejected": -44.19097900390625, "step": 4648 }, { "epoch": 0.6330337690631809, "grad_norm": 39.399726528079256, "learning_rate": 2.857240456524357e-07, "logits/chosen": 13.30403995513916, "logits/rejected": 12.974666595458984, "logps/chosen": -4.2007341384887695, "logps/rejected": -4.270268440246582, "loss": 3.7232, "rewards/accuracies": 0.5, "rewards/chosen": -42.00733947753906, "rewards/margins": 0.6953458786010742, "rewards/rejected": -42.70269012451172, "step": 4649 }, { "epoch": 0.6331699346405228, "grad_norm": 37.58968941990673, "learning_rate": 2.85541842940646e-07, "logits/chosen": 13.798683166503906, "logits/rejected": 13.992639541625977, "logps/chosen": -4.332642555236816, "logps/rejected": -4.6009297370910645, "loss": 3.7923, "rewards/accuracies": 0.75, "rewards/chosen": -43.32642364501953, "rewards/margins": 2.6828737258911133, "rewards/rejected": -46.00929641723633, "step": 4650 }, { "epoch": 0.6333061002178649, "grad_norm": 41.747217100687664, "learning_rate": 2.8535966609162325e-07, "logits/chosen": 13.205376625061035, "logits/rejected": 13.06021785736084, "logps/chosen": -4.534360408782959, "logps/rejected": -4.489508628845215, "loss": 3.8496, "rewards/accuracies": 0.75, "rewards/chosen": -45.343605041503906, "rewards/margins": -0.4485197067260742, "rewards/rejected": -44.89508819580078, "step": 4651 }, { "epoch": 0.633442265795207, "grad_norm": 39.583992513966976, "learning_rate": 2.851775151465314e-07, "logits/chosen": 12.910688400268555, "logits/rejected": 13.75993537902832, "logps/chosen": -4.561190605163574, "logps/rejected": -4.436790466308594, "loss": 3.7617, "rewards/accuracies": 0.5, "rewards/chosen": -45.611907958984375, "rewards/margins": -1.2440013885498047, "rewards/rejected": -44.36790466308594, "step": 4652 }, { "epoch": 0.633578431372549, "grad_norm": 39.23666699555756, "learning_rate": 2.849953901465291e-07, "logits/chosen": 12.871383666992188, "logits/rejected": 14.251073837280273, "logps/chosen": -4.489158630371094, "logps/rejected": -5.236176490783691, "loss": 3.9716, "rewards/accuracies": 0.75, "rewards/chosen": -44.89158630371094, "rewards/margins": 7.470183372497559, "rewards/rejected": -52.36177062988281, "step": 4653 }, { "epoch": 0.6337145969498911, "grad_norm": 39.331835723762666, "learning_rate": 2.848132911327692e-07, "logits/chosen": 13.144405364990234, "logits/rejected": 13.485072135925293, "logps/chosen": -4.356381416320801, "logps/rejected": -4.537479400634766, "loss": 3.704, "rewards/accuracies": 0.75, "rewards/chosen": -43.563812255859375, "rewards/margins": 1.810983657836914, "rewards/rejected": -45.37479782104492, "step": 4654 }, { "epoch": 0.6338507625272332, "grad_norm": 38.07548205961801, "learning_rate": 2.8463121814639816e-07, "logits/chosen": 13.046852111816406, "logits/rejected": 13.757408142089844, "logps/chosen": -4.16322660446167, "logps/rejected": -4.50950288772583, "loss": 4.4789, "rewards/accuracies": 0.75, "rewards/chosen": -41.63226318359375, "rewards/margins": 3.4627647399902344, "rewards/rejected": -45.09503173828125, "step": 4655 }, { "epoch": 0.6339869281045751, "grad_norm": 38.202017077052325, "learning_rate": 2.84449171228557e-07, "logits/chosen": 13.25305461883545, "logits/rejected": 13.126617431640625, "logps/chosen": -4.4356489181518555, "logps/rejected": -4.3648223876953125, "loss": 3.7428, "rewards/accuracies": 0.25, "rewards/chosen": -44.35648727416992, "rewards/margins": -0.7082595825195312, "rewards/rejected": -43.648223876953125, "step": 4656 }, { "epoch": 0.6341230936819172, "grad_norm": 47.49396140087037, "learning_rate": 2.8426715042038084e-07, "logits/chosen": 12.676492691040039, "logits/rejected": 12.84465217590332, "logps/chosen": -4.125319957733154, "logps/rejected": -4.269970893859863, "loss": 4.1391, "rewards/accuracies": 0.75, "rewards/chosen": -41.253196716308594, "rewards/margins": 1.4465084075927734, "rewards/rejected": -42.69970703125, "step": 4657 }, { "epoch": 0.6342592592592593, "grad_norm": 41.90855818512029, "learning_rate": 2.8408515576299875e-07, "logits/chosen": 13.390228271484375, "logits/rejected": 13.352296829223633, "logps/chosen": -4.011252403259277, "logps/rejected": -4.467644691467285, "loss": 3.6961, "rewards/accuracies": 1.0, "rewards/chosen": -40.112525939941406, "rewards/margins": 4.563921928405762, "rewards/rejected": -44.676448822021484, "step": 4658 }, { "epoch": 0.6343954248366013, "grad_norm": 42.334604203891196, "learning_rate": 2.8390318729753373e-07, "logits/chosen": 13.070074081420898, "logits/rejected": 13.826037406921387, "logps/chosen": -3.9983725547790527, "logps/rejected": -4.618884563446045, "loss": 3.9492, "rewards/accuracies": 1.0, "rewards/chosen": -39.98372268676758, "rewards/margins": 6.205122947692871, "rewards/rejected": -46.188846588134766, "step": 4659 }, { "epoch": 0.6345315904139434, "grad_norm": 38.40073942368573, "learning_rate": 2.837212450651034e-07, "logits/chosen": 13.313148498535156, "logits/rejected": 12.974531173706055, "logps/chosen": -4.433140754699707, "logps/rejected": -4.135263919830322, "loss": 4.0045, "rewards/accuracies": 0.25, "rewards/chosen": -44.33140563964844, "rewards/margins": -2.9787673950195312, "rewards/rejected": -41.352638244628906, "step": 4660 }, { "epoch": 0.6346677559912854, "grad_norm": 40.876985990534415, "learning_rate": 2.835393291068188e-07, "logits/chosen": 14.472343444824219, "logits/rejected": 14.056480407714844, "logps/chosen": -4.353021621704102, "logps/rejected": -4.890207290649414, "loss": 4.5496, "rewards/accuracies": 0.75, "rewards/chosen": -43.53021240234375, "rewards/margins": 5.371858596801758, "rewards/rejected": -48.90207290649414, "step": 4661 }, { "epoch": 0.6348039215686274, "grad_norm": 40.78754184044854, "learning_rate": 2.833574394637854e-07, "logits/chosen": 13.200235366821289, "logits/rejected": 13.593509674072266, "logps/chosen": -3.8817806243896484, "logps/rejected": -4.295443534851074, "loss": 4.3518, "rewards/accuracies": 0.5, "rewards/chosen": -38.817806243896484, "rewards/margins": 4.136630058288574, "rewards/rejected": -42.954437255859375, "step": 4662 }, { "epoch": 0.6349400871459695, "grad_norm": 40.002091540712634, "learning_rate": 2.8317557617710285e-07, "logits/chosen": 12.882863998413086, "logits/rejected": 13.256834030151367, "logps/chosen": -3.850914478302002, "logps/rejected": -4.4156060218811035, "loss": 3.9195, "rewards/accuracies": 1.0, "rewards/chosen": -38.5091438293457, "rewards/margins": 5.646918296813965, "rewards/rejected": -44.156063079833984, "step": 4663 }, { "epoch": 0.6350762527233116, "grad_norm": 40.48101873376764, "learning_rate": 2.829937392878645e-07, "logits/chosen": 12.97140121459961, "logits/rejected": 13.432025909423828, "logps/chosen": -4.244368553161621, "logps/rejected": -4.380309104919434, "loss": 3.9662, "rewards/accuracies": 0.5, "rewards/chosen": -42.44368362426758, "rewards/margins": 1.359410285949707, "rewards/rejected": -43.80309295654297, "step": 4664 }, { "epoch": 0.6352124183006536, "grad_norm": 54.70008990650941, "learning_rate": 2.8281192883715795e-07, "logits/chosen": 12.769305229187012, "logits/rejected": 13.778951644897461, "logps/chosen": -4.339612007141113, "logps/rejected": -4.636162757873535, "loss": 4.1806, "rewards/accuracies": 0.75, "rewards/chosen": -43.3961181640625, "rewards/margins": 2.965510368347168, "rewards/rejected": -46.361629486083984, "step": 4665 }, { "epoch": 0.6353485838779956, "grad_norm": 43.95988617873291, "learning_rate": 2.826301448660648e-07, "logits/chosen": 13.58126449584961, "logits/rejected": 13.5220947265625, "logps/chosen": -4.462917804718018, "logps/rejected": -4.402159690856934, "loss": 4.4565, "rewards/accuracies": 0.25, "rewards/chosen": -44.629173278808594, "rewards/margins": -0.6075773239135742, "rewards/rejected": -44.02159881591797, "step": 4666 }, { "epoch": 0.6354847494553377, "grad_norm": 37.836230196130124, "learning_rate": 2.824483874156605e-07, "logits/chosen": 11.781709671020508, "logits/rejected": 12.524099349975586, "logps/chosen": -4.009467601776123, "logps/rejected": -4.302520751953125, "loss": 3.9574, "rewards/accuracies": 0.75, "rewards/chosen": -40.09467315673828, "rewards/margins": 2.9305286407470703, "rewards/rejected": -43.02520751953125, "step": 4667 }, { "epoch": 0.6356209150326797, "grad_norm": 40.7336312808159, "learning_rate": 2.822666565270149e-07, "logits/chosen": 12.93841552734375, "logits/rejected": 13.936012268066406, "logps/chosen": -4.40571403503418, "logps/rejected": -4.635040283203125, "loss": 3.6048, "rewards/accuracies": 0.5, "rewards/chosen": -44.05713653564453, "rewards/margins": 2.2932634353637695, "rewards/rejected": -46.35040283203125, "step": 4668 }, { "epoch": 0.6357570806100218, "grad_norm": 42.67618354555431, "learning_rate": 2.8208495224119137e-07, "logits/chosen": 12.734631538391113, "logits/rejected": 12.85153865814209, "logps/chosen": -4.309492588043213, "logps/rejected": -4.510396957397461, "loss": 3.8647, "rewards/accuracies": 0.75, "rewards/chosen": -43.094932556152344, "rewards/margins": 2.009037971496582, "rewards/rejected": -45.103965759277344, "step": 4669 }, { "epoch": 0.6358932461873639, "grad_norm": 46.44834013854828, "learning_rate": 2.8190327459924746e-07, "logits/chosen": 12.38272762298584, "logits/rejected": 12.824468612670898, "logps/chosen": -4.115175247192383, "logps/rejected": -4.203295707702637, "loss": 3.3134, "rewards/accuracies": 0.75, "rewards/chosen": -41.15175247192383, "rewards/margins": 0.8812026977539062, "rewards/rejected": -42.03295135498047, "step": 4670 }, { "epoch": 0.6360294117647058, "grad_norm": 42.04337608528053, "learning_rate": 2.817216236422349e-07, "logits/chosen": 12.745638847351074, "logits/rejected": 12.486104965209961, "logps/chosen": -4.360185623168945, "logps/rejected": -4.408994197845459, "loss": 3.3302, "rewards/accuracies": 0.75, "rewards/chosen": -43.60186004638672, "rewards/margins": 0.4880857467651367, "rewards/rejected": -44.089942932128906, "step": 4671 }, { "epoch": 0.6361655773420479, "grad_norm": 50.10286042450904, "learning_rate": 2.815399994111994e-07, "logits/chosen": 13.594484329223633, "logits/rejected": 12.694660186767578, "logps/chosen": -4.209625720977783, "logps/rejected": -4.197959899902344, "loss": 3.8444, "rewards/accuracies": 0.5, "rewards/chosen": -42.09626007080078, "rewards/margins": -0.11665916442871094, "rewards/rejected": -41.97959899902344, "step": 4672 }, { "epoch": 0.63630174291939, "grad_norm": 38.56079828909006, "learning_rate": 2.813584019471801e-07, "logits/chosen": 13.503467559814453, "logits/rejected": 12.396200180053711, "logps/chosen": -4.435986518859863, "logps/rejected": -4.113597869873047, "loss": 4.2695, "rewards/accuracies": 0.0, "rewards/chosen": -44.35986328125, "rewards/margins": -3.2238855361938477, "rewards/rejected": -41.13597869873047, "step": 4673 }, { "epoch": 0.636437908496732, "grad_norm": 36.96693440282306, "learning_rate": 2.8117683129121043e-07, "logits/chosen": 13.924373626708984, "logits/rejected": 13.807305335998535, "logps/chosen": -4.581697940826416, "logps/rejected": -4.649932861328125, "loss": 4.1444, "rewards/accuracies": 0.75, "rewards/chosen": -45.816978454589844, "rewards/margins": 0.6823501586914062, "rewards/rejected": -46.49932861328125, "step": 4674 }, { "epoch": 0.6365740740740741, "grad_norm": 41.127725746085815, "learning_rate": 2.809952874843182e-07, "logits/chosen": 14.18162727355957, "logits/rejected": 14.4566650390625, "logps/chosen": -4.396309852600098, "logps/rejected": -4.713400840759277, "loss": 4.3312, "rewards/accuracies": 1.0, "rewards/chosen": -43.963096618652344, "rewards/margins": 3.170905113220215, "rewards/rejected": -47.134002685546875, "step": 4675 }, { "epoch": 0.6367102396514162, "grad_norm": 40.423904126248544, "learning_rate": 2.808137705675243e-07, "logits/chosen": 14.199678421020508, "logits/rejected": 14.18475341796875, "logps/chosen": -4.288159370422363, "logps/rejected": -4.495240688323975, "loss": 3.7421, "rewards/accuracies": 0.75, "rewards/chosen": -42.881591796875, "rewards/margins": 2.070815086364746, "rewards/rejected": -44.95240783691406, "step": 4676 }, { "epoch": 0.6368464052287581, "grad_norm": 44.53425430199183, "learning_rate": 2.806322805818441e-07, "logits/chosen": 13.313325881958008, "logits/rejected": 14.04749870300293, "logps/chosen": -4.145981788635254, "logps/rejected": -4.55491828918457, "loss": 3.6668, "rewards/accuracies": 1.0, "rewards/chosen": -41.459815979003906, "rewards/margins": 4.08936882019043, "rewards/rejected": -45.54918670654297, "step": 4677 }, { "epoch": 0.6369825708061002, "grad_norm": 39.829187113965965, "learning_rate": 2.8045081756828695e-07, "logits/chosen": 12.690608978271484, "logits/rejected": 13.700187683105469, "logps/chosen": -4.340522766113281, "logps/rejected": -4.518106937408447, "loss": 4.2946, "rewards/accuracies": 0.5, "rewards/chosen": -43.40522766113281, "rewards/margins": 1.7758455276489258, "rewards/rejected": -45.18107223510742, "step": 4678 }, { "epoch": 0.6371187363834423, "grad_norm": 40.9049719060979, "learning_rate": 2.802693815678557e-07, "logits/chosen": 13.031408309936523, "logits/rejected": 14.460371017456055, "logps/chosen": -4.072898864746094, "logps/rejected": -4.409015655517578, "loss": 4.2015, "rewards/accuracies": 0.75, "rewards/chosen": -40.72898864746094, "rewards/margins": 3.3611698150634766, "rewards/rejected": -44.09015655517578, "step": 4679 }, { "epoch": 0.6372549019607843, "grad_norm": 44.73562478112533, "learning_rate": 2.800879726215473e-07, "logits/chosen": 13.771523475646973, "logits/rejected": 14.280172348022461, "logps/chosen": -4.317543983459473, "logps/rejected": -4.494577407836914, "loss": 3.7549, "rewards/accuracies": 0.5, "rewards/chosen": -43.175445556640625, "rewards/margins": 1.770329475402832, "rewards/rejected": -44.945770263671875, "step": 4680 }, { "epoch": 0.6373910675381264, "grad_norm": 36.322766385346355, "learning_rate": 2.799065907703529e-07, "logits/chosen": 13.048101425170898, "logits/rejected": 13.714431762695312, "logps/chosen": -4.130257606506348, "logps/rejected": -4.374900817871094, "loss": 3.7295, "rewards/accuracies": 0.75, "rewards/chosen": -41.302574157714844, "rewards/margins": 2.4464340209960938, "rewards/rejected": -43.74900817871094, "step": 4681 }, { "epoch": 0.6375272331154684, "grad_norm": 39.50266089744789, "learning_rate": 2.7972523605525684e-07, "logits/chosen": 13.157669067382812, "logits/rejected": 13.661067008972168, "logps/chosen": -4.311223030090332, "logps/rejected": -4.168013095855713, "loss": 4.101, "rewards/accuracies": 0.5, "rewards/chosen": -43.11222839355469, "rewards/margins": -1.4320974349975586, "rewards/rejected": -41.68013000488281, "step": 4682 }, { "epoch": 0.6376633986928104, "grad_norm": 44.40139011474442, "learning_rate": 2.7954390851723793e-07, "logits/chosen": 13.54839038848877, "logits/rejected": 14.213388442993164, "logps/chosen": -4.372539043426514, "logps/rejected": -4.567400932312012, "loss": 4.331, "rewards/accuracies": 0.5, "rewards/chosen": -43.72538757324219, "rewards/margins": 1.9486217498779297, "rewards/rejected": -45.67401123046875, "step": 4683 }, { "epoch": 0.6377995642701525, "grad_norm": 47.11867244991714, "learning_rate": 2.793626081972687e-07, "logits/chosen": 13.7659330368042, "logits/rejected": 13.970080375671387, "logps/chosen": -4.298036575317383, "logps/rejected": -4.156435012817383, "loss": 3.8786, "rewards/accuracies": 0.5, "rewards/chosen": -42.98036193847656, "rewards/margins": -1.416015625, "rewards/rejected": -41.56434631347656, "step": 4684 }, { "epoch": 0.6379357298474946, "grad_norm": 39.046608059312284, "learning_rate": 2.791813351363152e-07, "logits/chosen": 13.982868194580078, "logits/rejected": 14.478525161743164, "logps/chosen": -4.7508463859558105, "logps/rejected": -4.757472991943359, "loss": 4.1551, "rewards/accuracies": 0.25, "rewards/chosen": -47.50846481323242, "rewards/margins": 0.06626701354980469, "rewards/rejected": -47.574729919433594, "step": 4685 }, { "epoch": 0.6380718954248366, "grad_norm": 41.32935133022096, "learning_rate": 2.790000893753377e-07, "logits/chosen": 13.164302825927734, "logits/rejected": 12.839689254760742, "logps/chosen": -4.487361431121826, "logps/rejected": -4.411524772644043, "loss": 4.132, "rewards/accuracies": 0.25, "rewards/chosen": -44.87361145019531, "rewards/margins": -0.7583646774291992, "rewards/rejected": -44.11524963378906, "step": 4686 }, { "epoch": 0.6382080610021786, "grad_norm": 38.85167662643899, "learning_rate": 2.788188709552904e-07, "logits/chosen": 14.120573043823242, "logits/rejected": 14.044597625732422, "logps/chosen": -4.453903675079346, "logps/rejected": -4.976012229919434, "loss": 3.6775, "rewards/accuracies": 1.0, "rewards/chosen": -44.539031982421875, "rewards/margins": 5.221091270446777, "rewards/rejected": -49.76012420654297, "step": 4687 }, { "epoch": 0.6383442265795207, "grad_norm": 39.24057671625262, "learning_rate": 2.7863767991712075e-07, "logits/chosen": 13.398094177246094, "logits/rejected": 13.648931503295898, "logps/chosen": -4.198008060455322, "logps/rejected": -4.5081562995910645, "loss": 3.7566, "rewards/accuracies": 1.0, "rewards/chosen": -41.980079650878906, "rewards/margins": 3.1014833450317383, "rewards/rejected": -45.08156204223633, "step": 4688 }, { "epoch": 0.6384803921568627, "grad_norm": 41.63112120332154, "learning_rate": 2.7845651630177045e-07, "logits/chosen": 13.212053298950195, "logits/rejected": 13.329176902770996, "logps/chosen": -3.9824113845825195, "logps/rejected": -4.3439106941223145, "loss": 4.513, "rewards/accuracies": 0.75, "rewards/chosen": -39.82411193847656, "rewards/margins": 3.614992141723633, "rewards/rejected": -43.43910598754883, "step": 4689 }, { "epoch": 0.6386165577342048, "grad_norm": 41.788431626970244, "learning_rate": 2.7827538015017523e-07, "logits/chosen": 13.568552017211914, "logits/rejected": 13.783907890319824, "logps/chosen": -4.040060997009277, "logps/rejected": -4.283910751342773, "loss": 3.8167, "rewards/accuracies": 0.5, "rewards/chosen": -40.40060806274414, "rewards/margins": 2.4384965896606445, "rewards/rejected": -42.83910369873047, "step": 4690 }, { "epoch": 0.6387527233115469, "grad_norm": 37.44106748510257, "learning_rate": 2.780942715032639e-07, "logits/chosen": 13.43307876586914, "logits/rejected": 14.024229049682617, "logps/chosen": -4.066688537597656, "logps/rejected": -4.161060333251953, "loss": 3.8519, "rewards/accuracies": 0.75, "rewards/chosen": -40.66688537597656, "rewards/margins": 0.9437150955200195, "rewards/rejected": -41.61060333251953, "step": 4691 }, { "epoch": 0.6388888888888888, "grad_norm": 42.7477403179044, "learning_rate": 2.779131904019595e-07, "logits/chosen": 13.332467079162598, "logits/rejected": 13.418148040771484, "logps/chosen": -4.349688529968262, "logps/rejected": -4.631101608276367, "loss": 3.9546, "rewards/accuracies": 0.75, "rewards/chosen": -43.49688720703125, "rewards/margins": 2.8141307830810547, "rewards/rejected": -46.31101608276367, "step": 4692 }, { "epoch": 0.6390250544662309, "grad_norm": 42.19888333259724, "learning_rate": 2.7773213688717914e-07, "logits/chosen": 13.491188049316406, "logits/rejected": 12.959548950195312, "logps/chosen": -4.169837474822998, "logps/rejected": -4.420658111572266, "loss": 3.8563, "rewards/accuracies": 0.75, "rewards/chosen": -41.6983757019043, "rewards/margins": 2.5082054138183594, "rewards/rejected": -44.206581115722656, "step": 4693 }, { "epoch": 0.639161220043573, "grad_norm": 45.33431525808146, "learning_rate": 2.775511109998329e-07, "logits/chosen": 13.216302871704102, "logits/rejected": 13.24574089050293, "logps/chosen": -4.100140571594238, "logps/rejected": -4.173101902008057, "loss": 3.7292, "rewards/accuracies": 0.5, "rewards/chosen": -41.00140380859375, "rewards/margins": 0.7296142578125, "rewards/rejected": -41.73101806640625, "step": 4694 }, { "epoch": 0.639297385620915, "grad_norm": 39.33411769347935, "learning_rate": 2.773701127808254e-07, "logits/chosen": 13.487030982971191, "logits/rejected": 14.868558883666992, "logps/chosen": -4.474177837371826, "logps/rejected": -4.931878566741943, "loss": 3.9232, "rewards/accuracies": 0.75, "rewards/chosen": -44.74177932739258, "rewards/margins": 4.5770063400268555, "rewards/rejected": -49.318782806396484, "step": 4695 }, { "epoch": 0.6394335511982571, "grad_norm": 53.38061971728945, "learning_rate": 2.771891422710547e-07, "logits/chosen": 14.07438850402832, "logits/rejected": 14.237547874450684, "logps/chosen": -4.7999773025512695, "logps/rejected": -4.463384628295898, "loss": 4.9608, "rewards/accuracies": 0.25, "rewards/chosen": -47.9997673034668, "rewards/margins": -3.3659210205078125, "rewards/rejected": -44.63385009765625, "step": 4696 }, { "epoch": 0.6395697167755992, "grad_norm": 39.93619177893207, "learning_rate": 2.770081995114123e-07, "logits/chosen": 13.178531646728516, "logits/rejected": 13.659114837646484, "logps/chosen": -4.315981864929199, "logps/rejected": -4.517945289611816, "loss": 3.7731, "rewards/accuracies": 0.5, "rewards/chosen": -43.159820556640625, "rewards/margins": 2.019632339477539, "rewards/rejected": -45.17945098876953, "step": 4697 }, { "epoch": 0.6397058823529411, "grad_norm": 39.678522709770725, "learning_rate": 2.768272845427839e-07, "logits/chosen": 13.559867858886719, "logits/rejected": 14.795787811279297, "logps/chosen": -4.427992343902588, "logps/rejected": -4.766927719116211, "loss": 3.9609, "rewards/accuracies": 1.0, "rewards/chosen": -44.27992248535156, "rewards/margins": 3.3893537521362305, "rewards/rejected": -47.66927719116211, "step": 4698 }, { "epoch": 0.6398420479302832, "grad_norm": 39.96724525184667, "learning_rate": 2.766463974060489e-07, "logits/chosen": 14.003509521484375, "logits/rejected": 13.961227416992188, "logps/chosen": -4.57505989074707, "logps/rejected": -4.8238983154296875, "loss": 3.9971, "rewards/accuracies": 0.75, "rewards/chosen": -45.75060272216797, "rewards/margins": 2.488382339477539, "rewards/rejected": -48.238983154296875, "step": 4699 }, { "epoch": 0.6399782135076253, "grad_norm": 38.110728827739216, "learning_rate": 2.764655381420798e-07, "logits/chosen": 13.275618553161621, "logits/rejected": 13.177667617797852, "logps/chosen": -4.210662364959717, "logps/rejected": -4.463911056518555, "loss": 3.9619, "rewards/accuracies": 0.75, "rewards/chosen": -42.10662078857422, "rewards/margins": 2.5324859619140625, "rewards/rejected": -44.63910675048828, "step": 4700 }, { "epoch": 0.6401143790849673, "grad_norm": 38.87019939975228, "learning_rate": 2.7628470679174357e-07, "logits/chosen": 12.377958297729492, "logits/rejected": 13.249605178833008, "logps/chosen": -4.240107536315918, "logps/rejected": -4.384015083312988, "loss": 3.9947, "rewards/accuracies": 0.75, "rewards/chosen": -42.40108108520508, "rewards/margins": 1.4390716552734375, "rewards/rejected": -43.840152740478516, "step": 4701 }, { "epoch": 0.6402505446623094, "grad_norm": 39.49833962863604, "learning_rate": 2.761039033959006e-07, "logits/chosen": 12.538835525512695, "logits/rejected": 13.300674438476562, "logps/chosen": -4.149441719055176, "logps/rejected": -4.536191940307617, "loss": 3.7858, "rewards/accuracies": 1.0, "rewards/chosen": -41.494415283203125, "rewards/margins": 3.8675003051757812, "rewards/rejected": -45.361915588378906, "step": 4702 }, { "epoch": 0.6403867102396514, "grad_norm": 40.42547510674941, "learning_rate": 2.759231279954047e-07, "logits/chosen": 13.741278648376465, "logits/rejected": 13.172445297241211, "logps/chosen": -4.352996826171875, "logps/rejected": -4.177571773529053, "loss": 4.3916, "rewards/accuracies": 0.25, "rewards/chosen": -43.52996826171875, "rewards/margins": -1.7542486190795898, "rewards/rejected": -41.775718688964844, "step": 4703 }, { "epoch": 0.6405228758169934, "grad_norm": 40.52795106538501, "learning_rate": 2.757423806311036e-07, "logits/chosen": 13.595176696777344, "logits/rejected": 13.717540740966797, "logps/chosen": -4.654598236083984, "logps/rejected": -4.6840667724609375, "loss": 3.7783, "rewards/accuracies": 0.5, "rewards/chosen": -46.545982360839844, "rewards/margins": 0.2946891784667969, "rewards/rejected": -46.840667724609375, "step": 4704 }, { "epoch": 0.6406590413943355, "grad_norm": 37.32720794135362, "learning_rate": 2.7556166134383895e-07, "logits/chosen": 13.149393081665039, "logits/rejected": 14.503899574279785, "logps/chosen": -4.267043113708496, "logps/rejected": -4.7150983810424805, "loss": 3.7132, "rewards/accuracies": 1.0, "rewards/chosen": -42.67042922973633, "rewards/margins": 4.480554580688477, "rewards/rejected": -47.15098190307617, "step": 4705 }, { "epoch": 0.6407952069716776, "grad_norm": 39.99931795149041, "learning_rate": 2.753809701744453e-07, "logits/chosen": 12.885705947875977, "logits/rejected": 13.304444313049316, "logps/chosen": -4.13383674621582, "logps/rejected": -4.3833112716674805, "loss": 3.3423, "rewards/accuracies": 1.0, "rewards/chosen": -41.33836364746094, "rewards/margins": 2.494744300842285, "rewards/rejected": -43.833106994628906, "step": 4706 }, { "epoch": 0.6409313725490197, "grad_norm": 38.77733545731405, "learning_rate": 2.752003071637516e-07, "logits/chosen": 13.036974906921387, "logits/rejected": 12.73293399810791, "logps/chosen": -4.137280464172363, "logps/rejected": -3.8714513778686523, "loss": 4.381, "rewards/accuracies": 0.25, "rewards/chosen": -41.372802734375, "rewards/margins": -2.6582870483398438, "rewards/rejected": -38.714515686035156, "step": 4707 }, { "epoch": 0.6410675381263616, "grad_norm": 39.345722610621024, "learning_rate": 2.750196723525802e-07, "logits/chosen": 12.743850708007812, "logits/rejected": 13.595118522644043, "logps/chosen": -4.406959533691406, "logps/rejected": -4.704925537109375, "loss": 3.9306, "rewards/accuracies": 0.75, "rewards/chosen": -44.06959533691406, "rewards/margins": 2.979656219482422, "rewards/rejected": -47.04925537109375, "step": 4708 }, { "epoch": 0.6412037037037037, "grad_norm": 40.378563056354054, "learning_rate": 2.7483906578174686e-07, "logits/chosen": 12.890420913696289, "logits/rejected": 13.692886352539062, "logps/chosen": -4.361499309539795, "logps/rejected": -4.66607141494751, "loss": 3.9057, "rewards/accuracies": 0.75, "rewards/chosen": -43.614994049072266, "rewards/margins": 3.0457191467285156, "rewards/rejected": -46.66071319580078, "step": 4709 }, { "epoch": 0.6413398692810458, "grad_norm": 39.27568817312224, "learning_rate": 2.7465848749206115e-07, "logits/chosen": 13.00580883026123, "logits/rejected": 13.340896606445312, "logps/chosen": -3.986706018447876, "logps/rejected": -4.312512397766113, "loss": 3.5385, "rewards/accuracies": 0.75, "rewards/chosen": -39.867061614990234, "rewards/margins": 3.258061408996582, "rewards/rejected": -43.1251220703125, "step": 4710 }, { "epoch": 0.6414760348583878, "grad_norm": 39.684375056828685, "learning_rate": 2.7447793752432635e-07, "logits/chosen": 13.042448043823242, "logits/rejected": 13.429890632629395, "logps/chosen": -4.273083686828613, "logps/rejected": -4.382174491882324, "loss": 3.9866, "rewards/accuracies": 0.75, "rewards/chosen": -42.7308349609375, "rewards/margins": 1.0909051895141602, "rewards/rejected": -43.821739196777344, "step": 4711 }, { "epoch": 0.6416122004357299, "grad_norm": 34.66888389286409, "learning_rate": 2.742974159193392e-07, "logits/chosen": 14.245322227478027, "logits/rejected": 13.94122314453125, "logps/chosen": -4.608138084411621, "logps/rejected": -4.616571426391602, "loss": 4.0506, "rewards/accuracies": 0.75, "rewards/chosen": -46.08137893676758, "rewards/margins": 0.0843353271484375, "rewards/rejected": -46.165714263916016, "step": 4712 }, { "epoch": 0.641748366013072, "grad_norm": 43.53403878939162, "learning_rate": 2.741169227178898e-07, "logits/chosen": 14.521129608154297, "logits/rejected": 14.289302825927734, "logps/chosen": -4.734393119812012, "logps/rejected": -4.456251621246338, "loss": 4.2466, "rewards/accuracies": 0.0, "rewards/chosen": -47.34393310546875, "rewards/margins": -2.7814149856567383, "rewards/rejected": -44.56251525878906, "step": 4713 }, { "epoch": 0.6418845315904139, "grad_norm": 39.26324135568798, "learning_rate": 2.739364579607624e-07, "logits/chosen": 12.832377433776855, "logits/rejected": 12.435900688171387, "logps/chosen": -4.502418518066406, "logps/rejected": -4.322979927062988, "loss": 4.2541, "rewards/accuracies": 0.5, "rewards/chosen": -45.02418899536133, "rewards/margins": -1.7943925857543945, "rewards/rejected": -43.22979736328125, "step": 4714 }, { "epoch": 0.642020697167756, "grad_norm": 38.45052587227088, "learning_rate": 2.7375602168873435e-07, "logits/chosen": 11.994682312011719, "logits/rejected": 13.113851547241211, "logps/chosen": -4.0282721519470215, "logps/rejected": -4.291051864624023, "loss": 3.6466, "rewards/accuracies": 0.5, "rewards/chosen": -40.28272247314453, "rewards/margins": 2.627800941467285, "rewards/rejected": -42.9105224609375, "step": 4715 }, { "epoch": 0.6421568627450981, "grad_norm": 38.250411935804145, "learning_rate": 2.735756139425768e-07, "logits/chosen": 12.94471549987793, "logits/rejected": 13.078168869018555, "logps/chosen": -3.860795021057129, "logps/rejected": -4.267982482910156, "loss": 3.5528, "rewards/accuracies": 1.0, "rewards/chosen": -38.607948303222656, "rewards/margins": 4.071872711181641, "rewards/rejected": -42.67982482910156, "step": 4716 }, { "epoch": 0.6422930283224401, "grad_norm": 39.0366977384253, "learning_rate": 2.7339523476305426e-07, "logits/chosen": 13.548620223999023, "logits/rejected": 13.902416229248047, "logps/chosen": -4.23185920715332, "logps/rejected": -4.297962188720703, "loss": 3.7099, "rewards/accuracies": 0.5, "rewards/chosen": -42.31859588623047, "rewards/margins": 0.6610307693481445, "rewards/rejected": -42.9796257019043, "step": 4717 }, { "epoch": 0.6424291938997821, "grad_norm": 42.189183095065594, "learning_rate": 2.732148841909249e-07, "logits/chosen": 12.64063835144043, "logits/rejected": 13.05148983001709, "logps/chosen": -3.970440626144409, "logps/rejected": -4.356861114501953, "loss": 4.0152, "rewards/accuracies": 1.0, "rewards/chosen": -39.70440673828125, "rewards/margins": 3.8642044067382812, "rewards/rejected": -43.56861114501953, "step": 4718 }, { "epoch": 0.6425653594771242, "grad_norm": 36.34727802773603, "learning_rate": 2.7303456226694056e-07, "logits/chosen": 13.66161823272705, "logits/rejected": 13.68348503112793, "logps/chosen": -4.558903694152832, "logps/rejected": -4.5340728759765625, "loss": 3.7215, "rewards/accuracies": 0.5, "rewards/chosen": -45.58904266357422, "rewards/margins": -0.24831295013427734, "rewards/rejected": -45.340728759765625, "step": 4719 }, { "epoch": 0.6427015250544662, "grad_norm": 46.762839882400165, "learning_rate": 2.7285426903184636e-07, "logits/chosen": 12.747830390930176, "logits/rejected": 13.63693618774414, "logps/chosen": -4.107867240905762, "logps/rejected": -4.388791084289551, "loss": 4.2094, "rewards/accuracies": 0.5, "rewards/chosen": -41.07867431640625, "rewards/margins": 2.809239387512207, "rewards/rejected": -43.887916564941406, "step": 4720 }, { "epoch": 0.6428376906318083, "grad_norm": 40.65312645026179, "learning_rate": 2.726740045263811e-07, "logits/chosen": 12.807123184204102, "logits/rejected": 13.507511138916016, "logps/chosen": -4.542518615722656, "logps/rejected": -4.842299461364746, "loss": 4.0033, "rewards/accuracies": 1.0, "rewards/chosen": -45.42518615722656, "rewards/margins": 2.997807502746582, "rewards/rejected": -48.42299270629883, "step": 4721 }, { "epoch": 0.6429738562091504, "grad_norm": 36.998304556222756, "learning_rate": 2.724937687912769e-07, "logits/chosen": 12.90865707397461, "logits/rejected": 13.392271041870117, "logps/chosen": -4.2432146072387695, "logps/rejected": -4.172891616821289, "loss": 3.5922, "rewards/accuracies": 0.5, "rewards/chosen": -42.43214416503906, "rewards/margins": -0.7032260894775391, "rewards/rejected": -41.728919982910156, "step": 4722 }, { "epoch": 0.6431100217864923, "grad_norm": 39.242835194654894, "learning_rate": 2.7231356186725976e-07, "logits/chosen": 12.805709838867188, "logits/rejected": 13.172588348388672, "logps/chosen": -3.9786667823791504, "logps/rejected": -4.212008476257324, "loss": 3.7296, "rewards/accuracies": 0.75, "rewards/chosen": -39.78666687011719, "rewards/margins": 2.333420753479004, "rewards/rejected": -42.120086669921875, "step": 4723 }, { "epoch": 0.6432461873638344, "grad_norm": 36.523592812037066, "learning_rate": 2.721333837950486e-07, "logits/chosen": 12.361515045166016, "logits/rejected": 13.687826156616211, "logps/chosen": -3.98880672454834, "logps/rejected": -4.374948024749756, "loss": 3.9843, "rewards/accuracies": 1.0, "rewards/chosen": -39.888065338134766, "rewards/margins": 3.8614139556884766, "rewards/rejected": -43.749481201171875, "step": 4724 }, { "epoch": 0.6433823529411765, "grad_norm": 39.84623061370271, "learning_rate": 2.7195323461535644e-07, "logits/chosen": 13.511301040649414, "logits/rejected": 14.047772407531738, "logps/chosen": -4.310632705688477, "logps/rejected": -4.273255348205566, "loss": 4.1673, "rewards/accuracies": 0.25, "rewards/chosen": -43.10633087158203, "rewards/margins": -0.3737783432006836, "rewards/rejected": -42.732547760009766, "step": 4725 }, { "epoch": 0.6435185185185185, "grad_norm": 37.94698151432036, "learning_rate": 2.717731143688895e-07, "logits/chosen": 11.753694534301758, "logits/rejected": 13.386940002441406, "logps/chosen": -3.5874900817871094, "logps/rejected": -4.375616073608398, "loss": 3.937, "rewards/accuracies": 1.0, "rewards/chosen": -35.87489700317383, "rewards/margins": 7.88126277923584, "rewards/rejected": -43.75616455078125, "step": 4726 }, { "epoch": 0.6436546840958606, "grad_norm": 37.44980004068919, "learning_rate": 2.7159302309634705e-07, "logits/chosen": 13.450170516967773, "logits/rejected": 13.96882152557373, "logps/chosen": -4.655773162841797, "logps/rejected": -4.87638521194458, "loss": 4.1381, "rewards/accuracies": 0.5, "rewards/chosen": -46.55772399902344, "rewards/margins": 2.2061262130737305, "rewards/rejected": -48.76385498046875, "step": 4727 }, { "epoch": 0.6437908496732027, "grad_norm": 34.0083564402871, "learning_rate": 2.7141296083842255e-07, "logits/chosen": 11.880212783813477, "logits/rejected": 12.7874755859375, "logps/chosen": -3.794196605682373, "logps/rejected": -4.259637355804443, "loss": 3.7772, "rewards/accuracies": 1.0, "rewards/chosen": -37.94196701049805, "rewards/margins": 4.654404640197754, "rewards/rejected": -42.59637451171875, "step": 4728 }, { "epoch": 0.6439270152505446, "grad_norm": 40.21544650110489, "learning_rate": 2.712329276358026e-07, "logits/chosen": 12.585783004760742, "logits/rejected": 13.212276458740234, "logps/chosen": -4.055981159210205, "logps/rejected": -4.3178486824035645, "loss": 4.2691, "rewards/accuracies": 0.5, "rewards/chosen": -40.559814453125, "rewards/margins": 2.618675708770752, "rewards/rejected": -43.178489685058594, "step": 4729 }, { "epoch": 0.6440631808278867, "grad_norm": 43.69582400596463, "learning_rate": 2.710529235291669e-07, "logits/chosen": 13.208440780639648, "logits/rejected": 12.692699432373047, "logps/chosen": -4.604549407958984, "logps/rejected": -4.291451930999756, "loss": 4.5236, "rewards/accuracies": 0.5, "rewards/chosen": -46.045494079589844, "rewards/margins": -3.1309738159179688, "rewards/rejected": -42.914520263671875, "step": 4730 }, { "epoch": 0.6441993464052288, "grad_norm": 40.748933142848735, "learning_rate": 2.708729485591889e-07, "logits/chosen": 13.68096923828125, "logits/rejected": 13.593387603759766, "logps/chosen": -4.425037860870361, "logps/rejected": -4.446794509887695, "loss": 4.1897, "rewards/accuracies": 0.5, "rewards/chosen": -44.25037384033203, "rewards/margins": 0.21756935119628906, "rewards/rejected": -44.46794509887695, "step": 4731 }, { "epoch": 0.6443355119825708, "grad_norm": 40.04281630244232, "learning_rate": 2.7069300276653584e-07, "logits/chosen": 12.612619400024414, "logits/rejected": 14.080018043518066, "logps/chosen": -4.381283760070801, "logps/rejected": -4.869791507720947, "loss": 3.8052, "rewards/accuracies": 1.0, "rewards/chosen": -43.81283950805664, "rewards/margins": 4.885076522827148, "rewards/rejected": -48.697914123535156, "step": 4732 }, { "epoch": 0.6444716775599129, "grad_norm": 38.04582794797178, "learning_rate": 2.7051308619186744e-07, "logits/chosen": 13.37948226928711, "logits/rejected": 14.686767578125, "logps/chosen": -4.541315078735352, "logps/rejected": -4.754268646240234, "loss": 3.6362, "rewards/accuracies": 0.75, "rewards/chosen": -45.41314697265625, "rewards/margins": 2.129535675048828, "rewards/rejected": -47.54268264770508, "step": 4733 }, { "epoch": 0.6446078431372549, "grad_norm": 39.75747742278025, "learning_rate": 2.7033319887583765e-07, "logits/chosen": 13.571035385131836, "logits/rejected": 13.294962882995605, "logps/chosen": -4.034256935119629, "logps/rejected": -4.1577630043029785, "loss": 4.1515, "rewards/accuracies": 0.5, "rewards/chosen": -40.342567443847656, "rewards/margins": 1.235062599182129, "rewards/rejected": -41.57762908935547, "step": 4734 }, { "epoch": 0.6447440087145969, "grad_norm": 39.96577330593487, "learning_rate": 2.701533408590935e-07, "logits/chosen": 13.604455947875977, "logits/rejected": 13.54586124420166, "logps/chosen": -4.363683223724365, "logps/rejected": -4.420561790466309, "loss": 4.1186, "rewards/accuracies": 0.5, "rewards/chosen": -43.63683319091797, "rewards/margins": 0.5687799453735352, "rewards/rejected": -44.20561218261719, "step": 4735 }, { "epoch": 0.644880174291939, "grad_norm": 39.91347094745743, "learning_rate": 2.6997351218227515e-07, "logits/chosen": 12.701071739196777, "logits/rejected": 12.351579666137695, "logps/chosen": -4.02301025390625, "logps/rejected": -3.9734926223754883, "loss": 3.8433, "rewards/accuracies": 0.25, "rewards/chosen": -40.2301025390625, "rewards/margins": -0.4951772689819336, "rewards/rejected": -39.73492431640625, "step": 4736 }, { "epoch": 0.6450163398692811, "grad_norm": 40.66397723992694, "learning_rate": 2.697937128860166e-07, "logits/chosen": 13.6336669921875, "logits/rejected": 13.513965606689453, "logps/chosen": -4.409253120422363, "logps/rejected": -4.482010364532471, "loss": 3.6277, "rewards/accuracies": 0.5, "rewards/chosen": -44.09253692626953, "rewards/margins": 0.7275667190551758, "rewards/rejected": -44.820098876953125, "step": 4737 }, { "epoch": 0.6451525054466231, "grad_norm": 38.12292665025931, "learning_rate": 2.69613943010945e-07, "logits/chosen": 13.280569076538086, "logits/rejected": 13.500819206237793, "logps/chosen": -4.38803243637085, "logps/rejected": -4.2222514152526855, "loss": 3.4559, "rewards/accuracies": 0.25, "rewards/chosen": -43.88032150268555, "rewards/margins": -1.6578121185302734, "rewards/rejected": -42.222511291503906, "step": 4738 }, { "epoch": 0.6452886710239651, "grad_norm": 39.93528605259363, "learning_rate": 2.6943420259768063e-07, "logits/chosen": 13.698736190795898, "logits/rejected": 13.302305221557617, "logps/chosen": -4.664191246032715, "logps/rejected": -4.636147499084473, "loss": 4.3178, "rewards/accuracies": 0.25, "rewards/chosen": -46.64191436767578, "rewards/margins": -0.28043556213378906, "rewards/rejected": -46.36147689819336, "step": 4739 }, { "epoch": 0.6454248366013072, "grad_norm": 37.35969425793782, "learning_rate": 2.6925449168683736e-07, "logits/chosen": 13.535758972167969, "logits/rejected": 13.40118408203125, "logps/chosen": -4.207859039306641, "logps/rejected": -4.442892551422119, "loss": 3.8798, "rewards/accuracies": 0.75, "rewards/chosen": -42.078590393066406, "rewards/margins": 2.3503332138061523, "rewards/rejected": -44.428924560546875, "step": 4740 }, { "epoch": 0.6455610021786492, "grad_norm": 36.61123107150328, "learning_rate": 2.690748103190227e-07, "logits/chosen": 13.569284439086914, "logits/rejected": 13.825000762939453, "logps/chosen": -4.4333343505859375, "logps/rejected": -4.796553611755371, "loss": 4.2716, "rewards/accuracies": 0.75, "rewards/chosen": -44.333343505859375, "rewards/margins": 3.6321868896484375, "rewards/rejected": -47.96553039550781, "step": 4741 }, { "epoch": 0.6456971677559913, "grad_norm": 41.07470592038811, "learning_rate": 2.688951585348367e-07, "logits/chosen": 13.288576126098633, "logits/rejected": 13.801140785217285, "logps/chosen": -4.187750816345215, "logps/rejected": -4.412484169006348, "loss": 3.4946, "rewards/accuracies": 0.5, "rewards/chosen": -41.87751007080078, "rewards/margins": 2.2473297119140625, "rewards/rejected": -44.12483596801758, "step": 4742 }, { "epoch": 0.6458333333333334, "grad_norm": 42.368811783863954, "learning_rate": 2.687155363748734e-07, "logits/chosen": 13.16050910949707, "logits/rejected": 13.985824584960938, "logps/chosen": -4.258853435516357, "logps/rejected": -4.466813087463379, "loss": 3.8608, "rewards/accuracies": 0.5, "rewards/chosen": -42.588531494140625, "rewards/margins": 2.079596519470215, "rewards/rejected": -44.668128967285156, "step": 4743 }, { "epoch": 0.6459694989106753, "grad_norm": 41.05415537265269, "learning_rate": 2.6853594387972005e-07, "logits/chosen": 13.264509201049805, "logits/rejected": 13.229400634765625, "logps/chosen": -4.444889068603516, "logps/rejected": -4.168723106384277, "loss": 4.1981, "rewards/accuracies": 0.0, "rewards/chosen": -44.44888687133789, "rewards/margins": -2.761655807495117, "rewards/rejected": -41.687232971191406, "step": 4744 }, { "epoch": 0.6461056644880174, "grad_norm": 37.248816455184695, "learning_rate": 2.683563810899566e-07, "logits/chosen": 12.712812423706055, "logits/rejected": 13.277624130249023, "logps/chosen": -3.9664034843444824, "logps/rejected": -4.3154754638671875, "loss": 3.8521, "rewards/accuracies": 1.0, "rewards/chosen": -39.66403579711914, "rewards/margins": 3.490719795227051, "rewards/rejected": -43.154754638671875, "step": 4745 }, { "epoch": 0.6462418300653595, "grad_norm": 39.72193740959342, "learning_rate": 2.6817684804615706e-07, "logits/chosen": 13.229045867919922, "logits/rejected": 14.036518096923828, "logps/chosen": -4.251955986022949, "logps/rejected": -4.483489036560059, "loss": 3.8703, "rewards/accuracies": 0.5, "rewards/chosen": -42.519561767578125, "rewards/margins": 2.315328598022461, "rewards/rejected": -44.83488845825195, "step": 4746 }, { "epoch": 0.6463779956427015, "grad_norm": 38.17762274434748, "learning_rate": 2.6799734478888855e-07, "logits/chosen": 13.468244552612305, "logits/rejected": 13.86473274230957, "logps/chosen": -4.292396545410156, "logps/rejected": -4.726401329040527, "loss": 4.1459, "rewards/accuracies": 0.75, "rewards/chosen": -42.92396545410156, "rewards/margins": 4.340047836303711, "rewards/rejected": -47.264015197753906, "step": 4747 }, { "epoch": 0.6465141612200436, "grad_norm": 38.10348087875334, "learning_rate": 2.6781787135871097e-07, "logits/chosen": 13.30885124206543, "logits/rejected": 14.297405242919922, "logps/chosen": -4.154751777648926, "logps/rejected": -4.661530494689941, "loss": 3.5135, "rewards/accuracies": 1.0, "rewards/chosen": -41.547515869140625, "rewards/margins": 5.0677900314331055, "rewards/rejected": -46.61530303955078, "step": 4748 }, { "epoch": 0.6466503267973857, "grad_norm": 41.59008670934385, "learning_rate": 2.6763842779617793e-07, "logits/chosen": 12.7710599899292, "logits/rejected": 12.905282020568848, "logps/chosen": -4.132315158843994, "logps/rejected": -4.2720627784729, "loss": 4.3486, "rewards/accuracies": 0.5, "rewards/chosen": -41.32315444946289, "rewards/margins": 1.3974742889404297, "rewards/rejected": -42.72062683105469, "step": 4749 }, { "epoch": 0.6467864923747276, "grad_norm": 39.90552674115288, "learning_rate": 2.674590141418365e-07, "logits/chosen": 13.870094299316406, "logits/rejected": 14.16952133178711, "logps/chosen": -4.245183944702148, "logps/rejected": -4.697723865509033, "loss": 4.004, "rewards/accuracies": 0.75, "rewards/chosen": -42.451839447021484, "rewards/margins": 4.525400161743164, "rewards/rejected": -46.97724151611328, "step": 4750 }, { "epoch": 0.6469226579520697, "grad_norm": 36.447684214338274, "learning_rate": 2.672796304362262e-07, "logits/chosen": 13.192602157592773, "logits/rejected": 13.68783950805664, "logps/chosen": -4.559352874755859, "logps/rejected": -4.572892189025879, "loss": 3.8097, "rewards/accuracies": 0.5, "rewards/chosen": -45.593528747558594, "rewards/margins": 0.1353931427001953, "rewards/rejected": -45.728919982910156, "step": 4751 }, { "epoch": 0.6470588235294118, "grad_norm": 36.765544246712494, "learning_rate": 2.6710027671988044e-07, "logits/chosen": 12.907363891601562, "logits/rejected": 13.663880348205566, "logps/chosen": -4.0475077629089355, "logps/rejected": -4.404423713684082, "loss": 3.765, "rewards/accuracies": 0.75, "rewards/chosen": -40.475074768066406, "rewards/margins": 3.5691566467285156, "rewards/rejected": -44.04423522949219, "step": 4752 }, { "epoch": 0.6471949891067538, "grad_norm": 40.10200135382267, "learning_rate": 2.6692095303332596e-07, "logits/chosen": 13.564157485961914, "logits/rejected": 13.730162620544434, "logps/chosen": -4.091520309448242, "logps/rejected": -4.258577823638916, "loss": 3.8464, "rewards/accuracies": 0.75, "rewards/chosen": -40.91520309448242, "rewards/margins": 1.6705732345581055, "rewards/rejected": -42.585777282714844, "step": 4753 }, { "epoch": 0.6473311546840959, "grad_norm": 40.54097257686643, "learning_rate": 2.66741659417082e-07, "logits/chosen": 13.19379711151123, "logits/rejected": 13.028024673461914, "logps/chosen": -4.0518107414245605, "logps/rejected": -4.06958532333374, "loss": 3.7371, "rewards/accuracies": 0.5, "rewards/chosen": -40.51810836791992, "rewards/margins": 0.17774677276611328, "rewards/rejected": -40.69585418701172, "step": 4754 }, { "epoch": 0.6474673202614379, "grad_norm": 36.545222438937266, "learning_rate": 2.665623959116616e-07, "logits/chosen": 12.078474044799805, "logits/rejected": 13.156986236572266, "logps/chosen": -3.787116765975952, "logps/rejected": -4.35162353515625, "loss": 3.7689, "rewards/accuracies": 1.0, "rewards/chosen": -37.87117004394531, "rewards/margins": 5.645069122314453, "rewards/rejected": -43.5162353515625, "step": 4755 }, { "epoch": 0.6476034858387799, "grad_norm": 40.21630038417836, "learning_rate": 2.6638316255757094e-07, "logits/chosen": 12.691761016845703, "logits/rejected": 13.930326461791992, "logps/chosen": -4.11287784576416, "logps/rejected": -4.43571662902832, "loss": 4.3015, "rewards/accuracies": 0.75, "rewards/chosen": -41.1287841796875, "rewards/margins": 3.228386878967285, "rewards/rejected": -44.35717010498047, "step": 4756 }, { "epoch": 0.647739651416122, "grad_norm": 38.25820943407501, "learning_rate": 2.662039593953092e-07, "logits/chosen": 12.999734878540039, "logits/rejected": 13.784786224365234, "logps/chosen": -4.067040920257568, "logps/rejected": -4.443770408630371, "loss": 4.1148, "rewards/accuracies": 0.75, "rewards/chosen": -40.67041015625, "rewards/margins": 3.767293930053711, "rewards/rejected": -44.437705993652344, "step": 4757 }, { "epoch": 0.6478758169934641, "grad_norm": 37.385385768524124, "learning_rate": 2.660247864653687e-07, "logits/chosen": 12.694656372070312, "logits/rejected": 13.698690414428711, "logps/chosen": -3.9834322929382324, "logps/rejected": -4.5444183349609375, "loss": 3.8059, "rewards/accuracies": 1.0, "rewards/chosen": -39.83432388305664, "rewards/margins": 5.609856605529785, "rewards/rejected": -45.444183349609375, "step": 4758 }, { "epoch": 0.648011982570806, "grad_norm": 35.63691450640866, "learning_rate": 2.658456438082352e-07, "logits/chosen": 13.629167556762695, "logits/rejected": 13.676661491394043, "logps/chosen": -4.1959428787231445, "logps/rejected": -4.529530048370361, "loss": 3.3537, "rewards/accuracies": 0.5, "rewards/chosen": -41.959434509277344, "rewards/margins": 3.335869789123535, "rewards/rejected": -45.29530334472656, "step": 4759 }, { "epoch": 0.6481481481481481, "grad_norm": 40.67689211039443, "learning_rate": 2.656665314643875e-07, "logits/chosen": 13.451765060424805, "logits/rejected": 13.489507675170898, "logps/chosen": -4.1169281005859375, "logps/rejected": -4.128021240234375, "loss": 4.4481, "rewards/accuracies": 0.5, "rewards/chosen": -41.169281005859375, "rewards/margins": 0.1109323501586914, "rewards/rejected": -41.28021240234375, "step": 4760 }, { "epoch": 0.6482843137254902, "grad_norm": 38.88565758518848, "learning_rate": 2.6548744947429725e-07, "logits/chosen": 13.757157325744629, "logits/rejected": 13.130374908447266, "logps/chosen": -4.584486484527588, "logps/rejected": -4.31961727142334, "loss": 3.6685, "rewards/accuracies": 0.25, "rewards/chosen": -45.84486389160156, "rewards/margins": -2.6486940383911133, "rewards/rejected": -43.196170806884766, "step": 4761 }, { "epoch": 0.6484204793028322, "grad_norm": 40.91599326716376, "learning_rate": 2.6530839787842986e-07, "logits/chosen": 12.437112808227539, "logits/rejected": 12.795106887817383, "logps/chosen": -4.158336639404297, "logps/rejected": -4.4092912673950195, "loss": 3.7905, "rewards/accuracies": 0.75, "rewards/chosen": -41.58336639404297, "rewards/margins": 2.50954532623291, "rewards/rejected": -44.09291076660156, "step": 4762 }, { "epoch": 0.6485566448801743, "grad_norm": 39.74714780558259, "learning_rate": 2.6512937671724315e-07, "logits/chosen": 13.113933563232422, "logits/rejected": 13.770914077758789, "logps/chosen": -4.314401626586914, "logps/rejected": -4.590219497680664, "loss": 4.0649, "rewards/accuracies": 1.0, "rewards/chosen": -43.14401626586914, "rewards/margins": 2.758181571960449, "rewards/rejected": -45.902198791503906, "step": 4763 }, { "epoch": 0.6486928104575164, "grad_norm": 36.857426493766035, "learning_rate": 2.6495038603118873e-07, "logits/chosen": 13.798055648803711, "logits/rejected": 13.670093536376953, "logps/chosen": -4.368202209472656, "logps/rejected": -4.440040111541748, "loss": 3.9095, "rewards/accuracies": 0.5, "rewards/chosen": -43.6820182800293, "rewards/margins": 0.7183847427368164, "rewards/rejected": -44.4004020690918, "step": 4764 }, { "epoch": 0.6488289760348583, "grad_norm": 41.09480602220481, "learning_rate": 2.6477142586071104e-07, "logits/chosen": 13.192566871643066, "logits/rejected": 12.92794418334961, "logps/chosen": -3.9911463260650635, "logps/rejected": -4.163193702697754, "loss": 3.496, "rewards/accuracies": 0.75, "rewards/chosen": -39.911460876464844, "rewards/margins": 1.7204742431640625, "rewards/rejected": -41.63193893432617, "step": 4765 }, { "epoch": 0.6489651416122004, "grad_norm": 35.87942547869524, "learning_rate": 2.645924962462473e-07, "logits/chosen": 13.966809272766113, "logits/rejected": 13.360036849975586, "logps/chosen": -3.9700846672058105, "logps/rejected": -4.389682292938232, "loss": 3.5284, "rewards/accuracies": 1.0, "rewards/chosen": -39.70084762573242, "rewards/margins": 4.195977210998535, "rewards/rejected": -43.89682388305664, "step": 4766 }, { "epoch": 0.6491013071895425, "grad_norm": 40.88323683489746, "learning_rate": 2.644135972282284e-07, "logits/chosen": 13.16372013092041, "logits/rejected": 12.538582801818848, "logps/chosen": -3.9747815132141113, "logps/rejected": -4.000197410583496, "loss": 4.1084, "rewards/accuracies": 0.5, "rewards/chosen": -39.74781799316406, "rewards/margins": 0.25415802001953125, "rewards/rejected": -40.001976013183594, "step": 4767 }, { "epoch": 0.6492374727668845, "grad_norm": 36.22186475982032, "learning_rate": 2.6423472884707803e-07, "logits/chosen": 13.650996208190918, "logits/rejected": 13.548255920410156, "logps/chosen": -4.188801288604736, "logps/rejected": -4.4035139083862305, "loss": 3.6364, "rewards/accuracies": 1.0, "rewards/chosen": -41.88801574707031, "rewards/margins": 2.147125244140625, "rewards/rejected": -44.03514099121094, "step": 4768 }, { "epoch": 0.6493736383442266, "grad_norm": 41.25023360957064, "learning_rate": 2.640558911432128e-07, "logits/chosen": 13.040765762329102, "logits/rejected": 14.036667823791504, "logps/chosen": -4.525237560272217, "logps/rejected": -4.7839555740356445, "loss": 3.9435, "rewards/accuracies": 0.75, "rewards/chosen": -45.25237274169922, "rewards/margins": 2.5871782302856445, "rewards/rejected": -47.83955383300781, "step": 4769 }, { "epoch": 0.6495098039215687, "grad_norm": 40.920304609876496, "learning_rate": 2.638770841570427e-07, "logits/chosen": 12.710808753967285, "logits/rejected": 12.804664611816406, "logps/chosen": -4.357916831970215, "logps/rejected": -4.465837478637695, "loss": 4.0098, "rewards/accuracies": 0.5, "rewards/chosen": -43.57917022705078, "rewards/margins": 1.0792036056518555, "rewards/rejected": -44.65837097167969, "step": 4770 }, { "epoch": 0.6496459694989106, "grad_norm": 39.866790048069646, "learning_rate": 2.636983079289708e-07, "logits/chosen": 13.286696434020996, "logits/rejected": 13.407742500305176, "logps/chosen": -4.36168098449707, "logps/rejected": -4.531546115875244, "loss": 3.8294, "rewards/accuracies": 0.75, "rewards/chosen": -43.61680603027344, "rewards/margins": 1.6986570358276367, "rewards/rejected": -45.315460205078125, "step": 4771 }, { "epoch": 0.6497821350762527, "grad_norm": 40.85812409185394, "learning_rate": 2.635195624993927e-07, "logits/chosen": 12.61882209777832, "logits/rejected": 13.816003799438477, "logps/chosen": -4.132763862609863, "logps/rejected": -4.7122368812561035, "loss": 4.2555, "rewards/accuracies": 1.0, "rewards/chosen": -41.32763671875, "rewards/margins": 5.794729232788086, "rewards/rejected": -47.12236785888672, "step": 4772 }, { "epoch": 0.6499183006535948, "grad_norm": 39.241435249075515, "learning_rate": 2.6334084790869766e-07, "logits/chosen": 13.88475227355957, "logits/rejected": 14.315589904785156, "logps/chosen": -3.9731953144073486, "logps/rejected": -4.6079936027526855, "loss": 3.4171, "rewards/accuracies": 1.0, "rewards/chosen": -39.73195266723633, "rewards/margins": 6.347982406616211, "rewards/rejected": -46.07993698120117, "step": 4773 }, { "epoch": 0.6500544662309368, "grad_norm": 37.35000587034455, "learning_rate": 2.631621641972678e-07, "logits/chosen": 13.296921730041504, "logits/rejected": 13.129829406738281, "logps/chosen": -4.125950813293457, "logps/rejected": -4.399165153503418, "loss": 3.9179, "rewards/accuracies": 0.75, "rewards/chosen": -41.25950622558594, "rewards/margins": 2.7321434020996094, "rewards/rejected": -43.99164962768555, "step": 4774 }, { "epoch": 0.6501906318082789, "grad_norm": 41.98720690235972, "learning_rate": 2.62983511405478e-07, "logits/chosen": 12.634504318237305, "logits/rejected": 12.600589752197266, "logps/chosen": -4.349497318267822, "logps/rejected": -4.453205108642578, "loss": 3.8004, "rewards/accuracies": 0.75, "rewards/chosen": -43.49497604370117, "rewards/margins": 1.0370759963989258, "rewards/rejected": -44.53205108642578, "step": 4775 }, { "epoch": 0.6503267973856209, "grad_norm": 42.75308528537219, "learning_rate": 2.628048895736963e-07, "logits/chosen": 14.000402450561523, "logits/rejected": 13.887458801269531, "logps/chosen": -4.421389579772949, "logps/rejected": -4.37325382232666, "loss": 4.3634, "rewards/accuracies": 0.5, "rewards/chosen": -44.213897705078125, "rewards/margins": -0.4813575744628906, "rewards/rejected": -43.73253631591797, "step": 4776 }, { "epoch": 0.6504629629629629, "grad_norm": 40.33720396605906, "learning_rate": 2.6262629874228386e-07, "logits/chosen": 13.0881986618042, "logits/rejected": 14.107248306274414, "logps/chosen": -4.026988506317139, "logps/rejected": -4.586010932922363, "loss": 3.1388, "rewards/accuracies": 1.0, "rewards/chosen": -40.2698860168457, "rewards/margins": 5.590222358703613, "rewards/rejected": -45.860107421875, "step": 4777 }, { "epoch": 0.650599128540305, "grad_norm": 39.98855907169053, "learning_rate": 2.6244773895159495e-07, "logits/chosen": 13.551302909851074, "logits/rejected": 13.552406311035156, "logps/chosen": -4.0944929122924805, "logps/rejected": -4.41850471496582, "loss": 3.91, "rewards/accuracies": 0.75, "rewards/chosen": -40.94492721557617, "rewards/margins": 3.2401180267333984, "rewards/rejected": -44.18504333496094, "step": 4778 }, { "epoch": 0.6507352941176471, "grad_norm": 39.31012148071809, "learning_rate": 2.6226921024197627e-07, "logits/chosen": 13.547611236572266, "logits/rejected": 13.847177505493164, "logps/chosen": -4.535327911376953, "logps/rejected": -4.773970127105713, "loss": 4.1234, "rewards/accuracies": 0.75, "rewards/chosen": -45.35327911376953, "rewards/margins": 2.386423110961914, "rewards/rejected": -47.73970031738281, "step": 4779 }, { "epoch": 0.650871459694989, "grad_norm": 39.98665131786521, "learning_rate": 2.6209071265376806e-07, "logits/chosen": 12.439516067504883, "logits/rejected": 13.167640686035156, "logps/chosen": -4.084953308105469, "logps/rejected": -4.431154727935791, "loss": 4.2204, "rewards/accuracies": 0.5, "rewards/chosen": -40.84953308105469, "rewards/margins": 3.4620113372802734, "rewards/rejected": -44.311546325683594, "step": 4780 }, { "epoch": 0.6510076252723311, "grad_norm": 41.15109765662894, "learning_rate": 2.619122462273034e-07, "logits/chosen": 13.714498519897461, "logits/rejected": 13.875816345214844, "logps/chosen": -4.254837989807129, "logps/rejected": -4.483837604522705, "loss": 3.5207, "rewards/accuracies": 0.5, "rewards/chosen": -42.548377990722656, "rewards/margins": 2.289999008178711, "rewards/rejected": -44.838375091552734, "step": 4781 }, { "epoch": 0.6511437908496732, "grad_norm": 42.82105867215818, "learning_rate": 2.6173381100290803e-07, "logits/chosen": 13.018596649169922, "logits/rejected": 14.499759674072266, "logps/chosen": -4.325807094573975, "logps/rejected": -4.930941581726074, "loss": 3.3596, "rewards/accuracies": 1.0, "rewards/chosen": -43.25807189941406, "rewards/margins": 6.0513458251953125, "rewards/rejected": -49.309417724609375, "step": 4782 }, { "epoch": 0.6512799564270153, "grad_norm": 36.392591938368135, "learning_rate": 2.6155540702090094e-07, "logits/chosen": 13.649375915527344, "logits/rejected": 14.141101837158203, "logps/chosen": -4.5217790603637695, "logps/rejected": -4.5793304443359375, "loss": 4.0023, "rewards/accuracies": 0.5, "rewards/chosen": -45.21778869628906, "rewards/margins": 0.5755157470703125, "rewards/rejected": -45.79330825805664, "step": 4783 }, { "epoch": 0.6514161220043573, "grad_norm": 42.156877432869855, "learning_rate": 2.6137703432159423e-07, "logits/chosen": 13.949546813964844, "logits/rejected": 13.652750015258789, "logps/chosen": -4.809566497802734, "logps/rejected": -4.302540302276611, "loss": 4.0432, "rewards/accuracies": 0.0, "rewards/chosen": -48.095664978027344, "rewards/margins": -5.0702619552612305, "rewards/rejected": -43.02540588378906, "step": 4784 }, { "epoch": 0.6515522875816994, "grad_norm": 38.93480135071886, "learning_rate": 2.611986929452923e-07, "logits/chosen": 13.137985229492188, "logits/rejected": 13.774768829345703, "logps/chosen": -4.522547245025635, "logps/rejected": -4.8865437507629395, "loss": 3.5893, "rewards/accuracies": 0.75, "rewards/chosen": -45.22547149658203, "rewards/margins": 3.6399669647216797, "rewards/rejected": -48.865440368652344, "step": 4785 }, { "epoch": 0.6516884531590414, "grad_norm": 41.956359072057985, "learning_rate": 2.6102038293229306e-07, "logits/chosen": 12.940839767456055, "logits/rejected": 13.006429672241211, "logps/chosen": -3.8854153156280518, "logps/rejected": -4.051152229309082, "loss": 3.7945, "rewards/accuracies": 0.5, "rewards/chosen": -38.85415267944336, "rewards/margins": 1.6573667526245117, "rewards/rejected": -40.51152038574219, "step": 4786 }, { "epoch": 0.6518246187363834, "grad_norm": 42.43228828505683, "learning_rate": 2.6084210432288727e-07, "logits/chosen": 14.115578651428223, "logits/rejected": 13.709772109985352, "logps/chosen": -4.795140743255615, "logps/rejected": -4.587773323059082, "loss": 4.2327, "rewards/accuracies": 0.5, "rewards/chosen": -47.95140838623047, "rewards/margins": -2.0736751556396484, "rewards/rejected": -45.87773132324219, "step": 4787 }, { "epoch": 0.6519607843137255, "grad_norm": 42.334154123591794, "learning_rate": 2.6066385715735815e-07, "logits/chosen": 12.857433319091797, "logits/rejected": 13.07674789428711, "logps/chosen": -4.188223838806152, "logps/rejected": -4.202815055847168, "loss": 3.97, "rewards/accuracies": 0.75, "rewards/chosen": -41.882240295410156, "rewards/margins": 0.14590930938720703, "rewards/rejected": -42.02815246582031, "step": 4788 }, { "epoch": 0.6520969498910676, "grad_norm": 98.807038851161, "learning_rate": 2.6048564147598227e-07, "logits/chosen": 13.478954315185547, "logits/rejected": 12.781099319458008, "logps/chosen": -4.4114603996276855, "logps/rejected": -4.2729902267456055, "loss": 4.0003, "rewards/accuracies": 0.5, "rewards/chosen": -44.114601135253906, "rewards/margins": -1.3846979141235352, "rewards/rejected": -42.72990417480469, "step": 4789 }, { "epoch": 0.6522331154684096, "grad_norm": 41.36086182454188, "learning_rate": 2.6030745731902905e-07, "logits/chosen": 13.516220092773438, "logits/rejected": 14.117504119873047, "logps/chosen": -4.596346855163574, "logps/rejected": -4.662139892578125, "loss": 4.1615, "rewards/accuracies": 0.5, "rewards/chosen": -45.96347427368164, "rewards/margins": 0.6579246520996094, "rewards/rejected": -46.62139892578125, "step": 4790 }, { "epoch": 0.6523692810457516, "grad_norm": 40.056460801609305, "learning_rate": 2.6012930472676047e-07, "logits/chosen": 13.696474075317383, "logits/rejected": 14.643707275390625, "logps/chosen": -4.329598426818848, "logps/rejected": -4.515409469604492, "loss": 3.6031, "rewards/accuracies": 0.75, "rewards/chosen": -43.295989990234375, "rewards/margins": 1.8581056594848633, "rewards/rejected": -45.15409469604492, "step": 4791 }, { "epoch": 0.6525054466230937, "grad_norm": 42.24052470100378, "learning_rate": 2.599511837394316e-07, "logits/chosen": 14.509441375732422, "logits/rejected": 14.845494270324707, "logps/chosen": -4.521416664123535, "logps/rejected": -4.572079658508301, "loss": 4.2875, "rewards/accuracies": 0.5, "rewards/chosen": -45.21416473388672, "rewards/margins": 0.5066337585449219, "rewards/rejected": -45.72079849243164, "step": 4792 }, { "epoch": 0.6526416122004357, "grad_norm": 43.75674646213913, "learning_rate": 2.5977309439729064e-07, "logits/chosen": 13.67498779296875, "logits/rejected": 13.831245422363281, "logps/chosen": -4.224151611328125, "logps/rejected": -4.233166217803955, "loss": 3.7134, "rewards/accuracies": 0.25, "rewards/chosen": -42.24151611328125, "rewards/margins": 0.09014511108398438, "rewards/rejected": -42.331661224365234, "step": 4793 }, { "epoch": 0.6527777777777778, "grad_norm": 40.060395473668635, "learning_rate": 2.5959503674057786e-07, "logits/chosen": 13.677241325378418, "logits/rejected": 13.663933753967285, "logps/chosen": -4.400413513183594, "logps/rejected": -4.496016979217529, "loss": 3.4672, "rewards/accuracies": 0.5, "rewards/chosen": -44.0041389465332, "rewards/margins": 0.9560327529907227, "rewards/rejected": -44.96017074584961, "step": 4794 }, { "epoch": 0.6529139433551199, "grad_norm": 44.8813597665342, "learning_rate": 2.594170108095272e-07, "logits/chosen": 13.694074630737305, "logits/rejected": 13.668174743652344, "logps/chosen": -4.499245643615723, "logps/rejected": -4.362691879272461, "loss": 4.4202, "rewards/accuracies": 0.5, "rewards/chosen": -44.992454528808594, "rewards/margins": -1.3655357360839844, "rewards/rejected": -43.626922607421875, "step": 4795 }, { "epoch": 0.6530501089324618, "grad_norm": 36.607415167415226, "learning_rate": 2.5923901664436524e-07, "logits/chosen": 13.623406410217285, "logits/rejected": 13.4310941696167, "logps/chosen": -4.106148719787598, "logps/rejected": -4.238533973693848, "loss": 3.5467, "rewards/accuracies": 0.75, "rewards/chosen": -41.061485290527344, "rewards/margins": 1.3238554000854492, "rewards/rejected": -42.38534164428711, "step": 4796 }, { "epoch": 0.6531862745098039, "grad_norm": 42.78267368192151, "learning_rate": 2.590610542853108e-07, "logits/chosen": 13.354511260986328, "logits/rejected": 13.344730377197266, "logps/chosen": -4.4480180740356445, "logps/rejected": -4.497673034667969, "loss": 4.2933, "rewards/accuracies": 0.5, "rewards/chosen": -44.48017883300781, "rewards/margins": 0.4965486526489258, "rewards/rejected": -44.97673034667969, "step": 4797 }, { "epoch": 0.653322440087146, "grad_norm": 39.8606454214103, "learning_rate": 2.5888312377257616e-07, "logits/chosen": 13.596338272094727, "logits/rejected": 14.281675338745117, "logps/chosen": -4.520359992980957, "logps/rejected": -4.877377986907959, "loss": 3.9329, "rewards/accuracies": 0.75, "rewards/chosen": -45.2036018371582, "rewards/margins": 3.570178985595703, "rewards/rejected": -48.773780822753906, "step": 4798 }, { "epoch": 0.653458605664488, "grad_norm": 45.80866752953896, "learning_rate": 2.587052251463663e-07, "logits/chosen": 13.141212463378906, "logits/rejected": 14.643762588500977, "logps/chosen": -3.9652280807495117, "logps/rejected": -4.8046064376831055, "loss": 4.0193, "rewards/accuracies": 1.0, "rewards/chosen": -39.65228271484375, "rewards/margins": 8.393783569335938, "rewards/rejected": -48.04606628417969, "step": 4799 }, { "epoch": 0.6535947712418301, "grad_norm": 36.984178465046114, "learning_rate": 2.5852735844687867e-07, "logits/chosen": 12.740483283996582, "logits/rejected": 13.67361831665039, "logps/chosen": -4.116132736206055, "logps/rejected": -4.71335506439209, "loss": 3.5114, "rewards/accuracies": 1.0, "rewards/chosen": -41.16132736206055, "rewards/margins": 5.972220420837402, "rewards/rejected": -47.133548736572266, "step": 4800 }, { "epoch": 0.6537309368191722, "grad_norm": 38.99335538960185, "learning_rate": 2.5834952371430383e-07, "logits/chosen": 13.441682815551758, "logits/rejected": 14.244173049926758, "logps/chosen": -4.229549407958984, "logps/rejected": -4.418850898742676, "loss": 3.5729, "rewards/accuracies": 0.75, "rewards/chosen": -42.29549026489258, "rewards/margins": 1.8930187225341797, "rewards/rejected": -44.18851089477539, "step": 4801 }, { "epoch": 0.6538671023965141, "grad_norm": 37.46793596832549, "learning_rate": 2.5817172098882513e-07, "logits/chosen": 13.43725299835205, "logits/rejected": 13.542497634887695, "logps/chosen": -4.366237640380859, "logps/rejected": -4.569806098937988, "loss": 3.7318, "rewards/accuracies": 0.75, "rewards/chosen": -43.66238021850586, "rewards/margins": 2.0356836318969727, "rewards/rejected": -45.698062896728516, "step": 4802 }, { "epoch": 0.6540032679738562, "grad_norm": 36.88197686669504, "learning_rate": 2.579939503106183e-07, "logits/chosen": 13.663905143737793, "logits/rejected": 13.846170425415039, "logps/chosen": -4.368288993835449, "logps/rejected": -4.664748668670654, "loss": 3.8155, "rewards/accuracies": 0.75, "rewards/chosen": -43.682884216308594, "rewards/margins": 2.9645986557006836, "rewards/rejected": -46.647483825683594, "step": 4803 }, { "epoch": 0.6541394335511983, "grad_norm": 39.450684285941506, "learning_rate": 2.5781621171985215e-07, "logits/chosen": 13.32059097290039, "logits/rejected": 13.665424346923828, "logps/chosen": -4.284050464630127, "logps/rejected": -4.319565296173096, "loss": 4.3544, "rewards/accuracies": 0.5, "rewards/chosen": -42.84050750732422, "rewards/margins": 0.3551483154296875, "rewards/rejected": -43.195655822753906, "step": 4804 }, { "epoch": 0.6542755991285403, "grad_norm": 38.513805252228806, "learning_rate": 2.5763850525668857e-07, "logits/chosen": 13.353078842163086, "logits/rejected": 14.78966236114502, "logps/chosen": -4.127921104431152, "logps/rejected": -4.704689025878906, "loss": 3.069, "rewards/accuracies": 1.0, "rewards/chosen": -41.279212951660156, "rewards/margins": 5.7676801681518555, "rewards/rejected": -47.04689025878906, "step": 4805 }, { "epoch": 0.6544117647058824, "grad_norm": 35.321296796464715, "learning_rate": 2.574608309612812e-07, "logits/chosen": 13.61834716796875, "logits/rejected": 13.604026794433594, "logps/chosen": -4.400272369384766, "logps/rejected": -4.565072059631348, "loss": 3.4529, "rewards/accuracies": 0.5, "rewards/chosen": -44.002723693847656, "rewards/margins": 1.6479921340942383, "rewards/rejected": -45.650718688964844, "step": 4806 }, { "epoch": 0.6545479302832244, "grad_norm": 37.53374188895349, "learning_rate": 2.5728318887377744e-07, "logits/chosen": 13.835969924926758, "logits/rejected": 14.083658218383789, "logps/chosen": -4.134494781494141, "logps/rejected": -4.464445114135742, "loss": 3.834, "rewards/accuracies": 0.75, "rewards/chosen": -41.344947814941406, "rewards/margins": 3.2995033264160156, "rewards/rejected": -44.64445495605469, "step": 4807 }, { "epoch": 0.6546840958605664, "grad_norm": 38.78752980296031, "learning_rate": 2.571055790343169e-07, "logits/chosen": 12.192878723144531, "logits/rejected": 13.915903091430664, "logps/chosen": -3.9798004627227783, "logps/rejected": -4.559413909912109, "loss": 3.7852, "rewards/accuracies": 1.0, "rewards/chosen": -39.798004150390625, "rewards/margins": 5.796133041381836, "rewards/rejected": -45.594139099121094, "step": 4808 }, { "epoch": 0.6548202614379085, "grad_norm": 42.41161389288173, "learning_rate": 2.5692800148303193e-07, "logits/chosen": 13.434623718261719, "logits/rejected": 13.282869338989258, "logps/chosen": -4.003956317901611, "logps/rejected": -4.337987899780273, "loss": 4.035, "rewards/accuracies": 0.75, "rewards/chosen": -40.0395622253418, "rewards/margins": 3.3403167724609375, "rewards/rejected": -43.3798828125, "step": 4809 }, { "epoch": 0.6549564270152506, "grad_norm": 41.93847515579185, "learning_rate": 2.5675045626004756e-07, "logits/chosen": 14.167510032653809, "logits/rejected": 15.146672248840332, "logps/chosen": -4.4093098640441895, "logps/rejected": -4.913282871246338, "loss": 3.9975, "rewards/accuracies": 0.75, "rewards/chosen": -44.09309768676758, "rewards/margins": 5.039730072021484, "rewards/rejected": -49.13282775878906, "step": 4810 }, { "epoch": 0.6550925925925926, "grad_norm": 37.550917028294236, "learning_rate": 2.565729434054819e-07, "logits/chosen": 13.575526237487793, "logits/rejected": 13.249198913574219, "logps/chosen": -4.4088826179504395, "logps/rejected": -4.419816970825195, "loss": 4.1601, "rewards/accuracies": 0.5, "rewards/chosen": -44.088829040527344, "rewards/margins": 0.10934257507324219, "rewards/rejected": -44.19816970825195, "step": 4811 }, { "epoch": 0.6552287581699346, "grad_norm": 40.84129328588744, "learning_rate": 2.563954629594451e-07, "logits/chosen": 13.599228858947754, "logits/rejected": 12.903081893920898, "logps/chosen": -4.3871002197265625, "logps/rejected": -4.212184429168701, "loss": 4.1347, "rewards/accuracies": 0.25, "rewards/chosen": -43.871002197265625, "rewards/margins": -1.7491626739501953, "rewards/rejected": -42.12184143066406, "step": 4812 }, { "epoch": 0.6553649237472767, "grad_norm": 39.09257255358579, "learning_rate": 2.562180149620405e-07, "logits/chosen": 13.360834121704102, "logits/rejected": 14.207454681396484, "logps/chosen": -4.191705703735352, "logps/rejected": -4.541092872619629, "loss": 3.7534, "rewards/accuracies": 0.75, "rewards/chosen": -41.91706085205078, "rewards/margins": 3.493870735168457, "rewards/rejected": -45.41093063354492, "step": 4813 }, { "epoch": 0.6555010893246187, "grad_norm": 41.32091953765504, "learning_rate": 2.56040599453364e-07, "logits/chosen": 11.963092803955078, "logits/rejected": 13.348389625549316, "logps/chosen": -4.049815654754639, "logps/rejected": -4.414061069488525, "loss": 3.5819, "rewards/accuracies": 0.75, "rewards/chosen": -40.4981575012207, "rewards/margins": 3.6424560546875, "rewards/rejected": -44.1406135559082, "step": 4814 }, { "epoch": 0.6556372549019608, "grad_norm": 38.31905908585336, "learning_rate": 2.5586321647350405e-07, "logits/chosen": 12.698919296264648, "logits/rejected": 13.951339721679688, "logps/chosen": -3.9050228595733643, "logps/rejected": -4.339143753051758, "loss": 3.7066, "rewards/accuracies": 0.75, "rewards/chosen": -39.050228118896484, "rewards/margins": 4.3412065505981445, "rewards/rejected": -43.39143371582031, "step": 4815 }, { "epoch": 0.6557734204793029, "grad_norm": 39.44227145721226, "learning_rate": 2.556858660625417e-07, "logits/chosen": 13.28169059753418, "logits/rejected": 12.982597351074219, "logps/chosen": -4.413775444030762, "logps/rejected": -4.563205718994141, "loss": 4.0167, "rewards/accuracies": 0.75, "rewards/chosen": -44.13775634765625, "rewards/margins": 1.4943008422851562, "rewards/rejected": -45.632057189941406, "step": 4816 }, { "epoch": 0.6559095860566448, "grad_norm": 43.575397697298406, "learning_rate": 2.5550854826055095e-07, "logits/chosen": 13.537092208862305, "logits/rejected": 14.3514986038208, "logps/chosen": -4.046420574188232, "logps/rejected": -4.713009834289551, "loss": 3.7825, "rewards/accuracies": 0.75, "rewards/chosen": -40.46420669555664, "rewards/margins": 6.665890693664551, "rewards/rejected": -47.130096435546875, "step": 4817 }, { "epoch": 0.6560457516339869, "grad_norm": 37.82515699279722, "learning_rate": 2.55331263107598e-07, "logits/chosen": 13.922933578491211, "logits/rejected": 13.202276229858398, "logps/chosen": -4.634975433349609, "logps/rejected": -4.34827995300293, "loss": 3.8772, "rewards/accuracies": 0.25, "rewards/chosen": -46.349754333496094, "rewards/margins": -2.866954803466797, "rewards/rejected": -43.48279571533203, "step": 4818 }, { "epoch": 0.656181917211329, "grad_norm": 42.79638386102671, "learning_rate": 2.5515401064374196e-07, "logits/chosen": 13.102453231811523, "logits/rejected": 13.617422103881836, "logps/chosen": -4.28736686706543, "logps/rejected": -4.371030330657959, "loss": 4.6456, "rewards/accuracies": 0.5, "rewards/chosen": -42.8736686706543, "rewards/margins": 0.8366355895996094, "rewards/rejected": -43.710304260253906, "step": 4819 }, { "epoch": 0.656318082788671, "grad_norm": 46.39557253086919, "learning_rate": 2.549767909090346e-07, "logits/chosen": 13.152923583984375, "logits/rejected": 13.253018379211426, "logps/chosen": -4.211711883544922, "logps/rejected": -4.580142021179199, "loss": 4.1186, "rewards/accuracies": 0.75, "rewards/chosen": -42.11711883544922, "rewards/margins": 3.684300422668457, "rewards/rejected": -45.80141830444336, "step": 4820 }, { "epoch": 0.6564542483660131, "grad_norm": 38.54189115431984, "learning_rate": 2.5479960394352e-07, "logits/chosen": 12.499759674072266, "logits/rejected": 12.914050102233887, "logps/chosen": -3.7997045516967773, "logps/rejected": -3.9072179794311523, "loss": 3.9147, "rewards/accuracies": 0.5, "rewards/chosen": -37.997047424316406, "rewards/margins": 1.0751338005065918, "rewards/rejected": -39.072181701660156, "step": 4821 }, { "epoch": 0.6565904139433552, "grad_norm": 45.48928956305655, "learning_rate": 2.546224497872353e-07, "logits/chosen": 12.918437957763672, "logits/rejected": 13.804858207702637, "logps/chosen": -4.186949729919434, "logps/rejected": -4.595046520233154, "loss": 3.9362, "rewards/accuracies": 0.75, "rewards/chosen": -41.86949920654297, "rewards/margins": 4.080965995788574, "rewards/rejected": -45.95046615600586, "step": 4822 }, { "epoch": 0.6567265795206971, "grad_norm": 39.6991643995779, "learning_rate": 2.544453284802097e-07, "logits/chosen": 13.290372848510742, "logits/rejected": 13.385204315185547, "logps/chosen": -4.316393852233887, "logps/rejected": -4.08953857421875, "loss": 4.2465, "rewards/accuracies": 0.25, "rewards/chosen": -43.1639404296875, "rewards/margins": -2.2685546875, "rewards/rejected": -40.8953857421875, "step": 4823 }, { "epoch": 0.6568627450980392, "grad_norm": 37.38650548577441, "learning_rate": 2.5426824006246527e-07, "logits/chosen": 13.165406227111816, "logits/rejected": 12.819613456726074, "logps/chosen": -4.480608940124512, "logps/rejected": -4.275928497314453, "loss": 4.1005, "rewards/accuracies": 0.25, "rewards/chosen": -44.806087493896484, "rewards/margins": -2.0468063354492188, "rewards/rejected": -42.759281158447266, "step": 4824 }, { "epoch": 0.6569989106753813, "grad_norm": 39.25339893588813, "learning_rate": 2.540911845740167e-07, "logits/chosen": 13.522743225097656, "logits/rejected": 13.340696334838867, "logps/chosen": -4.286852836608887, "logps/rejected": -4.307141304016113, "loss": 3.9725, "rewards/accuracies": 0.5, "rewards/chosen": -42.8685302734375, "rewards/margins": 0.20288658142089844, "rewards/rejected": -43.071414947509766, "step": 4825 }, { "epoch": 0.6571350762527233, "grad_norm": 36.76978172826653, "learning_rate": 2.53914162054871e-07, "logits/chosen": 13.775714874267578, "logits/rejected": 13.183612823486328, "logps/chosen": -4.317099571228027, "logps/rejected": -4.337435722351074, "loss": 4.2744, "rewards/accuracies": 0.75, "rewards/chosen": -43.17099380493164, "rewards/margins": 0.20336532592773438, "rewards/rejected": -43.374359130859375, "step": 4826 }, { "epoch": 0.6572712418300654, "grad_norm": 40.346624785405076, "learning_rate": 2.537371725450279e-07, "logits/chosen": 13.425617218017578, "logits/rejected": 12.821235656738281, "logps/chosen": -4.457151412963867, "logps/rejected": -4.485288619995117, "loss": 4.0844, "rewards/accuracies": 0.5, "rewards/chosen": -44.57151412963867, "rewards/margins": 0.2813739776611328, "rewards/rejected": -44.85289001464844, "step": 4827 }, { "epoch": 0.6574074074074074, "grad_norm": 37.20645796880053, "learning_rate": 2.5356021608447967e-07, "logits/chosen": 12.652421951293945, "logits/rejected": 13.963013648986816, "logps/chosen": -4.4530930519104, "logps/rejected": -4.601134300231934, "loss": 4.111, "rewards/accuracies": 0.75, "rewards/chosen": -44.53093338012695, "rewards/margins": 1.4804086685180664, "rewards/rejected": -46.01134490966797, "step": 4828 }, { "epoch": 0.6575435729847494, "grad_norm": 42.3274031158597, "learning_rate": 2.533832927132113e-07, "logits/chosen": 13.544857025146484, "logits/rejected": 13.264984130859375, "logps/chosen": -4.447079181671143, "logps/rejected": -4.196536540985107, "loss": 4.1926, "rewards/accuracies": 0.25, "rewards/chosen": -44.470787048339844, "rewards/margins": -2.5054235458374023, "rewards/rejected": -41.965362548828125, "step": 4829 }, { "epoch": 0.6576797385620915, "grad_norm": 34.57386411856214, "learning_rate": 2.5320640247119966e-07, "logits/chosen": 13.229137420654297, "logits/rejected": 13.53646469116211, "logps/chosen": -3.875026226043701, "logps/rejected": -4.125757217407227, "loss": 3.8308, "rewards/accuracies": 0.5, "rewards/chosen": -38.75026321411133, "rewards/margins": 2.5073060989379883, "rewards/rejected": -41.257568359375, "step": 4830 }, { "epoch": 0.6578159041394336, "grad_norm": 35.83659716741173, "learning_rate": 2.530295453984149e-07, "logits/chosen": 13.629814147949219, "logits/rejected": 12.650178909301758, "logps/chosen": -4.265775680541992, "logps/rejected": -4.101973533630371, "loss": 3.7956, "rewards/accuracies": 0.5, "rewards/chosen": -42.65775680541992, "rewards/margins": -1.6380224227905273, "rewards/rejected": -41.019737243652344, "step": 4831 }, { "epoch": 0.6579520697167756, "grad_norm": 37.53909978506034, "learning_rate": 2.5285272153481926e-07, "logits/chosen": 13.268956184387207, "logits/rejected": 12.261849403381348, "logps/chosen": -4.62788200378418, "logps/rejected": -4.2756547927856445, "loss": 4.0661, "rewards/accuracies": 0.0, "rewards/chosen": -46.27881622314453, "rewards/margins": -3.522273063659668, "rewards/rejected": -42.75654602050781, "step": 4832 }, { "epoch": 0.6580882352941176, "grad_norm": 36.97864969063935, "learning_rate": 2.5267593092036754e-07, "logits/chosen": 12.826034545898438, "logits/rejected": 13.654508590698242, "logps/chosen": -4.046111106872559, "logps/rejected": -4.3429412841796875, "loss": 3.7133, "rewards/accuracies": 0.75, "rewards/chosen": -40.46111297607422, "rewards/margins": 2.9683027267456055, "rewards/rejected": -43.429412841796875, "step": 4833 }, { "epoch": 0.6582244008714597, "grad_norm": 39.039421216322076, "learning_rate": 2.5249917359500685e-07, "logits/chosen": 13.035436630249023, "logits/rejected": 13.143471717834473, "logps/chosen": -4.3762640953063965, "logps/rejected": -4.383537769317627, "loss": 3.8072, "rewards/accuracies": 0.5, "rewards/chosen": -43.76264190673828, "rewards/margins": 0.0727376937866211, "rewards/rejected": -43.83538055419922, "step": 4834 }, { "epoch": 0.6583605664488017, "grad_norm": 39.806070817200236, "learning_rate": 2.5232244959867734e-07, "logits/chosen": 13.162335395812988, "logits/rejected": 13.668524742126465, "logps/chosen": -4.35324764251709, "logps/rejected": -4.608092308044434, "loss": 3.9281, "rewards/accuracies": 0.75, "rewards/chosen": -43.53247833251953, "rewards/margins": 2.5484418869018555, "rewards/rejected": -46.08091735839844, "step": 4835 }, { "epoch": 0.6584967320261438, "grad_norm": 39.489053823104506, "learning_rate": 2.521457589713109e-07, "logits/chosen": 13.654815673828125, "logits/rejected": 13.00877571105957, "logps/chosen": -4.389807224273682, "logps/rejected": -4.2751851081848145, "loss": 4.3981, "rewards/accuracies": 0.5, "rewards/chosen": -43.8980712890625, "rewards/margins": -1.1462211608886719, "rewards/rejected": -42.751853942871094, "step": 4836 }, { "epoch": 0.6586328976034859, "grad_norm": 43.3403221910397, "learning_rate": 2.519691017528324e-07, "logits/chosen": 13.310576438903809, "logits/rejected": 13.443609237670898, "logps/chosen": -4.494898319244385, "logps/rejected": -4.530233383178711, "loss": 3.9377, "rewards/accuracies": 0.5, "rewards/chosen": -44.9489860534668, "rewards/margins": 0.3533477783203125, "rewards/rejected": -45.302330017089844, "step": 4837 }, { "epoch": 0.6587690631808278, "grad_norm": 38.744526951686154, "learning_rate": 2.517924779831592e-07, "logits/chosen": 12.967906951904297, "logits/rejected": 13.139425277709961, "logps/chosen": -4.059077262878418, "logps/rejected": -4.264595985412598, "loss": 4.2085, "rewards/accuracies": 0.75, "rewards/chosen": -40.59077453613281, "rewards/margins": 2.055187225341797, "rewards/rejected": -42.64596176147461, "step": 4838 }, { "epoch": 0.6589052287581699, "grad_norm": 36.23083267675365, "learning_rate": 2.516158877022005e-07, "logits/chosen": 12.893081665039062, "logits/rejected": 12.951070785522461, "logps/chosen": -4.137144088745117, "logps/rejected": -4.408598899841309, "loss": 3.7529, "rewards/accuracies": 1.0, "rewards/chosen": -41.371437072753906, "rewards/margins": 2.7145490646362305, "rewards/rejected": -44.08598709106445, "step": 4839 }, { "epoch": 0.659041394335512, "grad_norm": 37.105807501424636, "learning_rate": 2.5143933094985855e-07, "logits/chosen": 13.655677795410156, "logits/rejected": 13.113525390625, "logps/chosen": -4.1590495109558105, "logps/rejected": -4.393404006958008, "loss": 4.0674, "rewards/accuracies": 0.75, "rewards/chosen": -41.590492248535156, "rewards/margins": 2.343547821044922, "rewards/rejected": -43.934043884277344, "step": 4840 }, { "epoch": 0.659177559912854, "grad_norm": 39.74886877795797, "learning_rate": 2.512628077660279e-07, "logits/chosen": 12.895916938781738, "logits/rejected": 13.627399444580078, "logps/chosen": -4.060539722442627, "logps/rejected": -4.519064426422119, "loss": 3.9178, "rewards/accuracies": 1.0, "rewards/chosen": -40.60540008544922, "rewards/margins": 4.585247993469238, "rewards/rejected": -45.190643310546875, "step": 4841 }, { "epoch": 0.6593137254901961, "grad_norm": 38.755497306911444, "learning_rate": 2.510863181905952e-07, "logits/chosen": 12.697797775268555, "logits/rejected": 13.728996276855469, "logps/chosen": -4.340597152709961, "logps/rejected": -4.560462951660156, "loss": 4.3393, "rewards/accuracies": 1.0, "rewards/chosen": -43.40597152709961, "rewards/margins": 2.1986570358276367, "rewards/rejected": -45.60462951660156, "step": 4842 }, { "epoch": 0.6594498910675382, "grad_norm": 37.99397908811653, "learning_rate": 2.509098622634398e-07, "logits/chosen": 13.40469741821289, "logits/rejected": 13.634500503540039, "logps/chosen": -4.335436820983887, "logps/rejected": -4.40635871887207, "loss": 3.9583, "rewards/accuracies": 0.5, "rewards/chosen": -43.3543701171875, "rewards/margins": 0.709223747253418, "rewards/rejected": -44.06359100341797, "step": 4843 }, { "epoch": 0.6595860566448801, "grad_norm": 41.95901102762601, "learning_rate": 2.507334400244336e-07, "logits/chosen": 12.97705364227295, "logits/rejected": 13.779773712158203, "logps/chosen": -4.222442626953125, "logps/rejected": -4.181116580963135, "loss": 4.6982, "rewards/accuracies": 0.5, "rewards/chosen": -42.224422454833984, "rewards/margins": -0.4132566452026367, "rewards/rejected": -41.81116485595703, "step": 4844 }, { "epoch": 0.6597222222222222, "grad_norm": 35.181112044779105, "learning_rate": 2.5055705151344033e-07, "logits/chosen": 13.194506645202637, "logits/rejected": 13.574525833129883, "logps/chosen": -4.229313850402832, "logps/rejected": -4.51213264465332, "loss": 4.1268, "rewards/accuracies": 1.0, "rewards/chosen": -42.29314041137695, "rewards/margins": 2.828184127807617, "rewards/rejected": -45.12132263183594, "step": 4845 }, { "epoch": 0.6598583877995643, "grad_norm": 38.65510938004856, "learning_rate": 2.5038069677031657e-07, "logits/chosen": 13.159873008728027, "logits/rejected": 12.510632514953613, "logps/chosen": -4.339042663574219, "logps/rejected": -4.154626369476318, "loss": 3.8558, "rewards/accuracies": 0.5, "rewards/chosen": -43.39042663574219, "rewards/margins": -1.8441638946533203, "rewards/rejected": -41.5462646484375, "step": 4846 }, { "epoch": 0.6599945533769063, "grad_norm": 37.1750556366727, "learning_rate": 2.5020437583491126e-07, "logits/chosen": 13.595762252807617, "logits/rejected": 13.811083793640137, "logps/chosen": -4.525599002838135, "logps/rejected": -4.443509101867676, "loss": 4.439, "rewards/accuracies": 0.5, "rewards/chosen": -45.25598907470703, "rewards/margins": -0.8209009170532227, "rewards/rejected": -44.435089111328125, "step": 4847 }, { "epoch": 0.6601307189542484, "grad_norm": 39.4373727082619, "learning_rate": 2.5002808874706535e-07, "logits/chosen": 13.779195785522461, "logits/rejected": 13.403692245483398, "logps/chosen": -4.555117607116699, "logps/rejected": -4.589176177978516, "loss": 4.3968, "rewards/accuracies": 0.75, "rewards/chosen": -45.55117416381836, "rewards/margins": 0.3405904769897461, "rewards/rejected": -45.89176559448242, "step": 4848 }, { "epoch": 0.6602668845315904, "grad_norm": 43.826570710821464, "learning_rate": 2.498518355466124e-07, "logits/chosen": 12.32767105102539, "logits/rejected": 13.374229431152344, "logps/chosen": -4.150470733642578, "logps/rejected": -4.537067413330078, "loss": 3.6035, "rewards/accuracies": 0.75, "rewards/chosen": -41.50470733642578, "rewards/margins": 3.865966796875, "rewards/rejected": -45.37067413330078, "step": 4849 }, { "epoch": 0.6604030501089324, "grad_norm": 47.59515070163812, "learning_rate": 2.4967561627337854e-07, "logits/chosen": 13.198001861572266, "logits/rejected": 13.114124298095703, "logps/chosen": -4.329489707946777, "logps/rejected": -4.32598876953125, "loss": 3.786, "rewards/accuracies": 0.5, "rewards/chosen": -43.29489517211914, "rewards/margins": -0.035004615783691406, "rewards/rejected": -43.259891510009766, "step": 4850 }, { "epoch": 0.6605392156862745, "grad_norm": 36.33984881413884, "learning_rate": 2.494994309671816e-07, "logits/chosen": 13.35954475402832, "logits/rejected": 13.402687072753906, "logps/chosen": -4.266571998596191, "logps/rejected": -4.344299793243408, "loss": 4.2701, "rewards/accuracies": 0.75, "rewards/chosen": -42.66571807861328, "rewards/margins": 0.7772798538208008, "rewards/rejected": -43.44300079345703, "step": 4851 }, { "epoch": 0.6606753812636166, "grad_norm": 40.46081221589932, "learning_rate": 2.493232796678323e-07, "logits/chosen": 13.149087905883789, "logits/rejected": 13.824531555175781, "logps/chosen": -4.173754692077637, "logps/rejected": -4.421557426452637, "loss": 3.8726, "rewards/accuracies": 0.75, "rewards/chosen": -41.737548828125, "rewards/margins": 2.4780263900756836, "rewards/rejected": -44.215576171875, "step": 4852 }, { "epoch": 0.6608115468409586, "grad_norm": 41.888477627434966, "learning_rate": 2.4914716241513366e-07, "logits/chosen": 13.161407470703125, "logits/rejected": 13.561172485351562, "logps/chosen": -4.278337478637695, "logps/rejected": -4.821101665496826, "loss": 3.4646, "rewards/accuracies": 0.75, "rewards/chosen": -42.78337478637695, "rewards/margins": 5.427641868591309, "rewards/rejected": -48.21101760864258, "step": 4853 }, { "epoch": 0.6609477124183006, "grad_norm": 37.83546837586009, "learning_rate": 2.4897107924888044e-07, "logits/chosen": 13.45736312866211, "logits/rejected": 13.360803604125977, "logps/chosen": -4.583794593811035, "logps/rejected": -4.474701881408691, "loss": 4.1772, "rewards/accuracies": 0.5, "rewards/chosen": -45.837947845458984, "rewards/margins": -1.090928077697754, "rewards/rejected": -44.74702072143555, "step": 4854 }, { "epoch": 0.6610838779956427, "grad_norm": 37.54661544739782, "learning_rate": 2.4879503020886025e-07, "logits/chosen": 13.106157302856445, "logits/rejected": 12.960798263549805, "logps/chosen": -4.240002632141113, "logps/rejected": -4.240347862243652, "loss": 3.6238, "rewards/accuracies": 0.5, "rewards/chosen": -42.400028228759766, "rewards/margins": 0.003448486328125, "rewards/rejected": -42.40347671508789, "step": 4855 }, { "epoch": 0.6612200435729847, "grad_norm": 43.856712652031675, "learning_rate": 2.486190153348531e-07, "logits/chosen": 13.202564239501953, "logits/rejected": 13.23360824584961, "logps/chosen": -4.017281532287598, "logps/rejected": -4.101322174072266, "loss": 3.5589, "rewards/accuracies": 0.5, "rewards/chosen": -40.17281723022461, "rewards/margins": 0.8404035568237305, "rewards/rejected": -41.013221740722656, "step": 4856 }, { "epoch": 0.6613562091503268, "grad_norm": 39.02178462324242, "learning_rate": 2.484430346666305e-07, "logits/chosen": 13.050468444824219, "logits/rejected": 13.639520645141602, "logps/chosen": -4.077480316162109, "logps/rejected": -4.395856857299805, "loss": 4.3451, "rewards/accuracies": 0.75, "rewards/chosen": -40.77479934692383, "rewards/margins": 3.1837711334228516, "rewards/rejected": -43.95856857299805, "step": 4857 }, { "epoch": 0.6614923747276689, "grad_norm": 39.892213698645435, "learning_rate": 2.482670882439571e-07, "logits/chosen": 13.305688858032227, "logits/rejected": 13.038759231567383, "logps/chosen": -4.246244430541992, "logps/rejected": -4.236846446990967, "loss": 4.2526, "rewards/accuracies": 0.5, "rewards/chosen": -42.462440490722656, "rewards/margins": -0.09397506713867188, "rewards/rejected": -42.368465423583984, "step": 4858 }, { "epoch": 0.661628540305011, "grad_norm": 37.94025896267701, "learning_rate": 2.4809117610658943e-07, "logits/chosen": 12.49548053741455, "logits/rejected": 14.20744800567627, "logps/chosen": -4.10904598236084, "logps/rejected": -4.5067315101623535, "loss": 4.21, "rewards/accuracies": 1.0, "rewards/chosen": -41.09046173095703, "rewards/margins": 3.976853370666504, "rewards/rejected": -45.06731414794922, "step": 4859 }, { "epoch": 0.6617647058823529, "grad_norm": 42.904632411213164, "learning_rate": 2.479152982942761e-07, "logits/chosen": 12.519515991210938, "logits/rejected": 12.8483304977417, "logps/chosen": -4.042803764343262, "logps/rejected": -3.903578996658325, "loss": 3.8812, "rewards/accuracies": 0.25, "rewards/chosen": -40.428035736083984, "rewards/margins": -1.3922452926635742, "rewards/rejected": -39.035789489746094, "step": 4860 }, { "epoch": 0.661900871459695, "grad_norm": 40.506169643073015, "learning_rate": 2.4773945484675824e-07, "logits/chosen": 12.337100982666016, "logits/rejected": 11.871604919433594, "logps/chosen": -4.220588684082031, "logps/rejected": -4.202945232391357, "loss": 4.4359, "rewards/accuracies": 0.5, "rewards/chosen": -42.20589065551758, "rewards/margins": -0.1764373779296875, "rewards/rejected": -42.029457092285156, "step": 4861 }, { "epoch": 0.6620370370370371, "grad_norm": 39.96395709055153, "learning_rate": 2.475636458037692e-07, "logits/chosen": 13.531402587890625, "logits/rejected": 12.921293258666992, "logps/chosen": -4.270475387573242, "logps/rejected": -4.190761089324951, "loss": 4.0915, "rewards/accuracies": 0.25, "rewards/chosen": -42.70475387573242, "rewards/margins": -0.7971439361572266, "rewards/rejected": -41.90760803222656, "step": 4862 }, { "epoch": 0.6621732026143791, "grad_norm": 41.83723499388397, "learning_rate": 2.4738787120503454e-07, "logits/chosen": 13.308738708496094, "logits/rejected": 13.574594497680664, "logps/chosen": -4.050820350646973, "logps/rejected": -4.33294677734375, "loss": 3.9991, "rewards/accuracies": 0.75, "rewards/chosen": -40.50820541381836, "rewards/margins": 2.82126522064209, "rewards/rejected": -43.3294677734375, "step": 4863 }, { "epoch": 0.6623093681917211, "grad_norm": 41.69733314680514, "learning_rate": 2.4721213109027174e-07, "logits/chosen": 12.16340160369873, "logits/rejected": 12.974522590637207, "logps/chosen": -4.063485145568848, "logps/rejected": -4.3734893798828125, "loss": 4.4495, "rewards/accuracies": 0.75, "rewards/chosen": -40.63485336303711, "rewards/margins": 3.100043296813965, "rewards/rejected": -43.73489761352539, "step": 4864 }, { "epoch": 0.6624455337690632, "grad_norm": 39.48647063830999, "learning_rate": 2.4703642549919095e-07, "logits/chosen": 13.00646686553955, "logits/rejected": 13.408809661865234, "logps/chosen": -4.211130142211914, "logps/rejected": -4.64903450012207, "loss": 4.0656, "rewards/accuracies": 1.0, "rewards/chosen": -42.111305236816406, "rewards/margins": 4.379039764404297, "rewards/rejected": -46.4903450012207, "step": 4865 }, { "epoch": 0.6625816993464052, "grad_norm": 38.029796973088075, "learning_rate": 2.468607544714943e-07, "logits/chosen": 12.916231155395508, "logits/rejected": 13.807936668395996, "logps/chosen": -4.185110092163086, "logps/rejected": -4.386717796325684, "loss": 3.797, "rewards/accuracies": 0.75, "rewards/chosen": -41.85110092163086, "rewards/margins": 2.0160751342773438, "rewards/rejected": -43.8671760559082, "step": 4866 }, { "epoch": 0.6627178649237473, "grad_norm": 39.79206700376609, "learning_rate": 2.466851180468759e-07, "logits/chosen": 13.492803573608398, "logits/rejected": 13.961149215698242, "logps/chosen": -4.291825294494629, "logps/rejected": -4.7024335861206055, "loss": 4.0149, "rewards/accuracies": 0.75, "rewards/chosen": -42.918251037597656, "rewards/margins": 4.106082916259766, "rewards/rejected": -47.02433395385742, "step": 4867 }, { "epoch": 0.6628540305010894, "grad_norm": 40.87707770706523, "learning_rate": 2.4650951626502247e-07, "logits/chosen": 13.911609649658203, "logits/rejected": 13.428110122680664, "logps/chosen": -4.555719375610352, "logps/rejected": -4.237744331359863, "loss": 4.5648, "rewards/accuracies": 0.0, "rewards/chosen": -45.557193756103516, "rewards/margins": -3.1797523498535156, "rewards/rejected": -42.37744140625, "step": 4868 }, { "epoch": 0.6629901960784313, "grad_norm": 38.81676768910928, "learning_rate": 2.463339491656125e-07, "logits/chosen": 13.36289119720459, "logits/rejected": 13.304292678833008, "logps/chosen": -3.951695680618286, "logps/rejected": -4.155756950378418, "loss": 3.471, "rewards/accuracies": 0.5, "rewards/chosen": -39.5169563293457, "rewards/margins": 2.040616035461426, "rewards/rejected": -41.55757141113281, "step": 4869 }, { "epoch": 0.6631263616557734, "grad_norm": 40.75872742303176, "learning_rate": 2.4615841678831705e-07, "logits/chosen": 13.479339599609375, "logits/rejected": 13.043819427490234, "logps/chosen": -4.218274116516113, "logps/rejected": -4.277840614318848, "loss": 3.8811, "rewards/accuracies": 0.5, "rewards/chosen": -42.182735443115234, "rewards/margins": 0.595667839050293, "rewards/rejected": -42.778404235839844, "step": 4870 }, { "epoch": 0.6632625272331155, "grad_norm": 39.7255389045365, "learning_rate": 2.45982919172799e-07, "logits/chosen": 14.48377799987793, "logits/rejected": 13.542808532714844, "logps/chosen": -4.624621391296387, "logps/rejected": -4.53362512588501, "loss": 3.6641, "rewards/accuracies": 0.5, "rewards/chosen": -46.246212005615234, "rewards/margins": -0.9099569320678711, "rewards/rejected": -45.33625411987305, "step": 4871 }, { "epoch": 0.6633986928104575, "grad_norm": 42.88845852411591, "learning_rate": 2.4580745635871336e-07, "logits/chosen": 13.43820571899414, "logits/rejected": 13.837074279785156, "logps/chosen": -4.44187068939209, "logps/rejected": -4.42436408996582, "loss": 3.8232, "rewards/accuracies": 0.5, "rewards/chosen": -44.41870880126953, "rewards/margins": -0.17506790161132812, "rewards/rejected": -44.24364471435547, "step": 4872 }, { "epoch": 0.6635348583877996, "grad_norm": 36.96342207688596, "learning_rate": 2.4563202838570763e-07, "logits/chosen": 13.292739868164062, "logits/rejected": 13.8895263671875, "logps/chosen": -4.312509059906006, "logps/rejected": -4.679782390594482, "loss": 4.0651, "rewards/accuracies": 1.0, "rewards/chosen": -43.125091552734375, "rewards/margins": 3.6727352142333984, "rewards/rejected": -46.797828674316406, "step": 4873 }, { "epoch": 0.6636710239651417, "grad_norm": 38.266156753338144, "learning_rate": 2.4545663529342116e-07, "logits/chosen": 14.005125045776367, "logits/rejected": 14.074859619140625, "logps/chosen": -4.285960674285889, "logps/rejected": -4.470226287841797, "loss": 4.3184, "rewards/accuracies": 0.75, "rewards/chosen": -42.85960388183594, "rewards/margins": 1.8426589965820312, "rewards/rejected": -44.70226287841797, "step": 4874 }, { "epoch": 0.6638071895424836, "grad_norm": 38.952485189422205, "learning_rate": 2.4528127712148523e-07, "logits/chosen": 13.73367691040039, "logits/rejected": 14.449801445007324, "logps/chosen": -4.341462135314941, "logps/rejected": -4.663976192474365, "loss": 3.9003, "rewards/accuracies": 0.5, "rewards/chosen": -43.41462326049805, "rewards/margins": 3.2251386642456055, "rewards/rejected": -46.63976287841797, "step": 4875 }, { "epoch": 0.6639433551198257, "grad_norm": 34.21732449693932, "learning_rate": 2.451059539095237e-07, "logits/chosen": 13.228862762451172, "logits/rejected": 12.896547317504883, "logps/chosen": -4.157122611999512, "logps/rejected": -4.514700412750244, "loss": 3.6858, "rewards/accuracies": 0.75, "rewards/chosen": -41.57122802734375, "rewards/margins": 3.5757713317871094, "rewards/rejected": -45.147003173828125, "step": 4876 }, { "epoch": 0.6640795206971678, "grad_norm": 37.33258877493983, "learning_rate": 2.449306656971524e-07, "logits/chosen": 12.713971138000488, "logits/rejected": 13.663305282592773, "logps/chosen": -3.8539514541625977, "logps/rejected": -4.279020309448242, "loss": 4.3253, "rewards/accuracies": 0.75, "rewards/chosen": -38.53951644897461, "rewards/margins": 4.250688552856445, "rewards/rejected": -42.79020309448242, "step": 4877 }, { "epoch": 0.6642156862745098, "grad_norm": 44.70301378707247, "learning_rate": 2.447554125239789e-07, "logits/chosen": 13.314961433410645, "logits/rejected": 13.871601104736328, "logps/chosen": -4.198825836181641, "logps/rejected": -4.368594169616699, "loss": 4.5332, "rewards/accuracies": 0.75, "rewards/chosen": -41.988258361816406, "rewards/margins": 1.6976814270019531, "rewards/rejected": -43.685943603515625, "step": 4878 }, { "epoch": 0.6643518518518519, "grad_norm": 54.81587162442357, "learning_rate": 2.4458019442960315e-07, "logits/chosen": 13.807870864868164, "logits/rejected": 13.855365753173828, "logps/chosen": -4.412487506866455, "logps/rejected": -4.56239128112793, "loss": 3.7737, "rewards/accuracies": 1.0, "rewards/chosen": -44.1248779296875, "rewards/margins": 1.4990348815917969, "rewards/rejected": -45.62390899658203, "step": 4879 }, { "epoch": 0.664488017429194, "grad_norm": 35.315655636492075, "learning_rate": 2.4440501145361734e-07, "logits/chosen": 12.719067573547363, "logits/rejected": 13.287002563476562, "logps/chosen": -3.986736536026001, "logps/rejected": -4.283681392669678, "loss": 3.9833, "rewards/accuracies": 1.0, "rewards/chosen": -39.867366790771484, "rewards/margins": 2.9694480895996094, "rewards/rejected": -42.836814880371094, "step": 4880 }, { "epoch": 0.6646241830065359, "grad_norm": 39.24342293134405, "learning_rate": 2.442298636356052e-07, "logits/chosen": 13.316488265991211, "logits/rejected": 13.59782600402832, "logps/chosen": -4.200104236602783, "logps/rejected": -4.498316764831543, "loss": 3.8472, "rewards/accuracies": 0.75, "rewards/chosen": -42.00104522705078, "rewards/margins": 2.982126235961914, "rewards/rejected": -44.98316955566406, "step": 4881 }, { "epoch": 0.664760348583878, "grad_norm": 38.25915420442179, "learning_rate": 2.44054751015143e-07, "logits/chosen": 12.61713981628418, "logits/rejected": 13.267435073852539, "logps/chosen": -3.876985788345337, "logps/rejected": -4.091450214385986, "loss": 4.0868, "rewards/accuracies": 0.75, "rewards/chosen": -38.769859313964844, "rewards/margins": 2.144643783569336, "rewards/rejected": -40.91450500488281, "step": 4882 }, { "epoch": 0.6648965141612201, "grad_norm": 36.836720948867736, "learning_rate": 2.4387967363179903e-07, "logits/chosen": 13.746519088745117, "logits/rejected": 13.972346305847168, "logps/chosen": -4.15513801574707, "logps/rejected": -4.344461441040039, "loss": 3.8886, "rewards/accuracies": 0.75, "rewards/chosen": -41.55137634277344, "rewards/margins": 1.893239974975586, "rewards/rejected": -43.444618225097656, "step": 4883 }, { "epoch": 0.6650326797385621, "grad_norm": 40.60891295768302, "learning_rate": 2.437046315251331e-07, "logits/chosen": 13.745168685913086, "logits/rejected": 13.484902381896973, "logps/chosen": -4.379672527313232, "logps/rejected": -4.240506649017334, "loss": 4.2373, "rewards/accuracies": 0.25, "rewards/chosen": -43.796722412109375, "rewards/margins": -1.3916568756103516, "rewards/rejected": -42.405067443847656, "step": 4884 }, { "epoch": 0.6651688453159041, "grad_norm": 38.5029796287181, "learning_rate": 2.4352962473469766e-07, "logits/chosen": 12.959358215332031, "logits/rejected": 14.339929580688477, "logps/chosen": -4.011704444885254, "logps/rejected": -4.58074951171875, "loss": 4.0117, "rewards/accuracies": 0.75, "rewards/chosen": -40.117042541503906, "rewards/margins": 5.69045352935791, "rewards/rejected": -45.8074951171875, "step": 4885 }, { "epoch": 0.6653050108932462, "grad_norm": 40.255992106266355, "learning_rate": 2.433546533000371e-07, "logits/chosen": 14.223343849182129, "logits/rejected": 13.033658027648926, "logps/chosen": -4.444300174713135, "logps/rejected": -4.396630764007568, "loss": 3.5199, "rewards/accuracies": 0.25, "rewards/chosen": -44.4430046081543, "rewards/margins": -0.47669410705566406, "rewards/rejected": -43.96630859375, "step": 4886 }, { "epoch": 0.6654411764705882, "grad_norm": 35.083586088237865, "learning_rate": 2.431797172606872e-07, "logits/chosen": 12.252670288085938, "logits/rejected": 13.573446273803711, "logps/chosen": -3.9755725860595703, "logps/rejected": -4.177070617675781, "loss": 4.0709, "rewards/accuracies": 0.5, "rewards/chosen": -39.7557258605957, "rewards/margins": 2.014979362487793, "rewards/rejected": -41.77070617675781, "step": 4887 }, { "epoch": 0.6655773420479303, "grad_norm": 33.98627320898889, "learning_rate": 2.430048166561766e-07, "logits/chosen": 12.981037139892578, "logits/rejected": 12.391304016113281, "logps/chosen": -4.0798187255859375, "logps/rejected": -4.0912861824035645, "loss": 4.2534, "rewards/accuracies": 0.75, "rewards/chosen": -40.798187255859375, "rewards/margins": 0.11467647552490234, "rewards/rejected": -40.912864685058594, "step": 4888 }, { "epoch": 0.6657135076252724, "grad_norm": 36.040862600386184, "learning_rate": 2.428299515260255e-07, "logits/chosen": 13.799348831176758, "logits/rejected": 13.795671463012695, "logps/chosen": -4.600368976593018, "logps/rejected": -4.436066150665283, "loss": 3.8377, "rewards/accuracies": 0.5, "rewards/chosen": -46.003692626953125, "rewards/margins": -1.6430292129516602, "rewards/rejected": -44.360660552978516, "step": 4889 }, { "epoch": 0.6658496732026143, "grad_norm": 40.30263922482236, "learning_rate": 2.426551219097459e-07, "logits/chosen": 13.437372207641602, "logits/rejected": 13.784109115600586, "logps/chosen": -4.201653480529785, "logps/rejected": -4.467006683349609, "loss": 4.2668, "rewards/accuracies": 1.0, "rewards/chosen": -42.01653289794922, "rewards/margins": 2.6535301208496094, "rewards/rejected": -44.67006301879883, "step": 4890 }, { "epoch": 0.6659858387799564, "grad_norm": 40.127506995821655, "learning_rate": 2.4248032784684216e-07, "logits/chosen": 12.214473724365234, "logits/rejected": 13.511804580688477, "logps/chosen": -3.836343765258789, "logps/rejected": -4.372257232666016, "loss": 4.0642, "rewards/accuracies": 1.0, "rewards/chosen": -38.36343765258789, "rewards/margins": 5.359132766723633, "rewards/rejected": -43.722572326660156, "step": 4891 }, { "epoch": 0.6661220043572985, "grad_norm": 36.96752218788296, "learning_rate": 2.423055693768105e-07, "logits/chosen": 13.240928649902344, "logits/rejected": 13.09143352508545, "logps/chosen": -3.902435064315796, "logps/rejected": -4.127865791320801, "loss": 3.7891, "rewards/accuracies": 1.0, "rewards/chosen": -39.024349212646484, "rewards/margins": 2.254307746887207, "rewards/rejected": -41.27865982055664, "step": 4892 }, { "epoch": 0.6662581699346405, "grad_norm": 35.470921580760354, "learning_rate": 2.4213084653913886e-07, "logits/chosen": 13.954673767089844, "logits/rejected": 13.895381927490234, "logps/chosen": -4.370124816894531, "logps/rejected": -4.572744846343994, "loss": 4.0008, "rewards/accuracies": 0.5, "rewards/chosen": -43.70124816894531, "rewards/margins": 2.026200294494629, "rewards/rejected": -45.727447509765625, "step": 4893 }, { "epoch": 0.6663943355119826, "grad_norm": 37.142266721736114, "learning_rate": 2.419561593733074e-07, "logits/chosen": 13.47160530090332, "logits/rejected": 13.828895568847656, "logps/chosen": -4.167013168334961, "logps/rejected": -4.432659149169922, "loss": 4.3394, "rewards/accuracies": 0.75, "rewards/chosen": -41.670127868652344, "rewards/margins": 2.6564626693725586, "rewards/rejected": -44.32659149169922, "step": 4894 }, { "epoch": 0.6665305010893247, "grad_norm": 33.045952755379524, "learning_rate": 2.417815079187883e-07, "logits/chosen": 12.997096061706543, "logits/rejected": 13.197023391723633, "logps/chosen": -4.05055570602417, "logps/rejected": -4.299088478088379, "loss": 3.3953, "rewards/accuracies": 0.5, "rewards/chosen": -40.505558013916016, "rewards/margins": 2.485325813293457, "rewards/rejected": -42.990882873535156, "step": 4895 }, { "epoch": 0.6666666666666666, "grad_norm": 33.34938281701614, "learning_rate": 2.416068922150451e-07, "logits/chosen": 13.262237548828125, "logits/rejected": 12.880409240722656, "logps/chosen": -4.048097133636475, "logps/rejected": -3.875021457672119, "loss": 4.0356, "rewards/accuracies": 0.5, "rewards/chosen": -40.48097229003906, "rewards/margins": -1.7307558059692383, "rewards/rejected": -38.750213623046875, "step": 4896 }, { "epoch": 0.6668028322440087, "grad_norm": 36.14467393873719, "learning_rate": 2.4143231230153397e-07, "logits/chosen": 12.857439994812012, "logits/rejected": 13.494808197021484, "logps/chosen": -4.071862697601318, "logps/rejected": -4.57085657119751, "loss": 3.6412, "rewards/accuracies": 1.0, "rewards/chosen": -40.7186279296875, "rewards/margins": 4.989936828613281, "rewards/rejected": -45.70856475830078, "step": 4897 }, { "epoch": 0.6669389978213508, "grad_norm": 36.51035074494145, "learning_rate": 2.4125776821770275e-07, "logits/chosen": 13.34500503540039, "logits/rejected": 13.874116897583008, "logps/chosen": -4.188426494598389, "logps/rejected": -4.372127056121826, "loss": 3.9612, "rewards/accuracies": 0.75, "rewards/chosen": -41.88426208496094, "rewards/margins": 1.837005615234375, "rewards/rejected": -43.72126770019531, "step": 4898 }, { "epoch": 0.6670751633986928, "grad_norm": 40.686063391663225, "learning_rate": 2.4108326000299077e-07, "logits/chosen": 14.569009780883789, "logits/rejected": 13.681344985961914, "logps/chosen": -4.5699381828308105, "logps/rejected": -4.408959865570068, "loss": 4.2815, "rewards/accuracies": 0.25, "rewards/chosen": -45.69938659667969, "rewards/margins": -1.6097850799560547, "rewards/rejected": -44.089599609375, "step": 4899 }, { "epoch": 0.6672113289760349, "grad_norm": 41.16164504825334, "learning_rate": 2.409087876968298e-07, "logits/chosen": 13.346953392028809, "logits/rejected": 14.07773208618164, "logps/chosen": -4.11587381362915, "logps/rejected": -4.47202205657959, "loss": 3.4689, "rewards/accuracies": 0.75, "rewards/chosen": -41.15873718261719, "rewards/margins": 3.561488151550293, "rewards/rejected": -44.72022247314453, "step": 4900 }, { "epoch": 0.6673474945533769, "grad_norm": 40.087311842072914, "learning_rate": 2.4073435133864353e-07, "logits/chosen": 12.914536476135254, "logits/rejected": 13.9666748046875, "logps/chosen": -4.239808082580566, "logps/rejected": -4.438214302062988, "loss": 3.9323, "rewards/accuracies": 0.75, "rewards/chosen": -42.39807891845703, "rewards/margins": 1.9840612411499023, "rewards/rejected": -44.38214111328125, "step": 4901 }, { "epoch": 0.6674836601307189, "grad_norm": 37.16308767270718, "learning_rate": 2.4055995096784696e-07, "logits/chosen": 13.351945877075195, "logits/rejected": 14.344343185424805, "logps/chosen": -4.148166179656982, "logps/rejected": -4.526390075683594, "loss": 4.0625, "rewards/accuracies": 1.0, "rewards/chosen": -41.481658935546875, "rewards/margins": 3.782238006591797, "rewards/rejected": -45.26390075683594, "step": 4902 }, { "epoch": 0.667619825708061, "grad_norm": 36.25450472955772, "learning_rate": 2.4038558662384736e-07, "logits/chosen": 13.2191801071167, "logits/rejected": 13.436605453491211, "logps/chosen": -4.080862045288086, "logps/rejected": -4.377584457397461, "loss": 4.1224, "rewards/accuracies": 0.75, "rewards/chosen": -40.808624267578125, "rewards/margins": 2.9672183990478516, "rewards/rejected": -43.775840759277344, "step": 4903 }, { "epoch": 0.6677559912854031, "grad_norm": 36.419184273071394, "learning_rate": 2.4021125834604394e-07, "logits/chosen": 12.37600326538086, "logits/rejected": 12.986783981323242, "logps/chosen": -4.1572723388671875, "logps/rejected": -4.3599677085876465, "loss": 3.6345, "rewards/accuracies": 0.5, "rewards/chosen": -41.572723388671875, "rewards/margins": 2.0269546508789062, "rewards/rejected": -43.599674224853516, "step": 4904 }, { "epoch": 0.6678921568627451, "grad_norm": 39.717979529176084, "learning_rate": 2.400369661738275e-07, "logits/chosen": 13.611309051513672, "logits/rejected": 12.846359252929688, "logps/chosen": -4.154314994812012, "logps/rejected": -4.210684776306152, "loss": 4.4352, "rewards/accuracies": 0.5, "rewards/chosen": -41.543148040771484, "rewards/margins": 0.5636987686157227, "rewards/rejected": -42.106849670410156, "step": 4905 }, { "epoch": 0.6680283224400871, "grad_norm": 37.65540979209848, "learning_rate": 2.3986271014658076e-07, "logits/chosen": 12.237987518310547, "logits/rejected": 12.603513717651367, "logps/chosen": -4.063506126403809, "logps/rejected": -4.218340873718262, "loss": 4.2784, "rewards/accuracies": 0.75, "rewards/chosen": -40.635066986083984, "rewards/margins": 1.548344612121582, "rewards/rejected": -42.18341064453125, "step": 4906 }, { "epoch": 0.6681644880174292, "grad_norm": 40.2879562789938, "learning_rate": 2.396884903036785e-07, "logits/chosen": 12.871512413024902, "logits/rejected": 13.459024429321289, "logps/chosen": -4.041773796081543, "logps/rejected": -4.251333236694336, "loss": 4.0869, "rewards/accuracies": 0.5, "rewards/chosen": -40.41773986816406, "rewards/margins": 2.095597267150879, "rewards/rejected": -42.513336181640625, "step": 4907 }, { "epoch": 0.6683006535947712, "grad_norm": 35.27258257588094, "learning_rate": 2.3951430668448686e-07, "logits/chosen": 13.122472763061523, "logits/rejected": 13.578678131103516, "logps/chosen": -3.953181743621826, "logps/rejected": -4.614073753356934, "loss": 3.7875, "rewards/accuracies": 0.75, "rewards/chosen": -39.53181838989258, "rewards/margins": 6.608916282653809, "rewards/rejected": -46.1407356262207, "step": 4908 }, { "epoch": 0.6684368191721133, "grad_norm": 35.35742447286887, "learning_rate": 2.3934015932836425e-07, "logits/chosen": 12.432158470153809, "logits/rejected": 13.466690063476562, "logps/chosen": -3.944998264312744, "logps/rejected": -4.306756019592285, "loss": 3.491, "rewards/accuracies": 1.0, "rewards/chosen": -39.44998550415039, "rewards/margins": 3.61757755279541, "rewards/rejected": -43.06755828857422, "step": 4909 }, { "epoch": 0.6685729847494554, "grad_norm": 39.36745635271413, "learning_rate": 2.3916604827466076e-07, "logits/chosen": 13.99261474609375, "logits/rejected": 13.663995742797852, "logps/chosen": -4.2743682861328125, "logps/rejected": -4.560935974121094, "loss": 4.3758, "rewards/accuracies": 0.75, "rewards/chosen": -42.74367904663086, "rewards/margins": 2.865682601928711, "rewards/rejected": -45.60935974121094, "step": 4910 }, { "epoch": 0.6687091503267973, "grad_norm": 41.13211568634603, "learning_rate": 2.3899197356271804e-07, "logits/chosen": 13.567344665527344, "logits/rejected": 14.537107467651367, "logps/chosen": -4.457188606262207, "logps/rejected": -4.484097003936768, "loss": 3.3569, "rewards/accuracies": 0.5, "rewards/chosen": -44.5718879699707, "rewards/margins": 0.26908302307128906, "rewards/rejected": -44.84096908569336, "step": 4911 }, { "epoch": 0.6688453159041394, "grad_norm": 37.95581237902473, "learning_rate": 2.3881793523186975e-07, "logits/chosen": 13.465801239013672, "logits/rejected": 13.496240615844727, "logps/chosen": -4.102547645568848, "logps/rejected": -4.455968379974365, "loss": 3.7492, "rewards/accuracies": 0.75, "rewards/chosen": -41.025474548339844, "rewards/margins": 3.534205436706543, "rewards/rejected": -44.55968475341797, "step": 4912 }, { "epoch": 0.6689814814814815, "grad_norm": 36.711372458416875, "learning_rate": 2.3864393332144143e-07, "logits/chosen": 14.117782592773438, "logits/rejected": 13.393546104431152, "logps/chosen": -4.556977272033691, "logps/rejected": -4.326817035675049, "loss": 3.8432, "rewards/accuracies": 0.25, "rewards/chosen": -45.56977081298828, "rewards/margins": -2.301603317260742, "rewards/rejected": -43.26816940307617, "step": 4913 }, { "epoch": 0.6691176470588235, "grad_norm": 38.033416449419185, "learning_rate": 2.384699678707502e-07, "logits/chosen": 13.040664672851562, "logits/rejected": 13.261428833007812, "logps/chosen": -4.2291717529296875, "logps/rejected": -4.214115142822266, "loss": 4.3101, "rewards/accuracies": 0.25, "rewards/chosen": -42.29172134399414, "rewards/margins": -0.15056991577148438, "rewards/rejected": -42.141151428222656, "step": 4914 }, { "epoch": 0.6692538126361656, "grad_norm": 37.57371979198855, "learning_rate": 2.382960389191048e-07, "logits/chosen": 13.138779640197754, "logits/rejected": 14.219529151916504, "logps/chosen": -4.137749195098877, "logps/rejected": -4.705190658569336, "loss": 3.7517, "rewards/accuracies": 0.75, "rewards/chosen": -41.37749099731445, "rewards/margins": 5.674415588378906, "rewards/rejected": -47.051910400390625, "step": 4915 }, { "epoch": 0.6693899782135077, "grad_norm": 50.585633744466655, "learning_rate": 2.3812214650580622e-07, "logits/chosen": 13.55774211883545, "logits/rejected": 13.511030197143555, "logps/chosen": -4.143789291381836, "logps/rejected": -4.208835601806641, "loss": 4.3351, "rewards/accuracies": 0.5, "rewards/chosen": -41.437896728515625, "rewards/margins": 0.6504611968994141, "rewards/rejected": -42.088356018066406, "step": 4916 }, { "epoch": 0.6695261437908496, "grad_norm": 38.60165685918704, "learning_rate": 2.3794829067014671e-07, "logits/chosen": 12.741212844848633, "logits/rejected": 12.93967056274414, "logps/chosen": -4.03692626953125, "logps/rejected": -4.225869178771973, "loss": 3.6151, "rewards/accuracies": 0.5, "rewards/chosen": -40.3692626953125, "rewards/margins": 1.8894319534301758, "rewards/rejected": -42.25869369506836, "step": 4917 }, { "epoch": 0.6696623093681917, "grad_norm": 37.11190387334164, "learning_rate": 2.377744714514103e-07, "logits/chosen": 13.227774620056152, "logits/rejected": 13.528042793273926, "logps/chosen": -3.8923110961914062, "logps/rejected": -4.218276023864746, "loss": 3.9034, "rewards/accuracies": 0.5, "rewards/chosen": -38.92311096191406, "rewards/margins": 3.259648323059082, "rewards/rejected": -42.18275833129883, "step": 4918 }, { "epoch": 0.6697984749455338, "grad_norm": 37.87525951359434, "learning_rate": 2.3760068888887322e-07, "logits/chosen": 13.090852737426758, "logits/rejected": 13.51506519317627, "logps/chosen": -3.9368083477020264, "logps/rejected": -4.4446187019348145, "loss": 4.056, "rewards/accuracies": 0.75, "rewards/chosen": -39.368080139160156, "rewards/margins": 5.078104019165039, "rewards/rejected": -44.44618606567383, "step": 4919 }, { "epoch": 0.6699346405228758, "grad_norm": 35.31600344365926, "learning_rate": 2.3742694302180274e-07, "logits/chosen": 13.300487518310547, "logits/rejected": 13.085716247558594, "logps/chosen": -4.018539905548096, "logps/rejected": -4.165175437927246, "loss": 3.3232, "rewards/accuracies": 0.75, "rewards/chosen": -40.18539810180664, "rewards/margins": 1.4663562774658203, "rewards/rejected": -41.651756286621094, "step": 4920 }, { "epoch": 0.6700708061002179, "grad_norm": 38.4465471384416, "learning_rate": 2.3725323388945843e-07, "logits/chosen": 13.34798526763916, "logits/rejected": 13.191123962402344, "logps/chosen": -4.459898471832275, "logps/rejected": -4.317099571228027, "loss": 3.9487, "rewards/accuracies": 0.5, "rewards/chosen": -44.59898376464844, "rewards/margins": -1.4279890060424805, "rewards/rejected": -43.170997619628906, "step": 4921 }, { "epoch": 0.6702069716775599, "grad_norm": 38.78842013027334, "learning_rate": 2.3707956153109124e-07, "logits/chosen": 13.7020902633667, "logits/rejected": 14.196673393249512, "logps/chosen": -4.464520454406738, "logps/rejected": -4.815299034118652, "loss": 4.133, "rewards/accuracies": 1.0, "rewards/chosen": -44.64520263671875, "rewards/margins": 3.507786750793457, "rewards/rejected": -48.152992248535156, "step": 4922 }, { "epoch": 0.6703431372549019, "grad_norm": 38.65513918195577, "learning_rate": 2.369059259859437e-07, "logits/chosen": 13.19868278503418, "logits/rejected": 13.618021011352539, "logps/chosen": -3.844817876815796, "logps/rejected": -4.263188362121582, "loss": 3.6996, "rewards/accuracies": 0.75, "rewards/chosen": -38.44818115234375, "rewards/margins": 4.18370246887207, "rewards/rejected": -42.63188171386719, "step": 4923 }, { "epoch": 0.670479302832244, "grad_norm": 36.95930245345908, "learning_rate": 2.3673232729325043e-07, "logits/chosen": 13.522207260131836, "logits/rejected": 13.798013687133789, "logps/chosen": -4.390097618103027, "logps/rejected": -4.675011157989502, "loss": 3.6544, "rewards/accuracies": 0.75, "rewards/chosen": -43.900978088378906, "rewards/margins": 2.8491315841674805, "rewards/rejected": -46.7501106262207, "step": 4924 }, { "epoch": 0.6706154684095861, "grad_norm": 39.67294349122605, "learning_rate": 2.365587654922374e-07, "logits/chosen": 13.45833683013916, "logits/rejected": 13.53831672668457, "logps/chosen": -4.156896591186523, "logps/rejected": -4.183753967285156, "loss": 3.9182, "rewards/accuracies": 0.5, "rewards/chosen": -41.56896209716797, "rewards/margins": 0.26857471466064453, "rewards/rejected": -41.83753967285156, "step": 4925 }, { "epoch": 0.670751633986928, "grad_norm": 42.91060708874554, "learning_rate": 2.3638524062212223e-07, "logits/chosen": 12.902877807617188, "logits/rejected": 12.8323392868042, "logps/chosen": -4.122302055358887, "logps/rejected": -4.225234508514404, "loss": 3.9084, "rewards/accuracies": 0.75, "rewards/chosen": -41.22301483154297, "rewards/margins": 1.029327392578125, "rewards/rejected": -42.252342224121094, "step": 4926 }, { "epoch": 0.6708877995642701, "grad_norm": 44.25374522102714, "learning_rate": 2.3621175272211443e-07, "logits/chosen": 12.824222564697266, "logits/rejected": 13.269699096679688, "logps/chosen": -3.9469215869903564, "logps/rejected": -4.242016315460205, "loss": 4.2768, "rewards/accuracies": 1.0, "rewards/chosen": -39.46921920776367, "rewards/margins": 2.950946807861328, "rewards/rejected": -42.420166015625, "step": 4927 }, { "epoch": 0.6710239651416122, "grad_norm": 41.34210547865493, "learning_rate": 2.3603830183141516e-07, "logits/chosen": 13.019346237182617, "logits/rejected": 13.450075149536133, "logps/chosen": -4.359204292297363, "logps/rejected": -4.574577331542969, "loss": 4.604, "rewards/accuracies": 0.75, "rewards/chosen": -43.592041015625, "rewards/margins": 2.1537303924560547, "rewards/rejected": -45.74577331542969, "step": 4928 }, { "epoch": 0.6711601307189542, "grad_norm": 38.53123842194273, "learning_rate": 2.3586488798921665e-07, "logits/chosen": 13.314544677734375, "logits/rejected": 13.902623176574707, "logps/chosen": -4.26654577255249, "logps/rejected": -4.484552383422852, "loss": 3.9706, "rewards/accuracies": 0.75, "rewards/chosen": -42.66545867919922, "rewards/margins": 2.180069923400879, "rewards/rejected": -44.84552764892578, "step": 4929 }, { "epoch": 0.6712962962962963, "grad_norm": 39.23817007283048, "learning_rate": 2.3569151123470356e-07, "logits/chosen": 13.44552230834961, "logits/rejected": 13.802602767944336, "logps/chosen": -4.768191337585449, "logps/rejected": -4.896937370300293, "loss": 3.9709, "rewards/accuracies": 0.75, "rewards/chosen": -47.681915283203125, "rewards/margins": 1.287461280822754, "rewards/rejected": -48.96937561035156, "step": 4930 }, { "epoch": 0.6714324618736384, "grad_norm": 38.60459325956744, "learning_rate": 2.3551817160705183e-07, "logits/chosen": 12.75454330444336, "logits/rejected": 13.505912780761719, "logps/chosen": -3.7871644496917725, "logps/rejected": -4.112949371337891, "loss": 3.9028, "rewards/accuracies": 0.75, "rewards/chosen": -37.87164306640625, "rewards/margins": 3.2578468322753906, "rewards/rejected": -41.129493713378906, "step": 4931 }, { "epoch": 0.6715686274509803, "grad_norm": 39.33726334230609, "learning_rate": 2.3534486914542867e-07, "logits/chosen": 13.702520370483398, "logits/rejected": 13.94614315032959, "logps/chosen": -4.201504230499268, "logps/rejected": -4.422873497009277, "loss": 3.6478, "rewards/accuracies": 1.0, "rewards/chosen": -42.01504135131836, "rewards/margins": 2.213695526123047, "rewards/rejected": -44.228736877441406, "step": 4932 }, { "epoch": 0.6717047930283224, "grad_norm": 36.84265151155385, "learning_rate": 2.3517160388899334e-07, "logits/chosen": 12.8364839553833, "logits/rejected": 14.10670280456543, "logps/chosen": -4.011865615844727, "logps/rejected": -4.504824161529541, "loss": 3.7806, "rewards/accuracies": 0.75, "rewards/chosen": -40.11865234375, "rewards/margins": 4.929590225219727, "rewards/rejected": -45.048240661621094, "step": 4933 }, { "epoch": 0.6718409586056645, "grad_norm": 37.949155399204884, "learning_rate": 2.3499837587689685e-07, "logits/chosen": 12.583972930908203, "logits/rejected": 14.121094703674316, "logps/chosen": -4.102088928222656, "logps/rejected": -4.625492095947266, "loss": 3.8647, "rewards/accuracies": 1.0, "rewards/chosen": -41.02088928222656, "rewards/margins": 5.2340288162231445, "rewards/rejected": -46.254920959472656, "step": 4934 }, { "epoch": 0.6719771241830066, "grad_norm": 41.33060872689552, "learning_rate": 2.3482518514828103e-07, "logits/chosen": 14.118288040161133, "logits/rejected": 14.609037399291992, "logps/chosen": -3.920020580291748, "logps/rejected": -4.3050537109375, "loss": 4.4077, "rewards/accuracies": 1.0, "rewards/chosen": -39.2002067565918, "rewards/margins": 3.8503265380859375, "rewards/rejected": -43.050533294677734, "step": 4935 }, { "epoch": 0.6721132897603486, "grad_norm": 35.090612619837, "learning_rate": 2.3465203174228e-07, "logits/chosen": 13.922847747802734, "logits/rejected": 13.202428817749023, "logps/chosen": -4.685177803039551, "logps/rejected": -4.529363632202148, "loss": 3.7132, "rewards/accuracies": 0.25, "rewards/chosen": -46.851776123046875, "rewards/margins": -1.5581378936767578, "rewards/rejected": -45.29364013671875, "step": 4936 }, { "epoch": 0.6722494553376906, "grad_norm": 39.60468595052896, "learning_rate": 2.3447891569801929e-07, "logits/chosen": 13.270246505737305, "logits/rejected": 13.0640869140625, "logps/chosen": -4.255782127380371, "logps/rejected": -4.128293991088867, "loss": 3.8941, "rewards/accuracies": 0.5, "rewards/chosen": -42.557823181152344, "rewards/margins": -1.2748823165893555, "rewards/rejected": -41.28293991088867, "step": 4937 }, { "epoch": 0.6723856209150327, "grad_norm": 34.541056920490206, "learning_rate": 2.3430583705461565e-07, "logits/chosen": 12.807723999023438, "logits/rejected": 14.15431022644043, "logps/chosen": -4.038887977600098, "logps/rejected": -4.571255683898926, "loss": 3.5505, "rewards/accuracies": 1.0, "rewards/chosen": -40.388877868652344, "rewards/margins": 5.323683738708496, "rewards/rejected": -45.712562561035156, "step": 4938 }, { "epoch": 0.6725217864923747, "grad_norm": 36.94427927701725, "learning_rate": 2.341327958511778e-07, "logits/chosen": 13.028043746948242, "logits/rejected": 13.315391540527344, "logps/chosen": -4.029421806335449, "logps/rejected": -4.347196578979492, "loss": 4.0579, "rewards/accuracies": 0.5, "rewards/chosen": -40.294219970703125, "rewards/margins": 3.1777496337890625, "rewards/rejected": -43.47196960449219, "step": 4939 }, { "epoch": 0.6726579520697168, "grad_norm": 37.80401382610564, "learning_rate": 2.3395979212680594e-07, "logits/chosen": 12.646081924438477, "logits/rejected": 12.581881523132324, "logps/chosen": -4.04886531829834, "logps/rejected": -4.078723907470703, "loss": 4.3094, "rewards/accuracies": 0.5, "rewards/chosen": -40.488651275634766, "rewards/margins": 0.29859066009521484, "rewards/rejected": -40.7872428894043, "step": 4940 }, { "epoch": 0.6727941176470589, "grad_norm": 40.80431682309297, "learning_rate": 2.3378682592059138e-07, "logits/chosen": 13.290840148925781, "logits/rejected": 13.381490707397461, "logps/chosen": -4.1877241134643555, "logps/rejected": -4.2580060958862305, "loss": 4.1973, "rewards/accuracies": 0.5, "rewards/chosen": -41.87724304199219, "rewards/margins": 0.702815055847168, "rewards/rejected": -42.58005905151367, "step": 4941 }, { "epoch": 0.6729302832244008, "grad_norm": 40.682361135459885, "learning_rate": 2.3361389727161743e-07, "logits/chosen": 13.193601608276367, "logits/rejected": 13.82681941986084, "logps/chosen": -4.068792343139648, "logps/rejected": -4.3578386306762695, "loss": 4.3186, "rewards/accuracies": 0.75, "rewards/chosen": -40.687923431396484, "rewards/margins": 2.8904638290405273, "rewards/rejected": -43.57838439941406, "step": 4942 }, { "epoch": 0.6730664488017429, "grad_norm": 38.29177744511326, "learning_rate": 2.3344100621895894e-07, "logits/chosen": 12.56491756439209, "logits/rejected": 12.595775604248047, "logps/chosen": -3.8525469303131104, "logps/rejected": -3.9445345401763916, "loss": 3.8121, "rewards/accuracies": 0.5, "rewards/chosen": -38.52546691894531, "rewards/margins": 0.9198765754699707, "rewards/rejected": -39.445343017578125, "step": 4943 }, { "epoch": 0.673202614379085, "grad_norm": 40.30478545971053, "learning_rate": 2.3326815280168168e-07, "logits/chosen": 12.851924896240234, "logits/rejected": 13.886427879333496, "logps/chosen": -3.791264295578003, "logps/rejected": -4.352321624755859, "loss": 3.7568, "rewards/accuracies": 1.0, "rewards/chosen": -37.91264343261719, "rewards/margins": 5.610569953918457, "rewards/rejected": -43.52321243286133, "step": 4944 }, { "epoch": 0.673338779956427, "grad_norm": 36.312771337243994, "learning_rate": 2.3309533705884355e-07, "logits/chosen": 13.72088623046875, "logits/rejected": 14.115150451660156, "logps/chosen": -4.314032554626465, "logps/rejected": -4.547185897827148, "loss": 3.0998, "rewards/accuracies": 1.0, "rewards/chosen": -43.140323638916016, "rewards/margins": 2.3315324783325195, "rewards/rejected": -45.471858978271484, "step": 4945 }, { "epoch": 0.6734749455337691, "grad_norm": 42.42409896091213, "learning_rate": 2.3292255902949384e-07, "logits/chosen": 13.32950210571289, "logits/rejected": 13.551198959350586, "logps/chosen": -4.201405048370361, "logps/rejected": -4.324412822723389, "loss": 3.6445, "rewards/accuracies": 0.5, "rewards/chosen": -42.01405334472656, "rewards/margins": 1.2300777435302734, "rewards/rejected": -43.2441291809082, "step": 4946 }, { "epoch": 0.6736111111111112, "grad_norm": 37.570818935580675, "learning_rate": 2.327498187526728e-07, "logits/chosen": 13.714485168457031, "logits/rejected": 13.634023666381836, "logps/chosen": -4.3316264152526855, "logps/rejected": -4.347712516784668, "loss": 3.9017, "rewards/accuracies": 0.5, "rewards/chosen": -43.31626510620117, "rewards/margins": 0.1608600616455078, "rewards/rejected": -43.47712326049805, "step": 4947 }, { "epoch": 0.6737472766884531, "grad_norm": 41.85336862923013, "learning_rate": 2.3257711626741275e-07, "logits/chosen": 12.746244430541992, "logits/rejected": 12.807329177856445, "logps/chosen": -4.1616902351379395, "logps/rejected": -4.249116897583008, "loss": 4.1234, "rewards/accuracies": 0.5, "rewards/chosen": -41.61690139770508, "rewards/margins": 0.874267578125, "rewards/rejected": -42.49116897583008, "step": 4948 }, { "epoch": 0.6738834422657952, "grad_norm": 37.42477483895268, "learning_rate": 2.3240445161273735e-07, "logits/chosen": 13.589892387390137, "logits/rejected": 13.053264617919922, "logps/chosen": -4.285503387451172, "logps/rejected": -4.212114334106445, "loss": 4.0085, "rewards/accuracies": 0.25, "rewards/chosen": -42.85503387451172, "rewards/margins": -0.7338905334472656, "rewards/rejected": -42.12114715576172, "step": 4949 }, { "epoch": 0.6740196078431373, "grad_norm": 40.24862889249953, "learning_rate": 2.322318248276613e-07, "logits/chosen": 12.868224143981934, "logits/rejected": 13.794648170471191, "logps/chosen": -3.946333408355713, "logps/rejected": -4.4402055740356445, "loss": 4.5871, "rewards/accuracies": 0.75, "rewards/chosen": -39.46333312988281, "rewards/margins": 4.938720703125, "rewards/rejected": -44.40205383300781, "step": 4950 }, { "epoch": 0.6741557734204793, "grad_norm": 42.33036490574692, "learning_rate": 2.3205923595119122e-07, "logits/chosen": 13.132465362548828, "logits/rejected": 13.797626495361328, "logps/chosen": -3.9140400886535645, "logps/rejected": -4.3677520751953125, "loss": 4.3932, "rewards/accuracies": 1.0, "rewards/chosen": -39.14039993286133, "rewards/margins": 4.5371198654174805, "rewards/rejected": -43.677520751953125, "step": 4951 }, { "epoch": 0.6742919389978214, "grad_norm": 38.65863046668932, "learning_rate": 2.3188668502232515e-07, "logits/chosen": 12.721505165100098, "logits/rejected": 13.815732955932617, "logps/chosen": -3.810096263885498, "logps/rejected": -4.384868621826172, "loss": 3.8279, "rewards/accuracies": 1.0, "rewards/chosen": -38.1009635925293, "rewards/margins": 5.747722625732422, "rewards/rejected": -43.84868621826172, "step": 4952 }, { "epoch": 0.6744281045751634, "grad_norm": 38.445963142787726, "learning_rate": 2.3171417208005207e-07, "logits/chosen": 12.956640243530273, "logits/rejected": 13.284492492675781, "logps/chosen": -3.8424072265625, "logps/rejected": -3.9871623516082764, "loss": 4.0825, "rewards/accuracies": 0.75, "rewards/chosen": -38.424072265625, "rewards/margins": 1.4475507736206055, "rewards/rejected": -39.871620178222656, "step": 4953 }, { "epoch": 0.6745642701525054, "grad_norm": 38.57591597194426, "learning_rate": 2.315416971633529e-07, "logits/chosen": 13.466562271118164, "logits/rejected": 13.348563194274902, "logps/chosen": -4.034516334533691, "logps/rejected": -4.309122562408447, "loss": 4.0819, "rewards/accuracies": 0.75, "rewards/chosen": -40.34516525268555, "rewards/margins": 2.746058464050293, "rewards/rejected": -43.091224670410156, "step": 4954 }, { "epoch": 0.6747004357298475, "grad_norm": 37.96961364564256, "learning_rate": 2.313692603111999e-07, "logits/chosen": 14.012765884399414, "logits/rejected": 12.862920761108398, "logps/chosen": -4.390127182006836, "logps/rejected": -4.180395603179932, "loss": 4.1301, "rewards/accuracies": 0.5, "rewards/chosen": -43.90127182006836, "rewards/margins": -2.097315788269043, "rewards/rejected": -41.803955078125, "step": 4955 }, { "epoch": 0.6748366013071896, "grad_norm": 38.077334931216924, "learning_rate": 2.3119686156255622e-07, "logits/chosen": 13.84618091583252, "logits/rejected": 13.823923110961914, "logps/chosen": -4.423742294311523, "logps/rejected": -4.48220157623291, "loss": 4.285, "rewards/accuracies": 0.75, "rewards/chosen": -44.237422943115234, "rewards/margins": 0.5845947265625, "rewards/rejected": -44.822021484375, "step": 4956 }, { "epoch": 0.6749727668845316, "grad_norm": 37.8058924433653, "learning_rate": 2.3102450095637712e-07, "logits/chosen": 12.869550704956055, "logits/rejected": 13.998485565185547, "logps/chosen": -4.139325141906738, "logps/rejected": -4.3200225830078125, "loss": 3.6435, "rewards/accuracies": 0.5, "rewards/chosen": -41.39324951171875, "rewards/margins": 1.8069725036621094, "rewards/rejected": -43.20022201538086, "step": 4957 }, { "epoch": 0.6751089324618736, "grad_norm": 38.115104044654785, "learning_rate": 2.3085217853160888e-07, "logits/chosen": 13.013557434082031, "logits/rejected": 13.581707000732422, "logps/chosen": -4.269559383392334, "logps/rejected": -4.24217414855957, "loss": 3.647, "rewards/accuracies": 0.5, "rewards/chosen": -42.695594787597656, "rewards/margins": -0.2738494873046875, "rewards/rejected": -42.4217414855957, "step": 4958 }, { "epoch": 0.6752450980392157, "grad_norm": 36.27604281624898, "learning_rate": 2.3067989432718896e-07, "logits/chosen": 12.608892440795898, "logits/rejected": 13.986117362976074, "logps/chosen": -4.183938503265381, "logps/rejected": -4.548959732055664, "loss": 4.0606, "rewards/accuracies": 0.75, "rewards/chosen": -41.839385986328125, "rewards/margins": 3.6502084732055664, "rewards/rejected": -45.489593505859375, "step": 4959 }, { "epoch": 0.6753812636165577, "grad_norm": 39.566889399842744, "learning_rate": 2.3050764838204652e-07, "logits/chosen": 13.680245399475098, "logits/rejected": 13.357431411743164, "logps/chosen": -4.411849021911621, "logps/rejected": -3.9784514904022217, "loss": 3.9611, "rewards/accuracies": 0.25, "rewards/chosen": -44.11848831176758, "rewards/margins": -4.3339738845825195, "rewards/rejected": -39.784515380859375, "step": 4960 }, { "epoch": 0.6755174291938998, "grad_norm": 39.95772241272317, "learning_rate": 2.3033544073510213e-07, "logits/chosen": 14.288445472717285, "logits/rejected": 13.817296981811523, "logps/chosen": -4.395822048187256, "logps/rejected": -4.38028621673584, "loss": 3.6531, "rewards/accuracies": 0.5, "rewards/chosen": -43.958221435546875, "rewards/margins": -0.15535736083984375, "rewards/rejected": -43.80286407470703, "step": 4961 }, { "epoch": 0.6756535947712419, "grad_norm": 38.525053389779266, "learning_rate": 2.301632714252672e-07, "logits/chosen": 12.847063064575195, "logits/rejected": 13.848566055297852, "logps/chosen": -4.162755012512207, "logps/rejected": -4.6717963218688965, "loss": 4.0449, "rewards/accuracies": 1.0, "rewards/chosen": -41.62754821777344, "rewards/margins": 5.090417861938477, "rewards/rejected": -46.71796417236328, "step": 4962 }, { "epoch": 0.6757897603485838, "grad_norm": 40.84401699036456, "learning_rate": 2.29991140491445e-07, "logits/chosen": 13.421256065368652, "logits/rejected": 12.48569107055664, "logps/chosen": -4.307842254638672, "logps/rejected": -4.048992156982422, "loss": 4.3681, "rewards/accuracies": 0.25, "rewards/chosen": -43.07842254638672, "rewards/margins": -2.5884971618652344, "rewards/rejected": -40.48992156982422, "step": 4963 }, { "epoch": 0.6759259259259259, "grad_norm": 44.00340767457676, "learning_rate": 2.2981904797253002e-07, "logits/chosen": 13.401823997497559, "logits/rejected": 13.822525978088379, "logps/chosen": -4.297903060913086, "logps/rejected": -4.52322244644165, "loss": 4.2308, "rewards/accuracies": 0.5, "rewards/chosen": -42.979026794433594, "rewards/margins": 2.2531967163085938, "rewards/rejected": -45.23222351074219, "step": 4964 }, { "epoch": 0.676062091503268, "grad_norm": 43.235226603065236, "learning_rate": 2.296469939074078e-07, "logits/chosen": 14.333459854125977, "logits/rejected": 13.949462890625, "logps/chosen": -4.383719444274902, "logps/rejected": -4.410735607147217, "loss": 4.4834, "rewards/accuracies": 0.25, "rewards/chosen": -43.837196350097656, "rewards/margins": 0.2701568603515625, "rewards/rejected": -44.10735321044922, "step": 4965 }, { "epoch": 0.67619825708061, "grad_norm": 37.079638227953495, "learning_rate": 2.294749783349554e-07, "logits/chosen": 13.726045608520508, "logits/rejected": 14.147699356079102, "logps/chosen": -4.294762134552002, "logps/rejected": -4.66843843460083, "loss": 3.7999, "rewards/accuracies": 1.0, "rewards/chosen": -42.94762420654297, "rewards/margins": 3.7367610931396484, "rewards/rejected": -46.68437957763672, "step": 4966 }, { "epoch": 0.6763344226579521, "grad_norm": 44.211332903641264, "learning_rate": 2.2930300129404138e-07, "logits/chosen": 13.64992904663086, "logits/rejected": 13.539363861083984, "logps/chosen": -4.519554138183594, "logps/rejected": -4.572309970855713, "loss": 3.7589, "rewards/accuracies": 0.5, "rewards/chosen": -45.19554138183594, "rewards/margins": 0.5275564193725586, "rewards/rejected": -45.72309875488281, "step": 4967 }, { "epoch": 0.6764705882352942, "grad_norm": 36.73329280615194, "learning_rate": 2.2913106282352506e-07, "logits/chosen": 13.081182479858398, "logits/rejected": 13.253957748413086, "logps/chosen": -4.168489456176758, "logps/rejected": -4.1564435958862305, "loss": 3.7825, "rewards/accuracies": 0.5, "rewards/chosen": -41.68489074707031, "rewards/margins": -0.12045574188232422, "rewards/rejected": -41.56443405151367, "step": 4968 }, { "epoch": 0.6766067538126361, "grad_norm": 40.29488155845811, "learning_rate": 2.2895916296225755e-07, "logits/chosen": 14.546598434448242, "logits/rejected": 14.574928283691406, "logps/chosen": -4.554875373840332, "logps/rejected": -4.842704772949219, "loss": 4.2657, "rewards/accuracies": 1.0, "rewards/chosen": -45.54875564575195, "rewards/margins": 2.878293991088867, "rewards/rejected": -48.42704772949219, "step": 4969 }, { "epoch": 0.6767429193899782, "grad_norm": 41.174273205331694, "learning_rate": 2.2878730174908116e-07, "logits/chosen": 12.921098709106445, "logits/rejected": 12.210588455200195, "logps/chosen": -3.966479778289795, "logps/rejected": -3.9754796028137207, "loss": 4.4049, "rewards/accuracies": 0.5, "rewards/chosen": -39.664794921875, "rewards/margins": 0.08999824523925781, "rewards/rejected": -39.754798889160156, "step": 4970 }, { "epoch": 0.6768790849673203, "grad_norm": 38.51683179767007, "learning_rate": 2.28615479222829e-07, "logits/chosen": 13.72456169128418, "logits/rejected": 14.240532875061035, "logps/chosen": -4.560673713684082, "logps/rejected": -4.88822603225708, "loss": 4.2064, "rewards/accuracies": 1.0, "rewards/chosen": -45.60673904418945, "rewards/margins": 3.2755212783813477, "rewards/rejected": -48.88226318359375, "step": 4971 }, { "epoch": 0.6770152505446623, "grad_norm": 41.21215463435957, "learning_rate": 2.2844369542232598e-07, "logits/chosen": 13.332682609558105, "logits/rejected": 12.826804161071777, "logps/chosen": -4.288166046142578, "logps/rejected": -4.286161422729492, "loss": 3.7048, "rewards/accuracies": 0.75, "rewards/chosen": -42.88166046142578, "rewards/margins": -0.02004718780517578, "rewards/rejected": -42.861610412597656, "step": 4972 }, { "epoch": 0.6771514161220044, "grad_norm": 40.222582782353, "learning_rate": 2.2827195038638826e-07, "logits/chosen": 13.575287818908691, "logits/rejected": 14.126911163330078, "logps/chosen": -4.358053207397461, "logps/rejected": -4.729905128479004, "loss": 3.8199, "rewards/accuracies": 0.75, "rewards/chosen": -43.58053207397461, "rewards/margins": 3.7185192108154297, "rewards/rejected": -47.29905319213867, "step": 4973 }, { "epoch": 0.6772875816993464, "grad_norm": 40.7054476084585, "learning_rate": 2.2810024415382271e-07, "logits/chosen": 13.3892822265625, "logits/rejected": 14.465702056884766, "logps/chosen": -4.232492446899414, "logps/rejected": -4.686868190765381, "loss": 4.0958, "rewards/accuracies": 1.0, "rewards/chosen": -42.324928283691406, "rewards/margins": 4.543753623962402, "rewards/rejected": -46.868682861328125, "step": 4974 }, { "epoch": 0.6774237472766884, "grad_norm": 45.675618018964336, "learning_rate": 2.2792857676342794e-07, "logits/chosen": 12.890312194824219, "logits/rejected": 13.303117752075195, "logps/chosen": -3.743856906890869, "logps/rejected": -4.228359699249268, "loss": 4.2716, "rewards/accuracies": 0.75, "rewards/chosen": -37.438568115234375, "rewards/margins": 4.845026016235352, "rewards/rejected": -42.28359603881836, "step": 4975 }, { "epoch": 0.6775599128540305, "grad_norm": 36.938790594611184, "learning_rate": 2.2775694825399375e-07, "logits/chosen": 13.061237335205078, "logits/rejected": 13.785886764526367, "logps/chosen": -4.008164405822754, "logps/rejected": -4.19135856628418, "loss": 3.9258, "rewards/accuracies": 0.75, "rewards/chosen": -40.08164596557617, "rewards/margins": 1.8319416046142578, "rewards/rejected": -41.9135856628418, "step": 4976 }, { "epoch": 0.6776960784313726, "grad_norm": 37.10133424816249, "learning_rate": 2.2758535866430074e-07, "logits/chosen": 14.104854583740234, "logits/rejected": 13.841049194335938, "logps/chosen": -4.4067182540893555, "logps/rejected": -4.285238742828369, "loss": 4.0124, "rewards/accuracies": 0.5, "rewards/chosen": -44.06718063354492, "rewards/margins": -1.2147932052612305, "rewards/rejected": -42.852386474609375, "step": 4977 }, { "epoch": 0.6778322440087146, "grad_norm": 39.761906772641716, "learning_rate": 2.2741380803312115e-07, "logits/chosen": 13.531610488891602, "logits/rejected": 14.065025329589844, "logps/chosen": -4.452895164489746, "logps/rejected": -4.772008895874023, "loss": 4.2408, "rewards/accuracies": 0.75, "rewards/chosen": -44.52894973754883, "rewards/margins": 3.191143035888672, "rewards/rejected": -47.7200927734375, "step": 4978 }, { "epoch": 0.6779684095860566, "grad_norm": 43.747568198408885, "learning_rate": 2.2724229639921836e-07, "logits/chosen": 13.533609390258789, "logits/rejected": 13.463663101196289, "logps/chosen": -4.381803035736084, "logps/rejected": -4.501583576202393, "loss": 4.1755, "rewards/accuracies": 0.75, "rewards/chosen": -43.818031311035156, "rewards/margins": 1.1978044509887695, "rewards/rejected": -45.015838623046875, "step": 4979 }, { "epoch": 0.6781045751633987, "grad_norm": 41.21474463546016, "learning_rate": 2.2707082380134656e-07, "logits/chosen": 13.665870666503906, "logits/rejected": 14.034829139709473, "logps/chosen": -4.181507587432861, "logps/rejected": -4.397177219390869, "loss": 4.4366, "rewards/accuracies": 0.5, "rewards/chosen": -41.81507873535156, "rewards/margins": 2.156693458557129, "rewards/rejected": -43.971771240234375, "step": 4980 }, { "epoch": 0.6782407407407407, "grad_norm": 38.206057859108874, "learning_rate": 2.2689939027825163e-07, "logits/chosen": 13.581478118896484, "logits/rejected": 14.627066612243652, "logps/chosen": -4.009920597076416, "logps/rejected": -4.532353401184082, "loss": 3.5802, "rewards/accuracies": 0.75, "rewards/chosen": -40.099205017089844, "rewards/margins": 5.22432804107666, "rewards/rejected": -45.32353591918945, "step": 4981 }, { "epoch": 0.6783769063180828, "grad_norm": 38.490709952399854, "learning_rate": 2.2672799586867043e-07, "logits/chosen": 14.277299880981445, "logits/rejected": 13.858455657958984, "logps/chosen": -4.619071960449219, "logps/rejected": -4.550715923309326, "loss": 4.2897, "rewards/accuracies": 0.25, "rewards/chosen": -46.19071960449219, "rewards/margins": -0.683558464050293, "rewards/rejected": -45.50716018676758, "step": 4982 }, { "epoch": 0.6785130718954249, "grad_norm": 40.11423691937173, "learning_rate": 2.265566406113307e-07, "logits/chosen": 14.070667266845703, "logits/rejected": 14.367620468139648, "logps/chosen": -4.299476146697998, "logps/rejected": -4.48367977142334, "loss": 3.8117, "rewards/accuracies": 0.75, "rewards/chosen": -42.9947624206543, "rewards/margins": 1.8420333862304688, "rewards/rejected": -44.836795806884766, "step": 4983 }, { "epoch": 0.6786492374727668, "grad_norm": 37.46558772915579, "learning_rate": 2.2638532454495176e-07, "logits/chosen": 13.138294219970703, "logits/rejected": 13.326571464538574, "logps/chosen": -4.381597518920898, "logps/rejected": -4.463193893432617, "loss": 4.1183, "rewards/accuracies": 0.5, "rewards/chosen": -43.81597137451172, "rewards/margins": 0.8159675598144531, "rewards/rejected": -44.63193893432617, "step": 4984 }, { "epoch": 0.6787854030501089, "grad_norm": 67.23218415160886, "learning_rate": 2.2621404770824398e-07, "logits/chosen": 13.656267166137695, "logits/rejected": 14.31346321105957, "logps/chosen": -4.248865127563477, "logps/rejected": -4.72747802734375, "loss": 4.0863, "rewards/accuracies": 1.0, "rewards/chosen": -42.4886474609375, "rewards/margins": 4.786129951477051, "rewards/rejected": -47.2747802734375, "step": 4985 }, { "epoch": 0.678921568627451, "grad_norm": 41.62689708797565, "learning_rate": 2.2604281013990846e-07, "logits/chosen": 13.764545440673828, "logits/rejected": 13.75579833984375, "logps/chosen": -4.149874210357666, "logps/rejected": -4.593227386474609, "loss": 4.4826, "rewards/accuracies": 1.0, "rewards/chosen": -41.498741149902344, "rewards/margins": 4.43353271484375, "rewards/rejected": -45.932273864746094, "step": 4986 }, { "epoch": 0.679057734204793, "grad_norm": 41.89114439933292, "learning_rate": 2.25871611878638e-07, "logits/chosen": 13.49289321899414, "logits/rejected": 13.435308456420898, "logps/chosen": -4.215784549713135, "logps/rejected": -4.097024917602539, "loss": 3.7158, "rewards/accuracies": 0.25, "rewards/chosen": -42.15784454345703, "rewards/margins": -1.1875982284545898, "rewards/rejected": -40.97024917602539, "step": 4987 }, { "epoch": 0.6791938997821351, "grad_norm": 37.76187268078374, "learning_rate": 2.2570045296311613e-07, "logits/chosen": 13.158077239990234, "logits/rejected": 13.149697303771973, "logps/chosen": -4.239227294921875, "logps/rejected": -4.372925758361816, "loss": 3.906, "rewards/accuracies": 0.5, "rewards/chosen": -42.392269134521484, "rewards/margins": 1.336984634399414, "rewards/rejected": -43.72925567626953, "step": 4988 }, { "epoch": 0.6793300653594772, "grad_norm": 37.031011607070525, "learning_rate": 2.2552933343201796e-07, "logits/chosen": 12.841787338256836, "logits/rejected": 13.446521759033203, "logps/chosen": -4.011806488037109, "logps/rejected": -4.350944519042969, "loss": 4.0119, "rewards/accuracies": 0.75, "rewards/chosen": -40.118064880371094, "rewards/margins": 3.391383171081543, "rewards/rejected": -43.50944900512695, "step": 4989 }, { "epoch": 0.6794662309368191, "grad_norm": 39.066465114972495, "learning_rate": 2.253582533240088e-07, "logits/chosen": 13.265604019165039, "logits/rejected": 13.471429824829102, "logps/chosen": -4.167520523071289, "logps/rejected": -4.498297691345215, "loss": 3.8967, "rewards/accuracies": 0.75, "rewards/chosen": -41.67520523071289, "rewards/margins": 3.307771682739258, "rewards/rejected": -44.98297882080078, "step": 4990 }, { "epoch": 0.6796023965141612, "grad_norm": 38.46686949385782, "learning_rate": 2.2518721267774597e-07, "logits/chosen": 13.489778518676758, "logits/rejected": 14.06750202178955, "logps/chosen": -4.158685684204102, "logps/rejected": -4.284387588500977, "loss": 4.3572, "rewards/accuracies": 0.75, "rewards/chosen": -41.58685302734375, "rewards/margins": 1.2570180892944336, "rewards/rejected": -42.8438720703125, "step": 4991 }, { "epoch": 0.6797385620915033, "grad_norm": 37.200513818051604, "learning_rate": 2.2501621153187762e-07, "logits/chosen": 13.186548233032227, "logits/rejected": 13.757600784301758, "logps/chosen": -4.255725383758545, "logps/rejected": -4.439255714416504, "loss": 4.1156, "rewards/accuracies": 0.75, "rewards/chosen": -42.557254791259766, "rewards/margins": 1.8353052139282227, "rewards/rejected": -44.39256286621094, "step": 4992 }, { "epoch": 0.6798747276688453, "grad_norm": 40.287967558548495, "learning_rate": 2.2484524992504251e-07, "logits/chosen": 13.727678298950195, "logits/rejected": 13.681447982788086, "logps/chosen": -4.334414482116699, "logps/rejected": -4.333764553070068, "loss": 4.2264, "rewards/accuracies": 0.5, "rewards/chosen": -43.34414291381836, "rewards/margins": -0.006497383117675781, "rewards/rejected": -43.337646484375, "step": 4993 }, { "epoch": 0.6800108932461874, "grad_norm": 39.372514580313805, "learning_rate": 2.2467432789587103e-07, "logits/chosen": 13.744820594787598, "logits/rejected": 13.591926574707031, "logps/chosen": -4.198310375213623, "logps/rejected": -4.295980453491211, "loss": 3.7949, "rewards/accuracies": 0.5, "rewards/chosen": -41.98310089111328, "rewards/margins": 0.9767026901245117, "rewards/rejected": -42.959808349609375, "step": 4994 }, { "epoch": 0.6801470588235294, "grad_norm": 42.18162031896227, "learning_rate": 2.2450344548298444e-07, "logits/chosen": 13.75855541229248, "logits/rejected": 13.14894962310791, "logps/chosen": -4.133901596069336, "logps/rejected": -4.283002853393555, "loss": 4.6661, "rewards/accuracies": 0.75, "rewards/chosen": -41.339019775390625, "rewards/margins": 1.4910097122192383, "rewards/rejected": -42.83003234863281, "step": 4995 }, { "epoch": 0.6802832244008714, "grad_norm": 40.69330314619433, "learning_rate": 2.2433260272499513e-07, "logits/chosen": 13.152807235717773, "logits/rejected": 13.351385116577148, "logps/chosen": -4.540722846984863, "logps/rejected": -4.450883865356445, "loss": 4.1758, "rewards/accuracies": 0.25, "rewards/chosen": -45.4072265625, "rewards/margins": -0.8983869552612305, "rewards/rejected": -44.50883483886719, "step": 4996 }, { "epoch": 0.6804193899782135, "grad_norm": 40.523749155542205, "learning_rate": 2.241617996605062e-07, "logits/chosen": 13.891581535339355, "logits/rejected": 13.762735366821289, "logps/chosen": -4.382202625274658, "logps/rejected": -4.745565891265869, "loss": 4.3215, "rewards/accuracies": 0.75, "rewards/chosen": -43.82202911376953, "rewards/margins": 3.633631706237793, "rewards/rejected": -47.455657958984375, "step": 4997 }, { "epoch": 0.6805555555555556, "grad_norm": 41.032099766170546, "learning_rate": 2.2399103632811206e-07, "logits/chosen": 13.359865188598633, "logits/rejected": 13.182570457458496, "logps/chosen": -3.9578073024749756, "logps/rejected": -4.031169891357422, "loss": 3.677, "rewards/accuracies": 0.5, "rewards/chosen": -39.57807540893555, "rewards/margins": 0.7336273193359375, "rewards/rejected": -40.31169891357422, "step": 4998 }, { "epoch": 0.6806917211328976, "grad_norm": 37.85332720279121, "learning_rate": 2.2382031276639842e-07, "logits/chosen": 14.450433731079102, "logits/rejected": 14.257366180419922, "logps/chosen": -4.2658843994140625, "logps/rejected": -4.547750473022461, "loss": 3.7661, "rewards/accuracies": 0.75, "rewards/chosen": -42.658843994140625, "rewards/margins": 2.818661689758301, "rewards/rejected": -45.47750473022461, "step": 4999 }, { "epoch": 0.6808278867102396, "grad_norm": 41.2182891145825, "learning_rate": 2.2364962901394123e-07, "logits/chosen": 13.993817329406738, "logits/rejected": 14.251441955566406, "logps/chosen": -4.3696136474609375, "logps/rejected": -4.298367023468018, "loss": 4.3506, "rewards/accuracies": 0.5, "rewards/chosen": -43.69613265991211, "rewards/margins": -0.7124624252319336, "rewards/rejected": -42.983673095703125, "step": 5000 }, { "epoch": 0.6809640522875817, "grad_norm": 39.14153327380705, "learning_rate": 2.234789851093081e-07, "logits/chosen": 13.315296173095703, "logits/rejected": 13.857022285461426, "logps/chosen": -4.401683807373047, "logps/rejected": -4.620563507080078, "loss": 4.1774, "rewards/accuracies": 0.75, "rewards/chosen": -44.0168342590332, "rewards/margins": 2.188800811767578, "rewards/rejected": -46.20563507080078, "step": 5001 }, { "epoch": 0.6811002178649237, "grad_norm": 47.939143628707626, "learning_rate": 2.2330838109105737e-07, "logits/chosen": 12.929569244384766, "logits/rejected": 13.525854110717773, "logps/chosen": -4.179147720336914, "logps/rejected": -4.623818397521973, "loss": 3.666, "rewards/accuracies": 1.0, "rewards/chosen": -41.791481018066406, "rewards/margins": 4.446707725524902, "rewards/rejected": -46.23818588256836, "step": 5002 }, { "epoch": 0.6812363834422658, "grad_norm": 38.00798450116685, "learning_rate": 2.231378169977387e-07, "logits/chosen": 13.491222381591797, "logits/rejected": 13.929513931274414, "logps/chosen": -4.354231834411621, "logps/rejected": -4.504958152770996, "loss": 3.9388, "rewards/accuracies": 0.75, "rewards/chosen": -43.54231262207031, "rewards/margins": 1.5072650909423828, "rewards/rejected": -45.04957962036133, "step": 5003 }, { "epoch": 0.6813725490196079, "grad_norm": 40.19004861333976, "learning_rate": 2.2296729286789207e-07, "logits/chosen": 13.031463623046875, "logits/rejected": 13.076131820678711, "logps/chosen": -4.109193801879883, "logps/rejected": -4.167816162109375, "loss": 3.6483, "rewards/accuracies": 0.5, "rewards/chosen": -41.09193801879883, "rewards/margins": 0.5862236022949219, "rewards/rejected": -41.67816162109375, "step": 5004 }, { "epoch": 0.6815087145969498, "grad_norm": 37.62272472660577, "learning_rate": 2.2279680874004895e-07, "logits/chosen": 12.955437660217285, "logits/rejected": 12.840771675109863, "logps/chosen": -4.034949779510498, "logps/rejected": -3.832881212234497, "loss": 3.9735, "rewards/accuracies": 0.25, "rewards/chosen": -40.34949493408203, "rewards/margins": -2.020686149597168, "rewards/rejected": -38.32881164550781, "step": 5005 }, { "epoch": 0.6816448801742919, "grad_norm": 37.41838194021685, "learning_rate": 2.2262636465273187e-07, "logits/chosen": 13.116691589355469, "logits/rejected": 13.984855651855469, "logps/chosen": -4.123475074768066, "logps/rejected": -4.410501480102539, "loss": 4.0801, "rewards/accuracies": 0.75, "rewards/chosen": -41.23474884033203, "rewards/margins": 2.87026309967041, "rewards/rejected": -44.105010986328125, "step": 5006 }, { "epoch": 0.681781045751634, "grad_norm": 38.657092350230975, "learning_rate": 2.224559606444537e-07, "logits/chosen": 13.865559577941895, "logits/rejected": 14.011590003967285, "logps/chosen": -4.171965599060059, "logps/rejected": -4.623730182647705, "loss": 4.1524, "rewards/accuracies": 0.75, "rewards/chosen": -41.71965408325195, "rewards/margins": 4.517648696899414, "rewards/rejected": -46.2373046875, "step": 5007 }, { "epoch": 0.681917211328976, "grad_norm": 39.073354881014474, "learning_rate": 2.222855967537188e-07, "logits/chosen": 14.259031295776367, "logits/rejected": 13.856268882751465, "logps/chosen": -4.7399444580078125, "logps/rejected": -4.7085723876953125, "loss": 4.0595, "rewards/accuracies": 0.25, "rewards/chosen": -47.399444580078125, "rewards/margins": -0.3137216567993164, "rewards/rejected": -47.085723876953125, "step": 5008 }, { "epoch": 0.6820533769063181, "grad_norm": 38.90836745391573, "learning_rate": 2.2211527301902252e-07, "logits/chosen": 13.952967643737793, "logits/rejected": 14.327670097351074, "logps/chosen": -4.357396602630615, "logps/rejected": -4.47529411315918, "loss": 4.2237, "rewards/accuracies": 0.5, "rewards/chosen": -43.57396697998047, "rewards/margins": 1.1789751052856445, "rewards/rejected": -44.7529411315918, "step": 5009 }, { "epoch": 0.6821895424836601, "grad_norm": 42.08167346594988, "learning_rate": 2.2194498947885055e-07, "logits/chosen": 13.118541717529297, "logits/rejected": 13.647377014160156, "logps/chosen": -4.258277893066406, "logps/rejected": -4.394826889038086, "loss": 3.8537, "rewards/accuracies": 0.75, "rewards/chosen": -42.58277893066406, "rewards/margins": 1.3654861450195312, "rewards/rejected": -43.94826889038086, "step": 5010 }, { "epoch": 0.6823257080610022, "grad_norm": 39.51510933267412, "learning_rate": 2.2177474617168e-07, "logits/chosen": 13.641788482666016, "logits/rejected": 13.726428031921387, "logps/chosen": -4.176939010620117, "logps/rejected": -4.396330833435059, "loss": 4.2457, "rewards/accuracies": 0.5, "rewards/chosen": -41.769386291503906, "rewards/margins": 2.1939220428466797, "rewards/rejected": -43.96331024169922, "step": 5011 }, { "epoch": 0.6824618736383442, "grad_norm": 40.730414855864005, "learning_rate": 2.216045431359789e-07, "logits/chosen": 14.482207298278809, "logits/rejected": 13.872542381286621, "logps/chosen": -4.720890045166016, "logps/rejected": -4.4878644943237305, "loss": 4.3534, "rewards/accuracies": 0.25, "rewards/chosen": -47.208900451660156, "rewards/margins": -2.3302555084228516, "rewards/rejected": -44.87864303588867, "step": 5012 }, { "epoch": 0.6825980392156863, "grad_norm": 42.4475798367322, "learning_rate": 2.214343804102058e-07, "logits/chosen": 13.968061447143555, "logits/rejected": 13.837713241577148, "logps/chosen": -4.61461067199707, "logps/rejected": -4.521609306335449, "loss": 4.0538, "rewards/accuracies": 0.25, "rewards/chosen": -46.14611053466797, "rewards/margins": -0.9300174713134766, "rewards/rejected": -45.216087341308594, "step": 5013 }, { "epoch": 0.6827342047930284, "grad_norm": 39.53957775761008, "learning_rate": 2.2126425803281048e-07, "logits/chosen": 13.488215446472168, "logits/rejected": 13.705069541931152, "logps/chosen": -4.218360900878906, "logps/rejected": -4.216750144958496, "loss": 4.0048, "rewards/accuracies": 0.75, "rewards/chosen": -42.18360900878906, "rewards/margins": -0.016106605529785156, "rewards/rejected": -42.167503356933594, "step": 5014 }, { "epoch": 0.6828703703703703, "grad_norm": 41.563302514019306, "learning_rate": 2.2109417604223366e-07, "logits/chosen": 13.47268295288086, "logits/rejected": 13.241342544555664, "logps/chosen": -4.422366619110107, "logps/rejected": -4.254859447479248, "loss": 4.3242, "rewards/accuracies": 0.5, "rewards/chosen": -44.223663330078125, "rewards/margins": -1.6750659942626953, "rewards/rejected": -42.54859924316406, "step": 5015 }, { "epoch": 0.6830065359477124, "grad_norm": 40.351341090100256, "learning_rate": 2.2092413447690643e-07, "logits/chosen": 13.918268203735352, "logits/rejected": 14.432722091674805, "logps/chosen": -4.423618793487549, "logps/rejected": -4.541496276855469, "loss": 4.1924, "rewards/accuracies": 0.75, "rewards/chosen": -44.23619079589844, "rewards/margins": 1.1787729263305664, "rewards/rejected": -45.41496276855469, "step": 5016 }, { "epoch": 0.6831427015250545, "grad_norm": 39.380100272285354, "learning_rate": 2.2075413337525132e-07, "logits/chosen": 13.634376525878906, "logits/rejected": 14.137292861938477, "logps/chosen": -4.366642475128174, "logps/rejected": -4.7054877281188965, "loss": 4.0943, "rewards/accuracies": 0.75, "rewards/chosen": -43.66642379760742, "rewards/margins": 3.38845157623291, "rewards/rejected": -47.05487823486328, "step": 5017 }, { "epoch": 0.6832788671023965, "grad_norm": 39.15997031466706, "learning_rate": 2.2058417277568157e-07, "logits/chosen": 14.054794311523438, "logits/rejected": 14.145118713378906, "logps/chosen": -4.5085015296936035, "logps/rejected": -4.7973480224609375, "loss": 4.4287, "rewards/accuracies": 1.0, "rewards/chosen": -45.08501434326172, "rewards/margins": 2.8884620666503906, "rewards/rejected": -47.973480224609375, "step": 5018 }, { "epoch": 0.6834150326797386, "grad_norm": 43.53520005843306, "learning_rate": 2.2041425271660085e-07, "logits/chosen": 13.050012588500977, "logits/rejected": 13.301427841186523, "logps/chosen": -4.17169189453125, "logps/rejected": -4.464792728424072, "loss": 3.4043, "rewards/accuracies": 1.0, "rewards/chosen": -41.716915130615234, "rewards/margins": 2.931009292602539, "rewards/rejected": -44.647926330566406, "step": 5019 }, { "epoch": 0.6835511982570807, "grad_norm": 41.86912081677285, "learning_rate": 2.2024437323640427e-07, "logits/chosen": 13.518144607543945, "logits/rejected": 13.531888961791992, "logps/chosen": -4.2252020835876465, "logps/rejected": -4.575495719909668, "loss": 3.7797, "rewards/accuracies": 0.75, "rewards/chosen": -42.25202178955078, "rewards/margins": 3.5029401779174805, "rewards/rejected": -45.75495910644531, "step": 5020 }, { "epoch": 0.6836873638344226, "grad_norm": 42.82758359465621, "learning_rate": 2.2007453437347757e-07, "logits/chosen": 13.926349639892578, "logits/rejected": 13.987655639648438, "logps/chosen": -4.204654693603516, "logps/rejected": -4.383811950683594, "loss": 3.9902, "rewards/accuracies": 0.75, "rewards/chosen": -42.046546936035156, "rewards/margins": 1.791574478149414, "rewards/rejected": -43.8381233215332, "step": 5021 }, { "epoch": 0.6838235294117647, "grad_norm": 40.42373771053524, "learning_rate": 2.199047361661969e-07, "logits/chosen": 13.318455696105957, "logits/rejected": 13.40045166015625, "logps/chosen": -4.015239715576172, "logps/rejected": -4.418717384338379, "loss": 3.8965, "rewards/accuracies": 0.75, "rewards/chosen": -40.15239715576172, "rewards/margins": 4.034775733947754, "rewards/rejected": -44.18717575073242, "step": 5022 }, { "epoch": 0.6839596949891068, "grad_norm": 36.47791628233479, "learning_rate": 2.1973497865292984e-07, "logits/chosen": 13.535608291625977, "logits/rejected": 13.879993438720703, "logps/chosen": -4.130778789520264, "logps/rejected": -4.601337432861328, "loss": 3.85, "rewards/accuracies": 1.0, "rewards/chosen": -41.30778503417969, "rewards/margins": 4.705585479736328, "rewards/rejected": -46.013370513916016, "step": 5023 }, { "epoch": 0.6840958605664488, "grad_norm": 39.167793451315006, "learning_rate": 2.1956526187203454e-07, "logits/chosen": 14.933722496032715, "logits/rejected": 14.696575164794922, "logps/chosen": -4.752525329589844, "logps/rejected": -4.90363883972168, "loss": 4.1223, "rewards/accuracies": 0.5, "rewards/chosen": -47.52525329589844, "rewards/margins": 1.511134147644043, "rewards/rejected": -49.03639221191406, "step": 5024 }, { "epoch": 0.6842320261437909, "grad_norm": 37.42753695054362, "learning_rate": 2.193955858618597e-07, "logits/chosen": 12.68875503540039, "logits/rejected": 12.712322235107422, "logps/chosen": -4.378776550292969, "logps/rejected": -4.109373092651367, "loss": 4.0378, "rewards/accuracies": 0.25, "rewards/chosen": -43.78776931762695, "rewards/margins": -2.6940345764160156, "rewards/rejected": -41.09373474121094, "step": 5025 }, { "epoch": 0.684368191721133, "grad_norm": 36.98959901362154, "learning_rate": 2.192259506607451e-07, "logits/chosen": 12.839412689208984, "logits/rejected": 13.885951042175293, "logps/chosen": -4.321671485900879, "logps/rejected": -4.629119873046875, "loss": 4.108, "rewards/accuracies": 0.75, "rewards/chosen": -43.216712951660156, "rewards/margins": 3.0744848251342773, "rewards/rejected": -46.29119873046875, "step": 5026 }, { "epoch": 0.6845043572984749, "grad_norm": 40.51895705338232, "learning_rate": 2.190563563070214e-07, "logits/chosen": 13.986565589904785, "logits/rejected": 13.995365142822266, "logps/chosen": -4.650404930114746, "logps/rejected": -4.534981727600098, "loss": 3.9426, "rewards/accuracies": 0.25, "rewards/chosen": -46.504051208496094, "rewards/margins": -1.1542377471923828, "rewards/rejected": -45.349815368652344, "step": 5027 }, { "epoch": 0.684640522875817, "grad_norm": 40.99825987622521, "learning_rate": 2.1888680283900952e-07, "logits/chosen": 13.376726150512695, "logits/rejected": 14.02424430847168, "logps/chosen": -4.353873252868652, "logps/rejected": -4.597479820251465, "loss": 3.7493, "rewards/accuracies": 0.75, "rewards/chosen": -43.53873062133789, "rewards/margins": 2.4360647201538086, "rewards/rejected": -45.974796295166016, "step": 5028 }, { "epoch": 0.6847766884531591, "grad_norm": 40.48789120605081, "learning_rate": 2.1871729029502166e-07, "logits/chosen": 13.691600799560547, "logits/rejected": 13.984077453613281, "logps/chosen": -4.293684005737305, "logps/rejected": -4.400207996368408, "loss": 3.9265, "rewards/accuracies": 0.75, "rewards/chosen": -42.93684005737305, "rewards/margins": 1.065237045288086, "rewards/rejected": -44.0020751953125, "step": 5029 }, { "epoch": 0.6849128540305011, "grad_norm": 41.61070659342836, "learning_rate": 2.185478187133607e-07, "logits/chosen": 13.614520072937012, "logits/rejected": 14.826154708862305, "logps/chosen": -4.358434200286865, "logps/rejected": -4.7392473220825195, "loss": 4.0848, "rewards/accuracies": 0.75, "rewards/chosen": -43.58434295654297, "rewards/margins": 3.808131217956543, "rewards/rejected": -47.39247131347656, "step": 5030 }, { "epoch": 0.6850490196078431, "grad_norm": 35.55938066400783, "learning_rate": 2.1837838813231984e-07, "logits/chosen": 13.036423683166504, "logits/rejected": 13.578446388244629, "logps/chosen": -4.122106075286865, "logps/rejected": -4.560116291046143, "loss": 3.8384, "rewards/accuracies": 1.0, "rewards/chosen": -41.22106170654297, "rewards/margins": 4.38010311126709, "rewards/rejected": -45.601165771484375, "step": 5031 }, { "epoch": 0.6851851851851852, "grad_norm": 42.2502012384439, "learning_rate": 2.182089985901835e-07, "logits/chosen": 13.40878677368164, "logits/rejected": 13.713956832885742, "logps/chosen": -4.253888130187988, "logps/rejected": -4.292304039001465, "loss": 4.3705, "rewards/accuracies": 0.5, "rewards/chosen": -42.53887939453125, "rewards/margins": 0.38416290283203125, "rewards/rejected": -42.92304229736328, "step": 5032 }, { "epoch": 0.6853213507625272, "grad_norm": 39.926950666458964, "learning_rate": 2.180396501252268e-07, "logits/chosen": 13.59848403930664, "logits/rejected": 13.662155151367188, "logps/chosen": -4.27988338470459, "logps/rejected": -4.571267604827881, "loss": 3.8587, "rewards/accuracies": 0.75, "rewards/chosen": -42.79883575439453, "rewards/margins": 2.91384220123291, "rewards/rejected": -45.712677001953125, "step": 5033 }, { "epoch": 0.6854575163398693, "grad_norm": 40.338495846062784, "learning_rate": 2.17870342775715e-07, "logits/chosen": 12.531594276428223, "logits/rejected": 12.890243530273438, "logps/chosen": -4.240568161010742, "logps/rejected": -4.356138229370117, "loss": 3.4567, "rewards/accuracies": 0.75, "rewards/chosen": -42.40568161010742, "rewards/margins": 1.155695915222168, "rewards/rejected": -43.561378479003906, "step": 5034 }, { "epoch": 0.6855936819172114, "grad_norm": 35.87077145834212, "learning_rate": 2.1770107657990486e-07, "logits/chosen": 14.370407104492188, "logits/rejected": 14.463908195495605, "logps/chosen": -4.686976909637451, "logps/rejected": -4.874799728393555, "loss": 3.574, "rewards/accuracies": 0.75, "rewards/chosen": -46.86976623535156, "rewards/margins": 1.8782272338867188, "rewards/rejected": -48.74799346923828, "step": 5035 }, { "epoch": 0.6857298474945533, "grad_norm": 39.12489561047164, "learning_rate": 2.175318515760435e-07, "logits/chosen": 12.918832778930664, "logits/rejected": 13.968973159790039, "logps/chosen": -3.768709182739258, "logps/rejected": -4.398289680480957, "loss": 3.5903, "rewards/accuracies": 1.0, "rewards/chosen": -37.68709182739258, "rewards/margins": 6.295801162719727, "rewards/rejected": -43.98289489746094, "step": 5036 }, { "epoch": 0.6858660130718954, "grad_norm": 41.74413993359043, "learning_rate": 2.173626678023684e-07, "logits/chosen": 13.540460586547852, "logits/rejected": 13.330751419067383, "logps/chosen": -4.473898410797119, "logps/rejected": -4.502078056335449, "loss": 3.654, "rewards/accuracies": 0.5, "rewards/chosen": -44.738983154296875, "rewards/margins": 0.2817964553833008, "rewards/rejected": -45.02077865600586, "step": 5037 }, { "epoch": 0.6860021786492375, "grad_norm": 37.5007331523921, "learning_rate": 2.1719352529710817e-07, "logits/chosen": 13.855670928955078, "logits/rejected": 14.227656364440918, "logps/chosen": -4.482041358947754, "logps/rejected": -4.625223636627197, "loss": 3.6856, "rewards/accuracies": 0.5, "rewards/chosen": -44.82041549682617, "rewards/margins": 1.431818962097168, "rewards/rejected": -46.252235412597656, "step": 5038 }, { "epoch": 0.6861383442265795, "grad_norm": 41.055044514652195, "learning_rate": 2.1702442409848217e-07, "logits/chosen": 12.971515655517578, "logits/rejected": 13.559062004089355, "logps/chosen": -4.011073589324951, "logps/rejected": -4.586760520935059, "loss": 3.432, "rewards/accuracies": 1.0, "rewards/chosen": -40.11073684692383, "rewards/margins": 5.756866455078125, "rewards/rejected": -45.86760711669922, "step": 5039 }, { "epoch": 0.6862745098039216, "grad_norm": 43.80961834716902, "learning_rate": 2.1685536424469992e-07, "logits/chosen": 14.547357559204102, "logits/rejected": 14.802654266357422, "logps/chosen": -5.06403923034668, "logps/rejected": -4.929516792297363, "loss": 3.9679, "rewards/accuracies": 0.25, "rewards/chosen": -50.64038848876953, "rewards/margins": -1.345224380493164, "rewards/rejected": -49.295166015625, "step": 5040 }, { "epoch": 0.6864106753812637, "grad_norm": 44.22407758350806, "learning_rate": 2.1668634577396198e-07, "logits/chosen": 13.296409606933594, "logits/rejected": 13.413585662841797, "logps/chosen": -4.327343940734863, "logps/rejected": -4.446374893188477, "loss": 3.546, "rewards/accuracies": 0.75, "rewards/chosen": -43.2734375, "rewards/margins": 1.1903095245361328, "rewards/rejected": -44.4637451171875, "step": 5041 }, { "epoch": 0.6865468409586056, "grad_norm": 41.905448938825565, "learning_rate": 2.1651736872445965e-07, "logits/chosen": 13.519182205200195, "logits/rejected": 15.08736801147461, "logps/chosen": -4.150845527648926, "logps/rejected": -4.9727630615234375, "loss": 4.2661, "rewards/accuracies": 1.0, "rewards/chosen": -41.508453369140625, "rewards/margins": 8.219178199768066, "rewards/rejected": -49.727630615234375, "step": 5042 }, { "epoch": 0.6866830065359477, "grad_norm": 38.986250628116075, "learning_rate": 2.1634843313437437e-07, "logits/chosen": 13.302345275878906, "logits/rejected": 14.049736976623535, "logps/chosen": -4.264213562011719, "logps/rejected": -4.754871845245361, "loss": 3.6447, "rewards/accuracies": 0.75, "rewards/chosen": -42.64213562011719, "rewards/margins": 4.906582832336426, "rewards/rejected": -47.5487174987793, "step": 5043 }, { "epoch": 0.6868191721132898, "grad_norm": 41.43963160576243, "learning_rate": 2.1617953904187875e-07, "logits/chosen": 14.015668869018555, "logits/rejected": 13.914632797241211, "logps/chosen": -4.415635585784912, "logps/rejected": -4.415077209472656, "loss": 4.4754, "rewards/accuracies": 0.5, "rewards/chosen": -44.15635681152344, "rewards/margins": -0.005585670471191406, "rewards/rejected": -44.15077209472656, "step": 5044 }, { "epoch": 0.6869553376906318, "grad_norm": 41.42453187285882, "learning_rate": 2.1601068648513588e-07, "logits/chosen": 13.416875839233398, "logits/rejected": 14.152803421020508, "logps/chosen": -4.34463357925415, "logps/rejected": -4.626467704772949, "loss": 3.9024, "rewards/accuracies": 0.75, "rewards/chosen": -43.44633865356445, "rewards/margins": 2.8183412551879883, "rewards/rejected": -46.264678955078125, "step": 5045 }, { "epoch": 0.6870915032679739, "grad_norm": 42.547638699440384, "learning_rate": 2.158418755022991e-07, "logits/chosen": 14.266857147216797, "logits/rejected": 14.325002670288086, "logps/chosen": -4.7712082862854, "logps/rejected": -4.512879371643066, "loss": 4.2968, "rewards/accuracies": 0.5, "rewards/chosen": -47.71208190917969, "rewards/margins": -2.583287239074707, "rewards/rejected": -45.1287956237793, "step": 5046 }, { "epoch": 0.6872276688453159, "grad_norm": 36.33742439285385, "learning_rate": 2.1567310613151287e-07, "logits/chosen": 12.625694274902344, "logits/rejected": 14.308486938476562, "logps/chosen": -4.004900932312012, "logps/rejected": -4.695990562438965, "loss": 3.332, "rewards/accuracies": 1.0, "rewards/chosen": -40.04900360107422, "rewards/margins": 6.910897254943848, "rewards/rejected": -46.959903717041016, "step": 5047 }, { "epoch": 0.6873638344226579, "grad_norm": 38.062929829020945, "learning_rate": 2.1550437841091206e-07, "logits/chosen": 13.050477981567383, "logits/rejected": 14.595656394958496, "logps/chosen": -4.040771961212158, "logps/rejected": -4.549335479736328, "loss": 3.7408, "rewards/accuracies": 0.75, "rewards/chosen": -40.407718658447266, "rewards/margins": 5.085635185241699, "rewards/rejected": -45.49335479736328, "step": 5048 }, { "epoch": 0.6875, "grad_norm": 38.38820804171776, "learning_rate": 2.1533569237862186e-07, "logits/chosen": 12.521825790405273, "logits/rejected": 13.542545318603516, "logps/chosen": -4.010828018188477, "logps/rejected": -4.413381099700928, "loss": 3.7902, "rewards/accuracies": 1.0, "rewards/chosen": -40.108280181884766, "rewards/margins": 4.025531768798828, "rewards/rejected": -44.133811950683594, "step": 5049 }, { "epoch": 0.6876361655773421, "grad_norm": 44.29289101538668, "learning_rate": 2.151670480727585e-07, "logits/chosen": 14.584162712097168, "logits/rejected": 14.631885528564453, "logps/chosen": -4.4877753257751465, "logps/rejected": -4.494235515594482, "loss": 3.8791, "rewards/accuracies": 0.5, "rewards/chosen": -44.87775421142578, "rewards/margins": 0.06460380554199219, "rewards/rejected": -44.94235610961914, "step": 5050 }, { "epoch": 0.6877723311546841, "grad_norm": 39.397161643258244, "learning_rate": 2.1499844553142855e-07, "logits/chosen": 13.695842742919922, "logits/rejected": 14.165885925292969, "logps/chosen": -4.39158296585083, "logps/rejected": -4.4695820808410645, "loss": 4.1872, "rewards/accuracies": 0.75, "rewards/chosen": -43.91583251953125, "rewards/margins": 0.7799911499023438, "rewards/rejected": -44.69581985473633, "step": 5051 }, { "epoch": 0.6879084967320261, "grad_norm": 35.92665347251774, "learning_rate": 2.1482988479272893e-07, "logits/chosen": 13.772439956665039, "logits/rejected": 13.699932098388672, "logps/chosen": -4.3884453773498535, "logps/rejected": -4.57930326461792, "loss": 3.7809, "rewards/accuracies": 0.5, "rewards/chosen": -43.88445281982422, "rewards/margins": 1.9085798263549805, "rewards/rejected": -45.793033599853516, "step": 5052 }, { "epoch": 0.6880446623093682, "grad_norm": 42.3711856466267, "learning_rate": 2.1466136589474747e-07, "logits/chosen": 13.832405090332031, "logits/rejected": 12.903850555419922, "logps/chosen": -4.243586540222168, "logps/rejected": -4.020169258117676, "loss": 3.9917, "rewards/accuracies": 0.25, "rewards/chosen": -42.43586349487305, "rewards/margins": -2.23416805267334, "rewards/rejected": -40.20169448852539, "step": 5053 }, { "epoch": 0.6881808278867102, "grad_norm": 38.700195296605706, "learning_rate": 2.1449288887556256e-07, "logits/chosen": 14.295101165771484, "logits/rejected": 14.081643104553223, "logps/chosen": -4.718104362487793, "logps/rejected": -4.572268486022949, "loss": 3.6167, "rewards/accuracies": 0.5, "rewards/chosen": -47.18104553222656, "rewards/margins": -1.4583654403686523, "rewards/rejected": -45.722679138183594, "step": 5054 }, { "epoch": 0.6883169934640523, "grad_norm": 38.62764074812554, "learning_rate": 2.1432445377324268e-07, "logits/chosen": 13.382574081420898, "logits/rejected": 13.666784286499023, "logps/chosen": -4.190792083740234, "logps/rejected": -4.785477161407471, "loss": 4.1945, "rewards/accuracies": 0.75, "rewards/chosen": -41.90791702270508, "rewards/margins": 5.946855545043945, "rewards/rejected": -47.85477066040039, "step": 5055 }, { "epoch": 0.6884531590413944, "grad_norm": 40.50749111618203, "learning_rate": 2.1415606062584727e-07, "logits/chosen": 12.626447677612305, "logits/rejected": 13.738018035888672, "logps/chosen": -4.038938999176025, "logps/rejected": -4.3202128410339355, "loss": 3.6985, "rewards/accuracies": 0.75, "rewards/chosen": -40.38938903808594, "rewards/margins": 2.8127365112304688, "rewards/rejected": -43.202125549316406, "step": 5056 }, { "epoch": 0.6885893246187363, "grad_norm": 45.36966814888738, "learning_rate": 2.1398770947142632e-07, "logits/chosen": 13.71196174621582, "logits/rejected": 13.744626998901367, "logps/chosen": -3.9670965671539307, "logps/rejected": -4.1369476318359375, "loss": 4.4352, "rewards/accuracies": 1.0, "rewards/chosen": -39.67096710205078, "rewards/margins": 1.6985111236572266, "rewards/rejected": -41.369476318359375, "step": 5057 }, { "epoch": 0.6887254901960784, "grad_norm": 48.80460830128169, "learning_rate": 2.1381940034801986e-07, "logits/chosen": 12.971404075622559, "logits/rejected": 13.091668128967285, "logps/chosen": -4.141975402832031, "logps/rejected": -4.242392539978027, "loss": 4.5644, "rewards/accuracies": 0.75, "rewards/chosen": -41.41975021362305, "rewards/margins": 1.0041723251342773, "rewards/rejected": -42.42392349243164, "step": 5058 }, { "epoch": 0.6888616557734205, "grad_norm": 39.2788526511265, "learning_rate": 2.136511332936589e-07, "logits/chosen": 12.152881622314453, "logits/rejected": 14.11141586303711, "logps/chosen": -4.060329437255859, "logps/rejected": -4.573972702026367, "loss": 3.8823, "rewards/accuracies": 1.0, "rewards/chosen": -40.603294372558594, "rewards/margins": 5.136431694030762, "rewards/rejected": -45.73973083496094, "step": 5059 }, { "epoch": 0.6889978213507625, "grad_norm": 40.028130573292785, "learning_rate": 2.1348290834636492e-07, "logits/chosen": 14.187246322631836, "logits/rejected": 13.94156265258789, "logps/chosen": -4.302502155303955, "logps/rejected": -4.699012756347656, "loss": 4.1863, "rewards/accuracies": 0.75, "rewards/chosen": -43.0250244140625, "rewards/margins": 3.965104103088379, "rewards/rejected": -46.99012756347656, "step": 5060 }, { "epoch": 0.6891339869281046, "grad_norm": 40.31862235299814, "learning_rate": 2.1331472554414933e-07, "logits/chosen": 13.478516578674316, "logits/rejected": 13.547574996948242, "logps/chosen": -4.723428726196289, "logps/rejected": -4.765753269195557, "loss": 4.1966, "rewards/accuracies": 0.5, "rewards/chosen": -47.234291076660156, "rewards/margins": 0.42324161529541016, "rewards/rejected": -47.65753173828125, "step": 5061 }, { "epoch": 0.6892701525054467, "grad_norm": 36.874722247018184, "learning_rate": 2.131465849250147e-07, "logits/chosen": 13.753011703491211, "logits/rejected": 13.973971366882324, "logps/chosen": -4.384876728057861, "logps/rejected": -4.4835357666015625, "loss": 3.9654, "rewards/accuracies": 0.75, "rewards/chosen": -43.84877014160156, "rewards/margins": 0.9865894317626953, "rewards/rejected": -44.835357666015625, "step": 5062 }, { "epoch": 0.6894063180827886, "grad_norm": 39.197698167541105, "learning_rate": 2.1297848652695395e-07, "logits/chosen": 13.405345916748047, "logits/rejected": 13.968631744384766, "logps/chosen": -4.548349380493164, "logps/rejected": -4.7334303855896, "loss": 3.7588, "rewards/accuracies": 0.75, "rewards/chosen": -45.483497619628906, "rewards/margins": 1.8508062362670898, "rewards/rejected": -47.33430480957031, "step": 5063 }, { "epoch": 0.6895424836601307, "grad_norm": 40.77606095602148, "learning_rate": 2.128104303879499e-07, "logits/chosen": 13.773035049438477, "logits/rejected": 13.854598999023438, "logps/chosen": -4.7415361404418945, "logps/rejected": -4.731759071350098, "loss": 4.2898, "rewards/accuracies": 0.25, "rewards/chosen": -47.41536331176758, "rewards/margins": -0.09777450561523438, "rewards/rejected": -47.317588806152344, "step": 5064 }, { "epoch": 0.6896786492374728, "grad_norm": 40.201231143791, "learning_rate": 2.126424165459764e-07, "logits/chosen": 12.973766326904297, "logits/rejected": 13.787008285522461, "logps/chosen": -4.08463191986084, "logps/rejected": -4.501404762268066, "loss": 4.0282, "rewards/accuracies": 1.0, "rewards/chosen": -40.846317291259766, "rewards/margins": 4.167731285095215, "rewards/rejected": -45.01404571533203, "step": 5065 }, { "epoch": 0.6898148148148148, "grad_norm": 42.012874876491395, "learning_rate": 2.124744450389978e-07, "logits/chosen": 13.115781784057617, "logits/rejected": 14.145967483520508, "logps/chosen": -4.489529132843018, "logps/rejected": -4.892233848571777, "loss": 4.0276, "rewards/accuracies": 1.0, "rewards/chosen": -44.89529037475586, "rewards/margins": 4.027047157287598, "rewards/rejected": -48.92233657836914, "step": 5066 }, { "epoch": 0.6899509803921569, "grad_norm": 39.320761478551, "learning_rate": 2.1230651590496826e-07, "logits/chosen": 13.4715576171875, "logits/rejected": 13.3380126953125, "logps/chosen": -4.434619903564453, "logps/rejected": -4.462268352508545, "loss": 4.0453, "rewards/accuracies": 0.75, "rewards/chosen": -44.34619903564453, "rewards/margins": 0.27648353576660156, "rewards/rejected": -44.622684478759766, "step": 5067 }, { "epoch": 0.6900871459694989, "grad_norm": 38.56069829271397, "learning_rate": 2.1213862918183296e-07, "logits/chosen": 12.666022300720215, "logits/rejected": 13.309820175170898, "logps/chosen": -4.091804027557373, "logps/rejected": -4.387993812561035, "loss": 3.712, "rewards/accuracies": 0.75, "rewards/chosen": -40.91804122924805, "rewards/margins": 2.961897850036621, "rewards/rejected": -43.879940032958984, "step": 5068 }, { "epoch": 0.6902233115468409, "grad_norm": 36.90036952990655, "learning_rate": 2.119707849075274e-07, "logits/chosen": 13.779170989990234, "logits/rejected": 14.054308891296387, "logps/chosen": -4.5529022216796875, "logps/rejected": -4.570451259613037, "loss": 4.1879, "rewards/accuracies": 0.25, "rewards/chosen": -45.529022216796875, "rewards/margins": 0.1754894256591797, "rewards/rejected": -45.70451354980469, "step": 5069 }, { "epoch": 0.690359477124183, "grad_norm": 44.104225943927766, "learning_rate": 2.1180298311997716e-07, "logits/chosen": 13.657942771911621, "logits/rejected": 13.39828109741211, "logps/chosen": -4.505480766296387, "logps/rejected": -4.422237396240234, "loss": 3.9324, "rewards/accuracies": 0.5, "rewards/chosen": -45.054805755615234, "rewards/margins": -0.8324317932128906, "rewards/rejected": -44.222373962402344, "step": 5070 }, { "epoch": 0.6904956427015251, "grad_norm": 48.38707789284869, "learning_rate": 2.1163522385709852e-07, "logits/chosen": 12.905149459838867, "logits/rejected": 13.052987098693848, "logps/chosen": -4.226322174072266, "logps/rejected": -4.566833019256592, "loss": 3.9213, "rewards/accuracies": 1.0, "rewards/chosen": -42.26322555541992, "rewards/margins": 3.405109405517578, "rewards/rejected": -45.6683349609375, "step": 5071 }, { "epoch": 0.690631808278867, "grad_norm": 42.753626313334756, "learning_rate": 2.1146750715679822e-07, "logits/chosen": 13.604279518127441, "logits/rejected": 12.967686653137207, "logps/chosen": -4.033572673797607, "logps/rejected": -4.058935642242432, "loss": 4.0181, "rewards/accuracies": 0.5, "rewards/chosen": -40.335723876953125, "rewards/margins": 0.2536287307739258, "rewards/rejected": -40.58935546875, "step": 5072 }, { "epoch": 0.6907679738562091, "grad_norm": 36.32437528953042, "learning_rate": 2.1129983305697294e-07, "logits/chosen": 13.471662521362305, "logits/rejected": 13.766321182250977, "logps/chosen": -4.346612453460693, "logps/rejected": -4.531155586242676, "loss": 3.3999, "rewards/accuracies": 0.75, "rewards/chosen": -43.46612548828125, "rewards/margins": 1.8454322814941406, "rewards/rejected": -45.311553955078125, "step": 5073 }, { "epoch": 0.6909041394335512, "grad_norm": 38.480282860695674, "learning_rate": 2.1113220159551025e-07, "logits/chosen": 13.00357437133789, "logits/rejected": 13.266532897949219, "logps/chosen": -4.228598594665527, "logps/rejected": -4.4062347412109375, "loss": 4.0251, "rewards/accuracies": 0.5, "rewards/chosen": -42.285980224609375, "rewards/margins": 1.7763681411743164, "rewards/rejected": -44.062347412109375, "step": 5074 }, { "epoch": 0.6910403050108932, "grad_norm": 42.280991173950135, "learning_rate": 2.109646128102879e-07, "logits/chosen": 14.259641647338867, "logits/rejected": 13.997842788696289, "logps/chosen": -4.070560455322266, "logps/rejected": -4.561738967895508, "loss": 4.2596, "rewards/accuracies": 0.75, "rewards/chosen": -40.705604553222656, "rewards/margins": 4.911787033081055, "rewards/rejected": -45.617393493652344, "step": 5075 }, { "epoch": 0.6911764705882353, "grad_norm": 38.613310198679024, "learning_rate": 2.1079706673917374e-07, "logits/chosen": 13.240327835083008, "logits/rejected": 13.775201797485352, "logps/chosen": -4.1620025634765625, "logps/rejected": -4.376728057861328, "loss": 4.3401, "rewards/accuracies": 0.75, "rewards/chosen": -41.620033264160156, "rewards/margins": 2.1472463607788086, "rewards/rejected": -43.767276763916016, "step": 5076 }, { "epoch": 0.6913126361655774, "grad_norm": 39.467726711383484, "learning_rate": 2.106295634200263e-07, "logits/chosen": 13.106302261352539, "logits/rejected": 13.34740924835205, "logps/chosen": -4.356480598449707, "logps/rejected": -4.508414268493652, "loss": 4.256, "rewards/accuracies": 0.75, "rewards/chosen": -43.5648078918457, "rewards/margins": 1.5193328857421875, "rewards/rejected": -45.08414077758789, "step": 5077 }, { "epoch": 0.6914488017429193, "grad_norm": 39.54001761721204, "learning_rate": 2.104621028906945e-07, "logits/chosen": 13.43777084350586, "logits/rejected": 14.201343536376953, "logps/chosen": -4.3150739669799805, "logps/rejected": -4.481050968170166, "loss": 3.9811, "rewards/accuracies": 0.75, "rewards/chosen": -43.15073776245117, "rewards/margins": 1.6597719192504883, "rewards/rejected": -44.810508728027344, "step": 5078 }, { "epoch": 0.6915849673202614, "grad_norm": 38.64049016042035, "learning_rate": 2.102946851890172e-07, "logits/chosen": 13.373954772949219, "logits/rejected": 14.20352554321289, "logps/chosen": -4.657707214355469, "logps/rejected": -4.922489166259766, "loss": 3.9068, "rewards/accuracies": 0.75, "rewards/chosen": -46.57707595825195, "rewards/margins": 2.6478147506713867, "rewards/rejected": -49.224891662597656, "step": 5079 }, { "epoch": 0.6917211328976035, "grad_norm": 39.56546511880596, "learning_rate": 2.1012731035282382e-07, "logits/chosen": 12.820098876953125, "logits/rejected": 13.619216918945312, "logps/chosen": -4.095576763153076, "logps/rejected": -4.331718444824219, "loss": 3.2493, "rewards/accuracies": 1.0, "rewards/chosen": -40.95576477050781, "rewards/margins": 2.361417770385742, "rewards/rejected": -43.31718444824219, "step": 5080 }, { "epoch": 0.6918572984749455, "grad_norm": 37.57375791342435, "learning_rate": 2.0995997841993435e-07, "logits/chosen": 13.408679962158203, "logits/rejected": 14.196884155273438, "logps/chosen": -4.232433319091797, "logps/rejected": -4.345610618591309, "loss": 3.8146, "rewards/accuracies": 0.75, "rewards/chosen": -42.32433319091797, "rewards/margins": 1.131770133972168, "rewards/rejected": -43.45610809326172, "step": 5081 }, { "epoch": 0.6919934640522876, "grad_norm": 38.140626459492054, "learning_rate": 2.097926894281585e-07, "logits/chosen": 13.580757141113281, "logits/rejected": 12.99094009399414, "logps/chosen": -4.084359169006348, "logps/rejected": -4.301429748535156, "loss": 4.0326, "rewards/accuracies": 1.0, "rewards/chosen": -40.843589782714844, "rewards/margins": 2.1707077026367188, "rewards/rejected": -43.01429748535156, "step": 5082 }, { "epoch": 0.6921296296296297, "grad_norm": 36.868871540820045, "learning_rate": 2.0962544341529678e-07, "logits/chosen": 12.872196197509766, "logits/rejected": 14.118925094604492, "logps/chosen": -4.589765548706055, "logps/rejected": -4.867189407348633, "loss": 3.7388, "rewards/accuracies": 0.75, "rewards/chosen": -45.89765167236328, "rewards/margins": 2.7742443084716797, "rewards/rejected": -48.671897888183594, "step": 5083 }, { "epoch": 0.6922657952069716, "grad_norm": 38.31957541309553, "learning_rate": 2.0945824041913985e-07, "logits/chosen": 13.260990142822266, "logits/rejected": 12.993643760681152, "logps/chosen": -4.299395561218262, "logps/rejected": -4.235053062438965, "loss": 3.945, "rewards/accuracies": 0.25, "rewards/chosen": -42.99395751953125, "rewards/margins": -0.6434249877929688, "rewards/rejected": -42.35053253173828, "step": 5084 }, { "epoch": 0.6924019607843137, "grad_norm": 35.30689097428245, "learning_rate": 2.0929108047746839e-07, "logits/chosen": 14.389842987060547, "logits/rejected": 13.789209365844727, "logps/chosen": -4.482004642486572, "logps/rejected": -4.4548845291137695, "loss": 3.7945, "rewards/accuracies": 0.5, "rewards/chosen": -44.820045471191406, "rewards/margins": -0.2712059020996094, "rewards/rejected": -44.5488395690918, "step": 5085 }, { "epoch": 0.6925381263616558, "grad_norm": 40.60834438000853, "learning_rate": 2.0912396362805377e-07, "logits/chosen": 14.499579429626465, "logits/rejected": 13.676612854003906, "logps/chosen": -4.535195350646973, "logps/rejected": -4.656301498413086, "loss": 3.5262, "rewards/accuracies": 0.5, "rewards/chosen": -45.351951599121094, "rewards/margins": 1.2110605239868164, "rewards/rejected": -46.563011169433594, "step": 5086 }, { "epoch": 0.6926742919389978, "grad_norm": 39.70328065390223, "learning_rate": 2.0895688990865735e-07, "logits/chosen": 13.413020133972168, "logits/rejected": 13.961522102355957, "logps/chosen": -4.372249603271484, "logps/rejected": -4.5308427810668945, "loss": 3.3869, "rewards/accuracies": 0.75, "rewards/chosen": -43.722496032714844, "rewards/margins": 1.5859289169311523, "rewards/rejected": -45.30842590332031, "step": 5087 }, { "epoch": 0.6928104575163399, "grad_norm": 42.78352466779943, "learning_rate": 2.0878985935703092e-07, "logits/chosen": 13.8380765914917, "logits/rejected": 13.709436416625977, "logps/chosen": -4.756457328796387, "logps/rejected": -4.503605842590332, "loss": 3.7964, "rewards/accuracies": 0.25, "rewards/chosen": -47.5645751953125, "rewards/margins": -2.528517723083496, "rewards/rejected": -45.03605651855469, "step": 5088 }, { "epoch": 0.6929466230936819, "grad_norm": 37.51598292162736, "learning_rate": 2.0862287201091626e-07, "logits/chosen": 12.89134407043457, "logits/rejected": 13.190160751342773, "logps/chosen": -4.313179969787598, "logps/rejected": -4.427896976470947, "loss": 4.2028, "rewards/accuracies": 0.5, "rewards/chosen": -43.131797790527344, "rewards/margins": 1.147170066833496, "rewards/rejected": -44.278968811035156, "step": 5089 }, { "epoch": 0.693082788671024, "grad_norm": 37.910338929381496, "learning_rate": 2.084559279080456e-07, "logits/chosen": 13.220922470092773, "logits/rejected": 12.979711532592773, "logps/chosen": -4.030872821807861, "logps/rejected": -4.086403846740723, "loss": 4.1775, "rewards/accuracies": 0.5, "rewards/chosen": -40.3087272644043, "rewards/margins": 0.5553140640258789, "rewards/rejected": -40.86404037475586, "step": 5090 }, { "epoch": 0.693218954248366, "grad_norm": 38.64257031954763, "learning_rate": 2.0828902708614144e-07, "logits/chosen": 12.802217483520508, "logits/rejected": 13.722006797790527, "logps/chosen": -4.25602912902832, "logps/rejected": -4.379354476928711, "loss": 4.135, "rewards/accuracies": 0.75, "rewards/chosen": -42.5602912902832, "rewards/margins": 1.2332572937011719, "rewards/rejected": -43.793548583984375, "step": 5091 }, { "epoch": 0.6933551198257081, "grad_norm": 38.69898419167205, "learning_rate": 2.081221695829162e-07, "logits/chosen": 13.544507026672363, "logits/rejected": 13.613484382629395, "logps/chosen": -4.524534225463867, "logps/rejected": -4.059294700622559, "loss": 3.7619, "rewards/accuracies": 0.25, "rewards/chosen": -45.245338439941406, "rewards/margins": -4.65239143371582, "rewards/rejected": -40.59294891357422, "step": 5092 }, { "epoch": 0.6934912854030502, "grad_norm": 38.542648281272335, "learning_rate": 2.079553554360728e-07, "logits/chosen": 13.738131523132324, "logits/rejected": 13.455058097839355, "logps/chosen": -4.312030792236328, "logps/rejected": -4.330740928649902, "loss": 3.8025, "rewards/accuracies": 0.25, "rewards/chosen": -43.12030792236328, "rewards/margins": 0.1871042251586914, "rewards/rejected": -43.307411193847656, "step": 5093 }, { "epoch": 0.6936274509803921, "grad_norm": 42.1444966065303, "learning_rate": 2.077885846833043e-07, "logits/chosen": 13.498115539550781, "logits/rejected": 13.553848266601562, "logps/chosen": -4.198816299438477, "logps/rejected": -4.1770524978637695, "loss": 3.8399, "rewards/accuracies": 0.75, "rewards/chosen": -41.98816680908203, "rewards/margins": -0.21763896942138672, "rewards/rejected": -41.77052688598633, "step": 5094 }, { "epoch": 0.6937636165577342, "grad_norm": 38.87128793477116, "learning_rate": 2.0762185736229409e-07, "logits/chosen": 13.650476455688477, "logits/rejected": 13.344253540039062, "logps/chosen": -4.2995381355285645, "logps/rejected": -4.293870449066162, "loss": 3.9441, "rewards/accuracies": 0.5, "rewards/chosen": -42.995384216308594, "rewards/margins": -0.05667877197265625, "rewards/rejected": -42.93870544433594, "step": 5095 }, { "epoch": 0.6938997821350763, "grad_norm": 35.60803319044975, "learning_rate": 2.0745517351071528e-07, "logits/chosen": 13.489133834838867, "logits/rejected": 14.639524459838867, "logps/chosen": -4.3782196044921875, "logps/rejected": -4.762901306152344, "loss": 3.5312, "rewards/accuracies": 0.5, "rewards/chosen": -43.782196044921875, "rewards/margins": 3.8468189239501953, "rewards/rejected": -47.6290168762207, "step": 5096 }, { "epoch": 0.6940359477124183, "grad_norm": 42.90620940594998, "learning_rate": 2.0728853316623162e-07, "logits/chosen": 14.073200225830078, "logits/rejected": 13.552586555480957, "logps/chosen": -4.199348449707031, "logps/rejected": -4.320382118225098, "loss": 4.7294, "rewards/accuracies": 0.5, "rewards/chosen": -41.99348449707031, "rewards/margins": 1.2103357315063477, "rewards/rejected": -43.203819274902344, "step": 5097 }, { "epoch": 0.6941721132897604, "grad_norm": 39.73357970750803, "learning_rate": 2.0712193636649697e-07, "logits/chosen": 14.294288635253906, "logits/rejected": 13.812255859375, "logps/chosen": -4.417448043823242, "logps/rejected": -4.401244163513184, "loss": 4.2564, "rewards/accuracies": 0.5, "rewards/chosen": -44.174476623535156, "rewards/margins": -0.16203975677490234, "rewards/rejected": -44.01243591308594, "step": 5098 }, { "epoch": 0.6943082788671024, "grad_norm": 37.76850944517999, "learning_rate": 2.0695538314915501e-07, "logits/chosen": 13.949347496032715, "logits/rejected": 13.972448348999023, "logps/chosen": -4.433678150177002, "logps/rejected": -4.6456708908081055, "loss": 4.0065, "rewards/accuracies": 0.75, "rewards/chosen": -44.3367805480957, "rewards/margins": 2.119929313659668, "rewards/rejected": -46.45671081542969, "step": 5099 }, { "epoch": 0.6944444444444444, "grad_norm": 39.17336379521555, "learning_rate": 2.0678887355183998e-07, "logits/chosen": 13.886926651000977, "logits/rejected": 13.87466812133789, "logps/chosen": -4.287412166595459, "logps/rejected": -4.51446008682251, "loss": 4.1491, "rewards/accuracies": 0.75, "rewards/chosen": -42.874122619628906, "rewards/margins": 2.270482063293457, "rewards/rejected": -45.14459991455078, "step": 5100 }, { "epoch": 0.6945806100217865, "grad_norm": 39.561750482738695, "learning_rate": 2.0662240761217605e-07, "logits/chosen": 12.83111572265625, "logits/rejected": 13.139691352844238, "logps/chosen": -4.276802062988281, "logps/rejected": -4.300348281860352, "loss": 4.2301, "rewards/accuracies": 0.75, "rewards/chosen": -42.76802062988281, "rewards/margins": 0.23546218872070312, "rewards/rejected": -43.00348663330078, "step": 5101 }, { "epoch": 0.6947167755991286, "grad_norm": 36.396709984236146, "learning_rate": 2.0645598536777774e-07, "logits/chosen": 12.627059936523438, "logits/rejected": 14.119946479797363, "logps/chosen": -4.071237564086914, "logps/rejected": -4.508131980895996, "loss": 3.8511, "rewards/accuracies": 1.0, "rewards/chosen": -40.712379455566406, "rewards/margins": 4.368943214416504, "rewards/rejected": -45.081321716308594, "step": 5102 }, { "epoch": 0.6948529411764706, "grad_norm": 37.48249846325334, "learning_rate": 2.062896068562492e-07, "logits/chosen": 13.661323547363281, "logits/rejected": 14.328166007995605, "logps/chosen": -4.109047889709473, "logps/rejected": -4.606332302093506, "loss": 3.3031, "rewards/accuracies": 0.75, "rewards/chosen": -41.090476989746094, "rewards/margins": 4.972846031188965, "rewards/rejected": -46.063323974609375, "step": 5103 }, { "epoch": 0.6949891067538126, "grad_norm": 38.843635129133425, "learning_rate": 2.0612327211518524e-07, "logits/chosen": 12.434555053710938, "logits/rejected": 13.156925201416016, "logps/chosen": -4.138948440551758, "logps/rejected": -4.539813995361328, "loss": 4.1521, "rewards/accuracies": 0.75, "rewards/chosen": -41.38948059082031, "rewards/margins": 4.008659362792969, "rewards/rejected": -45.39813995361328, "step": 5104 }, { "epoch": 0.6951252723311547, "grad_norm": 46.68538215060089, "learning_rate": 2.0595698118217072e-07, "logits/chosen": 14.116056442260742, "logits/rejected": 13.083209991455078, "logps/chosen": -4.441139221191406, "logps/rejected": -4.072765827178955, "loss": 4.6056, "rewards/accuracies": 0.0, "rewards/chosen": -44.41139221191406, "rewards/margins": -3.683734893798828, "rewards/rejected": -40.7276611328125, "step": 5105 }, { "epoch": 0.6952614379084967, "grad_norm": 44.01563884132969, "learning_rate": 2.057907340947801e-07, "logits/chosen": 13.502754211425781, "logits/rejected": 13.726007461547852, "logps/chosen": -4.40440559387207, "logps/rejected": -4.500838279724121, "loss": 3.6826, "rewards/accuracies": 0.5, "rewards/chosen": -44.04405212402344, "rewards/margins": 0.964324951171875, "rewards/rejected": -45.00837707519531, "step": 5106 }, { "epoch": 0.6953976034858388, "grad_norm": 37.16495056552234, "learning_rate": 2.056245308905785e-07, "logits/chosen": 13.90030288696289, "logits/rejected": 14.097043991088867, "logps/chosen": -4.175010681152344, "logps/rejected": -4.663534164428711, "loss": 4.2538, "rewards/accuracies": 1.0, "rewards/chosen": -41.75010681152344, "rewards/margins": 4.885235786437988, "rewards/rejected": -46.63534164428711, "step": 5107 }, { "epoch": 0.6955337690631809, "grad_norm": 38.25583842094225, "learning_rate": 2.0545837160712098e-07, "logits/chosen": 13.245870590209961, "logits/rejected": 13.289324760437012, "logps/chosen": -3.850165367126465, "logps/rejected": -4.094611167907715, "loss": 4.0658, "rewards/accuracies": 1.0, "rewards/chosen": -38.50165557861328, "rewards/margins": 2.4444589614868164, "rewards/rejected": -40.94611358642578, "step": 5108 }, { "epoch": 0.6956699346405228, "grad_norm": 37.816896373418736, "learning_rate": 2.0529225628195235e-07, "logits/chosen": 12.728599548339844, "logits/rejected": 13.311216354370117, "logps/chosen": -4.288878440856934, "logps/rejected": -4.260190010070801, "loss": 3.9632, "rewards/accuracies": 0.25, "rewards/chosen": -42.88878631591797, "rewards/margins": -0.2868824005126953, "rewards/rejected": -42.60190200805664, "step": 5109 }, { "epoch": 0.6958061002178649, "grad_norm": 39.058727403970046, "learning_rate": 2.051261849526079e-07, "logits/chosen": 13.298072814941406, "logits/rejected": 13.807806015014648, "logps/chosen": -4.121884822845459, "logps/rejected": -4.523576259613037, "loss": 3.947, "rewards/accuracies": 1.0, "rewards/chosen": -41.218849182128906, "rewards/margins": 4.016914367675781, "rewards/rejected": -45.23576354980469, "step": 5110 }, { "epoch": 0.695942265795207, "grad_norm": 39.61922651168836, "learning_rate": 2.0496015765661294e-07, "logits/chosen": 12.946435928344727, "logits/rejected": 13.912372589111328, "logps/chosen": -4.079537391662598, "logps/rejected": -4.362336158752441, "loss": 3.9322, "rewards/accuracies": 0.75, "rewards/chosen": -40.795379638671875, "rewards/margins": 2.8279857635498047, "rewards/rejected": -43.62336349487305, "step": 5111 }, { "epoch": 0.696078431372549, "grad_norm": 44.004425591725074, "learning_rate": 2.0479417443148247e-07, "logits/chosen": 13.19286823272705, "logits/rejected": 13.724157333374023, "logps/chosen": -4.235901832580566, "logps/rejected": -4.396440029144287, "loss": 3.8547, "rewards/accuracies": 0.5, "rewards/chosen": -42.35901641845703, "rewards/margins": 1.6053838729858398, "rewards/rejected": -43.96440124511719, "step": 5112 }, { "epoch": 0.6962145969498911, "grad_norm": 41.3595503125648, "learning_rate": 2.0462823531472177e-07, "logits/chosen": 13.928114891052246, "logits/rejected": 14.194326400756836, "logps/chosen": -4.617372512817383, "logps/rejected": -4.6885175704956055, "loss": 3.8934, "rewards/accuracies": 0.75, "rewards/chosen": -46.173728942871094, "rewards/margins": 0.7114505767822266, "rewards/rejected": -46.88517761230469, "step": 5113 }, { "epoch": 0.6963507625272332, "grad_norm": 38.69496665951196, "learning_rate": 2.044623403438265e-07, "logits/chosen": 12.969883918762207, "logits/rejected": 14.02122688293457, "logps/chosen": -4.2245917320251465, "logps/rejected": -4.6271843910217285, "loss": 3.6179, "rewards/accuracies": 1.0, "rewards/chosen": -42.245914459228516, "rewards/margins": 4.0259294509887695, "rewards/rejected": -46.27184295654297, "step": 5114 }, { "epoch": 0.6964869281045751, "grad_norm": 43.59550726298821, "learning_rate": 2.0429648955628157e-07, "logits/chosen": 13.657180786132812, "logits/rejected": 13.961130142211914, "logps/chosen": -4.3033318519592285, "logps/rejected": -4.528029441833496, "loss": 4.0582, "rewards/accuracies": 0.5, "rewards/chosen": -43.03331756591797, "rewards/margins": 2.246973991394043, "rewards/rejected": -45.28029251098633, "step": 5115 }, { "epoch": 0.6966230936819172, "grad_norm": 41.77186654133659, "learning_rate": 2.0413068298956255e-07, "logits/chosen": 13.383626937866211, "logits/rejected": 13.395957946777344, "logps/chosen": -3.7528700828552246, "logps/rejected": -4.18802547454834, "loss": 3.6808, "rewards/accuracies": 0.5, "rewards/chosen": -37.52870178222656, "rewards/margins": 4.351555824279785, "rewards/rejected": -41.88025665283203, "step": 5116 }, { "epoch": 0.6967592592592593, "grad_norm": 42.397216223130364, "learning_rate": 2.0396492068113492e-07, "logits/chosen": 13.528997421264648, "logits/rejected": 13.648028373718262, "logps/chosen": -4.064450263977051, "logps/rejected": -4.297560691833496, "loss": 3.7799, "rewards/accuracies": 0.75, "rewards/chosen": -40.644508361816406, "rewards/margins": 2.3311004638671875, "rewards/rejected": -42.97560501098633, "step": 5117 }, { "epoch": 0.6968954248366013, "grad_norm": 41.16542695885064, "learning_rate": 2.0379920266845376e-07, "logits/chosen": 13.26787281036377, "logits/rejected": 13.3870267868042, "logps/chosen": -4.4066619873046875, "logps/rejected": -4.524715423583984, "loss": 3.7802, "rewards/accuracies": 0.75, "rewards/chosen": -44.066619873046875, "rewards/margins": 1.1805334091186523, "rewards/rejected": -45.247154235839844, "step": 5118 }, { "epoch": 0.6970315904139434, "grad_norm": 42.45208924152671, "learning_rate": 2.0363352898896458e-07, "logits/chosen": 13.921204566955566, "logits/rejected": 13.929887771606445, "logps/chosen": -4.539616584777832, "logps/rejected": -4.497953414916992, "loss": 4.1885, "rewards/accuracies": 0.25, "rewards/chosen": -45.39616394042969, "rewards/margins": -0.41663265228271484, "rewards/rejected": -44.979530334472656, "step": 5119 }, { "epoch": 0.6971677559912854, "grad_norm": 43.32200552269895, "learning_rate": 2.0346789968010283e-07, "logits/chosen": 13.460939407348633, "logits/rejected": 13.716808319091797, "logps/chosen": -4.208390235900879, "logps/rejected": -4.383354187011719, "loss": 3.7053, "rewards/accuracies": 0.5, "rewards/chosen": -42.08390426635742, "rewards/margins": 1.7496404647827148, "rewards/rejected": -43.83354187011719, "step": 5120 }, { "epoch": 0.6973039215686274, "grad_norm": 42.64008882935128, "learning_rate": 2.0330231477929356e-07, "logits/chosen": 13.129406929016113, "logits/rejected": 12.938698768615723, "logps/chosen": -3.924868106842041, "logps/rejected": -3.9420864582061768, "loss": 4.199, "rewards/accuracies": 0.5, "rewards/chosen": -39.248680114746094, "rewards/margins": 0.17218685150146484, "rewards/rejected": -39.42086410522461, "step": 5121 }, { "epoch": 0.6974400871459695, "grad_norm": 42.52266195599789, "learning_rate": 2.0313677432395217e-07, "logits/chosen": 12.729791641235352, "logits/rejected": 13.242042541503906, "logps/chosen": -4.293476104736328, "logps/rejected": -4.368725776672363, "loss": 3.8041, "rewards/accuracies": 0.5, "rewards/chosen": -42.93476104736328, "rewards/margins": 0.7524929046630859, "rewards/rejected": -43.687255859375, "step": 5122 }, { "epoch": 0.6975762527233116, "grad_norm": 42.311300172204085, "learning_rate": 2.0297127835148408e-07, "logits/chosen": 13.056344985961914, "logits/rejected": 13.832625389099121, "logps/chosen": -4.270934104919434, "logps/rejected": -4.495896816253662, "loss": 4.1599, "rewards/accuracies": 1.0, "rewards/chosen": -42.709346771240234, "rewards/margins": 2.2496252059936523, "rewards/rejected": -44.95896911621094, "step": 5123 }, { "epoch": 0.6977124183006536, "grad_norm": 45.27425549063458, "learning_rate": 2.0280582689928419e-07, "logits/chosen": 13.728252410888672, "logits/rejected": 13.904824256896973, "logps/chosen": -4.193385124206543, "logps/rejected": -4.581623554229736, "loss": 3.5703, "rewards/accuracies": 1.0, "rewards/chosen": -41.93385314941406, "rewards/margins": 3.882380485534668, "rewards/rejected": -45.81623077392578, "step": 5124 }, { "epoch": 0.6978485838779956, "grad_norm": 39.08944700318666, "learning_rate": 2.0264042000473768e-07, "logits/chosen": 13.12820053100586, "logits/rejected": 13.489065170288086, "logps/chosen": -4.3385396003723145, "logps/rejected": -4.755695343017578, "loss": 3.7066, "rewards/accuracies": 1.0, "rewards/chosen": -43.38539505004883, "rewards/margins": 4.171555519104004, "rewards/rejected": -47.55695343017578, "step": 5125 }, { "epoch": 0.6979847494553377, "grad_norm": 39.27170684593788, "learning_rate": 2.024750577052198e-07, "logits/chosen": 12.350537300109863, "logits/rejected": 12.595584869384766, "logps/chosen": -4.09684944152832, "logps/rejected": -4.359516143798828, "loss": 3.9747, "rewards/accuracies": 0.75, "rewards/chosen": -40.9684944152832, "rewards/margins": 2.6266679763793945, "rewards/rejected": -43.59516143798828, "step": 5126 }, { "epoch": 0.6981209150326797, "grad_norm": 37.45960539313818, "learning_rate": 2.0230974003809528e-07, "logits/chosen": 12.781574249267578, "logits/rejected": 13.413076400756836, "logps/chosen": -4.232390403747559, "logps/rejected": -4.069972038269043, "loss": 3.9225, "rewards/accuracies": 0.25, "rewards/chosen": -42.32390213012695, "rewards/margins": -1.6241827011108398, "rewards/rejected": -40.69972229003906, "step": 5127 }, { "epoch": 0.6982570806100218, "grad_norm": 39.54839068027705, "learning_rate": 2.0214446704071908e-07, "logits/chosen": 13.228950500488281, "logits/rejected": 12.896495819091797, "logps/chosen": -4.381579399108887, "logps/rejected": -4.099713325500488, "loss": 3.9926, "rewards/accuracies": 0.0, "rewards/chosen": -43.8157958984375, "rewards/margins": -2.818661689758301, "rewards/rejected": -40.99713134765625, "step": 5128 }, { "epoch": 0.6983932461873639, "grad_norm": 40.16107453241221, "learning_rate": 2.0197923875043625e-07, "logits/chosen": 13.30756950378418, "logits/rejected": 13.550365447998047, "logps/chosen": -4.090828895568848, "logps/rejected": -4.220236778259277, "loss": 4.1372, "rewards/accuracies": 0.75, "rewards/chosen": -40.908287048339844, "rewards/margins": 1.294081687927246, "rewards/rejected": -42.202369689941406, "step": 5129 }, { "epoch": 0.6985294117647058, "grad_norm": 42.34727150036864, "learning_rate": 2.0181405520458106e-07, "logits/chosen": 14.100132942199707, "logits/rejected": 13.430322647094727, "logps/chosen": -4.544897079467773, "logps/rejected": -4.406019687652588, "loss": 3.7333, "rewards/accuracies": 0.25, "rewards/chosen": -45.44896697998047, "rewards/margins": -1.3887691497802734, "rewards/rejected": -44.06019592285156, "step": 5130 }, { "epoch": 0.6986655773420479, "grad_norm": 41.33403970603909, "learning_rate": 2.0164891644047838e-07, "logits/chosen": 13.90578842163086, "logits/rejected": 12.879951477050781, "logps/chosen": -4.170355796813965, "logps/rejected": -4.02611780166626, "loss": 3.9881, "rewards/accuracies": 0.25, "rewards/chosen": -41.70355987548828, "rewards/margins": -1.4423809051513672, "rewards/rejected": -40.26117706298828, "step": 5131 }, { "epoch": 0.69880174291939, "grad_norm": 61.15409187159347, "learning_rate": 2.0148382249544275e-07, "logits/chosen": 13.510875701904297, "logits/rejected": 13.872871398925781, "logps/chosen": -4.279130935668945, "logps/rejected": -4.642852783203125, "loss": 3.6231, "rewards/accuracies": 1.0, "rewards/chosen": -42.79130554199219, "rewards/margins": 3.6372241973876953, "rewards/rejected": -46.42852783203125, "step": 5132 }, { "epoch": 0.698937908496732, "grad_norm": 40.7773840018214, "learning_rate": 2.0131877340677818e-07, "logits/chosen": 13.731179237365723, "logits/rejected": 14.577947616577148, "logps/chosen": -4.625234603881836, "logps/rejected": -4.594245910644531, "loss": 3.8268, "rewards/accuracies": 0.25, "rewards/chosen": -46.252342224121094, "rewards/margins": -0.30988407135009766, "rewards/rejected": -45.94246292114258, "step": 5133 }, { "epoch": 0.6990740740740741, "grad_norm": 40.08902721383591, "learning_rate": 2.0115376921177916e-07, "logits/chosen": 14.228360176086426, "logits/rejected": 14.037528038024902, "logps/chosen": -4.3741583824157715, "logps/rejected": -4.671998977661133, "loss": 3.7046, "rewards/accuracies": 0.75, "rewards/chosen": -43.74158477783203, "rewards/margins": 2.978404998779297, "rewards/rejected": -46.71998977661133, "step": 5134 }, { "epoch": 0.6992102396514162, "grad_norm": 38.33875039285389, "learning_rate": 2.0098880994772976e-07, "logits/chosen": 13.066904067993164, "logits/rejected": 13.154605865478516, "logps/chosen": -4.352512359619141, "logps/rejected": -4.373033046722412, "loss": 3.9651, "rewards/accuracies": 0.5, "rewards/chosen": -43.52512741088867, "rewards/margins": 0.20520305633544922, "rewards/rejected": -43.73033142089844, "step": 5135 }, { "epoch": 0.6993464052287581, "grad_norm": 43.40571075598741, "learning_rate": 2.0082389565190368e-07, "logits/chosen": 13.466208457946777, "logits/rejected": 13.506184577941895, "logps/chosen": -4.312685966491699, "logps/rejected": -4.410557746887207, "loss": 4.0279, "rewards/accuracies": 0.5, "rewards/chosen": -43.12685775756836, "rewards/margins": 0.9787206649780273, "rewards/rejected": -44.10557556152344, "step": 5136 }, { "epoch": 0.6994825708061002, "grad_norm": 37.0210711168665, "learning_rate": 2.0065902636156476e-07, "logits/chosen": 12.916738510131836, "logits/rejected": 12.968768119812012, "logps/chosen": -3.9266653060913086, "logps/rejected": -4.087886333465576, "loss": 3.8468, "rewards/accuracies": 0.5, "rewards/chosen": -39.26665496826172, "rewards/margins": 1.612208366394043, "rewards/rejected": -40.87886428833008, "step": 5137 }, { "epoch": 0.6996187363834423, "grad_norm": 40.34337809084139, "learning_rate": 2.0049420211396676e-07, "logits/chosen": 13.655550003051758, "logits/rejected": 14.187780380249023, "logps/chosen": -4.184724807739258, "logps/rejected": -4.50202751159668, "loss": 3.6016, "rewards/accuracies": 0.5, "rewards/chosen": -41.84724426269531, "rewards/margins": 3.1730308532714844, "rewards/rejected": -45.02027893066406, "step": 5138 }, { "epoch": 0.6997549019607843, "grad_norm": 45.135351125050406, "learning_rate": 2.0032942294635276e-07, "logits/chosen": 13.126056671142578, "logits/rejected": 13.280105590820312, "logps/chosen": -4.3618879318237305, "logps/rejected": -4.502769947052002, "loss": 3.7528, "rewards/accuracies": 0.75, "rewards/chosen": -43.61888122558594, "rewards/margins": 1.4088211059570312, "rewards/rejected": -45.02770233154297, "step": 5139 }, { "epoch": 0.6998910675381264, "grad_norm": 43.072291624240755, "learning_rate": 2.0016468889595611e-07, "logits/chosen": 13.348194122314453, "logits/rejected": 12.22671890258789, "logps/chosen": -4.27002477645874, "logps/rejected": -4.089560031890869, "loss": 4.272, "rewards/accuracies": 0.5, "rewards/chosen": -42.70024871826172, "rewards/margins": -1.8046493530273438, "rewards/rejected": -40.895599365234375, "step": 5140 }, { "epoch": 0.7000272331154684, "grad_norm": 44.52126341963172, "learning_rate": 2.0000000000000007e-07, "logits/chosen": 13.095069885253906, "logits/rejected": 13.571948051452637, "logps/chosen": -4.165031433105469, "logps/rejected": -4.533298492431641, "loss": 4.0828, "rewards/accuracies": 0.75, "rewards/chosen": -41.65031433105469, "rewards/margins": 3.6826725006103516, "rewards/rejected": -45.33298873901367, "step": 5141 }, { "epoch": 0.7001633986928104, "grad_norm": 39.21186691703361, "learning_rate": 1.9983535629569707e-07, "logits/chosen": 12.885648727416992, "logits/rejected": 13.144519805908203, "logps/chosen": -4.114560604095459, "logps/rejected": -4.470950126647949, "loss": 3.8835, "rewards/accuracies": 0.75, "rewards/chosen": -41.14560317993164, "rewards/margins": 3.563897132873535, "rewards/rejected": -44.709503173828125, "step": 5142 }, { "epoch": 0.7002995642701525, "grad_norm": 41.67233083934594, "learning_rate": 1.9967075782024988e-07, "logits/chosen": 13.243278503417969, "logits/rejected": 12.925467491149902, "logps/chosen": -4.336586952209473, "logps/rejected": -4.3235650062561035, "loss": 3.8969, "rewards/accuracies": 0.5, "rewards/chosen": -43.365867614746094, "rewards/margins": -0.1302194595336914, "rewards/rejected": -43.23564910888672, "step": 5143 }, { "epoch": 0.7004357298474946, "grad_norm": 39.942244689947984, "learning_rate": 1.995062046108511e-07, "logits/chosen": 13.516656875610352, "logits/rejected": 13.966930389404297, "logps/chosen": -4.217154502868652, "logps/rejected": -4.529127597808838, "loss": 3.3547, "rewards/accuracies": 0.75, "rewards/chosen": -42.171546936035156, "rewards/margins": 3.1197290420532227, "rewards/rejected": -45.29127502441406, "step": 5144 }, { "epoch": 0.7005718954248366, "grad_norm": 42.72292306503307, "learning_rate": 1.9934169670468252e-07, "logits/chosen": 13.754435539245605, "logits/rejected": 13.55134105682373, "logps/chosen": -4.4577836990356445, "logps/rejected": -4.213104724884033, "loss": 4.0389, "rewards/accuracies": 0.0, "rewards/chosen": -44.57783508300781, "rewards/margins": -2.4467878341674805, "rewards/rejected": -42.13105010986328, "step": 5145 }, { "epoch": 0.7007080610021786, "grad_norm": 36.69430201046644, "learning_rate": 1.991772341389162e-07, "logits/chosen": 13.478212356567383, "logits/rejected": 13.538125991821289, "logps/chosen": -4.232283115386963, "logps/rejected": -4.3842973709106445, "loss": 3.597, "rewards/accuracies": 0.75, "rewards/chosen": -42.32283020019531, "rewards/margins": 1.5201377868652344, "rewards/rejected": -43.84297180175781, "step": 5146 }, { "epoch": 0.7008442265795207, "grad_norm": 39.17314256743365, "learning_rate": 1.9901281695071397e-07, "logits/chosen": 13.667421340942383, "logits/rejected": 13.759598731994629, "logps/chosen": -4.086797714233398, "logps/rejected": -4.445780277252197, "loss": 3.7085, "rewards/accuracies": 1.0, "rewards/chosen": -40.867977142333984, "rewards/margins": 3.5898265838623047, "rewards/rejected": -44.457801818847656, "step": 5147 }, { "epoch": 0.7009803921568627, "grad_norm": 40.241571263871094, "learning_rate": 1.9884844517722704e-07, "logits/chosen": 13.505046844482422, "logits/rejected": 13.70449447631836, "logps/chosen": -4.03066349029541, "logps/rejected": -4.084979057312012, "loss": 4.1297, "rewards/accuracies": 0.5, "rewards/chosen": -40.30663299560547, "rewards/margins": 0.5431537628173828, "rewards/rejected": -40.849788665771484, "step": 5148 }, { "epoch": 0.7011165577342048, "grad_norm": 43.39029114955207, "learning_rate": 1.986841188555966e-07, "logits/chosen": 14.158156394958496, "logits/rejected": 12.454906463623047, "logps/chosen": -4.4570207595825195, "logps/rejected": -3.9501962661743164, "loss": 4.2687, "rewards/accuracies": 0.0, "rewards/chosen": -44.57020568847656, "rewards/margins": -5.068241119384766, "rewards/rejected": -39.5019645690918, "step": 5149 }, { "epoch": 0.7012527233115469, "grad_norm": 41.1181414287654, "learning_rate": 1.985198380229538e-07, "logits/chosen": 14.063739776611328, "logits/rejected": 13.866357803344727, "logps/chosen": -4.74574613571167, "logps/rejected": -4.397122383117676, "loss": 3.9659, "rewards/accuracies": 0.0, "rewards/chosen": -47.457462310791016, "rewards/margins": -3.4862375259399414, "rewards/rejected": -43.971221923828125, "step": 5150 }, { "epoch": 0.7013888888888888, "grad_norm": 36.59856535500695, "learning_rate": 1.9835560271641887e-07, "logits/chosen": 12.62891960144043, "logits/rejected": 13.717235565185547, "logps/chosen": -4.02811861038208, "logps/rejected": -4.307214736938477, "loss": 3.5203, "rewards/accuracies": 0.75, "rewards/chosen": -40.281185150146484, "rewards/margins": 2.790961265563965, "rewards/rejected": -43.072147369384766, "step": 5151 }, { "epoch": 0.7015250544662309, "grad_norm": 42.46575718149766, "learning_rate": 1.9819141297310233e-07, "logits/chosen": 12.982412338256836, "logits/rejected": 13.579341888427734, "logps/chosen": -4.119619846343994, "logps/rejected": -4.393478870391846, "loss": 4.5162, "rewards/accuracies": 0.75, "rewards/chosen": -41.196197509765625, "rewards/margins": 2.738591194152832, "rewards/rejected": -43.93478775024414, "step": 5152 }, { "epoch": 0.701661220043573, "grad_norm": 40.02212639937827, "learning_rate": 1.9802726883010435e-07, "logits/chosen": 13.657591819763184, "logits/rejected": 13.959936141967773, "logps/chosen": -4.030641078948975, "logps/rejected": -4.298126220703125, "loss": 3.6093, "rewards/accuracies": 0.75, "rewards/chosen": -40.30641174316406, "rewards/margins": 2.6748523712158203, "rewards/rejected": -42.98126220703125, "step": 5153 }, { "epoch": 0.701797385620915, "grad_norm": 37.45187411875401, "learning_rate": 1.9786317032451435e-07, "logits/chosen": 12.94189453125, "logits/rejected": 12.44813346862793, "logps/chosen": -4.058906555175781, "logps/rejected": -4.133084774017334, "loss": 3.7074, "rewards/accuracies": 0.5, "rewards/chosen": -40.58906555175781, "rewards/margins": 0.7417821884155273, "rewards/rejected": -41.330848693847656, "step": 5154 }, { "epoch": 0.7019335511982571, "grad_norm": 42.36253044131882, "learning_rate": 1.9769911749341186e-07, "logits/chosen": 13.295236587524414, "logits/rejected": 13.634693145751953, "logps/chosen": -4.353341102600098, "logps/rejected": -4.434907913208008, "loss": 3.8765, "rewards/accuracies": 0.5, "rewards/chosen": -43.533409118652344, "rewards/margins": 0.8156681060791016, "rewards/rejected": -44.34907531738281, "step": 5155 }, { "epoch": 0.7020697167755992, "grad_norm": 42.48945265557904, "learning_rate": 1.9753511037386619e-07, "logits/chosen": 13.056706428527832, "logits/rejected": 13.91481876373291, "logps/chosen": -4.008641719818115, "logps/rejected": -4.04142951965332, "loss": 4.3653, "rewards/accuracies": 0.5, "rewards/chosen": -40.08641815185547, "rewards/margins": 0.3278799057006836, "rewards/rejected": -40.41429901123047, "step": 5156 }, { "epoch": 0.7022058823529411, "grad_norm": 37.88049552509063, "learning_rate": 1.9737114900293578e-07, "logits/chosen": 12.700931549072266, "logits/rejected": 12.972396850585938, "logps/chosen": -3.905294179916382, "logps/rejected": -4.206721782684326, "loss": 3.9116, "rewards/accuracies": 0.75, "rewards/chosen": -39.052940368652344, "rewards/margins": 3.014277458190918, "rewards/rejected": -42.06721878051758, "step": 5157 }, { "epoch": 0.7023420479302832, "grad_norm": 41.29292200781668, "learning_rate": 1.972072334176692e-07, "logits/chosen": 13.050403594970703, "logits/rejected": 13.825616836547852, "logps/chosen": -4.3333539962768555, "logps/rejected": -4.446414947509766, "loss": 3.7087, "rewards/accuracies": 0.75, "rewards/chosen": -43.33354187011719, "rewards/margins": 1.130605697631836, "rewards/rejected": -44.46414566040039, "step": 5158 }, { "epoch": 0.7024782135076253, "grad_norm": 39.245603264957786, "learning_rate": 1.9704336365510464e-07, "logits/chosen": 13.745481491088867, "logits/rejected": 13.812477111816406, "logps/chosen": -4.221010208129883, "logps/rejected": -4.537377834320068, "loss": 3.7844, "rewards/accuracies": 0.75, "rewards/chosen": -42.21010208129883, "rewards/margins": 3.1636762619018555, "rewards/rejected": -45.373775482177734, "step": 5159 }, { "epoch": 0.7026143790849673, "grad_norm": 40.27565561237212, "learning_rate": 1.968795397522696e-07, "logits/chosen": 14.063709259033203, "logits/rejected": 13.898418426513672, "logps/chosen": -4.221341609954834, "logps/rejected": -4.229895114898682, "loss": 4.0522, "rewards/accuracies": 0.5, "rewards/chosen": -42.213417053222656, "rewards/margins": 0.08553600311279297, "rewards/rejected": -42.2989501953125, "step": 5160 }, { "epoch": 0.7027505446623094, "grad_norm": 38.76810084816598, "learning_rate": 1.9671576174618156e-07, "logits/chosen": 13.383108139038086, "logits/rejected": 13.934578895568848, "logps/chosen": -4.326953887939453, "logps/rejected": -4.450483798980713, "loss": 3.6963, "rewards/accuracies": 0.5, "rewards/chosen": -43.26953887939453, "rewards/margins": 1.2353010177612305, "rewards/rejected": -44.50484085083008, "step": 5161 }, { "epoch": 0.7028867102396514, "grad_norm": 42.223399230242045, "learning_rate": 1.9655202967384766e-07, "logits/chosen": 12.90224838256836, "logits/rejected": 13.624810218811035, "logps/chosen": -4.3377366065979, "logps/rejected": -4.553841590881348, "loss": 3.9007, "rewards/accuracies": 0.75, "rewards/chosen": -43.37736511230469, "rewards/margins": 2.161052703857422, "rewards/rejected": -45.53841781616211, "step": 5162 }, { "epoch": 0.7030228758169934, "grad_norm": 36.142500335658156, "learning_rate": 1.9638834357226425e-07, "logits/chosen": 13.056343078613281, "logits/rejected": 13.092612266540527, "logps/chosen": -3.9943227767944336, "logps/rejected": -4.112364768981934, "loss": 3.4626, "rewards/accuracies": 0.5, "rewards/chosen": -39.94322967529297, "rewards/margins": 1.1804170608520508, "rewards/rejected": -41.12364196777344, "step": 5163 }, { "epoch": 0.7031590413943355, "grad_norm": 40.97198430794119, "learning_rate": 1.9622470347841764e-07, "logits/chosen": 13.102176666259766, "logits/rejected": 13.309304237365723, "logps/chosen": -4.285585403442383, "logps/rejected": -4.307920932769775, "loss": 4.356, "rewards/accuracies": 0.5, "rewards/chosen": -42.85585403442383, "rewards/margins": 0.2233572006225586, "rewards/rejected": -43.0792121887207, "step": 5164 }, { "epoch": 0.7032952069716776, "grad_norm": 38.058965168377256, "learning_rate": 1.960611094292839e-07, "logits/chosen": 14.417192459106445, "logits/rejected": 14.482965469360352, "logps/chosen": -4.436529159545898, "logps/rejected": -5.069827079772949, "loss": 3.6082, "rewards/accuracies": 1.0, "rewards/chosen": -44.36529541015625, "rewards/margins": 6.332980155944824, "rewards/rejected": -50.698272705078125, "step": 5165 }, { "epoch": 0.7034313725490197, "grad_norm": 41.372960794937015, "learning_rate": 1.9589756146182809e-07, "logits/chosen": 13.402458190917969, "logits/rejected": 14.096978187561035, "logps/chosen": -4.116424560546875, "logps/rejected": -4.271852016448975, "loss": 3.768, "rewards/accuracies": 1.0, "rewards/chosen": -41.16424560546875, "rewards/margins": 1.5542707443237305, "rewards/rejected": -42.7185173034668, "step": 5166 }, { "epoch": 0.7035675381263616, "grad_norm": 37.63689781613687, "learning_rate": 1.957340596130054e-07, "logits/chosen": 13.5678071975708, "logits/rejected": 13.326887130737305, "logps/chosen": -4.293967247009277, "logps/rejected": -4.090143203735352, "loss": 4.1473, "rewards/accuracies": 0.5, "rewards/chosen": -42.93967056274414, "rewards/margins": -2.0382375717163086, "rewards/rejected": -40.901432037353516, "step": 5167 }, { "epoch": 0.7037037037037037, "grad_norm": 41.031815580615124, "learning_rate": 1.9557060391976053e-07, "logits/chosen": 12.966439247131348, "logits/rejected": 13.352987289428711, "logps/chosen": -4.087362289428711, "logps/rejected": -4.242397308349609, "loss": 3.9184, "rewards/accuracies": 0.5, "rewards/chosen": -40.87362289428711, "rewards/margins": 1.5503520965576172, "rewards/rejected": -42.423973083496094, "step": 5168 }, { "epoch": 0.7038398692810458, "grad_norm": 36.38073191270635, "learning_rate": 1.9540719441902742e-07, "logits/chosen": 14.27264404296875, "logits/rejected": 13.780519485473633, "logps/chosen": -4.426687240600586, "logps/rejected": -4.341365814208984, "loss": 3.7456, "rewards/accuracies": 0.5, "rewards/chosen": -44.266876220703125, "rewards/margins": -0.8532133102416992, "rewards/rejected": -43.41366195678711, "step": 5169 }, { "epoch": 0.7039760348583878, "grad_norm": 63.255651668126035, "learning_rate": 1.9524383114772992e-07, "logits/chosen": 13.348808288574219, "logits/rejected": 13.489977836608887, "logps/chosen": -4.35283899307251, "logps/rejected": -4.6322455406188965, "loss": 4.4926, "rewards/accuracies": 0.75, "rewards/chosen": -43.52838897705078, "rewards/margins": 2.794064521789551, "rewards/rejected": -46.32245635986328, "step": 5170 }, { "epoch": 0.7041122004357299, "grad_norm": 43.98899841660054, "learning_rate": 1.9508051414278147e-07, "logits/chosen": 13.06148910522461, "logits/rejected": 13.468671798706055, "logps/chosen": -4.141324520111084, "logps/rejected": -4.419071197509766, "loss": 3.2761, "rewards/accuracies": 0.5, "rewards/chosen": -41.413246154785156, "rewards/margins": 2.7774658203125, "rewards/rejected": -44.19071578979492, "step": 5171 }, { "epoch": 0.704248366013072, "grad_norm": 42.37790369213316, "learning_rate": 1.9491724344108452e-07, "logits/chosen": 13.17090129852295, "logits/rejected": 13.314254760742188, "logps/chosen": -4.120504379272461, "logps/rejected": -4.199687957763672, "loss": 3.5522, "rewards/accuracies": 0.5, "rewards/chosen": -41.205039978027344, "rewards/margins": 0.791839599609375, "rewards/rejected": -41.996883392333984, "step": 5172 }, { "epoch": 0.7043845315904139, "grad_norm": 37.87525629851816, "learning_rate": 1.947540190795317e-07, "logits/chosen": 13.892192840576172, "logits/rejected": 13.99758243560791, "logps/chosen": -4.532140731811523, "logps/rejected": -4.662797927856445, "loss": 3.7249, "rewards/accuracies": 0.75, "rewards/chosen": -45.321407318115234, "rewards/margins": 1.3065729141235352, "rewards/rejected": -46.62797927856445, "step": 5173 }, { "epoch": 0.704520697167756, "grad_norm": 38.88227427137344, "learning_rate": 1.9459084109500497e-07, "logits/chosen": 13.24289321899414, "logits/rejected": 12.770988464355469, "logps/chosen": -4.334042549133301, "logps/rejected": -4.164857864379883, "loss": 4.3158, "rewards/accuracies": 0.5, "rewards/chosen": -43.340423583984375, "rewards/margins": -1.6918439865112305, "rewards/rejected": -41.64857864379883, "step": 5174 }, { "epoch": 0.7046568627450981, "grad_norm": 42.426680913740206, "learning_rate": 1.9442770952437547e-07, "logits/chosen": 13.824549674987793, "logits/rejected": 14.253477096557617, "logps/chosen": -4.663713455200195, "logps/rejected": -4.772258281707764, "loss": 3.7052, "rewards/accuracies": 0.5, "rewards/chosen": -46.63713836669922, "rewards/margins": 1.0854463577270508, "rewards/rejected": -47.72257995605469, "step": 5175 }, { "epoch": 0.7047930283224401, "grad_norm": 40.552292657113846, "learning_rate": 1.942646244045043e-07, "logits/chosen": 13.430370330810547, "logits/rejected": 13.549002647399902, "logps/chosen": -4.463382720947266, "logps/rejected": -4.606252670288086, "loss": 3.8937, "rewards/accuracies": 0.5, "rewards/chosen": -44.633827209472656, "rewards/margins": 1.428696632385254, "rewards/rejected": -46.06252670288086, "step": 5176 }, { "epoch": 0.7049291938997821, "grad_norm": 39.567432789561174, "learning_rate": 1.9410158577224203e-07, "logits/chosen": 13.747124671936035, "logits/rejected": 14.128755569458008, "logps/chosen": -4.311914443969727, "logps/rejected": -4.409697532653809, "loss": 4.1111, "rewards/accuracies": 0.75, "rewards/chosen": -43.119140625, "rewards/margins": 0.9778289794921875, "rewards/rejected": -44.09696960449219, "step": 5177 }, { "epoch": 0.7050653594771242, "grad_norm": 39.952721721135475, "learning_rate": 1.9393859366442827e-07, "logits/chosen": 13.307243347167969, "logits/rejected": 13.204225540161133, "logps/chosen": -4.209743976593018, "logps/rejected": -4.448459625244141, "loss": 4.1625, "rewards/accuracies": 0.75, "rewards/chosen": -42.09743881225586, "rewards/margins": 2.3871583938598633, "rewards/rejected": -44.484596252441406, "step": 5178 }, { "epoch": 0.7052015250544662, "grad_norm": 36.615771378615065, "learning_rate": 1.9377564811789258e-07, "logits/chosen": 14.153722763061523, "logits/rejected": 14.369144439697266, "logps/chosen": -4.236005783081055, "logps/rejected": -4.639937400817871, "loss": 3.8486, "rewards/accuracies": 1.0, "rewards/chosen": -42.36005783081055, "rewards/margins": 4.039311408996582, "rewards/rejected": -46.39937210083008, "step": 5179 }, { "epoch": 0.7053376906318083, "grad_norm": 44.25236900185209, "learning_rate": 1.9361274916945401e-07, "logits/chosen": 13.355768203735352, "logits/rejected": 12.684846878051758, "logps/chosen": -4.3101396560668945, "logps/rejected": -3.972381591796875, "loss": 4.4378, "rewards/accuracies": 0.25, "rewards/chosen": -43.10139846801758, "rewards/margins": -3.377582550048828, "rewards/rejected": -39.72381591796875, "step": 5180 }, { "epoch": 0.7054738562091504, "grad_norm": 39.73923546815276, "learning_rate": 1.9344989685592065e-07, "logits/chosen": 12.767376899719238, "logits/rejected": 13.352434158325195, "logps/chosen": -4.046738624572754, "logps/rejected": -4.310219764709473, "loss": 3.7351, "rewards/accuracies": 0.75, "rewards/chosen": -40.467384338378906, "rewards/margins": 2.634817123413086, "rewards/rejected": -43.102203369140625, "step": 5181 }, { "epoch": 0.7056100217864923, "grad_norm": 38.00817113314495, "learning_rate": 1.9328709121409042e-07, "logits/chosen": 13.063551902770996, "logits/rejected": 13.511639595031738, "logps/chosen": -4.230790138244629, "logps/rejected": -4.498754024505615, "loss": 4.1315, "rewards/accuracies": 0.75, "rewards/chosen": -42.30790328979492, "rewards/margins": 2.679636001586914, "rewards/rejected": -44.9875373840332, "step": 5182 }, { "epoch": 0.7057461873638344, "grad_norm": 40.08146473063446, "learning_rate": 1.9312433228075083e-07, "logits/chosen": 13.580448150634766, "logits/rejected": 13.782491683959961, "logps/chosen": -4.036105155944824, "logps/rejected": -4.246120929718018, "loss": 4.1329, "rewards/accuracies": 0.5, "rewards/chosen": -40.36104965209961, "rewards/margins": 2.1001577377319336, "rewards/rejected": -42.461204528808594, "step": 5183 }, { "epoch": 0.7058823529411765, "grad_norm": 40.282616105822, "learning_rate": 1.9296162009267824e-07, "logits/chosen": 13.647216796875, "logits/rejected": 13.337196350097656, "logps/chosen": -4.4189453125, "logps/rejected": -4.347502708435059, "loss": 3.7202, "rewards/accuracies": 0.5, "rewards/chosen": -44.189453125, "rewards/margins": -0.7144269943237305, "rewards/rejected": -43.47502899169922, "step": 5184 }, { "epoch": 0.7060185185185185, "grad_norm": 38.39492892269187, "learning_rate": 1.92798954686639e-07, "logits/chosen": 12.343807220458984, "logits/rejected": 12.02707290649414, "logps/chosen": -3.950258731842041, "logps/rejected": -3.8057165145874023, "loss": 3.9077, "rewards/accuracies": 0.5, "rewards/chosen": -39.502586364746094, "rewards/margins": -1.4454221725463867, "rewards/rejected": -38.057167053222656, "step": 5185 }, { "epoch": 0.7061546840958606, "grad_norm": 40.48565525137759, "learning_rate": 1.926363360993887e-07, "logits/chosen": 13.832670211791992, "logits/rejected": 14.14150619506836, "logps/chosen": -4.161772727966309, "logps/rejected": -4.451186180114746, "loss": 3.7944, "rewards/accuracies": 1.0, "rewards/chosen": -41.61772918701172, "rewards/margins": 2.8941287994384766, "rewards/rejected": -44.511863708496094, "step": 5186 }, { "epoch": 0.7062908496732027, "grad_norm": 40.78521459701278, "learning_rate": 1.9247376436767246e-07, "logits/chosen": 13.510978698730469, "logits/rejected": 13.813138961791992, "logps/chosen": -3.922783851623535, "logps/rejected": -4.543000221252441, "loss": 4.1432, "rewards/accuracies": 1.0, "rewards/chosen": -39.22783660888672, "rewards/margins": 6.2021636962890625, "rewards/rejected": -45.43000030517578, "step": 5187 }, { "epoch": 0.7064270152505446, "grad_norm": 36.676298683099155, "learning_rate": 1.9231123952822444e-07, "logits/chosen": 14.315540313720703, "logits/rejected": 14.075420379638672, "logps/chosen": -4.300210475921631, "logps/rejected": -4.297695636749268, "loss": 3.7989, "rewards/accuracies": 0.5, "rewards/chosen": -43.002105712890625, "rewards/margins": -0.02514934539794922, "rewards/rejected": -42.97695541381836, "step": 5188 }, { "epoch": 0.7065631808278867, "grad_norm": 37.18054026741525, "learning_rate": 1.9214876161776865e-07, "logits/chosen": 12.959413528442383, "logits/rejected": 14.007789611816406, "logps/chosen": -4.048581123352051, "logps/rejected": -4.765748977661133, "loss": 3.2293, "rewards/accuracies": 1.0, "rewards/chosen": -40.48581314086914, "rewards/margins": 7.1716766357421875, "rewards/rejected": -47.65748977661133, "step": 5189 }, { "epoch": 0.7066993464052288, "grad_norm": 35.983732005766555, "learning_rate": 1.919863306730184e-07, "logits/chosen": 13.143033027648926, "logits/rejected": 13.775803565979004, "logps/chosen": -3.9956369400024414, "logps/rejected": -4.381626605987549, "loss": 3.7256, "rewards/accuracies": 0.75, "rewards/chosen": -39.95636749267578, "rewards/margins": 3.859898567199707, "rewards/rejected": -43.81626510620117, "step": 5190 }, { "epoch": 0.7068355119825708, "grad_norm": 38.46787025829177, "learning_rate": 1.918239467306761e-07, "logits/chosen": 12.501482009887695, "logits/rejected": 13.371826171875, "logps/chosen": -3.935291290283203, "logps/rejected": -4.347182273864746, "loss": 4.1516, "rewards/accuracies": 0.75, "rewards/chosen": -39.35291290283203, "rewards/margins": 4.118910789489746, "rewards/rejected": -43.471824645996094, "step": 5191 }, { "epoch": 0.7069716775599129, "grad_norm": 38.343882059119764, "learning_rate": 1.9166160982743382e-07, "logits/chosen": 13.362257957458496, "logits/rejected": 14.052244186401367, "logps/chosen": -4.150585651397705, "logps/rejected": -4.589376449584961, "loss": 3.7705, "rewards/accuracies": 1.0, "rewards/chosen": -41.505855560302734, "rewards/margins": 4.387906074523926, "rewards/rejected": -45.893760681152344, "step": 5192 }, { "epoch": 0.7071078431372549, "grad_norm": 39.819802364531306, "learning_rate": 1.914993199999729e-07, "logits/chosen": 14.381552696228027, "logits/rejected": 14.359329223632812, "logps/chosen": -4.635268688201904, "logps/rejected": -4.826478004455566, "loss": 4.1082, "rewards/accuracies": 0.5, "rewards/chosen": -46.352684020996094, "rewards/margins": 1.912093162536621, "rewards/rejected": -48.2647819519043, "step": 5193 }, { "epoch": 0.7072440087145969, "grad_norm": 38.99207960034575, "learning_rate": 1.9133707728496428e-07, "logits/chosen": 13.899908065795898, "logits/rejected": 13.793083190917969, "logps/chosen": -4.82456636428833, "logps/rejected": -4.5380024909973145, "loss": 4.1833, "rewards/accuracies": 0.25, "rewards/chosen": -48.245662689208984, "rewards/margins": -2.86563777923584, "rewards/rejected": -45.38002395629883, "step": 5194 }, { "epoch": 0.707380174291939, "grad_norm": 39.87820193267735, "learning_rate": 1.9117488171906774e-07, "logits/chosen": 12.748136520385742, "logits/rejected": 13.871063232421875, "logps/chosen": -4.4346771240234375, "logps/rejected": -4.683080673217773, "loss": 4.3567, "rewards/accuracies": 0.5, "rewards/chosen": -44.346771240234375, "rewards/margins": 2.4840335845947266, "rewards/rejected": -46.830806732177734, "step": 5195 }, { "epoch": 0.7075163398692811, "grad_norm": 41.75870698783387, "learning_rate": 1.9101273333893285e-07, "logits/chosen": 13.822101593017578, "logits/rejected": 13.859003067016602, "logps/chosen": -4.417499542236328, "logps/rejected": -4.313600540161133, "loss": 4.316, "rewards/accuracies": 0.75, "rewards/chosen": -44.17499542236328, "rewards/margins": -1.0389862060546875, "rewards/rejected": -43.136009216308594, "step": 5196 }, { "epoch": 0.7076525054466231, "grad_norm": 39.3929777901619, "learning_rate": 1.9085063218119851e-07, "logits/chosen": 13.310616493225098, "logits/rejected": 14.683441162109375, "logps/chosen": -4.241864204406738, "logps/rejected": -4.6399688720703125, "loss": 4.1609, "rewards/accuracies": 1.0, "rewards/chosen": -42.41864776611328, "rewards/margins": 3.981045722961426, "rewards/rejected": -46.399688720703125, "step": 5197 }, { "epoch": 0.7077886710239651, "grad_norm": 39.416617707488214, "learning_rate": 1.9068857828249253e-07, "logits/chosen": 12.942225456237793, "logits/rejected": 13.678321838378906, "logps/chosen": -4.2079620361328125, "logps/rejected": -4.385230541229248, "loss": 4.0836, "rewards/accuracies": 1.0, "rewards/chosen": -42.079620361328125, "rewards/margins": 1.7726850509643555, "rewards/rejected": -43.8523063659668, "step": 5198 }, { "epoch": 0.7079248366013072, "grad_norm": 43.59330583499739, "learning_rate": 1.9052657167943242e-07, "logits/chosen": 13.647114753723145, "logits/rejected": 13.857673645019531, "logps/chosen": -4.164309501647949, "logps/rejected": -4.427174091339111, "loss": 3.8732, "rewards/accuracies": 0.75, "rewards/chosen": -41.64309310913086, "rewards/margins": 2.628647804260254, "rewards/rejected": -44.2717399597168, "step": 5199 }, { "epoch": 0.7080610021786492, "grad_norm": 39.46872545143744, "learning_rate": 1.9036461240862502e-07, "logits/chosen": 13.645305633544922, "logits/rejected": 15.163490295410156, "logps/chosen": -4.126845359802246, "logps/rejected": -4.65456485748291, "loss": 3.5989, "rewards/accuracies": 0.75, "rewards/chosen": -41.26845169067383, "rewards/margins": 5.277199745178223, "rewards/rejected": -46.545654296875, "step": 5200 }, { "epoch": 0.7081971677559913, "grad_norm": 36.21600859695451, "learning_rate": 1.902027005066664e-07, "logits/chosen": 14.036975860595703, "logits/rejected": 14.550114631652832, "logps/chosen": -4.497014999389648, "logps/rejected": -4.966921329498291, "loss": 3.8439, "rewards/accuracies": 1.0, "rewards/chosen": -44.97015380859375, "rewards/margins": 4.699063301086426, "rewards/rejected": -49.669212341308594, "step": 5201 }, { "epoch": 0.7083333333333334, "grad_norm": 42.606680743517636, "learning_rate": 1.9004083601014173e-07, "logits/chosen": 13.345321655273438, "logits/rejected": 14.192811012268066, "logps/chosen": -4.397952079772949, "logps/rejected": -4.461149215698242, "loss": 3.6935, "rewards/accuracies": 0.75, "rewards/chosen": -43.979522705078125, "rewards/margins": 0.6319742202758789, "rewards/rejected": -44.61149597167969, "step": 5202 }, { "epoch": 0.7084694989106753, "grad_norm": 36.61383803062621, "learning_rate": 1.8987901895562568e-07, "logits/chosen": 13.924899101257324, "logits/rejected": 14.079514503479004, "logps/chosen": -4.053579330444336, "logps/rejected": -4.392596244812012, "loss": 3.7799, "rewards/accuracies": 1.0, "rewards/chosen": -40.535797119140625, "rewards/margins": 3.390170097351074, "rewards/rejected": -43.925968170166016, "step": 5203 }, { "epoch": 0.7086056644880174, "grad_norm": 36.66651738419624, "learning_rate": 1.8971724937968231e-07, "logits/chosen": 13.459794998168945, "logits/rejected": 14.429069519042969, "logps/chosen": -4.423531532287598, "logps/rejected": -4.739205360412598, "loss": 4.3084, "rewards/accuracies": 1.0, "rewards/chosen": -44.235313415527344, "rewards/margins": 3.1567459106445312, "rewards/rejected": -47.392059326171875, "step": 5204 }, { "epoch": 0.7087418300653595, "grad_norm": 41.715988125659806, "learning_rate": 1.8955552731886453e-07, "logits/chosen": 12.433999061584473, "logits/rejected": 12.416643142700195, "logps/chosen": -4.330565452575684, "logps/rejected": -4.069182395935059, "loss": 3.9354, "rewards/accuracies": 0.5, "rewards/chosen": -43.305660247802734, "rewards/margins": -2.6138343811035156, "rewards/rejected": -40.69182586669922, "step": 5205 }, { "epoch": 0.7088779956427015, "grad_norm": 36.83059327401044, "learning_rate": 1.8939385280971485e-07, "logits/chosen": 13.737825393676758, "logits/rejected": 13.642255783081055, "logps/chosen": -4.334880352020264, "logps/rejected": -4.392523765563965, "loss": 3.7384, "rewards/accuracies": 0.75, "rewards/chosen": -43.34880065917969, "rewards/margins": 0.5764389038085938, "rewards/rejected": -43.92523956298828, "step": 5206 }, { "epoch": 0.7090141612200436, "grad_norm": 40.34703423681002, "learning_rate": 1.892322258887652e-07, "logits/chosen": 13.620344161987305, "logits/rejected": 13.495060920715332, "logps/chosen": -4.101471900939941, "logps/rejected": -4.124916076660156, "loss": 4.6594, "rewards/accuracies": 0.5, "rewards/chosen": -41.01471710205078, "rewards/margins": 0.23444652557373047, "rewards/rejected": -41.24916458129883, "step": 5207 }, { "epoch": 0.7091503267973857, "grad_norm": 39.45109765676241, "learning_rate": 1.890706465925362e-07, "logits/chosen": 13.034003257751465, "logits/rejected": 13.256983757019043, "logps/chosen": -4.124434471130371, "logps/rejected": -4.2887773513793945, "loss": 3.3308, "rewards/accuracies": 0.5, "rewards/chosen": -41.244346618652344, "rewards/margins": 1.6434259414672852, "rewards/rejected": -42.88777160644531, "step": 5208 }, { "epoch": 0.7092864923747276, "grad_norm": 43.27328652490942, "learning_rate": 1.8890911495753814e-07, "logits/chosen": 13.756007194519043, "logits/rejected": 13.448324203491211, "logps/chosen": -4.066926956176758, "logps/rejected": -4.592180252075195, "loss": 4.7134, "rewards/accuracies": 0.75, "rewards/chosen": -40.66926574707031, "rewards/margins": 5.252531051635742, "rewards/rejected": -45.92179870605469, "step": 5209 }, { "epoch": 0.7094226579520697, "grad_norm": 40.687404267076424, "learning_rate": 1.887476310202706e-07, "logits/chosen": 13.160778999328613, "logits/rejected": 13.461669921875, "logps/chosen": -4.05453634262085, "logps/rejected": -4.426636695861816, "loss": 4.2452, "rewards/accuracies": 0.75, "rewards/chosen": -40.54536437988281, "rewards/margins": 3.721001625061035, "rewards/rejected": -44.26636505126953, "step": 5210 }, { "epoch": 0.7095588235294118, "grad_norm": 39.491909587530934, "learning_rate": 1.8858619481722195e-07, "logits/chosen": 13.861040115356445, "logits/rejected": 13.707550048828125, "logps/chosen": -3.9900424480438232, "logps/rejected": -4.365396499633789, "loss": 4.12, "rewards/accuracies": 1.0, "rewards/chosen": -39.90042495727539, "rewards/margins": 3.7535390853881836, "rewards/rejected": -43.653961181640625, "step": 5211 }, { "epoch": 0.7096949891067538, "grad_norm": 37.00882972024986, "learning_rate": 1.8842480638487007e-07, "logits/chosen": 13.306130409240723, "logits/rejected": 13.136911392211914, "logps/chosen": -4.386534690856934, "logps/rejected": -4.526584625244141, "loss": 3.8153, "rewards/accuracies": 0.25, "rewards/chosen": -43.8653450012207, "rewards/margins": 1.4004974365234375, "rewards/rejected": -45.26584243774414, "step": 5212 }, { "epoch": 0.7098311546840959, "grad_norm": 37.319893601547435, "learning_rate": 1.882634657596823e-07, "logits/chosen": 13.45899486541748, "logits/rejected": 14.076007843017578, "logps/chosen": -4.072545528411865, "logps/rejected": -4.359705924987793, "loss": 3.3268, "rewards/accuracies": 0.75, "rewards/chosen": -40.7254524230957, "rewards/margins": 2.871601104736328, "rewards/rejected": -43.59705352783203, "step": 5213 }, { "epoch": 0.7099673202614379, "grad_norm": 38.547330838564896, "learning_rate": 1.881021729781145e-07, "logits/chosen": 13.10940933227539, "logits/rejected": 13.613800048828125, "logps/chosen": -4.28587532043457, "logps/rejected": -4.671677589416504, "loss": 3.3359, "rewards/accuracies": 1.0, "rewards/chosen": -42.8587532043457, "rewards/margins": 3.8580188751220703, "rewards/rejected": -46.716773986816406, "step": 5214 }, { "epoch": 0.7101034858387799, "grad_norm": 44.92648842610128, "learning_rate": 1.879409280766123e-07, "logits/chosen": 13.79930305480957, "logits/rejected": 13.438993453979492, "logps/chosen": -4.340378761291504, "logps/rejected": -4.463798522949219, "loss": 3.4034, "rewards/accuracies": 0.75, "rewards/chosen": -43.40378952026367, "rewards/margins": 1.2341947555541992, "rewards/rejected": -44.63798522949219, "step": 5215 }, { "epoch": 0.710239651416122, "grad_norm": 36.61353826751292, "learning_rate": 1.8777973109161046e-07, "logits/chosen": 14.006731033325195, "logits/rejected": 14.39303207397461, "logps/chosen": -4.411237716674805, "logps/rejected": -4.767653942108154, "loss": 3.9417, "rewards/accuracies": 1.0, "rewards/chosen": -44.11238098144531, "rewards/margins": 3.5641584396362305, "rewards/rejected": -47.676536560058594, "step": 5216 }, { "epoch": 0.7103758169934641, "grad_norm": 37.55629217920406, "learning_rate": 1.8761858205953241e-07, "logits/chosen": 14.279711723327637, "logits/rejected": 14.136837005615234, "logps/chosen": -4.37972354888916, "logps/rejected": -4.276217937469482, "loss": 4.0231, "rewards/accuracies": 0.25, "rewards/chosen": -43.79723358154297, "rewards/margins": -1.0350542068481445, "rewards/rejected": -42.76218032836914, "step": 5217 }, { "epoch": 0.710511982570806, "grad_norm": 33.28963993544823, "learning_rate": 1.874574810167913e-07, "logits/chosen": 13.295751571655273, "logits/rejected": 13.911094665527344, "logps/chosen": -4.407388210296631, "logps/rejected": -4.545717239379883, "loss": 3.5199, "rewards/accuracies": 0.5, "rewards/chosen": -44.073883056640625, "rewards/margins": 1.3832921981811523, "rewards/rejected": -45.457176208496094, "step": 5218 }, { "epoch": 0.7106481481481481, "grad_norm": 38.51433375933589, "learning_rate": 1.8729642799978946e-07, "logits/chosen": 12.907268524169922, "logits/rejected": 13.901182174682617, "logps/chosen": -3.9773764610290527, "logps/rejected": -4.3046112060546875, "loss": 3.5498, "rewards/accuracies": 0.75, "rewards/chosen": -39.773765563964844, "rewards/margins": 3.2723422050476074, "rewards/rejected": -43.04610824584961, "step": 5219 }, { "epoch": 0.7107843137254902, "grad_norm": 41.17510510318449, "learning_rate": 1.8713542304491777e-07, "logits/chosen": 14.009912490844727, "logits/rejected": 13.50356674194336, "logps/chosen": -4.388021945953369, "logps/rejected": -4.2442708015441895, "loss": 4.2869, "rewards/accuracies": 0.5, "rewards/chosen": -43.88022232055664, "rewards/margins": -1.437516212463379, "rewards/rejected": -42.44270324707031, "step": 5220 }, { "epoch": 0.7109204793028322, "grad_norm": 39.73900608452733, "learning_rate": 1.869744661885568e-07, "logits/chosen": 13.246295928955078, "logits/rejected": 13.320674896240234, "logps/chosen": -4.057569980621338, "logps/rejected": -4.312324523925781, "loss": 4.2484, "rewards/accuracies": 1.0, "rewards/chosen": -40.57569885253906, "rewards/margins": 2.547541618347168, "rewards/rejected": -43.12324523925781, "step": 5221 }, { "epoch": 0.7110566448801743, "grad_norm": 40.32534311158532, "learning_rate": 1.868135574670762e-07, "logits/chosen": 13.643522262573242, "logits/rejected": 13.82141399383545, "logps/chosen": -4.468140602111816, "logps/rejected": -4.413589954376221, "loss": 4.3551, "rewards/accuracies": 0.5, "rewards/chosen": -44.68140411376953, "rewards/margins": -0.5455083847045898, "rewards/rejected": -44.13589859008789, "step": 5222 }, { "epoch": 0.7111928104575164, "grad_norm": 38.43994764246059, "learning_rate": 1.8665269691683437e-07, "logits/chosen": 12.84105110168457, "logits/rejected": 13.85953140258789, "logps/chosen": -3.8746554851531982, "logps/rejected": -4.327812671661377, "loss": 3.675, "rewards/accuracies": 0.75, "rewards/chosen": -38.746559143066406, "rewards/margins": 4.5315704345703125, "rewards/rejected": -43.27812957763672, "step": 5223 }, { "epoch": 0.7113289760348583, "grad_norm": 39.273842515370916, "learning_rate": 1.8649188457417923e-07, "logits/chosen": 14.041807174682617, "logits/rejected": 14.912229537963867, "logps/chosen": -4.458721160888672, "logps/rejected": -4.542911529541016, "loss": 4.2364, "rewards/accuracies": 0.5, "rewards/chosen": -44.58721160888672, "rewards/margins": 0.8419017791748047, "rewards/rejected": -45.429115295410156, "step": 5224 }, { "epoch": 0.7114651416122004, "grad_norm": 44.27900309980207, "learning_rate": 1.8633112047544776e-07, "logits/chosen": 13.407577514648438, "logits/rejected": 13.518234252929688, "logps/chosen": -4.457774639129639, "logps/rejected": -4.343740940093994, "loss": 4.1706, "rewards/accuracies": 0.5, "rewards/chosen": -44.5777473449707, "rewards/margins": -1.1403369903564453, "rewards/rejected": -43.437408447265625, "step": 5225 }, { "epoch": 0.7116013071895425, "grad_norm": 37.3889395834444, "learning_rate": 1.8617040465696573e-07, "logits/chosen": 13.564525604248047, "logits/rejected": 13.161002159118652, "logps/chosen": -4.20144510269165, "logps/rejected": -4.341948509216309, "loss": 3.6223, "rewards/accuracies": 0.75, "rewards/chosen": -42.01445007324219, "rewards/margins": 1.4050321578979492, "rewards/rejected": -43.41947937011719, "step": 5226 }, { "epoch": 0.7117374727668845, "grad_norm": 38.414324478023566, "learning_rate": 1.8600973715504828e-07, "logits/chosen": 13.454061508178711, "logits/rejected": 13.291719436645508, "logps/chosen": -4.312784194946289, "logps/rejected": -4.556402206420898, "loss": 3.868, "rewards/accuracies": 0.75, "rewards/chosen": -43.12784194946289, "rewards/margins": 2.4361772537231445, "rewards/rejected": -45.56401824951172, "step": 5227 }, { "epoch": 0.7118736383442266, "grad_norm": 39.062147396314806, "learning_rate": 1.8584911800599974e-07, "logits/chosen": 13.36857795715332, "logits/rejected": 13.641401290893555, "logps/chosen": -4.177271366119385, "logps/rejected": -4.491184711456299, "loss": 4.1278, "rewards/accuracies": 0.75, "rewards/chosen": -41.77271270751953, "rewards/margins": 3.1391334533691406, "rewards/rejected": -44.91184997558594, "step": 5228 }, { "epoch": 0.7120098039215687, "grad_norm": 36.79981713009841, "learning_rate": 1.8568854724611298e-07, "logits/chosen": 13.496856689453125, "logits/rejected": 13.594605445861816, "logps/chosen": -4.181085586547852, "logps/rejected": -4.122048854827881, "loss": 3.5604, "rewards/accuracies": 0.25, "rewards/chosen": -41.810855865478516, "rewards/margins": -0.5903663635253906, "rewards/rejected": -41.220489501953125, "step": 5229 }, { "epoch": 0.7121459694989106, "grad_norm": 40.93791701390754, "learning_rate": 1.8552802491167053e-07, "logits/chosen": 12.123180389404297, "logits/rejected": 12.577692031860352, "logps/chosen": -3.5654585361480713, "logps/rejected": -3.7748265266418457, "loss": 4.2008, "rewards/accuracies": 0.5, "rewards/chosen": -35.65458679199219, "rewards/margins": 2.0936813354492188, "rewards/rejected": -37.748268127441406, "step": 5230 }, { "epoch": 0.7122821350762527, "grad_norm": 39.693314389328876, "learning_rate": 1.853675510389438e-07, "logits/chosen": 12.992786407470703, "logits/rejected": 12.658373832702637, "logps/chosen": -4.342020034790039, "logps/rejected": -4.386687278747559, "loss": 4.3145, "rewards/accuracies": 0.5, "rewards/chosen": -43.420196533203125, "rewards/margins": 0.44667720794677734, "rewards/rejected": -43.86687469482422, "step": 5231 }, { "epoch": 0.7124183006535948, "grad_norm": 38.02760416642388, "learning_rate": 1.85207125664193e-07, "logits/chosen": 13.901017189025879, "logits/rejected": 14.224992752075195, "logps/chosen": -4.321666240692139, "logps/rejected": -4.645052433013916, "loss": 4.178, "rewards/accuracies": 0.75, "rewards/chosen": -43.21665954589844, "rewards/margins": 3.2338666915893555, "rewards/rejected": -46.45052719116211, "step": 5232 }, { "epoch": 0.7125544662309368, "grad_norm": 40.953082635755045, "learning_rate": 1.8504674882366758e-07, "logits/chosen": 13.470491409301758, "logits/rejected": 13.762592315673828, "logps/chosen": -4.411699295043945, "logps/rejected": -4.67968225479126, "loss": 4.1657, "rewards/accuracies": 1.0, "rewards/chosen": -44.11699676513672, "rewards/margins": 2.679821014404297, "rewards/rejected": -46.79682159423828, "step": 5233 }, { "epoch": 0.7126906318082789, "grad_norm": 40.598819282129405, "learning_rate": 1.848864205536063e-07, "logits/chosen": 13.279441833496094, "logits/rejected": 14.203194618225098, "logps/chosen": -4.066963195800781, "logps/rejected": -4.385710716247559, "loss": 3.9616, "rewards/accuracies": 0.75, "rewards/chosen": -40.66963195800781, "rewards/margins": 3.18747615814209, "rewards/rejected": -43.85710906982422, "step": 5234 }, { "epoch": 0.7128267973856209, "grad_norm": 39.184774212451735, "learning_rate": 1.8472614089023625e-07, "logits/chosen": 13.63405990600586, "logits/rejected": 13.581842422485352, "logps/chosen": -4.20613956451416, "logps/rejected": -4.16203498840332, "loss": 3.7964, "rewards/accuracies": 0.75, "rewards/chosen": -42.06139373779297, "rewards/margins": -0.44104862213134766, "rewards/rejected": -41.62034606933594, "step": 5235 }, { "epoch": 0.7129629629629629, "grad_norm": 36.27518130447168, "learning_rate": 1.845659098697742e-07, "logits/chosen": 13.031105041503906, "logits/rejected": 13.451393127441406, "logps/chosen": -4.362968921661377, "logps/rejected": -4.427959442138672, "loss": 4.1276, "rewards/accuracies": 0.5, "rewards/chosen": -43.62968826293945, "rewards/margins": 0.64990234375, "rewards/rejected": -44.27959442138672, "step": 5236 }, { "epoch": 0.713099128540305, "grad_norm": 41.47805921247827, "learning_rate": 1.844057275284257e-07, "logits/chosen": 13.724651336669922, "logits/rejected": 14.41351318359375, "logps/chosen": -4.3111395835876465, "logps/rejected": -4.65314245223999, "loss": 4.2277, "rewards/accuracies": 1.0, "rewards/chosen": -43.11139678955078, "rewards/margins": 3.420027732849121, "rewards/rejected": -46.53142547607422, "step": 5237 }, { "epoch": 0.7132352941176471, "grad_norm": 40.38708222687642, "learning_rate": 1.8424559390238504e-07, "logits/chosen": 13.68043327331543, "logits/rejected": 14.413521766662598, "logps/chosen": -4.191500663757324, "logps/rejected": -4.199321746826172, "loss": 4.4123, "rewards/accuracies": 0.75, "rewards/chosen": -41.91500473022461, "rewards/margins": 0.07821273803710938, "rewards/rejected": -41.99321746826172, "step": 5238 }, { "epoch": 0.713371459694989, "grad_norm": 36.29707223372644, "learning_rate": 1.8408550902783588e-07, "logits/chosen": 13.520596504211426, "logits/rejected": 14.027580261230469, "logps/chosen": -4.2548604011535645, "logps/rejected": -4.832738876342773, "loss": 3.5066, "rewards/accuracies": 0.75, "rewards/chosen": -42.54860305786133, "rewards/margins": 5.778785705566406, "rewards/rejected": -48.327388763427734, "step": 5239 }, { "epoch": 0.7135076252723311, "grad_norm": 38.55300614518803, "learning_rate": 1.8392547294095092e-07, "logits/chosen": 13.497713088989258, "logits/rejected": 14.154812812805176, "logps/chosen": -4.476661205291748, "logps/rejected": -4.502315521240234, "loss": 4.4908, "rewards/accuracies": 0.5, "rewards/chosen": -44.7666130065918, "rewards/margins": 0.25654029846191406, "rewards/rejected": -45.023155212402344, "step": 5240 }, { "epoch": 0.7136437908496732, "grad_norm": 39.75017502940492, "learning_rate": 1.8376548567789123e-07, "logits/chosen": 13.616632461547852, "logits/rejected": 13.554938316345215, "logps/chosen": -4.326718330383301, "logps/rejected": -4.59598445892334, "loss": 3.819, "rewards/accuracies": 0.75, "rewards/chosen": -43.267181396484375, "rewards/margins": 2.6926651000976562, "rewards/rejected": -45.95984649658203, "step": 5241 }, { "epoch": 0.7137799564270153, "grad_norm": 37.26369578956372, "learning_rate": 1.8360554727480749e-07, "logits/chosen": 13.574361801147461, "logits/rejected": 14.301064491271973, "logps/chosen": -4.213891506195068, "logps/rejected": -4.538454532623291, "loss": 3.7501, "rewards/accuracies": 1.0, "rewards/chosen": -42.138916015625, "rewards/margins": 3.245631217956543, "rewards/rejected": -45.384544372558594, "step": 5242 }, { "epoch": 0.7139161220043573, "grad_norm": 39.36113579640574, "learning_rate": 1.834456577678392e-07, "logits/chosen": 13.550565719604492, "logits/rejected": 14.09556770324707, "logps/chosen": -4.114770889282227, "logps/rejected": -4.633426666259766, "loss": 3.8485, "rewards/accuracies": 1.0, "rewards/chosen": -41.147705078125, "rewards/margins": 5.1865644454956055, "rewards/rejected": -46.33427047729492, "step": 5243 }, { "epoch": 0.7140522875816994, "grad_norm": 35.948170265921995, "learning_rate": 1.832858171931145e-07, "logits/chosen": 13.355037689208984, "logits/rejected": 14.223627090454102, "logps/chosen": -4.123199462890625, "logps/rejected": -4.5770416259765625, "loss": 3.6961, "rewards/accuracies": 0.75, "rewards/chosen": -41.23199462890625, "rewards/margins": 4.538425445556641, "rewards/rejected": -45.770416259765625, "step": 5244 }, { "epoch": 0.7141884531590414, "grad_norm": 36.05471196856021, "learning_rate": 1.8312602558675074e-07, "logits/chosen": 13.797922134399414, "logits/rejected": 13.605143547058105, "logps/chosen": -4.342990875244141, "logps/rejected": -4.1805291175842285, "loss": 3.7913, "rewards/accuracies": 0.25, "rewards/chosen": -43.42990493774414, "rewards/margins": -1.624612808227539, "rewards/rejected": -41.80529022216797, "step": 5245 }, { "epoch": 0.7143246187363834, "grad_norm": 39.85109258510514, "learning_rate": 1.8296628298485436e-07, "logits/chosen": 14.28114128112793, "logits/rejected": 13.85866928100586, "logps/chosen": -4.3916215896606445, "logps/rejected": -4.205328464508057, "loss": 3.7626, "rewards/accuracies": 0.25, "rewards/chosen": -43.91621398925781, "rewards/margins": -1.862929344177246, "rewards/rejected": -42.05328369140625, "step": 5246 }, { "epoch": 0.7144607843137255, "grad_norm": 38.89158146638643, "learning_rate": 1.8280658942352017e-07, "logits/chosen": 13.57724380493164, "logits/rejected": 13.37601089477539, "logps/chosen": -4.00611686706543, "logps/rejected": -4.0059814453125, "loss": 4.0068, "rewards/accuracies": 0.25, "rewards/chosen": -40.06116485595703, "rewards/margins": -0.00135040283203125, "rewards/rejected": -40.059814453125, "step": 5247 }, { "epoch": 0.7145969498910676, "grad_norm": 37.58136710552268, "learning_rate": 1.8264694493883251e-07, "logits/chosen": 13.24301528930664, "logits/rejected": 13.436083793640137, "logps/chosen": -4.0421671867370605, "logps/rejected": -4.195518493652344, "loss": 3.9631, "rewards/accuracies": 0.5, "rewards/chosen": -40.421669006347656, "rewards/margins": 1.5335149765014648, "rewards/rejected": -41.95518493652344, "step": 5248 }, { "epoch": 0.7147331154684096, "grad_norm": 37.830593988801375, "learning_rate": 1.824873495668644e-07, "logits/chosen": 13.449233055114746, "logits/rejected": 13.865972518920898, "logps/chosen": -4.320499420166016, "logps/rejected": -4.604494094848633, "loss": 4.1483, "rewards/accuracies": 1.0, "rewards/chosen": -43.204994201660156, "rewards/margins": 2.8399505615234375, "rewards/rejected": -46.044944763183594, "step": 5249 }, { "epoch": 0.7148692810457516, "grad_norm": 34.518188651659, "learning_rate": 1.8232780334367752e-07, "logits/chosen": 13.909854888916016, "logits/rejected": 14.243125915527344, "logps/chosen": -4.54454231262207, "logps/rejected": -4.53615140914917, "loss": 4.0012, "rewards/accuracies": 0.5, "rewards/chosen": -45.4454231262207, "rewards/margins": -0.08390998840332031, "rewards/rejected": -45.36151123046875, "step": 5250 }, { "epoch": 0.7150054466230937, "grad_norm": 37.12681284192182, "learning_rate": 1.8216830630532276e-07, "logits/chosen": 13.594266891479492, "logits/rejected": 14.226253509521484, "logps/chosen": -4.021280288696289, "logps/rejected": -4.486023902893066, "loss": 3.9229, "rewards/accuracies": 1.0, "rewards/chosen": -40.212799072265625, "rewards/margins": 4.647436141967773, "rewards/rejected": -44.86023712158203, "step": 5251 }, { "epoch": 0.7151416122004357, "grad_norm": 42.50679950347023, "learning_rate": 1.820088584878399e-07, "logits/chosen": 12.88691520690918, "logits/rejected": 13.39756965637207, "logps/chosen": -4.076801776885986, "logps/rejected": -4.120840549468994, "loss": 4.3962, "rewards/accuracies": 0.5, "rewards/chosen": -40.76802062988281, "rewards/margins": 0.44038867950439453, "rewards/rejected": -41.208404541015625, "step": 5252 }, { "epoch": 0.7152777777777778, "grad_norm": 38.954244291007385, "learning_rate": 1.8184945992725732e-07, "logits/chosen": 14.016752243041992, "logits/rejected": 13.663456916809082, "logps/chosen": -4.884530067443848, "logps/rejected": -4.687869071960449, "loss": 4.2768, "rewards/accuracies": 0.25, "rewards/chosen": -48.845298767089844, "rewards/margins": -1.9666099548339844, "rewards/rejected": -46.878692626953125, "step": 5253 }, { "epoch": 0.7154139433551199, "grad_norm": 34.52541303106965, "learning_rate": 1.816901106595925e-07, "logits/chosen": 14.27787971496582, "logits/rejected": 13.623194694519043, "logps/chosen": -4.719343185424805, "logps/rejected": -4.626049041748047, "loss": 3.7052, "rewards/accuracies": 0.25, "rewards/chosen": -47.19343566894531, "rewards/margins": -0.9329414367675781, "rewards/rejected": -46.260494232177734, "step": 5254 }, { "epoch": 0.7155501089324618, "grad_norm": 37.714777248733455, "learning_rate": 1.815308107208519e-07, "logits/chosen": 13.281667709350586, "logits/rejected": 14.01255989074707, "logps/chosen": -4.111664772033691, "logps/rejected": -4.343132019042969, "loss": 3.6227, "rewards/accuracies": 0.5, "rewards/chosen": -41.11665344238281, "rewards/margins": 2.314668655395508, "rewards/rejected": -43.43132019042969, "step": 5255 }, { "epoch": 0.7156862745098039, "grad_norm": 37.04064645659624, "learning_rate": 1.8137156014703034e-07, "logits/chosen": 12.76887321472168, "logits/rejected": 13.360734939575195, "logps/chosen": -4.04597806930542, "logps/rejected": -4.186528205871582, "loss": 4.157, "rewards/accuracies": 0.75, "rewards/chosen": -40.45977783203125, "rewards/margins": 1.4055061340332031, "rewards/rejected": -41.86528396606445, "step": 5256 }, { "epoch": 0.715822440087146, "grad_norm": 38.87966088724265, "learning_rate": 1.8121235897411195e-07, "logits/chosen": 12.564496994018555, "logits/rejected": 13.145882606506348, "logps/chosen": -4.079677581787109, "logps/rejected": -4.350567817687988, "loss": 3.9028, "rewards/accuracies": 0.75, "rewards/chosen": -40.79677200317383, "rewards/margins": 2.7089014053344727, "rewards/rejected": -43.50567626953125, "step": 5257 }, { "epoch": 0.715958605664488, "grad_norm": 36.15139392707922, "learning_rate": 1.810532072380697e-07, "logits/chosen": 13.470824241638184, "logits/rejected": 13.4661865234375, "logps/chosen": -4.59113883972168, "logps/rejected": -4.527742385864258, "loss": 3.8099, "rewards/accuracies": 0.25, "rewards/chosen": -45.91139221191406, "rewards/margins": -0.633967399597168, "rewards/rejected": -45.27742385864258, "step": 5258 }, { "epoch": 0.7160947712418301, "grad_norm": 36.62370118807706, "learning_rate": 1.8089410497486503e-07, "logits/chosen": 13.037546157836914, "logits/rejected": 13.082052230834961, "logps/chosen": -4.208929538726807, "logps/rejected": -4.509535312652588, "loss": 3.812, "rewards/accuracies": 0.75, "rewards/chosen": -42.08929443359375, "rewards/margins": 3.0060577392578125, "rewards/rejected": -45.09535217285156, "step": 5259 }, { "epoch": 0.7162309368191722, "grad_norm": 40.93602576404211, "learning_rate": 1.8073505222044844e-07, "logits/chosen": 13.245416641235352, "logits/rejected": 14.245688438415527, "logps/chosen": -3.9483537673950195, "logps/rejected": -4.223426818847656, "loss": 4.2103, "rewards/accuracies": 0.5, "rewards/chosen": -39.48353576660156, "rewards/margins": 2.750734329223633, "rewards/rejected": -42.23426818847656, "step": 5260 }, { "epoch": 0.7163671023965141, "grad_norm": 49.3713826109516, "learning_rate": 1.8057604901075942e-07, "logits/chosen": 13.839300155639648, "logits/rejected": 13.252645492553711, "logps/chosen": -4.426486968994141, "logps/rejected": -4.356073379516602, "loss": 4.2497, "rewards/accuracies": 0.5, "rewards/chosen": -44.26486587524414, "rewards/margins": -0.7041358947753906, "rewards/rejected": -43.56072998046875, "step": 5261 }, { "epoch": 0.7165032679738562, "grad_norm": 37.63718209330773, "learning_rate": 1.8041709538172577e-07, "logits/chosen": 13.419727325439453, "logits/rejected": 14.247419357299805, "logps/chosen": -4.095516204833984, "logps/rejected": -4.6161603927612305, "loss": 3.4322, "rewards/accuracies": 0.75, "rewards/chosen": -40.955162048339844, "rewards/margins": 5.20644474029541, "rewards/rejected": -46.1616096496582, "step": 5262 }, { "epoch": 0.7166394335511983, "grad_norm": 38.51511306673422, "learning_rate": 1.802581913692645e-07, "logits/chosen": 13.56242561340332, "logits/rejected": 13.28704833984375, "logps/chosen": -4.119684219360352, "logps/rejected": -4.266791820526123, "loss": 3.3777, "rewards/accuracies": 0.75, "rewards/chosen": -41.196842193603516, "rewards/margins": 1.4710769653320312, "rewards/rejected": -42.66791915893555, "step": 5263 }, { "epoch": 0.7167755991285403, "grad_norm": 37.33171237474862, "learning_rate": 1.8009933700928142e-07, "logits/chosen": 13.23481273651123, "logits/rejected": 13.278863906860352, "logps/chosen": -4.3710222244262695, "logps/rejected": -4.46769380569458, "loss": 3.9256, "rewards/accuracies": 0.5, "rewards/chosen": -43.71022415161133, "rewards/margins": 0.9667177200317383, "rewards/rejected": -44.67694091796875, "step": 5264 }, { "epoch": 0.7169117647058824, "grad_norm": 36.19542783854546, "learning_rate": 1.7994053233767072e-07, "logits/chosen": 13.490152359008789, "logits/rejected": 13.689277648925781, "logps/chosen": -4.518523216247559, "logps/rejected": -4.654942035675049, "loss": 3.8544, "rewards/accuracies": 0.75, "rewards/chosen": -45.18523406982422, "rewards/margins": 1.3641891479492188, "rewards/rejected": -46.54942321777344, "step": 5265 }, { "epoch": 0.7170479302832244, "grad_norm": 40.71848691630247, "learning_rate": 1.7978177739031577e-07, "logits/chosen": 13.737394332885742, "logits/rejected": 13.350296974182129, "logps/chosen": -4.1377973556518555, "logps/rejected": -4.419515609741211, "loss": 3.347, "rewards/accuracies": 0.75, "rewards/chosen": -41.37797164916992, "rewards/margins": 2.8171844482421875, "rewards/rejected": -44.195152282714844, "step": 5266 }, { "epoch": 0.7171840958605664, "grad_norm": 38.133615225940204, "learning_rate": 1.7962307220308874e-07, "logits/chosen": 13.308826446533203, "logits/rejected": 13.384626388549805, "logps/chosen": -4.232486248016357, "logps/rejected": -4.1753950119018555, "loss": 3.7481, "rewards/accuracies": 0.5, "rewards/chosen": -42.32486343383789, "rewards/margins": -0.5709123611450195, "rewards/rejected": -41.75395202636719, "step": 5267 }, { "epoch": 0.7173202614379085, "grad_norm": 39.28047647417207, "learning_rate": 1.7946441681185003e-07, "logits/chosen": 13.315287590026855, "logits/rejected": 13.521636009216309, "logps/chosen": -4.340226173400879, "logps/rejected": -4.347965240478516, "loss": 4.4838, "rewards/accuracies": 0.5, "rewards/chosen": -43.40226364135742, "rewards/margins": 0.07738971710205078, "rewards/rejected": -43.479652404785156, "step": 5268 }, { "epoch": 0.7174564270152506, "grad_norm": 36.37322345284151, "learning_rate": 1.793058112524493e-07, "logits/chosen": 13.321779251098633, "logits/rejected": 14.36819076538086, "logps/chosen": -4.2349066734313965, "logps/rejected": -4.787091255187988, "loss": 4.0313, "rewards/accuracies": 1.0, "rewards/chosen": -42.349063873291016, "rewards/margins": 5.521847724914551, "rewards/rejected": -47.87091064453125, "step": 5269 }, { "epoch": 0.7175925925925926, "grad_norm": 40.45613180782884, "learning_rate": 1.7914725556072491e-07, "logits/chosen": 13.385724067687988, "logits/rejected": 13.654035568237305, "logps/chosen": -4.25969123840332, "logps/rejected": -4.394413948059082, "loss": 4.1752, "rewards/accuracies": 0.75, "rewards/chosen": -42.5969123840332, "rewards/margins": 1.3472309112548828, "rewards/rejected": -43.94414138793945, "step": 5270 }, { "epoch": 0.7177287581699346, "grad_norm": 45.03702242346411, "learning_rate": 1.7898874977250363e-07, "logits/chosen": 14.295324325561523, "logits/rejected": 14.047357559204102, "logps/chosen": -4.2520294189453125, "logps/rejected": -4.4756669998168945, "loss": 4.035, "rewards/accuracies": 0.75, "rewards/chosen": -42.52029037475586, "rewards/margins": 2.2363786697387695, "rewards/rejected": -44.75666809082031, "step": 5271 }, { "epoch": 0.7178649237472767, "grad_norm": 39.908494014123185, "learning_rate": 1.7883029392360123e-07, "logits/chosen": 13.920914649963379, "logits/rejected": 14.096439361572266, "logps/chosen": -4.13395357131958, "logps/rejected": -4.460721969604492, "loss": 3.9382, "rewards/accuracies": 0.75, "rewards/chosen": -41.339534759521484, "rewards/margins": 3.2676897048950195, "rewards/rejected": -44.60722351074219, "step": 5272 }, { "epoch": 0.7180010893246187, "grad_norm": 42.456313480127825, "learning_rate": 1.7867188804982223e-07, "logits/chosen": 14.097311019897461, "logits/rejected": 15.044742584228516, "logps/chosen": -4.3594512939453125, "logps/rejected": -4.761150360107422, "loss": 4.0284, "rewards/accuracies": 0.5, "rewards/chosen": -43.594512939453125, "rewards/margins": 4.0169878005981445, "rewards/rejected": -47.61150360107422, "step": 5273 }, { "epoch": 0.7181372549019608, "grad_norm": 46.548469080153986, "learning_rate": 1.7851353218695952e-07, "logits/chosen": 13.8590087890625, "logits/rejected": 14.302462577819824, "logps/chosen": -4.445607662200928, "logps/rejected": -5.002684593200684, "loss": 4.0998, "rewards/accuracies": 1.0, "rewards/chosen": -44.456077575683594, "rewards/margins": 5.570767402648926, "rewards/rejected": -50.0268440246582, "step": 5274 }, { "epoch": 0.7182734204793029, "grad_norm": 36.742145269271624, "learning_rate": 1.7835522637079504e-07, "logits/chosen": 13.748876571655273, "logits/rejected": 13.823854446411133, "logps/chosen": -4.448446273803711, "logps/rejected": -4.547637939453125, "loss": 4.3252, "rewards/accuracies": 0.75, "rewards/chosen": -44.48446273803711, "rewards/margins": 0.9919147491455078, "rewards/rejected": -45.47637939453125, "step": 5275 }, { "epoch": 0.7184095860566448, "grad_norm": 42.63246824972856, "learning_rate": 1.7819697063709942e-07, "logits/chosen": 12.992483139038086, "logits/rejected": 13.70673942565918, "logps/chosen": -4.122784614562988, "logps/rejected": -4.274435043334961, "loss": 4.5119, "rewards/accuracies": 0.5, "rewards/chosen": -41.22784423828125, "rewards/margins": 1.5165090560913086, "rewards/rejected": -42.744354248046875, "step": 5276 }, { "epoch": 0.7185457516339869, "grad_norm": 43.22638943669343, "learning_rate": 1.780387650216316e-07, "logits/chosen": 14.042798042297363, "logits/rejected": 13.985877990722656, "logps/chosen": -4.12749719619751, "logps/rejected": -4.431864261627197, "loss": 4.6157, "rewards/accuracies": 0.75, "rewards/chosen": -41.27497100830078, "rewards/margins": 3.043670654296875, "rewards/rejected": -44.318641662597656, "step": 5277 }, { "epoch": 0.718681917211329, "grad_norm": 40.751925984469786, "learning_rate": 1.778806095601396e-07, "logits/chosen": 13.182352066040039, "logits/rejected": 13.959774017333984, "logps/chosen": -3.9314279556274414, "logps/rejected": -4.452472686767578, "loss": 3.1423, "rewards/accuracies": 1.0, "rewards/chosen": -39.31427764892578, "rewards/margins": 5.210452079772949, "rewards/rejected": -44.52473068237305, "step": 5278 }, { "epoch": 0.718818082788671, "grad_norm": 37.86065450184794, "learning_rate": 1.7772250428836002e-07, "logits/chosen": 13.778448104858398, "logits/rejected": 14.585265159606934, "logps/chosen": -4.16370153427124, "logps/rejected": -4.768520832061768, "loss": 3.4494, "rewards/accuracies": 1.0, "rewards/chosen": -41.63701629638672, "rewards/margins": 6.048192977905273, "rewards/rejected": -47.68520736694336, "step": 5279 }, { "epoch": 0.7189542483660131, "grad_norm": 44.39945117269189, "learning_rate": 1.7756444924201786e-07, "logits/chosen": 13.668950080871582, "logits/rejected": 14.087104797363281, "logps/chosen": -4.096561908721924, "logps/rejected": -4.368743419647217, "loss": 3.9196, "rewards/accuracies": 0.75, "rewards/chosen": -40.96561813354492, "rewards/margins": 2.7218151092529297, "rewards/rejected": -43.68743133544922, "step": 5280 }, { "epoch": 0.7190904139433552, "grad_norm": 44.73140696815867, "learning_rate": 1.7740644445682701e-07, "logits/chosen": 13.76528549194336, "logits/rejected": 14.578125953674316, "logps/chosen": -4.2249908447265625, "logps/rejected": -4.729568004608154, "loss": 3.9713, "rewards/accuracies": 1.0, "rewards/chosen": -42.249908447265625, "rewards/margins": 5.045774459838867, "rewards/rejected": -47.295684814453125, "step": 5281 }, { "epoch": 0.7192265795206971, "grad_norm": 39.35896936454477, "learning_rate": 1.772484899684902e-07, "logits/chosen": 14.409988403320312, "logits/rejected": 14.41226577758789, "logps/chosen": -4.384588241577148, "logps/rejected": -4.440824508666992, "loss": 3.5691, "rewards/accuracies": 0.75, "rewards/chosen": -43.84588623046875, "rewards/margins": 0.5623598098754883, "rewards/rejected": -44.40824508666992, "step": 5282 }, { "epoch": 0.7193627450980392, "grad_norm": 39.60875821537268, "learning_rate": 1.770905858126982e-07, "logits/chosen": 14.140752792358398, "logits/rejected": 14.711127281188965, "logps/chosen": -4.459033966064453, "logps/rejected": -4.814726829528809, "loss": 4.0611, "rewards/accuracies": 0.75, "rewards/chosen": -44.5903434753418, "rewards/margins": 3.5569257736206055, "rewards/rejected": -48.14726638793945, "step": 5283 }, { "epoch": 0.7194989106753813, "grad_norm": 40.11678467424444, "learning_rate": 1.7693273202513096e-07, "logits/chosen": 13.63180160522461, "logits/rejected": 13.954015731811523, "logps/chosen": -4.334884166717529, "logps/rejected": -4.651087760925293, "loss": 3.4716, "rewards/accuracies": 0.75, "rewards/chosen": -43.34884262084961, "rewards/margins": 3.1620311737060547, "rewards/rejected": -46.5108757019043, "step": 5284 }, { "epoch": 0.7196350762527233, "grad_norm": 41.58878770861021, "learning_rate": 1.7677492864145678e-07, "logits/chosen": 14.271673202514648, "logits/rejected": 13.926475524902344, "logps/chosen": -4.121633529663086, "logps/rejected": -4.275425434112549, "loss": 4.1114, "rewards/accuracies": 0.75, "rewards/chosen": -41.216339111328125, "rewards/margins": 1.537917137145996, "rewards/rejected": -42.75425720214844, "step": 5285 }, { "epoch": 0.7197712418300654, "grad_norm": 39.948183518574794, "learning_rate": 1.7661717569733284e-07, "logits/chosen": 14.395204544067383, "logits/rejected": 13.846086502075195, "logps/chosen": -4.6486735343933105, "logps/rejected": -4.661042213439941, "loss": 3.9377, "rewards/accuracies": 0.25, "rewards/chosen": -46.48673629760742, "rewards/margins": 0.12368202209472656, "rewards/rejected": -46.61042022705078, "step": 5286 }, { "epoch": 0.7199074074074074, "grad_norm": 38.68972589433881, "learning_rate": 1.7645947322840437e-07, "logits/chosen": 13.896003723144531, "logits/rejected": 14.212688446044922, "logps/chosen": -4.356585502624512, "logps/rejected": -4.570980072021484, "loss": 3.5909, "rewards/accuracies": 0.75, "rewards/chosen": -43.56584930419922, "rewards/margins": 2.14394474029541, "rewards/rejected": -45.70979690551758, "step": 5287 }, { "epoch": 0.7200435729847494, "grad_norm": 43.796644416354795, "learning_rate": 1.7630182127030576e-07, "logits/chosen": 13.772363662719727, "logits/rejected": 14.020488739013672, "logps/chosen": -4.372942924499512, "logps/rejected": -4.347577095031738, "loss": 4.5216, "rewards/accuracies": 0.25, "rewards/chosen": -43.72943115234375, "rewards/margins": -0.25365638732910156, "rewards/rejected": -43.47576904296875, "step": 5288 }, { "epoch": 0.7201797385620915, "grad_norm": 38.32717531114032, "learning_rate": 1.7614421985865984e-07, "logits/chosen": 14.402667999267578, "logits/rejected": 13.790016174316406, "logps/chosen": -4.523467063903809, "logps/rejected": -4.455845832824707, "loss": 3.8161, "rewards/accuracies": 0.75, "rewards/chosen": -45.23467254638672, "rewards/margins": -0.6762180328369141, "rewards/rejected": -44.55845642089844, "step": 5289 }, { "epoch": 0.7203159041394336, "grad_norm": 39.8669332839089, "learning_rate": 1.7598666902907776e-07, "logits/chosen": 13.757298469543457, "logits/rejected": 14.317595481872559, "logps/chosen": -4.277918815612793, "logps/rejected": -4.821055889129639, "loss": 4.1123, "rewards/accuracies": 1.0, "rewards/chosen": -42.77919006347656, "rewards/margins": 5.431367874145508, "rewards/rejected": -48.2105598449707, "step": 5290 }, { "epoch": 0.7204520697167756, "grad_norm": 39.52142056179599, "learning_rate": 1.758291688171595e-07, "logits/chosen": 13.380620956420898, "logits/rejected": 13.390384674072266, "logps/chosen": -4.0703606605529785, "logps/rejected": -4.179445743560791, "loss": 4.1783, "rewards/accuracies": 0.75, "rewards/chosen": -40.70360565185547, "rewards/margins": 1.0908517837524414, "rewards/rejected": -41.794456481933594, "step": 5291 }, { "epoch": 0.7205882352941176, "grad_norm": 42.59012162866563, "learning_rate": 1.7567171925849354e-07, "logits/chosen": 13.372644424438477, "logits/rejected": 13.87704849243164, "logps/chosen": -4.37910270690918, "logps/rejected": -4.485930919647217, "loss": 3.9022, "rewards/accuracies": 0.25, "rewards/chosen": -43.7910270690918, "rewards/margins": 1.068282127380371, "rewards/rejected": -44.85930633544922, "step": 5292 }, { "epoch": 0.7207244008714597, "grad_norm": 38.73254931434997, "learning_rate": 1.7551432038865714e-07, "logits/chosen": 13.432474136352539, "logits/rejected": 14.351064682006836, "logps/chosen": -4.197003364562988, "logps/rejected": -4.713120460510254, "loss": 3.427, "rewards/accuracies": 0.75, "rewards/chosen": -41.97003173828125, "rewards/margins": 5.161167144775391, "rewards/rejected": -47.131202697753906, "step": 5293 }, { "epoch": 0.7208605664488017, "grad_norm": 42.167496998067826, "learning_rate": 1.7535697224321546e-07, "logits/chosen": 13.359333038330078, "logits/rejected": 13.317361831665039, "logps/chosen": -4.415389060974121, "logps/rejected": -4.386659145355225, "loss": 4.0266, "rewards/accuracies": 0.5, "rewards/chosen": -44.153892517089844, "rewards/margins": -0.28729915618896484, "rewards/rejected": -43.86659240722656, "step": 5294 }, { "epoch": 0.7209967320261438, "grad_norm": 44.41581327612599, "learning_rate": 1.7519967485772286e-07, "logits/chosen": 13.836820602416992, "logits/rejected": 14.15213680267334, "logps/chosen": -4.554731845855713, "logps/rejected": -4.905880928039551, "loss": 3.2437, "rewards/accuracies": 1.0, "rewards/chosen": -45.54732131958008, "rewards/margins": 3.511488914489746, "rewards/rejected": -49.05881118774414, "step": 5295 }, { "epoch": 0.7211328976034859, "grad_norm": 37.26835005025409, "learning_rate": 1.7504242826772208e-07, "logits/chosen": 13.623611450195312, "logits/rejected": 14.160235404968262, "logps/chosen": -4.450505256652832, "logps/rejected": -4.851777076721191, "loss": 3.5399, "rewards/accuracies": 1.0, "rewards/chosen": -44.50505828857422, "rewards/margins": 4.012712478637695, "rewards/rejected": -48.51776885986328, "step": 5296 }, { "epoch": 0.7212690631808278, "grad_norm": 39.311357358217926, "learning_rate": 1.74885232508744e-07, "logits/chosen": 14.135232925415039, "logits/rejected": 14.623618125915527, "logps/chosen": -4.237855911254883, "logps/rejected": -4.475363731384277, "loss": 3.6086, "rewards/accuracies": 1.0, "rewards/chosen": -42.378562927246094, "rewards/margins": 2.3750743865966797, "rewards/rejected": -44.753639221191406, "step": 5297 }, { "epoch": 0.7214052287581699, "grad_norm": 45.389546484915776, "learning_rate": 1.7472808761630845e-07, "logits/chosen": 14.27563762664795, "logits/rejected": 14.963302612304688, "logps/chosen": -4.693572044372559, "logps/rejected": -4.751769065856934, "loss": 4.2385, "rewards/accuracies": 0.5, "rewards/chosen": -46.93572235107422, "rewards/margins": 0.5819711685180664, "rewards/rejected": -47.51769256591797, "step": 5298 }, { "epoch": 0.721541394335512, "grad_norm": 40.953307448906465, "learning_rate": 1.745709936259236e-07, "logits/chosen": 14.594070434570312, "logits/rejected": 15.119372367858887, "logps/chosen": -4.469764232635498, "logps/rejected": -4.631598472595215, "loss": 3.9671, "rewards/accuracies": 0.75, "rewards/chosen": -44.6976432800293, "rewards/margins": 1.6183404922485352, "rewards/rejected": -46.315982818603516, "step": 5299 }, { "epoch": 0.721677559912854, "grad_norm": 39.92007580632734, "learning_rate": 1.7441395057308634e-07, "logits/chosen": 14.561527252197266, "logits/rejected": 14.491929054260254, "logps/chosen": -4.809345245361328, "logps/rejected": -4.983954429626465, "loss": 3.6165, "rewards/accuracies": 0.75, "rewards/chosen": -48.09345626831055, "rewards/margins": 1.7460908889770508, "rewards/rejected": -49.83954620361328, "step": 5300 }, { "epoch": 0.7218137254901961, "grad_norm": 43.94423539450717, "learning_rate": 1.742569584932815e-07, "logits/chosen": 14.383089065551758, "logits/rejected": 15.208414077758789, "logps/chosen": -4.244617462158203, "logps/rejected": -4.566301345825195, "loss": 3.9108, "rewards/accuracies": 0.75, "rewards/chosen": -42.44617462158203, "rewards/margins": 3.2168378829956055, "rewards/rejected": -45.66301345825195, "step": 5301 }, { "epoch": 0.7219498910675382, "grad_norm": 41.61894728060178, "learning_rate": 1.7410001742198288e-07, "logits/chosen": 13.593363761901855, "logits/rejected": 13.926898956298828, "logps/chosen": -4.305217742919922, "logps/rejected": -4.347611427307129, "loss": 3.9753, "rewards/accuracies": 0.5, "rewards/chosen": -43.05217742919922, "rewards/margins": 0.42393970489501953, "rewards/rejected": -43.47611618041992, "step": 5302 }, { "epoch": 0.7220860566448801, "grad_norm": 41.091917324977935, "learning_rate": 1.7394312739465282e-07, "logits/chosen": 14.432790756225586, "logits/rejected": 13.750204086303711, "logps/chosen": -4.75048828125, "logps/rejected": -4.5340118408203125, "loss": 3.985, "rewards/accuracies": 0.25, "rewards/chosen": -47.5048828125, "rewards/margins": -2.164767265319824, "rewards/rejected": -45.34011459350586, "step": 5303 }, { "epoch": 0.7222222222222222, "grad_norm": 46.815421176268366, "learning_rate": 1.7378628844674154e-07, "logits/chosen": 13.751337051391602, "logits/rejected": 13.981904983520508, "logps/chosen": -4.547189712524414, "logps/rejected": -4.597448348999023, "loss": 3.5992, "rewards/accuracies": 0.75, "rewards/chosen": -45.47189712524414, "rewards/margins": 0.5025854110717773, "rewards/rejected": -45.97447967529297, "step": 5304 }, { "epoch": 0.7223583877995643, "grad_norm": 41.16574721031467, "learning_rate": 1.736295006136883e-07, "logits/chosen": 13.949495315551758, "logits/rejected": 13.759418487548828, "logps/chosen": -4.454538345336914, "logps/rejected": -4.444957256317139, "loss": 3.9074, "rewards/accuracies": 0.5, "rewards/chosen": -44.545387268066406, "rewards/margins": -0.0958108901977539, "rewards/rejected": -44.44957733154297, "step": 5305 }, { "epoch": 0.7224945533769063, "grad_norm": 43.163908647775635, "learning_rate": 1.7347276393092076e-07, "logits/chosen": 14.394142150878906, "logits/rejected": 14.298407554626465, "logps/chosen": -4.390780925750732, "logps/rejected": -4.593761444091797, "loss": 4.4266, "rewards/accuracies": 0.75, "rewards/chosen": -43.907806396484375, "rewards/margins": 2.0298023223876953, "rewards/rejected": -45.9376106262207, "step": 5306 }, { "epoch": 0.7226307189542484, "grad_norm": 37.41044482427421, "learning_rate": 1.7331607843385454e-07, "logits/chosen": 13.573875427246094, "logits/rejected": 13.923763275146484, "logps/chosen": -4.2721405029296875, "logps/rejected": -4.543745040893555, "loss": 3.6773, "rewards/accuracies": 1.0, "rewards/chosen": -42.72140884399414, "rewards/margins": 2.7160444259643555, "rewards/rejected": -45.43745422363281, "step": 5307 }, { "epoch": 0.7227668845315904, "grad_norm": 39.58475192009733, "learning_rate": 1.731594441578942e-07, "logits/chosen": 14.317742347717285, "logits/rejected": 14.800070762634277, "logps/chosen": -4.693855285644531, "logps/rejected": -4.608596324920654, "loss": 3.8951, "rewards/accuracies": 0.25, "rewards/chosen": -46.93855667114258, "rewards/margins": -0.8525934219360352, "rewards/rejected": -46.085960388183594, "step": 5308 }, { "epoch": 0.7229030501089324, "grad_norm": 40.47099156388713, "learning_rate": 1.7300286113843266e-07, "logits/chosen": 12.656148910522461, "logits/rejected": 13.260242462158203, "logps/chosen": -3.974816083908081, "logps/rejected": -4.393404960632324, "loss": 3.8914, "rewards/accuracies": 0.75, "rewards/chosen": -39.74816131591797, "rewards/margins": 4.185888290405273, "rewards/rejected": -43.93404769897461, "step": 5309 }, { "epoch": 0.7230392156862745, "grad_norm": 40.57846983027119, "learning_rate": 1.728463294108509e-07, "logits/chosen": 13.967252731323242, "logits/rejected": 13.764625549316406, "logps/chosen": -4.553577423095703, "logps/rejected": -4.680356502532959, "loss": 3.8326, "rewards/accuracies": 0.5, "rewards/chosen": -45.535770416259766, "rewards/margins": 1.2677946090698242, "rewards/rejected": -46.803565979003906, "step": 5310 }, { "epoch": 0.7231753812636166, "grad_norm": 45.16696184870268, "learning_rate": 1.726898490105187e-07, "logits/chosen": 13.656841278076172, "logits/rejected": 13.934621810913086, "logps/chosen": -4.571840286254883, "logps/rejected": -4.62833309173584, "loss": 3.8963, "rewards/accuracies": 0.75, "rewards/chosen": -45.71839904785156, "rewards/margins": 0.5649318695068359, "rewards/rejected": -46.28333282470703, "step": 5311 }, { "epoch": 0.7233115468409586, "grad_norm": 38.74191438273607, "learning_rate": 1.725334199727942e-07, "logits/chosen": 13.030096054077148, "logits/rejected": 14.377866744995117, "logps/chosen": -4.2470479011535645, "logps/rejected": -4.600725173950195, "loss": 3.8047, "rewards/accuracies": 0.75, "rewards/chosen": -42.47047805786133, "rewards/margins": 3.536773681640625, "rewards/rejected": -46.00725173950195, "step": 5312 }, { "epoch": 0.7234477124183006, "grad_norm": 38.23376396177733, "learning_rate": 1.7237704233302353e-07, "logits/chosen": 14.086050033569336, "logits/rejected": 14.124160766601562, "logps/chosen": -4.355698585510254, "logps/rejected": -4.379729270935059, "loss": 3.2186, "rewards/accuracies": 0.5, "rewards/chosen": -43.556983947753906, "rewards/margins": 0.24030780792236328, "rewards/rejected": -43.79729080200195, "step": 5313 }, { "epoch": 0.7235838779956427, "grad_norm": 40.94391959933858, "learning_rate": 1.7222071612654174e-07, "logits/chosen": 14.119171142578125, "logits/rejected": 14.855941772460938, "logps/chosen": -4.4756035804748535, "logps/rejected": -4.477415084838867, "loss": 4.1812, "rewards/accuracies": 0.5, "rewards/chosen": -44.75603485107422, "rewards/margins": 0.018115997314453125, "rewards/rejected": -44.77415466308594, "step": 5314 }, { "epoch": 0.7237200435729847, "grad_norm": 42.11620798327522, "learning_rate": 1.720644413886721e-07, "logits/chosen": 13.825103759765625, "logits/rejected": 14.531824111938477, "logps/chosen": -4.489262580871582, "logps/rejected": -4.888883590698242, "loss": 4.5029, "rewards/accuracies": 0.75, "rewards/chosen": -44.89262390136719, "rewards/margins": 3.9962081909179688, "rewards/rejected": -48.88883590698242, "step": 5315 }, { "epoch": 0.7238562091503268, "grad_norm": 42.85410863402914, "learning_rate": 1.7190821815472595e-07, "logits/chosen": 13.624652862548828, "logits/rejected": 13.603315353393555, "logps/chosen": -4.421253204345703, "logps/rejected": -4.5376176834106445, "loss": 3.6552, "rewards/accuracies": 0.5, "rewards/chosen": -44.21253204345703, "rewards/margins": 1.163644790649414, "rewards/rejected": -45.37617874145508, "step": 5316 }, { "epoch": 0.7239923747276689, "grad_norm": 44.71828368283522, "learning_rate": 1.717520464600033e-07, "logits/chosen": 14.118568420410156, "logits/rejected": 13.710359573364258, "logps/chosen": -4.467672348022461, "logps/rejected": -4.6451520919799805, "loss": 4.2652, "rewards/accuracies": 0.75, "rewards/chosen": -44.676727294921875, "rewards/margins": 1.7747974395751953, "rewards/rejected": -46.45152282714844, "step": 5317 }, { "epoch": 0.724128540305011, "grad_norm": 43.68520898272241, "learning_rate": 1.7159592633979263e-07, "logits/chosen": 13.293949127197266, "logits/rejected": 13.93215560913086, "logps/chosen": -4.200082778930664, "logps/rejected": -4.222140789031982, "loss": 4.0183, "rewards/accuracies": 0.5, "rewards/chosen": -42.000823974609375, "rewards/margins": 0.22058486938476562, "rewards/rejected": -42.22140884399414, "step": 5318 }, { "epoch": 0.7242647058823529, "grad_norm": 39.52000323177116, "learning_rate": 1.7143985782937026e-07, "logits/chosen": 13.639881134033203, "logits/rejected": 14.69310188293457, "logps/chosen": -4.1834716796875, "logps/rejected": -4.804802894592285, "loss": 3.4187, "rewards/accuracies": 1.0, "rewards/chosen": -41.834716796875, "rewards/margins": 6.213313102722168, "rewards/rejected": -48.04802703857422, "step": 5319 }, { "epoch": 0.724400871459695, "grad_norm": 40.52377364467854, "learning_rate": 1.7128384096400136e-07, "logits/chosen": 13.810139656066895, "logits/rejected": 14.340068817138672, "logps/chosen": -4.223726272583008, "logps/rejected": -4.439934253692627, "loss": 4.3464, "rewards/accuracies": 0.75, "rewards/chosen": -42.23726272583008, "rewards/margins": 2.1620826721191406, "rewards/rejected": -44.39934158325195, "step": 5320 }, { "epoch": 0.7245370370370371, "grad_norm": 38.72868160103529, "learning_rate": 1.711278757789393e-07, "logits/chosen": 12.946178436279297, "logits/rejected": 13.352394104003906, "logps/chosen": -4.2389607429504395, "logps/rejected": -4.5612688064575195, "loss": 3.6301, "rewards/accuracies": 0.75, "rewards/chosen": -42.38960647583008, "rewards/margins": 3.223081588745117, "rewards/rejected": -45.61268615722656, "step": 5321 }, { "epoch": 0.7246732026143791, "grad_norm": 39.413541127175435, "learning_rate": 1.7097196230942542e-07, "logits/chosen": 13.944467544555664, "logits/rejected": 14.312068939208984, "logps/chosen": -4.3175225257873535, "logps/rejected": -4.402750015258789, "loss": 3.3554, "rewards/accuracies": 0.25, "rewards/chosen": -43.175228118896484, "rewards/margins": 0.8522739410400391, "rewards/rejected": -44.027503967285156, "step": 5322 }, { "epoch": 0.7248093681917211, "grad_norm": 37.41414287600041, "learning_rate": 1.708161005906898e-07, "logits/chosen": 13.600336074829102, "logits/rejected": 13.188512802124023, "logps/chosen": -4.086445331573486, "logps/rejected": -4.466186046600342, "loss": 3.5667, "rewards/accuracies": 0.75, "rewards/chosen": -40.86445236206055, "rewards/margins": 3.7974090576171875, "rewards/rejected": -44.661861419677734, "step": 5323 }, { "epoch": 0.7249455337690632, "grad_norm": 40.06519440846429, "learning_rate": 1.7066029065795088e-07, "logits/chosen": 14.171215057373047, "logits/rejected": 14.389854431152344, "logps/chosen": -4.637645244598389, "logps/rejected": -4.620676040649414, "loss": 4.2018, "rewards/accuracies": 0.5, "rewards/chosen": -46.37645721435547, "rewards/margins": -0.1696929931640625, "rewards/rejected": -46.20676040649414, "step": 5324 }, { "epoch": 0.7250816993464052, "grad_norm": 42.11290605387176, "learning_rate": 1.705045325464149e-07, "logits/chosen": 13.838277816772461, "logits/rejected": 13.954244613647461, "logps/chosen": -4.24603271484375, "logps/rejected": -4.480013847351074, "loss": 3.7443, "rewards/accuracies": 0.5, "rewards/chosen": -42.460323333740234, "rewards/margins": 2.339811325073242, "rewards/rejected": -44.80013656616211, "step": 5325 }, { "epoch": 0.7252178649237473, "grad_norm": 35.68559173361203, "learning_rate": 1.703488262912768e-07, "logits/chosen": 13.783573150634766, "logits/rejected": 14.674337387084961, "logps/chosen": -4.392226219177246, "logps/rejected": -4.499256610870361, "loss": 3.2029, "rewards/accuracies": 0.75, "rewards/chosen": -43.922264099121094, "rewards/margins": 1.0703058242797852, "rewards/rejected": -44.99256896972656, "step": 5326 }, { "epoch": 0.7253540305010894, "grad_norm": 44.36382469319832, "learning_rate": 1.7019317192771988e-07, "logits/chosen": 13.998148918151855, "logits/rejected": 14.558667182922363, "logps/chosen": -4.279669761657715, "logps/rejected": -4.713833808898926, "loss": 4.6352, "rewards/accuracies": 0.75, "rewards/chosen": -42.79669952392578, "rewards/margins": 4.34163761138916, "rewards/rejected": -47.138336181640625, "step": 5327 }, { "epoch": 0.7254901960784313, "grad_norm": 40.89160581428982, "learning_rate": 1.7003756949091518e-07, "logits/chosen": 13.129535675048828, "logits/rejected": 13.436159133911133, "logps/chosen": -4.022641181945801, "logps/rejected": -4.458831787109375, "loss": 4.1132, "rewards/accuracies": 1.0, "rewards/chosen": -40.22641372680664, "rewards/margins": 4.361903190612793, "rewards/rejected": -44.58831787109375, "step": 5328 }, { "epoch": 0.7256263616557734, "grad_norm": 50.96113804532442, "learning_rate": 1.6988201901602258e-07, "logits/chosen": 13.688905715942383, "logits/rejected": 14.186742782592773, "logps/chosen": -4.264374732971191, "logps/rejected": -4.573559284210205, "loss": 4.4605, "rewards/accuracies": 0.75, "rewards/chosen": -42.64374923706055, "rewards/margins": 3.0918455123901367, "rewards/rejected": -45.735595703125, "step": 5329 }, { "epoch": 0.7257625272331155, "grad_norm": 39.25896987479908, "learning_rate": 1.6972652053819004e-07, "logits/chosen": 15.082693099975586, "logits/rejected": 14.607536315917969, "logps/chosen": -4.554288864135742, "logps/rejected": -4.940080165863037, "loss": 3.5261, "rewards/accuracies": 0.75, "rewards/chosen": -45.54288864135742, "rewards/margins": 3.8579111099243164, "rewards/rejected": -49.40080261230469, "step": 5330 }, { "epoch": 0.7258986928104575, "grad_norm": 42.40427333096051, "learning_rate": 1.6957107409255355e-07, "logits/chosen": 13.576107025146484, "logits/rejected": 12.804834365844727, "logps/chosen": -4.610451698303223, "logps/rejected": -4.46906852722168, "loss": 4.2005, "rewards/accuracies": 0.25, "rewards/chosen": -46.104515075683594, "rewards/margins": -1.4138307571411133, "rewards/rejected": -44.6906852722168, "step": 5331 }, { "epoch": 0.7260348583877996, "grad_norm": 40.35520311443304, "learning_rate": 1.694156797142376e-07, "logits/chosen": 14.046051025390625, "logits/rejected": 14.395206451416016, "logps/chosen": -4.811518669128418, "logps/rejected": -4.603376865386963, "loss": 3.4904, "rewards/accuracies": 0.25, "rewards/chosen": -48.11518478393555, "rewards/margins": -2.081418037414551, "rewards/rejected": -46.03376770019531, "step": 5332 }, { "epoch": 0.7261710239651417, "grad_norm": 43.198277729357415, "learning_rate": 1.6926033743835503e-07, "logits/chosen": 14.054119110107422, "logits/rejected": 13.903935432434082, "logps/chosen": -4.375255584716797, "logps/rejected": -4.47501277923584, "loss": 3.3785, "rewards/accuracies": 0.75, "rewards/chosen": -43.75255584716797, "rewards/margins": 0.9975709915161133, "rewards/rejected": -44.75012969970703, "step": 5333 }, { "epoch": 0.7263071895424836, "grad_norm": 39.74537210595247, "learning_rate": 1.6910504730000635e-07, "logits/chosen": 14.12202262878418, "logits/rejected": 14.13705062866211, "logps/chosen": -4.369760513305664, "logps/rejected": -4.384034633636475, "loss": 3.8477, "rewards/accuracies": 0.75, "rewards/chosen": -43.69760513305664, "rewards/margins": 0.14274120330810547, "rewards/rejected": -43.84034729003906, "step": 5334 }, { "epoch": 0.7264433551198257, "grad_norm": 39.58060877988468, "learning_rate": 1.6894980933428085e-07, "logits/chosen": 13.954996109008789, "logits/rejected": 14.346397399902344, "logps/chosen": -4.231977462768555, "logps/rejected": -4.743593215942383, "loss": 3.6832, "rewards/accuracies": 0.75, "rewards/chosen": -42.31977462768555, "rewards/margins": 5.116157531738281, "rewards/rejected": -47.43593215942383, "step": 5335 }, { "epoch": 0.7265795206971678, "grad_norm": 42.03692255858366, "learning_rate": 1.6879462357625592e-07, "logits/chosen": 13.710368156433105, "logits/rejected": 14.706161499023438, "logps/chosen": -4.222752571105957, "logps/rejected": -4.699319839477539, "loss": 3.6819, "rewards/accuracies": 0.75, "rewards/chosen": -42.2275276184082, "rewards/margins": 4.7656707763671875, "rewards/rejected": -46.99319839477539, "step": 5336 }, { "epoch": 0.7267156862745098, "grad_norm": 41.21014573315123, "learning_rate": 1.6863949006099684e-07, "logits/chosen": 14.500325202941895, "logits/rejected": 14.578434944152832, "logps/chosen": -4.735601425170898, "logps/rejected": -4.685805320739746, "loss": 3.6637, "rewards/accuracies": 0.25, "rewards/chosen": -47.35601806640625, "rewards/margins": -0.49796485900878906, "rewards/rejected": -46.858055114746094, "step": 5337 }, { "epoch": 0.7268518518518519, "grad_norm": 45.80213032779978, "learning_rate": 1.6848440882355744e-07, "logits/chosen": 13.291422843933105, "logits/rejected": 13.799189567565918, "logps/chosen": -3.906358003616333, "logps/rejected": -4.354409217834473, "loss": 3.9425, "rewards/accuracies": 1.0, "rewards/chosen": -39.06357955932617, "rewards/margins": 4.480511665344238, "rewards/rejected": -43.544090270996094, "step": 5338 }, { "epoch": 0.726988017429194, "grad_norm": 42.05344805102445, "learning_rate": 1.6832937989897967e-07, "logits/chosen": 14.892117500305176, "logits/rejected": 14.605493545532227, "logps/chosen": -4.773000717163086, "logps/rejected": -4.704817771911621, "loss": 3.8717, "rewards/accuracies": 0.25, "rewards/chosen": -47.73000717163086, "rewards/margins": -0.6818313598632812, "rewards/rejected": -47.04817581176758, "step": 5339 }, { "epoch": 0.7271241830065359, "grad_norm": 42.61489402639697, "learning_rate": 1.6817440332229346e-07, "logits/chosen": 14.126998901367188, "logits/rejected": 13.268610000610352, "logps/chosen": -4.392580032348633, "logps/rejected": -3.9743106365203857, "loss": 3.8961, "rewards/accuracies": 0.0, "rewards/chosen": -43.92579650878906, "rewards/margins": -4.18269157409668, "rewards/rejected": -39.74310302734375, "step": 5340 }, { "epoch": 0.727260348583878, "grad_norm": 39.83616761329271, "learning_rate": 1.6801947912851703e-07, "logits/chosen": 14.471580505371094, "logits/rejected": 14.037025451660156, "logps/chosen": -4.434363842010498, "logps/rejected": -4.58000373840332, "loss": 3.6803, "rewards/accuracies": 0.75, "rewards/chosen": -44.3436393737793, "rewards/margins": 1.4563941955566406, "rewards/rejected": -45.80003356933594, "step": 5341 }, { "epoch": 0.7273965141612201, "grad_norm": 39.60662024422968, "learning_rate": 1.6786460735265706e-07, "logits/chosen": 13.960771560668945, "logits/rejected": 14.733766555786133, "logps/chosen": -4.516231536865234, "logps/rejected": -5.033801078796387, "loss": 4.0698, "rewards/accuracies": 1.0, "rewards/chosen": -45.162315368652344, "rewards/margins": 5.1757001876831055, "rewards/rejected": -50.3380126953125, "step": 5342 }, { "epoch": 0.7275326797385621, "grad_norm": 39.92837395258977, "learning_rate": 1.6770978802970776e-07, "logits/chosen": 12.820672035217285, "logits/rejected": 14.142887115478516, "logps/chosen": -3.972249984741211, "logps/rejected": -4.371034622192383, "loss": 3.5626, "rewards/accuracies": 1.0, "rewards/chosen": -39.72249984741211, "rewards/margins": 3.9878482818603516, "rewards/rejected": -43.710350036621094, "step": 5343 }, { "epoch": 0.7276688453159041, "grad_norm": 41.970961101754504, "learning_rate": 1.6755502119465197e-07, "logits/chosen": 14.405073165893555, "logits/rejected": 15.128850936889648, "logps/chosen": -4.7121171951293945, "logps/rejected": -5.0237507820129395, "loss": 4.1758, "rewards/accuracies": 1.0, "rewards/chosen": -47.12117004394531, "rewards/margins": 3.116335868835449, "rewards/rejected": -50.23750686645508, "step": 5344 }, { "epoch": 0.7278050108932462, "grad_norm": 42.71901509924737, "learning_rate": 1.674003068824607e-07, "logits/chosen": 13.374438285827637, "logits/rejected": 13.771909713745117, "logps/chosen": -4.2887163162231445, "logps/rejected": -4.500553131103516, "loss": 4.3779, "rewards/accuracies": 0.5, "rewards/chosen": -42.88716125488281, "rewards/margins": 2.1183691024780273, "rewards/rejected": -45.005531311035156, "step": 5345 }, { "epoch": 0.7279411764705882, "grad_norm": 43.88977427719134, "learning_rate": 1.6724564512809266e-07, "logits/chosen": 13.857912063598633, "logits/rejected": 15.304317474365234, "logps/chosen": -4.4029541015625, "logps/rejected": -5.207023620605469, "loss": 3.434, "rewards/accuracies": 1.0, "rewards/chosen": -44.029541015625, "rewards/margins": 8.040694236755371, "rewards/rejected": -52.07023620605469, "step": 5346 }, { "epoch": 0.7280773420479303, "grad_norm": 43.0571425831431, "learning_rate": 1.6709103596649502e-07, "logits/chosen": 12.624239921569824, "logits/rejected": 13.82234001159668, "logps/chosen": -4.092823028564453, "logps/rejected": -4.570530414581299, "loss": 4.1783, "rewards/accuracies": 1.0, "rewards/chosen": -40.928226470947266, "rewards/margins": 4.7770795822143555, "rewards/rejected": -45.70530700683594, "step": 5347 }, { "epoch": 0.7282135076252724, "grad_norm": 41.388489407906235, "learning_rate": 1.6693647943260323e-07, "logits/chosen": 13.027088165283203, "logits/rejected": 14.092461585998535, "logps/chosen": -4.167001724243164, "logps/rejected": -4.68600606918335, "loss": 3.5179, "rewards/accuracies": 1.0, "rewards/chosen": -41.67001724243164, "rewards/margins": 5.190044403076172, "rewards/rejected": -46.86006164550781, "step": 5348 }, { "epoch": 0.7283496732026143, "grad_norm": 41.531891111450015, "learning_rate": 1.667819755613403e-07, "logits/chosen": 14.174455642700195, "logits/rejected": 14.157655715942383, "logps/chosen": -4.515721797943115, "logps/rejected": -4.695714950561523, "loss": 3.8791, "rewards/accuracies": 0.5, "rewards/chosen": -45.15721893310547, "rewards/margins": 1.7999267578125, "rewards/rejected": -46.95714569091797, "step": 5349 }, { "epoch": 0.7284858387799564, "grad_norm": 40.171259127313114, "learning_rate": 1.6662752438761776e-07, "logits/chosen": 14.179994583129883, "logits/rejected": 14.31795883178711, "logps/chosen": -4.820887565612793, "logps/rejected": -4.673914432525635, "loss": 3.8426, "rewards/accuracies": 0.5, "rewards/chosen": -48.2088737487793, "rewards/margins": -1.4697294235229492, "rewards/rejected": -46.73914337158203, "step": 5350 }, { "epoch": 0.7286220043572985, "grad_norm": 40.776328942090196, "learning_rate": 1.6647312594633532e-07, "logits/chosen": 14.484428405761719, "logits/rejected": 14.816064834594727, "logps/chosen": -4.618357181549072, "logps/rejected": -4.813589096069336, "loss": 4.0923, "rewards/accuracies": 0.5, "rewards/chosen": -46.18357467651367, "rewards/margins": 1.952315330505371, "rewards/rejected": -48.135887145996094, "step": 5351 }, { "epoch": 0.7287581699346405, "grad_norm": 37.9718637272598, "learning_rate": 1.6631878027238027e-07, "logits/chosen": 14.256725311279297, "logits/rejected": 14.160528182983398, "logps/chosen": -4.498255252838135, "logps/rejected": -4.547944068908691, "loss": 3.7765, "rewards/accuracies": 0.5, "rewards/chosen": -44.98255157470703, "rewards/margins": 0.4968910217285156, "rewards/rejected": -45.47944641113281, "step": 5352 }, { "epoch": 0.7288943355119826, "grad_norm": 43.79692771300294, "learning_rate": 1.6616448740062845e-07, "logits/chosen": 13.69202995300293, "logits/rejected": 13.747817993164062, "logps/chosen": -4.538578033447266, "logps/rejected": -4.369492530822754, "loss": 4.3851, "rewards/accuracies": 0.25, "rewards/chosen": -45.385780334472656, "rewards/margins": -1.6908540725708008, "rewards/rejected": -43.694923400878906, "step": 5353 }, { "epoch": 0.7290305010893247, "grad_norm": 44.89863042413564, "learning_rate": 1.6601024736594376e-07, "logits/chosen": 14.147443771362305, "logits/rejected": 14.103006362915039, "logps/chosen": -4.784233093261719, "logps/rejected": -4.615768909454346, "loss": 4.0431, "rewards/accuracies": 0.25, "rewards/chosen": -47.84233093261719, "rewards/margins": -1.6846437454223633, "rewards/rejected": -46.157684326171875, "step": 5354 }, { "epoch": 0.7291666666666666, "grad_norm": 40.89850981462882, "learning_rate": 1.6585606020317772e-07, "logits/chosen": 14.092523574829102, "logits/rejected": 15.144055366516113, "logps/chosen": -4.436549663543701, "logps/rejected": -5.090799331665039, "loss": 3.6198, "rewards/accuracies": 1.0, "rewards/chosen": -44.36549377441406, "rewards/margins": 6.542498588562012, "rewards/rejected": -50.907997131347656, "step": 5355 }, { "epoch": 0.7293028322440087, "grad_norm": 41.91618046340408, "learning_rate": 1.6570192594717032e-07, "logits/chosen": 13.44073486328125, "logits/rejected": 13.845537185668945, "logps/chosen": -4.181537628173828, "logps/rejected": -4.47907018661499, "loss": 3.9974, "rewards/accuracies": 0.75, "rewards/chosen": -41.81538009643555, "rewards/margins": 2.975320816040039, "rewards/rejected": -44.79070281982422, "step": 5356 }, { "epoch": 0.7294389978213508, "grad_norm": 40.48620435747992, "learning_rate": 1.655478446327496e-07, "logits/chosen": 14.066051483154297, "logits/rejected": 13.987800598144531, "logps/chosen": -4.055468559265137, "logps/rejected": -4.085967063903809, "loss": 4.3207, "rewards/accuracies": 0.5, "rewards/chosen": -40.554683685302734, "rewards/margins": 0.30498409271240234, "rewards/rejected": -40.85966491699219, "step": 5357 }, { "epoch": 0.7295751633986928, "grad_norm": 40.3996898517443, "learning_rate": 1.653938162947313e-07, "logits/chosen": 14.239920616149902, "logits/rejected": 14.22822380065918, "logps/chosen": -4.61219596862793, "logps/rejected": -4.543841361999512, "loss": 4.0149, "rewards/accuracies": 0.25, "rewards/chosen": -46.12196350097656, "rewards/margins": -0.6835451126098633, "rewards/rejected": -45.43841552734375, "step": 5358 }, { "epoch": 0.7297113289760349, "grad_norm": 45.23243344448026, "learning_rate": 1.6523984096791944e-07, "logits/chosen": 14.273237228393555, "logits/rejected": 14.544867515563965, "logps/chosen": -4.45986270904541, "logps/rejected": -4.803410530090332, "loss": 4.0961, "rewards/accuracies": 0.75, "rewards/chosen": -44.59862518310547, "rewards/margins": 3.435481071472168, "rewards/rejected": -48.03410339355469, "step": 5359 }, { "epoch": 0.7298474945533769, "grad_norm": 39.76360660963221, "learning_rate": 1.650859186871062e-07, "logits/chosen": 13.631519317626953, "logits/rejected": 14.642250061035156, "logps/chosen": -4.372625350952148, "logps/rejected": -4.712333679199219, "loss": 4.1093, "rewards/accuracies": 0.75, "rewards/chosen": -43.726253509521484, "rewards/margins": 3.397080421447754, "rewards/rejected": -47.12333679199219, "step": 5360 }, { "epoch": 0.7299836601307189, "grad_norm": 46.31235471309552, "learning_rate": 1.6493204948707132e-07, "logits/chosen": 14.17933177947998, "logits/rejected": 13.86094856262207, "logps/chosen": -4.2441606521606445, "logps/rejected": -4.20511531829834, "loss": 3.6806, "rewards/accuracies": 0.5, "rewards/chosen": -42.44160461425781, "rewards/margins": -0.3904542922973633, "rewards/rejected": -42.0511474609375, "step": 5361 }, { "epoch": 0.730119825708061, "grad_norm": 41.78862727869835, "learning_rate": 1.6477823340258295e-07, "logits/chosen": 13.631805419921875, "logits/rejected": 13.878899574279785, "logps/chosen": -4.032090663909912, "logps/rejected": -4.342225074768066, "loss": 4.1117, "rewards/accuracies": 0.75, "rewards/chosen": -40.32090759277344, "rewards/margins": 3.101346015930176, "rewards/rejected": -43.42224884033203, "step": 5362 }, { "epoch": 0.7302559912854031, "grad_norm": 44.61647252943174, "learning_rate": 1.6462447046839727e-07, "logits/chosen": 14.476011276245117, "logits/rejected": 14.513590812683105, "logps/chosen": -4.589071273803711, "logps/rejected": -4.390916347503662, "loss": 4.0281, "rewards/accuracies": 0.0, "rewards/chosen": -45.890716552734375, "rewards/margins": -1.9815492630004883, "rewards/rejected": -43.90916442871094, "step": 5363 }, { "epoch": 0.7303921568627451, "grad_norm": 41.03075721415224, "learning_rate": 1.6447076071925792e-07, "logits/chosen": 14.55583667755127, "logits/rejected": 13.620830535888672, "logps/chosen": -4.57224178314209, "logps/rejected": -4.2793707847595215, "loss": 3.9923, "rewards/accuracies": 0.25, "rewards/chosen": -45.722415924072266, "rewards/margins": -2.928708076477051, "rewards/rejected": -42.79370880126953, "step": 5364 }, { "epoch": 0.7305283224400871, "grad_norm": 41.927015159676614, "learning_rate": 1.6431710418989715e-07, "logits/chosen": 15.10248851776123, "logits/rejected": 15.318363189697266, "logps/chosen": -4.769428253173828, "logps/rejected": -4.95658016204834, "loss": 3.9552, "rewards/accuracies": 0.75, "rewards/chosen": -47.69428253173828, "rewards/margins": 1.87152099609375, "rewards/rejected": -49.56580352783203, "step": 5365 }, { "epoch": 0.7306644880174292, "grad_norm": 38.8069933840965, "learning_rate": 1.6416350091503498e-07, "logits/chosen": 14.291339874267578, "logits/rejected": 15.125746726989746, "logps/chosen": -4.400602340698242, "logps/rejected": -4.743197917938232, "loss": 4.1245, "rewards/accuracies": 0.75, "rewards/chosen": -44.00602340698242, "rewards/margins": 3.425954818725586, "rewards/rejected": -47.43198013305664, "step": 5366 }, { "epoch": 0.7308006535947712, "grad_norm": 38.92739943160761, "learning_rate": 1.6400995092937908e-07, "logits/chosen": 13.959678649902344, "logits/rejected": 14.128491401672363, "logps/chosen": -4.568151473999023, "logps/rejected": -4.46478796005249, "loss": 4.2787, "rewards/accuracies": 0.5, "rewards/chosen": -45.681514739990234, "rewards/margins": -1.033635139465332, "rewards/rejected": -44.64787673950195, "step": 5367 }, { "epoch": 0.7309368191721133, "grad_norm": 38.19248229098538, "learning_rate": 1.6385645426762547e-07, "logits/chosen": 14.725858688354492, "logits/rejected": 14.446474075317383, "logps/chosen": -4.5147480964660645, "logps/rejected": -4.50663948059082, "loss": 4.0145, "rewards/accuracies": 0.25, "rewards/chosen": -45.14748001098633, "rewards/margins": -0.08108997344970703, "rewards/rejected": -45.06639099121094, "step": 5368 }, { "epoch": 0.7310729847494554, "grad_norm": 38.196159736112925, "learning_rate": 1.6370301096445816e-07, "logits/chosen": 13.160429954528809, "logits/rejected": 14.087366104125977, "logps/chosen": -4.380285263061523, "logps/rejected": -4.53155517578125, "loss": 3.7132, "rewards/accuracies": 1.0, "rewards/chosen": -43.802852630615234, "rewards/margins": 1.5126962661743164, "rewards/rejected": -45.3155517578125, "step": 5369 }, { "epoch": 0.7312091503267973, "grad_norm": 41.87451393061184, "learning_rate": 1.635496210545486e-07, "logits/chosen": 14.692481994628906, "logits/rejected": 14.955074310302734, "logps/chosen": -4.636279106140137, "logps/rejected": -4.8466620445251465, "loss": 3.3087, "rewards/accuracies": 0.5, "rewards/chosen": -46.36279296875, "rewards/margins": 2.1038293838500977, "rewards/rejected": -48.46662139892578, "step": 5370 }, { "epoch": 0.7313453159041394, "grad_norm": 52.53008448280534, "learning_rate": 1.6339628457255673e-07, "logits/chosen": 14.907188415527344, "logits/rejected": 14.362582206726074, "logps/chosen": -5.016029357910156, "logps/rejected": -4.347195625305176, "loss": 4.3995, "rewards/accuracies": 0.5, "rewards/chosen": -50.16029357910156, "rewards/margins": -6.688333511352539, "rewards/rejected": -43.471961975097656, "step": 5371 }, { "epoch": 0.7314814814814815, "grad_norm": 40.379224188871724, "learning_rate": 1.6324300155313025e-07, "logits/chosen": 13.808588027954102, "logits/rejected": 14.103326797485352, "logps/chosen": -4.469199180603027, "logps/rejected": -4.5041351318359375, "loss": 4.0796, "rewards/accuracies": 0.5, "rewards/chosen": -44.691993713378906, "rewards/margins": 0.34935569763183594, "rewards/rejected": -45.041351318359375, "step": 5372 }, { "epoch": 0.7316176470588235, "grad_norm": 41.99943517475982, "learning_rate": 1.6308977203090453e-07, "logits/chosen": 13.640035629272461, "logits/rejected": 13.857917785644531, "logps/chosen": -4.587734222412109, "logps/rejected": -4.539674758911133, "loss": 4.1666, "rewards/accuracies": 0.5, "rewards/chosen": -45.87734603881836, "rewards/margins": -0.4805946350097656, "rewards/rejected": -45.396751403808594, "step": 5373 }, { "epoch": 0.7317538126361656, "grad_norm": 39.630077064132124, "learning_rate": 1.629365960405031e-07, "logits/chosen": 13.657222747802734, "logits/rejected": 14.161109924316406, "logps/chosen": -4.154797554016113, "logps/rejected": -4.630744934082031, "loss": 4.1185, "rewards/accuracies": 0.75, "rewards/chosen": -41.547977447509766, "rewards/margins": 4.759469985961914, "rewards/rejected": -46.30744934082031, "step": 5374 }, { "epoch": 0.7318899782135077, "grad_norm": 38.405657761038384, "learning_rate": 1.6278347361653753e-07, "logits/chosen": 14.148662567138672, "logits/rejected": 14.624296188354492, "logps/chosen": -4.577272415161133, "logps/rejected": -4.720898628234863, "loss": 3.8198, "rewards/accuracies": 0.75, "rewards/chosen": -45.77272415161133, "rewards/margins": 1.4362602233886719, "rewards/rejected": -47.208984375, "step": 5375 }, { "epoch": 0.7320261437908496, "grad_norm": 42.83645640438666, "learning_rate": 1.6263040479360682e-07, "logits/chosen": 13.720307350158691, "logits/rejected": 14.24502944946289, "logps/chosen": -4.182628631591797, "logps/rejected": -4.466805458068848, "loss": 4.2198, "rewards/accuracies": 0.75, "rewards/chosen": -41.82628631591797, "rewards/margins": 2.841764450073242, "rewards/rejected": -44.66804885864258, "step": 5376 }, { "epoch": 0.7321623093681917, "grad_norm": 40.295779156856206, "learning_rate": 1.6247738960629823e-07, "logits/chosen": 13.83529281616211, "logits/rejected": 13.75523567199707, "logps/chosen": -4.301419734954834, "logps/rejected": -4.439092636108398, "loss": 3.7928, "rewards/accuracies": 0.5, "rewards/chosen": -43.014198303222656, "rewards/margins": 1.3767280578613281, "rewards/rejected": -44.390926361083984, "step": 5377 }, { "epoch": 0.7322984749455338, "grad_norm": 37.58748338769811, "learning_rate": 1.6232442808918702e-07, "logits/chosen": 14.514511108398438, "logits/rejected": 14.489995956420898, "logps/chosen": -4.401180267333984, "logps/rejected": -4.378692626953125, "loss": 3.7126, "rewards/accuracies": 0.5, "rewards/chosen": -44.011802673339844, "rewards/margins": -0.22487640380859375, "rewards/rejected": -43.786930084228516, "step": 5378 }, { "epoch": 0.7324346405228758, "grad_norm": 40.38132548453939, "learning_rate": 1.6217152027683576e-07, "logits/chosen": 13.778438568115234, "logits/rejected": 14.207784652709961, "logps/chosen": -4.295648574829102, "logps/rejected": -4.25691556930542, "loss": 4.2749, "rewards/accuracies": 0.5, "rewards/chosen": -42.956485748291016, "rewards/margins": -0.3873291015625, "rewards/rejected": -42.56915283203125, "step": 5379 }, { "epoch": 0.7325708061002179, "grad_norm": 40.44388208947847, "learning_rate": 1.620186662037954e-07, "logits/chosen": 13.983582496643066, "logits/rejected": 14.815038681030273, "logps/chosen": -4.58779764175415, "logps/rejected": -4.966434478759766, "loss": 3.7569, "rewards/accuracies": 0.5, "rewards/chosen": -45.87797546386719, "rewards/margins": 3.7863645553588867, "rewards/rejected": -49.66434097290039, "step": 5380 }, { "epoch": 0.7327069716775599, "grad_norm": 42.939129073861736, "learning_rate": 1.6186586590460473e-07, "logits/chosen": 14.738370895385742, "logits/rejected": 14.138063430786133, "logps/chosen": -4.789432048797607, "logps/rejected": -4.747581481933594, "loss": 4.3628, "rewards/accuracies": 0.25, "rewards/chosen": -47.894317626953125, "rewards/margins": -0.41850757598876953, "rewards/rejected": -47.47581100463867, "step": 5381 }, { "epoch": 0.7328431372549019, "grad_norm": 40.2399206004605, "learning_rate": 1.6171311941379e-07, "logits/chosen": 14.447591781616211, "logits/rejected": 15.09742546081543, "logps/chosen": -4.303218841552734, "logps/rejected": -4.793941020965576, "loss": 3.932, "rewards/accuracies": 0.75, "rewards/chosen": -43.03218460083008, "rewards/margins": 4.907223701477051, "rewards/rejected": -47.93941116333008, "step": 5382 }, { "epoch": 0.732979302832244, "grad_norm": 38.490980107495, "learning_rate": 1.615604267658656e-07, "logits/chosen": 13.337902069091797, "logits/rejected": 13.983789443969727, "logps/chosen": -4.029269695281982, "logps/rejected": -4.537616729736328, "loss": 3.6068, "rewards/accuracies": 0.75, "rewards/chosen": -40.29269790649414, "rewards/margins": 5.083469390869141, "rewards/rejected": -45.37616729736328, "step": 5383 }, { "epoch": 0.7331154684095861, "grad_norm": 42.457159181289725, "learning_rate": 1.6140778799533373e-07, "logits/chosen": 13.687484741210938, "logits/rejected": 13.416183471679688, "logps/chosen": -4.145240306854248, "logps/rejected": -4.1756486892700195, "loss": 4.6028, "rewards/accuracies": 0.5, "rewards/chosen": -41.45240020751953, "rewards/margins": 0.30408668518066406, "rewards/rejected": -41.75648498535156, "step": 5384 }, { "epoch": 0.733251633986928, "grad_norm": 40.4397189976198, "learning_rate": 1.6125520313668456e-07, "logits/chosen": 14.033370971679688, "logits/rejected": 14.010139465332031, "logps/chosen": -4.5536909103393555, "logps/rejected": -4.669971466064453, "loss": 4.2804, "rewards/accuracies": 0.5, "rewards/chosen": -45.53691101074219, "rewards/margins": 1.1627998352050781, "rewards/rejected": -46.699710845947266, "step": 5385 }, { "epoch": 0.7333877995642701, "grad_norm": 44.78528962426643, "learning_rate": 1.611026722243955e-07, "logits/chosen": 14.445927619934082, "logits/rejected": 14.548897743225098, "logps/chosen": -4.565645217895508, "logps/rejected": -4.3965959548950195, "loss": 4.5759, "rewards/accuracies": 0.25, "rewards/chosen": -45.656455993652344, "rewards/margins": -1.6904964447021484, "rewards/rejected": -43.96595764160156, "step": 5386 }, { "epoch": 0.7335239651416122, "grad_norm": 39.30495684023495, "learning_rate": 1.609501952929325e-07, "logits/chosen": 14.019488334655762, "logits/rejected": 13.997920989990234, "logps/chosen": -4.600563049316406, "logps/rejected": -4.553535461425781, "loss": 4.0929, "rewards/accuracies": 0.5, "rewards/chosen": -46.00563049316406, "rewards/margins": -0.4702768325805664, "rewards/rejected": -45.53535461425781, "step": 5387 }, { "epoch": 0.7336601307189542, "grad_norm": 44.87075898182854, "learning_rate": 1.6079777237674895e-07, "logits/chosen": 14.94034194946289, "logits/rejected": 14.803842544555664, "logps/chosen": -4.680522918701172, "logps/rejected": -4.79024076461792, "loss": 4.2644, "rewards/accuracies": 0.75, "rewards/chosen": -46.80522918701172, "rewards/margins": 1.0971765518188477, "rewards/rejected": -47.90240478515625, "step": 5388 }, { "epoch": 0.7337962962962963, "grad_norm": 37.89014305631895, "learning_rate": 1.606454035102859e-07, "logits/chosen": 14.093965530395508, "logits/rejected": 14.28470230102539, "logps/chosen": -4.325179576873779, "logps/rejected": -4.406465530395508, "loss": 4.0801, "rewards/accuracies": 0.5, "rewards/chosen": -43.251800537109375, "rewards/margins": 0.8128595352172852, "rewards/rejected": -44.064659118652344, "step": 5389 }, { "epoch": 0.7339324618736384, "grad_norm": 37.62078197928715, "learning_rate": 1.6049308872797242e-07, "logits/chosen": 13.973400115966797, "logits/rejected": 15.10644245147705, "logps/chosen": -4.3614091873168945, "logps/rejected": -4.916630744934082, "loss": 3.6731, "rewards/accuracies": 1.0, "rewards/chosen": -43.61408996582031, "rewards/margins": 5.552215576171875, "rewards/rejected": -49.16630554199219, "step": 5390 }, { "epoch": 0.7340686274509803, "grad_norm": 40.49250263910585, "learning_rate": 1.6034082806422532e-07, "logits/chosen": 14.408928871154785, "logits/rejected": 14.49293327331543, "logps/chosen": -4.375360488891602, "logps/rejected": -4.2732133865356445, "loss": 3.5414, "rewards/accuracies": 0.25, "rewards/chosen": -43.75360870361328, "rewards/margins": -1.0214757919311523, "rewards/rejected": -42.73212814331055, "step": 5391 }, { "epoch": 0.7342047930283224, "grad_norm": 43.242500215900264, "learning_rate": 1.6018862155344932e-07, "logits/chosen": 13.800310134887695, "logits/rejected": 14.802053451538086, "logps/chosen": -4.381217002868652, "logps/rejected": -4.679465293884277, "loss": 3.7112, "rewards/accuracies": 0.75, "rewards/chosen": -43.812164306640625, "rewards/margins": 2.9824838638305664, "rewards/rejected": -46.794647216796875, "step": 5392 }, { "epoch": 0.7343409586056645, "grad_norm": 35.98033758560982, "learning_rate": 1.6003646923003644e-07, "logits/chosen": 13.526991844177246, "logits/rejected": 14.08818531036377, "logps/chosen": -4.290378570556641, "logps/rejected": -4.5824785232543945, "loss": 3.7397, "rewards/accuracies": 0.5, "rewards/chosen": -42.90378189086914, "rewards/margins": 2.9210004806518555, "rewards/rejected": -45.82478332519531, "step": 5393 }, { "epoch": 0.7344771241830066, "grad_norm": 46.782701317888844, "learning_rate": 1.5988437112836692e-07, "logits/chosen": 14.51107120513916, "logits/rejected": 14.558839797973633, "logps/chosen": -4.412644386291504, "logps/rejected": -4.568583011627197, "loss": 3.6017, "rewards/accuracies": 0.5, "rewards/chosen": -44.126441955566406, "rewards/margins": 1.5593891143798828, "rewards/rejected": -45.685829162597656, "step": 5394 }, { "epoch": 0.7346132897603486, "grad_norm": 40.70724421701686, "learning_rate": 1.5973232728280864e-07, "logits/chosen": 13.42581558227539, "logits/rejected": 14.170060157775879, "logps/chosen": -4.24899959564209, "logps/rejected": -4.671056747436523, "loss": 3.8146, "rewards/accuracies": 1.0, "rewards/chosen": -42.489994049072266, "rewards/margins": 4.220571517944336, "rewards/rejected": -46.71056365966797, "step": 5395 }, { "epoch": 0.7347494553376906, "grad_norm": 45.628962912354226, "learning_rate": 1.5958033772771698e-07, "logits/chosen": 13.485459327697754, "logits/rejected": 13.85263442993164, "logps/chosen": -4.305342674255371, "logps/rejected": -4.277000427246094, "loss": 4.0171, "rewards/accuracies": 0.5, "rewards/chosen": -43.05342483520508, "rewards/margins": -0.28342437744140625, "rewards/rejected": -42.77000045776367, "step": 5396 }, { "epoch": 0.7348856209150327, "grad_norm": 40.850289868890655, "learning_rate": 1.5942840249743536e-07, "logits/chosen": 14.219584465026855, "logits/rejected": 14.483333587646484, "logps/chosen": -4.535248756408691, "logps/rejected": -4.712924957275391, "loss": 4.1548, "rewards/accuracies": 0.5, "rewards/chosen": -45.35248947143555, "rewards/margins": 1.7767610549926758, "rewards/rejected": -47.12925338745117, "step": 5397 }, { "epoch": 0.7350217864923747, "grad_norm": 42.76894678085603, "learning_rate": 1.5927652162629475e-07, "logits/chosen": 14.668988227844238, "logits/rejected": 14.609090805053711, "logps/chosen": -4.298703193664551, "logps/rejected": -4.433775901794434, "loss": 3.8379, "rewards/accuracies": 0.75, "rewards/chosen": -42.987030029296875, "rewards/margins": 1.350733757019043, "rewards/rejected": -44.33776092529297, "step": 5398 }, { "epoch": 0.7351579520697168, "grad_norm": 40.73418469540988, "learning_rate": 1.591246951486141e-07, "logits/chosen": 13.645601272583008, "logits/rejected": 14.211183547973633, "logps/chosen": -4.274609565734863, "logps/rejected": -4.640320301055908, "loss": 3.761, "rewards/accuracies": 0.75, "rewards/chosen": -42.746097564697266, "rewards/margins": 3.6571083068847656, "rewards/rejected": -46.40320587158203, "step": 5399 }, { "epoch": 0.7352941176470589, "grad_norm": 40.17638259399757, "learning_rate": 1.589729230986995e-07, "logits/chosen": 13.914270401000977, "logits/rejected": 13.956929206848145, "logps/chosen": -4.255617141723633, "logps/rejected": -4.419473648071289, "loss": 3.8837, "rewards/accuracies": 0.5, "rewards/chosen": -42.55616760253906, "rewards/margins": 1.6385679244995117, "rewards/rejected": -44.194740295410156, "step": 5400 }, { "epoch": 0.7354302832244008, "grad_norm": 40.02271696639956, "learning_rate": 1.5882120551084527e-07, "logits/chosen": 13.821788787841797, "logits/rejected": 14.05147933959961, "logps/chosen": -4.440351963043213, "logps/rejected": -4.6873064041137695, "loss": 3.7399, "rewards/accuracies": 0.75, "rewards/chosen": -44.40351867675781, "rewards/margins": 2.469540596008301, "rewards/rejected": -46.87306213378906, "step": 5401 }, { "epoch": 0.7355664488017429, "grad_norm": 59.98516959109687, "learning_rate": 1.5866954241933344e-07, "logits/chosen": 14.444242477416992, "logits/rejected": 14.447427749633789, "logps/chosen": -4.629824638366699, "logps/rejected": -4.5894880294799805, "loss": 4.1004, "rewards/accuracies": 0.5, "rewards/chosen": -46.29824447631836, "rewards/margins": -0.4033632278442383, "rewards/rejected": -45.89488220214844, "step": 5402 }, { "epoch": 0.735702614379085, "grad_norm": 39.64073771854152, "learning_rate": 1.5851793385843318e-07, "logits/chosen": 14.024295806884766, "logits/rejected": 14.191973686218262, "logps/chosen": -4.110504150390625, "logps/rejected": -4.316722869873047, "loss": 3.8085, "rewards/accuracies": 0.75, "rewards/chosen": -41.10504150390625, "rewards/margins": 2.062190055847168, "rewards/rejected": -43.16722869873047, "step": 5403 }, { "epoch": 0.735838779956427, "grad_norm": 39.95014164244795, "learning_rate": 1.5836637986240189e-07, "logits/chosen": 14.175851821899414, "logits/rejected": 14.718191146850586, "logps/chosen": -4.148403167724609, "logps/rejected": -4.431456089019775, "loss": 3.799, "rewards/accuracies": 0.75, "rewards/chosen": -41.484031677246094, "rewards/margins": 2.830526351928711, "rewards/rejected": -44.31455993652344, "step": 5404 }, { "epoch": 0.7359749455337691, "grad_norm": 43.32614502608855, "learning_rate": 1.5821488046548455e-07, "logits/chosen": 14.49118423461914, "logits/rejected": 13.97883415222168, "logps/chosen": -4.334232807159424, "logps/rejected": -4.273695945739746, "loss": 4.1026, "rewards/accuracies": 0.25, "rewards/chosen": -43.34232711791992, "rewards/margins": -0.6053676605224609, "rewards/rejected": -42.736961364746094, "step": 5405 }, { "epoch": 0.7361111111111112, "grad_norm": 44.57922904596702, "learning_rate": 1.5806343570191346e-07, "logits/chosen": 14.139532089233398, "logits/rejected": 14.26388931274414, "logps/chosen": -4.343944549560547, "logps/rejected": -4.598374366760254, "loss": 4.367, "rewards/accuracies": 0.75, "rewards/chosen": -43.43944549560547, "rewards/margins": 2.5442991256713867, "rewards/rejected": -45.983741760253906, "step": 5406 }, { "epoch": 0.7362472766884531, "grad_norm": 40.89463268293768, "learning_rate": 1.5791204560590897e-07, "logits/chosen": 13.168973922729492, "logits/rejected": 14.329541206359863, "logps/chosen": -4.274178981781006, "logps/rejected": -4.606632232666016, "loss": 4.1033, "rewards/accuracies": 0.75, "rewards/chosen": -42.741790771484375, "rewards/margins": 3.3245325088500977, "rewards/rejected": -46.066322326660156, "step": 5407 }, { "epoch": 0.7363834422657952, "grad_norm": 41.32392469296506, "learning_rate": 1.57760710211679e-07, "logits/chosen": 14.031517028808594, "logits/rejected": 14.29990005493164, "logps/chosen": -4.341182708740234, "logps/rejected": -4.403570175170898, "loss": 4.1742, "rewards/accuracies": 0.5, "rewards/chosen": -43.411827087402344, "rewards/margins": 0.6238737106323242, "rewards/rejected": -44.03569793701172, "step": 5408 }, { "epoch": 0.7365196078431373, "grad_norm": 39.10741474223677, "learning_rate": 1.5760942955341876e-07, "logits/chosen": 13.844087600708008, "logits/rejected": 14.171424865722656, "logps/chosen": -4.533311367034912, "logps/rejected": -4.616574287414551, "loss": 3.688, "rewards/accuracies": 0.75, "rewards/chosen": -45.33311462402344, "rewards/margins": 0.8326282501220703, "rewards/rejected": -46.165740966796875, "step": 5409 }, { "epoch": 0.7366557734204793, "grad_norm": 42.24185718635573, "learning_rate": 1.5745820366531159e-07, "logits/chosen": 14.114627838134766, "logits/rejected": 14.038814544677734, "logps/chosen": -4.412532806396484, "logps/rejected": -4.441715240478516, "loss": 3.961, "rewards/accuracies": 0.5, "rewards/chosen": -44.12532424926758, "rewards/margins": 0.29182910919189453, "rewards/rejected": -44.417152404785156, "step": 5410 }, { "epoch": 0.7367919389978214, "grad_norm": 41.4930195542966, "learning_rate": 1.573070325815283e-07, "logits/chosen": 14.820207595825195, "logits/rejected": 14.856788635253906, "logps/chosen": -4.40817928314209, "logps/rejected": -4.748902320861816, "loss": 3.9641, "rewards/accuracies": 0.75, "rewards/chosen": -44.081790924072266, "rewards/margins": 3.407231330871582, "rewards/rejected": -47.48902130126953, "step": 5411 }, { "epoch": 0.7369281045751634, "grad_norm": 41.697201516563084, "learning_rate": 1.5715591633622697e-07, "logits/chosen": 14.657740592956543, "logits/rejected": 14.73974323272705, "logps/chosen": -4.622347354888916, "logps/rejected": -5.018564701080322, "loss": 3.6186, "rewards/accuracies": 0.75, "rewards/chosen": -46.223472595214844, "rewards/margins": 3.9621734619140625, "rewards/rejected": -50.185646057128906, "step": 5412 }, { "epoch": 0.7370642701525054, "grad_norm": 44.445897246587755, "learning_rate": 1.5700485496355368e-07, "logits/chosen": 14.6220703125, "logits/rejected": 14.695819854736328, "logps/chosen": -4.27292537689209, "logps/rejected": -4.116621971130371, "loss": 4.9477, "rewards/accuracies": 0.25, "rewards/chosen": -42.72925567626953, "rewards/margins": -1.563033103942871, "rewards/rejected": -41.166221618652344, "step": 5413 }, { "epoch": 0.7372004357298475, "grad_norm": 38.96620435947676, "learning_rate": 1.5685384849764222e-07, "logits/chosen": 14.384921073913574, "logits/rejected": 14.027824401855469, "logps/chosen": -4.559041976928711, "logps/rejected": -4.311374664306641, "loss": 3.9479, "rewards/accuracies": 0.5, "rewards/chosen": -45.590423583984375, "rewards/margins": -2.476675033569336, "rewards/rejected": -43.113746643066406, "step": 5414 }, { "epoch": 0.7373366013071896, "grad_norm": 37.961620909115254, "learning_rate": 1.567028969726134e-07, "logits/chosen": 14.189664840698242, "logits/rejected": 14.013494491577148, "logps/chosen": -4.582662582397461, "logps/rejected": -4.6254754066467285, "loss": 3.5631, "rewards/accuracies": 0.5, "rewards/chosen": -45.826629638671875, "rewards/margins": 0.42812633514404297, "rewards/rejected": -46.25475311279297, "step": 5415 }, { "epoch": 0.7374727668845316, "grad_norm": 40.67283903232821, "learning_rate": 1.5655200042257612e-07, "logits/chosen": 13.741864204406738, "logits/rejected": 14.367607116699219, "logps/chosen": -4.385015487670898, "logps/rejected": -4.797294616699219, "loss": 3.8231, "rewards/accuracies": 1.0, "rewards/chosen": -43.850154876708984, "rewards/margins": 4.122793197631836, "rewards/rejected": -47.97294616699219, "step": 5416 }, { "epoch": 0.7376089324618736, "grad_norm": 43.36340452811232, "learning_rate": 1.5640115888162687e-07, "logits/chosen": 13.900989532470703, "logits/rejected": 14.135597229003906, "logps/chosen": -4.20751953125, "logps/rejected": -4.642962455749512, "loss": 3.7409, "rewards/accuracies": 0.75, "rewards/chosen": -42.0751953125, "rewards/margins": 4.354434967041016, "rewards/rejected": -46.42962646484375, "step": 5417 }, { "epoch": 0.7377450980392157, "grad_norm": 36.11879996109049, "learning_rate": 1.5625037238384922e-07, "logits/chosen": 14.009578704833984, "logits/rejected": 14.39544677734375, "logps/chosen": -4.348851203918457, "logps/rejected": -4.467324256896973, "loss": 3.8163, "rewards/accuracies": 0.5, "rewards/chosen": -43.48851013183594, "rewards/margins": 1.1847333908081055, "rewards/rejected": -44.673240661621094, "step": 5418 }, { "epoch": 0.7378812636165577, "grad_norm": 39.05558407357143, "learning_rate": 1.5609964096331481e-07, "logits/chosen": 14.448687553405762, "logits/rejected": 13.911623001098633, "logps/chosen": -4.2418718338012695, "logps/rejected": -4.3765692710876465, "loss": 4.1784, "rewards/accuracies": 1.0, "rewards/chosen": -42.41872024536133, "rewards/margins": 1.3469696044921875, "rewards/rejected": -43.76569366455078, "step": 5419 }, { "epoch": 0.7380174291938998, "grad_norm": 40.97468129898579, "learning_rate": 1.5594896465408272e-07, "logits/chosen": 13.724201202392578, "logits/rejected": 14.35239028930664, "logps/chosen": -3.9390745162963867, "logps/rejected": -4.334627151489258, "loss": 4.2325, "rewards/accuracies": 1.0, "rewards/chosen": -39.3907470703125, "rewards/margins": 3.9555234909057617, "rewards/rejected": -43.34626770019531, "step": 5420 }, { "epoch": 0.7381535947712419, "grad_norm": 41.530355252777696, "learning_rate": 1.557983434901993e-07, "logits/chosen": 13.886401176452637, "logits/rejected": 14.754037857055664, "logps/chosen": -4.365132808685303, "logps/rejected": -4.8392014503479, "loss": 3.9431, "rewards/accuracies": 0.75, "rewards/chosen": -43.651329040527344, "rewards/margins": 4.740686416625977, "rewards/rejected": -48.39201354980469, "step": 5421 }, { "epoch": 0.7382897603485838, "grad_norm": 39.90898745284686, "learning_rate": 1.5564777750569876e-07, "logits/chosen": 15.006534576416016, "logits/rejected": 14.647334098815918, "logps/chosen": -4.532147407531738, "logps/rejected": -4.25081729888916, "loss": 3.8107, "rewards/accuracies": 0.0, "rewards/chosen": -45.32147216796875, "rewards/margins": -2.8132972717285156, "rewards/rejected": -42.50817108154297, "step": 5422 }, { "epoch": 0.7384259259259259, "grad_norm": 48.696781586563915, "learning_rate": 1.5549726673460284e-07, "logits/chosen": 14.056015968322754, "logits/rejected": 14.316596984863281, "logps/chosen": -4.690119743347168, "logps/rejected": -4.730058670043945, "loss": 3.6688, "rewards/accuracies": 0.5, "rewards/chosen": -46.90119934082031, "rewards/margins": 0.3993873596191406, "rewards/rejected": -47.30058670043945, "step": 5423 }, { "epoch": 0.738562091503268, "grad_norm": 39.543522116992456, "learning_rate": 1.5534681121092047e-07, "logits/chosen": 13.98658275604248, "logits/rejected": 13.952463150024414, "logps/chosen": -4.578756332397461, "logps/rejected": -4.54179573059082, "loss": 3.9893, "rewards/accuracies": 0.5, "rewards/chosen": -45.78756332397461, "rewards/margins": -0.36960792541503906, "rewards/rejected": -45.4179573059082, "step": 5424 }, { "epoch": 0.73869825708061, "grad_norm": 44.96038459973471, "learning_rate": 1.5519641096864842e-07, "logits/chosen": 14.104183197021484, "logits/rejected": 14.093838691711426, "logps/chosen": -4.545564651489258, "logps/rejected": -4.663051605224609, "loss": 4.368, "rewards/accuracies": 0.75, "rewards/chosen": -45.45564651489258, "rewards/margins": 1.1748676300048828, "rewards/rejected": -46.63051223754883, "step": 5425 }, { "epoch": 0.7388344226579521, "grad_norm": 37.509252018087764, "learning_rate": 1.5504606604177103e-07, "logits/chosen": 13.87192440032959, "logits/rejected": 13.543234825134277, "logps/chosen": -4.1256537437438965, "logps/rejected": -4.218846321105957, "loss": 3.7667, "rewards/accuracies": 0.5, "rewards/chosen": -41.256534576416016, "rewards/margins": 0.9319276809692383, "rewards/rejected": -42.18846130371094, "step": 5426 }, { "epoch": 0.7389705882352942, "grad_norm": 43.61913386340588, "learning_rate": 1.5489577646425968e-07, "logits/chosen": 14.555781364440918, "logits/rejected": 14.369854927062988, "logps/chosen": -4.814764022827148, "logps/rejected": -4.625581741333008, "loss": 3.9128, "rewards/accuracies": 0.5, "rewards/chosen": -48.14764404296875, "rewards/margins": -1.8918256759643555, "rewards/rejected": -46.255821228027344, "step": 5427 }, { "epoch": 0.7391067538126361, "grad_norm": 40.78818362696834, "learning_rate": 1.5474554227007368e-07, "logits/chosen": 14.429277420043945, "logits/rejected": 14.600032806396484, "logps/chosen": -4.805044174194336, "logps/rejected": -4.762691497802734, "loss": 3.746, "rewards/accuracies": 0.5, "rewards/chosen": -48.05044174194336, "rewards/margins": -0.4235267639160156, "rewards/rejected": -47.626914978027344, "step": 5428 }, { "epoch": 0.7392429193899782, "grad_norm": 45.921028724860555, "learning_rate": 1.5459536349315988e-07, "logits/chosen": 13.523577690124512, "logits/rejected": 13.653493881225586, "logps/chosen": -4.428714752197266, "logps/rejected": -4.308926582336426, "loss": 3.8593, "rewards/accuracies": 0.25, "rewards/chosen": -44.287147521972656, "rewards/margins": -1.1978816986083984, "rewards/rejected": -43.089263916015625, "step": 5429 }, { "epoch": 0.7393790849673203, "grad_norm": 42.15549357440847, "learning_rate": 1.5444524016745204e-07, "logits/chosen": 14.943849563598633, "logits/rejected": 14.128530502319336, "logps/chosen": -4.8252153396606445, "logps/rejected": -4.701065540313721, "loss": 3.5575, "rewards/accuracies": 0.5, "rewards/chosen": -48.25215148925781, "rewards/margins": -1.2414960861206055, "rewards/rejected": -47.010658264160156, "step": 5430 }, { "epoch": 0.7395152505446623, "grad_norm": 43.73068678871546, "learning_rate": 1.5429517232687198e-07, "logits/chosen": 14.790626525878906, "logits/rejected": 14.31230354309082, "logps/chosen": -4.672145843505859, "logps/rejected": -4.569009780883789, "loss": 4.6457, "rewards/accuracies": 0.25, "rewards/chosen": -46.721458435058594, "rewards/margins": -1.0313644409179688, "rewards/rejected": -45.690093994140625, "step": 5431 }, { "epoch": 0.7396514161220044, "grad_norm": 37.70708973925435, "learning_rate": 1.541451600053289e-07, "logits/chosen": 13.65144157409668, "logits/rejected": 14.811817169189453, "logps/chosen": -4.501922130584717, "logps/rejected": -4.999514579772949, "loss": 3.8452, "rewards/accuracies": 1.0, "rewards/chosen": -45.019222259521484, "rewards/margins": 4.975924491882324, "rewards/rejected": -49.995147705078125, "step": 5432 }, { "epoch": 0.7397875816993464, "grad_norm": 40.20299230496511, "learning_rate": 1.5399520323671902e-07, "logits/chosen": 14.340902328491211, "logits/rejected": 14.481683731079102, "logps/chosen": -4.294057846069336, "logps/rejected": -4.569280624389648, "loss": 3.5183, "rewards/accuracies": 0.75, "rewards/chosen": -42.94057846069336, "rewards/margins": 2.752227783203125, "rewards/rejected": -45.692806243896484, "step": 5433 }, { "epoch": 0.7399237472766884, "grad_norm": 41.34783377709085, "learning_rate": 1.5384530205492648e-07, "logits/chosen": 13.86604118347168, "logits/rejected": 13.997230529785156, "logps/chosen": -4.274916172027588, "logps/rejected": -4.528693199157715, "loss": 3.7379, "rewards/accuracies": 0.75, "rewards/chosen": -42.74916076660156, "rewards/margins": 2.5377683639526367, "rewards/rejected": -45.286930084228516, "step": 5434 }, { "epoch": 0.7400599128540305, "grad_norm": 39.92900370821005, "learning_rate": 1.5369545649382282e-07, "logits/chosen": 14.376277923583984, "logits/rejected": 14.798484802246094, "logps/chosen": -4.354211807250977, "logps/rejected": -4.931464672088623, "loss": 3.6544, "rewards/accuracies": 0.75, "rewards/chosen": -43.54212188720703, "rewards/margins": 5.772526741027832, "rewards/rejected": -49.31465148925781, "step": 5435 }, { "epoch": 0.7401960784313726, "grad_norm": 39.264402418070866, "learning_rate": 1.5354566658726657e-07, "logits/chosen": 13.504302978515625, "logits/rejected": 13.78708553314209, "logps/chosen": -4.199512481689453, "logps/rejected": -4.474221229553223, "loss": 3.7777, "rewards/accuracies": 0.75, "rewards/chosen": -41.99512481689453, "rewards/margins": 2.7470884323120117, "rewards/rejected": -44.742210388183594, "step": 5436 }, { "epoch": 0.7403322440087146, "grad_norm": 38.29662157402092, "learning_rate": 1.5339593236910419e-07, "logits/chosen": 14.478837966918945, "logits/rejected": 14.260461807250977, "logps/chosen": -4.4740118980407715, "logps/rejected": -4.4059858322143555, "loss": 3.9802, "rewards/accuracies": 0.5, "rewards/chosen": -44.74011993408203, "rewards/margins": -0.6802635192871094, "rewards/rejected": -44.059852600097656, "step": 5437 }, { "epoch": 0.7404684095860566, "grad_norm": 38.8911706032393, "learning_rate": 1.5324625387316948e-07, "logits/chosen": 14.326896667480469, "logits/rejected": 14.147706985473633, "logps/chosen": -4.776092529296875, "logps/rejected": -4.699304580688477, "loss": 4.0909, "rewards/accuracies": 0.5, "rewards/chosen": -47.760921478271484, "rewards/margins": -0.7678728103637695, "rewards/rejected": -46.99304962158203, "step": 5438 }, { "epoch": 0.7406045751633987, "grad_norm": 38.15922986404777, "learning_rate": 1.5309663113328325e-07, "logits/chosen": 14.0315523147583, "logits/rejected": 14.038337707519531, "logps/chosen": -4.456642150878906, "logps/rejected": -4.607774257659912, "loss": 3.7999, "rewards/accuracies": 0.75, "rewards/chosen": -44.56642150878906, "rewards/margins": 1.5113191604614258, "rewards/rejected": -46.07774353027344, "step": 5439 }, { "epoch": 0.7407407407407407, "grad_norm": 39.01829598090733, "learning_rate": 1.5294706418325412e-07, "logits/chosen": 13.65469741821289, "logits/rejected": 14.302906036376953, "logps/chosen": -4.295842170715332, "logps/rejected": -4.638830184936523, "loss": 3.7097, "rewards/accuracies": 1.0, "rewards/chosen": -42.95841979980469, "rewards/margins": 3.4298810958862305, "rewards/rejected": -46.388301849365234, "step": 5440 }, { "epoch": 0.7408769063180828, "grad_norm": 38.925960929019716, "learning_rate": 1.527975530568782e-07, "logits/chosen": 13.890435218811035, "logits/rejected": 14.974264144897461, "logps/chosen": -4.233552932739258, "logps/rejected": -4.361661911010742, "loss": 3.4922, "rewards/accuracies": 0.5, "rewards/chosen": -42.33552932739258, "rewards/margins": 1.281092643737793, "rewards/rejected": -43.61662292480469, "step": 5441 }, { "epoch": 0.7410130718954249, "grad_norm": 41.11742504900593, "learning_rate": 1.5264809778793836e-07, "logits/chosen": 13.850400924682617, "logits/rejected": 14.268632888793945, "logps/chosen": -4.747110366821289, "logps/rejected": -4.690202713012695, "loss": 4.3862, "rewards/accuracies": 0.5, "rewards/chosen": -47.47110366821289, "rewards/margins": -0.5690736770629883, "rewards/rejected": -46.90203094482422, "step": 5442 }, { "epoch": 0.7411492374727668, "grad_norm": 42.59574075987188, "learning_rate": 1.5249869841020547e-07, "logits/chosen": 14.567211151123047, "logits/rejected": 15.018768310546875, "logps/chosen": -4.558210849761963, "logps/rejected": -4.827840805053711, "loss": 4.2367, "rewards/accuracies": 0.5, "rewards/chosen": -45.58210754394531, "rewards/margins": 2.6963024139404297, "rewards/rejected": -48.27840805053711, "step": 5443 }, { "epoch": 0.7412854030501089, "grad_norm": 53.894421058668456, "learning_rate": 1.5234935495743768e-07, "logits/chosen": 15.010543823242188, "logits/rejected": 14.684823036193848, "logps/chosen": -4.907450199127197, "logps/rejected": -4.873033046722412, "loss": 4.8414, "rewards/accuracies": 0.5, "rewards/chosen": -49.074501037597656, "rewards/margins": -0.34416961669921875, "rewards/rejected": -48.73033142089844, "step": 5444 }, { "epoch": 0.741421568627451, "grad_norm": 44.6461024901466, "learning_rate": 1.522000674633801e-07, "logits/chosen": 14.13558578491211, "logits/rejected": 14.672779083251953, "logps/chosen": -4.1870574951171875, "logps/rejected": -4.500458717346191, "loss": 3.6823, "rewards/accuracies": 1.0, "rewards/chosen": -41.870574951171875, "rewards/margins": 3.134012222290039, "rewards/rejected": -45.00458908081055, "step": 5445 }, { "epoch": 0.741557734204793, "grad_norm": 39.792951216143784, "learning_rate": 1.5205083596176565e-07, "logits/chosen": 13.34848403930664, "logits/rejected": 13.675562858581543, "logps/chosen": -4.140754699707031, "logps/rejected": -4.526982307434082, "loss": 4.1155, "rewards/accuracies": 0.75, "rewards/chosen": -41.40754699707031, "rewards/margins": 3.862276077270508, "rewards/rejected": -45.26982498168945, "step": 5446 }, { "epoch": 0.7416938997821351, "grad_norm": 40.315613040924084, "learning_rate": 1.5190166048631445e-07, "logits/chosen": 14.418697357177734, "logits/rejected": 14.114325523376465, "logps/chosen": -4.726959228515625, "logps/rejected": -4.690286159515381, "loss": 4.7219, "rewards/accuracies": 0.5, "rewards/chosen": -47.269588470458984, "rewards/margins": -0.3667278289794922, "rewards/rejected": -46.902862548828125, "step": 5447 }, { "epoch": 0.7418300653594772, "grad_norm": 43.481891198364835, "learning_rate": 1.517525410707338e-07, "logits/chosen": 14.223335266113281, "logits/rejected": 14.449493408203125, "logps/chosen": -4.471738815307617, "logps/rejected": -4.7871174812316895, "loss": 4.492, "rewards/accuracies": 0.5, "rewards/chosen": -44.71739196777344, "rewards/margins": 3.15378475189209, "rewards/rejected": -47.87117385864258, "step": 5448 }, { "epoch": 0.7419662309368191, "grad_norm": 41.552971060352796, "learning_rate": 1.5160347774871846e-07, "logits/chosen": 13.779897689819336, "logits/rejected": 14.564764022827148, "logps/chosen": -4.4592766761779785, "logps/rejected": -4.846889019012451, "loss": 3.6609, "rewards/accuracies": 0.75, "rewards/chosen": -44.59276580810547, "rewards/margins": 3.8761215209960938, "rewards/rejected": -48.46888732910156, "step": 5449 }, { "epoch": 0.7421023965141612, "grad_norm": 45.902848260016455, "learning_rate": 1.5145447055395074e-07, "logits/chosen": 14.089750289916992, "logits/rejected": 14.791555404663086, "logps/chosen": -4.345073699951172, "logps/rejected": -5.019902229309082, "loss": 3.9508, "rewards/accuracies": 1.0, "rewards/chosen": -43.45073699951172, "rewards/margins": 6.748286247253418, "rewards/rejected": -50.19902038574219, "step": 5450 }, { "epoch": 0.7422385620915033, "grad_norm": 39.46918098081322, "learning_rate": 1.513055195200998e-07, "logits/chosen": 13.996210098266602, "logits/rejected": 14.062023162841797, "logps/chosen": -4.299202919006348, "logps/rejected": -4.5461530685424805, "loss": 3.5583, "rewards/accuracies": 0.75, "rewards/chosen": -42.99203109741211, "rewards/margins": 2.4695005416870117, "rewards/rejected": -45.46153259277344, "step": 5451 }, { "epoch": 0.7423747276688453, "grad_norm": 43.17145206452721, "learning_rate": 1.5115662468082247e-07, "logits/chosen": 12.764341354370117, "logits/rejected": 13.937503814697266, "logps/chosen": -4.26533842086792, "logps/rejected": -4.581270694732666, "loss": 3.6077, "rewards/accuracies": 0.75, "rewards/chosen": -42.65338134765625, "rewards/margins": 3.1593246459960938, "rewards/rejected": -45.81270980834961, "step": 5452 }, { "epoch": 0.7425108932461874, "grad_norm": 39.971883453583196, "learning_rate": 1.5100778606976287e-07, "logits/chosen": 14.028959274291992, "logits/rejected": 14.279522895812988, "logps/chosen": -4.602898597717285, "logps/rejected": -4.641006946563721, "loss": 3.68, "rewards/accuracies": 0.75, "rewards/chosen": -46.02898406982422, "rewards/margins": 0.38108253479003906, "rewards/rejected": -46.410072326660156, "step": 5453 }, { "epoch": 0.7426470588235294, "grad_norm": 36.03700612627634, "learning_rate": 1.5085900372055203e-07, "logits/chosen": 13.315621376037598, "logits/rejected": 14.629590034484863, "logps/chosen": -4.153554916381836, "logps/rejected": -4.829477310180664, "loss": 3.3438, "rewards/accuracies": 1.0, "rewards/chosen": -41.53554916381836, "rewards/margins": 6.759223937988281, "rewards/rejected": -48.294769287109375, "step": 5454 }, { "epoch": 0.7427832244008714, "grad_norm": 39.561071131376195, "learning_rate": 1.5071027766680872e-07, "logits/chosen": 14.51431655883789, "logits/rejected": 13.928010940551758, "logps/chosen": -4.363954544067383, "logps/rejected": -4.484673500061035, "loss": 3.9533, "rewards/accuracies": 0.5, "rewards/chosen": -43.639549255371094, "rewards/margins": 1.2071895599365234, "rewards/rejected": -44.84674072265625, "step": 5455 }, { "epoch": 0.7429193899782135, "grad_norm": 43.57702800451054, "learning_rate": 1.5056160794213897e-07, "logits/chosen": 13.803438186645508, "logits/rejected": 14.905853271484375, "logps/chosen": -4.257835388183594, "logps/rejected": -4.568784236907959, "loss": 4.3282, "rewards/accuracies": 0.75, "rewards/chosen": -42.57835388183594, "rewards/margins": 3.1094846725463867, "rewards/rejected": -45.687843322753906, "step": 5456 }, { "epoch": 0.7430555555555556, "grad_norm": 40.29585608582027, "learning_rate": 1.5041299458013566e-07, "logits/chosen": 14.357579231262207, "logits/rejected": 14.979636192321777, "logps/chosen": -4.743508338928223, "logps/rejected": -4.746370792388916, "loss": 4.1497, "rewards/accuracies": 0.25, "rewards/chosen": -47.435081481933594, "rewards/margins": 0.028629302978515625, "rewards/rejected": -47.463706970214844, "step": 5457 }, { "epoch": 0.7431917211328976, "grad_norm": 39.1763147518629, "learning_rate": 1.502644376143793e-07, "logits/chosen": 15.256316184997559, "logits/rejected": 14.760458946228027, "logps/chosen": -4.452227592468262, "logps/rejected": -4.499111175537109, "loss": 3.9946, "rewards/accuracies": 0.25, "rewards/chosen": -44.52227783203125, "rewards/margins": 0.46883296966552734, "rewards/rejected": -44.991111755371094, "step": 5458 }, { "epoch": 0.7433278867102396, "grad_norm": 38.63609076564304, "learning_rate": 1.5011593707843777e-07, "logits/chosen": 13.817800521850586, "logits/rejected": 14.582319259643555, "logps/chosen": -4.49489164352417, "logps/rejected": -4.737060546875, "loss": 3.6997, "rewards/accuracies": 0.75, "rewards/chosen": -44.94891357421875, "rewards/margins": 2.421689033508301, "rewards/rejected": -47.370601654052734, "step": 5459 }, { "epoch": 0.7434640522875817, "grad_norm": 40.77157023969433, "learning_rate": 1.4996749300586567e-07, "logits/chosen": 14.264906883239746, "logits/rejected": 14.486918449401855, "logps/chosen": -4.302215576171875, "logps/rejected": -4.496750831604004, "loss": 3.914, "rewards/accuracies": 0.5, "rewards/chosen": -43.02215576171875, "rewards/margins": 1.945352554321289, "rewards/rejected": -44.967506408691406, "step": 5460 }, { "epoch": 0.7436002178649237, "grad_norm": 40.913423831072095, "learning_rate": 1.4981910543020532e-07, "logits/chosen": 14.959413528442383, "logits/rejected": 14.62983512878418, "logps/chosen": -4.789700508117676, "logps/rejected": -4.573635101318359, "loss": 3.8986, "rewards/accuracies": 0.25, "rewards/chosen": -47.89700698852539, "rewards/margins": -2.1606550216674805, "rewards/rejected": -45.736351013183594, "step": 5461 }, { "epoch": 0.7437363834422658, "grad_norm": 39.620648451741104, "learning_rate": 1.4967077438498623e-07, "logits/chosen": 13.975259780883789, "logits/rejected": 14.471649169921875, "logps/chosen": -4.518828392028809, "logps/rejected": -4.886079788208008, "loss": 3.9877, "rewards/accuracies": 0.5, "rewards/chosen": -45.18828201293945, "rewards/margins": 3.6725149154663086, "rewards/rejected": -48.86079788208008, "step": 5462 }, { "epoch": 0.7438725490196079, "grad_norm": 41.05897440742215, "learning_rate": 1.4952249990372477e-07, "logits/chosen": 14.334524154663086, "logits/rejected": 14.568500518798828, "logps/chosen": -4.46463680267334, "logps/rejected": -4.476943492889404, "loss": 3.7924, "rewards/accuracies": 0.5, "rewards/chosen": -44.64636993408203, "rewards/margins": 0.1230611801147461, "rewards/rejected": -44.769432067871094, "step": 5463 }, { "epoch": 0.7440087145969498, "grad_norm": 41.46029233995478, "learning_rate": 1.4937428201992496e-07, "logits/chosen": 14.85123062133789, "logits/rejected": 14.84890079498291, "logps/chosen": -4.416190147399902, "logps/rejected": -4.530500411987305, "loss": 4.0504, "rewards/accuracies": 0.5, "rewards/chosen": -44.161903381347656, "rewards/margins": 1.1430978775024414, "rewards/rejected": -45.30500030517578, "step": 5464 }, { "epoch": 0.7441448801742919, "grad_norm": 42.43816256069886, "learning_rate": 1.4922612076707796e-07, "logits/chosen": 14.524377822875977, "logits/rejected": 14.200477600097656, "logps/chosen": -4.322498798370361, "logps/rejected": -4.58349609375, "loss": 3.8757, "rewards/accuracies": 0.75, "rewards/chosen": -43.2249870300293, "rewards/margins": 2.6099700927734375, "rewards/rejected": -45.8349609375, "step": 5465 }, { "epoch": 0.744281045751634, "grad_norm": 39.6466631512133, "learning_rate": 1.4907801617866173e-07, "logits/chosen": 13.307937622070312, "logits/rejected": 15.260871887207031, "logps/chosen": -4.212528228759766, "logps/rejected": -4.85319709777832, "loss": 3.9461, "rewards/accuracies": 0.75, "rewards/chosen": -42.12528610229492, "rewards/margins": 6.406688690185547, "rewards/rejected": -48.53197479248047, "step": 5466 }, { "epoch": 0.744417211328976, "grad_norm": 43.164059809644115, "learning_rate": 1.4892996828814188e-07, "logits/chosen": 13.911947250366211, "logits/rejected": 14.350330352783203, "logps/chosen": -4.315585613250732, "logps/rejected": -4.632193565368652, "loss": 4.3192, "rewards/accuracies": 0.75, "rewards/chosen": -43.15585708618164, "rewards/margins": 3.1660842895507812, "rewards/rejected": -46.32194137573242, "step": 5467 }, { "epoch": 0.7445533769063181, "grad_norm": 52.4640080650968, "learning_rate": 1.487819771289712e-07, "logits/chosen": 14.145822525024414, "logits/rejected": 14.15225887298584, "logps/chosen": -4.654589653015137, "logps/rejected": -4.666712760925293, "loss": 4.3136, "rewards/accuracies": 0.25, "rewards/chosen": -46.545902252197266, "rewards/margins": 0.1212301254272461, "rewards/rejected": -46.66712951660156, "step": 5468 }, { "epoch": 0.7446895424836601, "grad_norm": 43.59268627409482, "learning_rate": 1.4863404273458927e-07, "logits/chosen": 14.064834594726562, "logits/rejected": 14.273398399353027, "logps/chosen": -4.431260108947754, "logps/rejected": -4.754168510437012, "loss": 3.7942, "rewards/accuracies": 1.0, "rewards/chosen": -44.31260299682617, "rewards/margins": 3.229084014892578, "rewards/rejected": -47.54168701171875, "step": 5469 }, { "epoch": 0.7448257080610022, "grad_norm": 44.203725511394694, "learning_rate": 1.4848616513842317e-07, "logits/chosen": 13.995613098144531, "logits/rejected": 14.417478561401367, "logps/chosen": -4.7733869552612305, "logps/rejected": -4.9536237716674805, "loss": 3.7734, "rewards/accuracies": 0.5, "rewards/chosen": -47.73386764526367, "rewards/margins": 1.8023700714111328, "rewards/rejected": -49.53623962402344, "step": 5470 }, { "epoch": 0.7449618736383442, "grad_norm": 41.00519919211116, "learning_rate": 1.4833834437388722e-07, "logits/chosen": 14.404182434082031, "logits/rejected": 14.383390426635742, "logps/chosen": -4.844201564788818, "logps/rejected": -4.690946578979492, "loss": 4.4191, "rewards/accuracies": 0.25, "rewards/chosen": -48.4420166015625, "rewards/margins": -1.5325508117675781, "rewards/rejected": -46.90946578979492, "step": 5471 }, { "epoch": 0.7450980392156863, "grad_norm": 44.856162768598466, "learning_rate": 1.4819058047438251e-07, "logits/chosen": 14.575142860412598, "logits/rejected": 14.3739013671875, "logps/chosen": -4.681434631347656, "logps/rejected": -4.838815689086914, "loss": 4.6622, "rewards/accuracies": 0.75, "rewards/chosen": -46.81434631347656, "rewards/margins": 1.5738067626953125, "rewards/rejected": -48.388153076171875, "step": 5472 }, { "epoch": 0.7452342047930284, "grad_norm": 42.043173875094894, "learning_rate": 1.480428734732976e-07, "logits/chosen": 13.89484977722168, "logits/rejected": 14.297378540039062, "logps/chosen": -4.393206596374512, "logps/rejected": -4.5849761962890625, "loss": 3.5231, "rewards/accuracies": 0.75, "rewards/chosen": -43.93206787109375, "rewards/margins": 1.917694091796875, "rewards/rejected": -45.849761962890625, "step": 5473 }, { "epoch": 0.7453703703703703, "grad_norm": 40.86100556479041, "learning_rate": 1.4789522340400825e-07, "logits/chosen": 13.281518936157227, "logits/rejected": 14.086811065673828, "logps/chosen": -4.352956771850586, "logps/rejected": -4.680125713348389, "loss": 3.5265, "rewards/accuracies": 0.75, "rewards/chosen": -43.52956771850586, "rewards/margins": 3.2716922760009766, "rewards/rejected": -46.8012580871582, "step": 5474 }, { "epoch": 0.7455065359477124, "grad_norm": 43.76235260533669, "learning_rate": 1.4774763029987697e-07, "logits/chosen": 14.475574493408203, "logits/rejected": 15.061220169067383, "logps/chosen": -4.59256649017334, "logps/rejected": -4.983519554138184, "loss": 3.8851, "rewards/accuracies": 0.75, "rewards/chosen": -45.92566680908203, "rewards/margins": 3.909529685974121, "rewards/rejected": -49.83519744873047, "step": 5475 }, { "epoch": 0.7456427015250545, "grad_norm": 42.57958001613074, "learning_rate": 1.4760009419425377e-07, "logits/chosen": 13.846297264099121, "logits/rejected": 14.305641174316406, "logps/chosen": -4.612957954406738, "logps/rejected": -4.781691551208496, "loss": 3.9439, "rewards/accuracies": 0.5, "rewards/chosen": -46.12958526611328, "rewards/margins": 1.6873350143432617, "rewards/rejected": -47.816917419433594, "step": 5476 }, { "epoch": 0.7457788671023965, "grad_norm": 40.904747755675686, "learning_rate": 1.474526151204758e-07, "logits/chosen": 13.867679595947266, "logits/rejected": 13.997485160827637, "logps/chosen": -4.212551593780518, "logps/rejected": -4.419744491577148, "loss": 3.5811, "rewards/accuracies": 0.75, "rewards/chosen": -42.12551498413086, "rewards/margins": 2.0719261169433594, "rewards/rejected": -44.19744110107422, "step": 5477 }, { "epoch": 0.7459150326797386, "grad_norm": 41.934784586315914, "learning_rate": 1.4730519311186681e-07, "logits/chosen": 13.67491340637207, "logits/rejected": 14.193788528442383, "logps/chosen": -4.1614484786987305, "logps/rejected": -4.492530345916748, "loss": 3.6087, "rewards/accuracies": 0.5, "rewards/chosen": -41.61448669433594, "rewards/margins": 3.3108158111572266, "rewards/rejected": -44.92530059814453, "step": 5478 }, { "epoch": 0.7460511982570807, "grad_norm": 43.68776413767741, "learning_rate": 1.4715782820173832e-07, "logits/chosen": 13.245182991027832, "logits/rejected": 13.478450775146484, "logps/chosen": -4.2332024574279785, "logps/rejected": -4.349666118621826, "loss": 4.4251, "rewards/accuracies": 0.5, "rewards/chosen": -42.33202362060547, "rewards/margins": 1.1646394729614258, "rewards/rejected": -43.49666213989258, "step": 5479 }, { "epoch": 0.7461873638344226, "grad_norm": 41.140100320619524, "learning_rate": 1.4701052042338865e-07, "logits/chosen": 13.838552474975586, "logits/rejected": 14.423589706420898, "logps/chosen": -4.150210380554199, "logps/rejected": -4.4113264083862305, "loss": 3.8291, "rewards/accuracies": 0.75, "rewards/chosen": -41.50210189819336, "rewards/margins": 2.611164093017578, "rewards/rejected": -44.11326599121094, "step": 5480 }, { "epoch": 0.7463235294117647, "grad_norm": 41.83334955423377, "learning_rate": 1.4686326981010303e-07, "logits/chosen": 14.46352767944336, "logits/rejected": 14.652922630310059, "logps/chosen": -4.603726387023926, "logps/rejected": -4.709992408752441, "loss": 3.7518, "rewards/accuracies": 0.75, "rewards/chosen": -46.037261962890625, "rewards/margins": 1.0626583099365234, "rewards/rejected": -47.09992218017578, "step": 5481 }, { "epoch": 0.7464596949891068, "grad_norm": 67.89952477177037, "learning_rate": 1.4671607639515399e-07, "logits/chosen": 13.675872802734375, "logits/rejected": 13.809614181518555, "logps/chosen": -4.560294151306152, "logps/rejected": -4.847890377044678, "loss": 3.9153, "rewards/accuracies": 1.0, "rewards/chosen": -45.60293960571289, "rewards/margins": 2.875965118408203, "rewards/rejected": -48.478904724121094, "step": 5482 }, { "epoch": 0.7465958605664488, "grad_norm": 40.17327432750419, "learning_rate": 1.4656894021180116e-07, "logits/chosen": 13.596738815307617, "logits/rejected": 13.908065795898438, "logps/chosen": -4.530068397521973, "logps/rejected": -4.598305702209473, "loss": 3.7608, "rewards/accuracies": 0.5, "rewards/chosen": -45.300682067871094, "rewards/margins": 0.6823768615722656, "rewards/rejected": -45.98305892944336, "step": 5483 }, { "epoch": 0.7467320261437909, "grad_norm": 39.64231587291956, "learning_rate": 1.4642186129329134e-07, "logits/chosen": 14.083690643310547, "logits/rejected": 15.32321548461914, "logps/chosen": -4.387611389160156, "logps/rejected": -5.012521743774414, "loss": 4.0385, "rewards/accuracies": 0.75, "rewards/chosen": -43.87611389160156, "rewards/margins": 6.249102592468262, "rewards/rejected": -50.125213623046875, "step": 5484 }, { "epoch": 0.746868191721133, "grad_norm": 40.02834215535601, "learning_rate": 1.462748396728579e-07, "logits/chosen": 13.732662200927734, "logits/rejected": 13.513957977294922, "logps/chosen": -4.496959209442139, "logps/rejected": -4.387691497802734, "loss": 3.9452, "rewards/accuracies": 0.25, "rewards/chosen": -44.96958923339844, "rewards/margins": -1.0926742553710938, "rewards/rejected": -43.876914978027344, "step": 5485 }, { "epoch": 0.7470043572984749, "grad_norm": 42.19533871963231, "learning_rate": 1.4612787538372175e-07, "logits/chosen": 14.226810455322266, "logits/rejected": 14.487493515014648, "logps/chosen": -4.381974697113037, "logps/rejected": -4.870886325836182, "loss": 4.1478, "rewards/accuracies": 0.75, "rewards/chosen": -43.81974792480469, "rewards/margins": 4.889113426208496, "rewards/rejected": -48.7088623046875, "step": 5486 }, { "epoch": 0.747140522875817, "grad_norm": 39.33478737065601, "learning_rate": 1.4598096845909086e-07, "logits/chosen": 13.512158393859863, "logits/rejected": 14.954951286315918, "logps/chosen": -4.390745162963867, "logps/rejected": -4.652350902557373, "loss": 3.9157, "rewards/accuracies": 0.75, "rewards/chosen": -43.907447814941406, "rewards/margins": 2.616060256958008, "rewards/rejected": -46.52350997924805, "step": 5487 }, { "epoch": 0.7472766884531591, "grad_norm": 44.737185512700165, "learning_rate": 1.458341189321597e-07, "logits/chosen": 14.307289123535156, "logits/rejected": 13.825435638427734, "logps/chosen": -4.522227764129639, "logps/rejected": -4.53463077545166, "loss": 4.3278, "rewards/accuracies": 0.5, "rewards/chosen": -45.22228240966797, "rewards/margins": 0.12402725219726562, "rewards/rejected": -45.34630584716797, "step": 5488 }, { "epoch": 0.7474128540305011, "grad_norm": 38.58675468630578, "learning_rate": 1.4568732683611034e-07, "logits/chosen": 13.406570434570312, "logits/rejected": 13.859930038452148, "logps/chosen": -3.9043617248535156, "logps/rejected": -4.230447769165039, "loss": 3.8642, "rewards/accuracies": 0.75, "rewards/chosen": -39.043617248535156, "rewards/margins": 3.260859489440918, "rewards/rejected": -42.30447769165039, "step": 5489 }, { "epoch": 0.7475490196078431, "grad_norm": 42.157404583831145, "learning_rate": 1.4554059220411167e-07, "logits/chosen": 13.617074966430664, "logits/rejected": 13.817005157470703, "logps/chosen": -4.337679862976074, "logps/rejected": -4.474129676818848, "loss": 3.7129, "rewards/accuracies": 0.5, "rewards/chosen": -43.37679672241211, "rewards/margins": 1.3644990921020508, "rewards/rejected": -44.741294860839844, "step": 5490 }, { "epoch": 0.7476851851851852, "grad_norm": 41.8394641546543, "learning_rate": 1.4539391506931971e-07, "logits/chosen": 14.86436653137207, "logits/rejected": 14.178144454956055, "logps/chosen": -4.660725116729736, "logps/rejected": -4.662013053894043, "loss": 4.1708, "rewards/accuracies": 0.25, "rewards/chosen": -46.60725402832031, "rewards/margins": 0.012876510620117188, "rewards/rejected": -46.62012481689453, "step": 5491 }, { "epoch": 0.7478213507625272, "grad_norm": 40.861602339781804, "learning_rate": 1.4524729546487708e-07, "logits/chosen": 15.116802215576172, "logits/rejected": 14.85765266418457, "logps/chosen": -4.402488708496094, "logps/rejected": -4.586627960205078, "loss": 4.4001, "rewards/accuracies": 0.75, "rewards/chosen": -44.02488708496094, "rewards/margins": 1.8413972854614258, "rewards/rejected": -45.86628341674805, "step": 5492 }, { "epoch": 0.7479575163398693, "grad_norm": 39.86867701484123, "learning_rate": 1.4510073342391387e-07, "logits/chosen": 14.915151596069336, "logits/rejected": 14.151102066040039, "logps/chosen": -4.490964412689209, "logps/rejected": -4.331984043121338, "loss": 3.7022, "rewards/accuracies": 0.5, "rewards/chosen": -44.909645080566406, "rewards/margins": -1.5898017883300781, "rewards/rejected": -43.31984329223633, "step": 5493 }, { "epoch": 0.7480936819172114, "grad_norm": 39.32501536514187, "learning_rate": 1.4495422897954707e-07, "logits/chosen": 13.850617408752441, "logits/rejected": 15.145637512207031, "logps/chosen": -4.317248344421387, "logps/rejected": -4.943749904632568, "loss": 4.0399, "rewards/accuracies": 1.0, "rewards/chosen": -43.172489166259766, "rewards/margins": 6.265012741088867, "rewards/rejected": -49.4375, "step": 5494 }, { "epoch": 0.7482298474945533, "grad_norm": 42.884324222859036, "learning_rate": 1.4480778216488032e-07, "logits/chosen": 15.015640258789062, "logits/rejected": 15.12234115600586, "logps/chosen": -4.230620384216309, "logps/rejected": -4.596403121948242, "loss": 3.9191, "rewards/accuracies": 0.75, "rewards/chosen": -42.30620574951172, "rewards/margins": 3.657827377319336, "rewards/rejected": -45.964027404785156, "step": 5495 }, { "epoch": 0.7483660130718954, "grad_norm": 39.033217023742964, "learning_rate": 1.4466139301300468e-07, "logits/chosen": 14.593673706054688, "logits/rejected": 13.930654525756836, "logps/chosen": -4.448700904846191, "logps/rejected": -4.322912693023682, "loss": 3.9081, "rewards/accuracies": 0.25, "rewards/chosen": -44.48700714111328, "rewards/margins": -1.257878303527832, "rewards/rejected": -43.2291259765625, "step": 5496 }, { "epoch": 0.7485021786492375, "grad_norm": 38.95949133868023, "learning_rate": 1.4451506155699788e-07, "logits/chosen": 14.477167129516602, "logits/rejected": 14.248964309692383, "logps/chosen": -4.445303916931152, "logps/rejected": -4.359190940856934, "loss": 3.4695, "rewards/accuracies": 0.25, "rewards/chosen": -44.453041076660156, "rewards/margins": -0.8611316680908203, "rewards/rejected": -43.59191131591797, "step": 5497 }, { "epoch": 0.7486383442265795, "grad_norm": 39.79550733729155, "learning_rate": 1.4436878782992496e-07, "logits/chosen": 14.272640228271484, "logits/rejected": 15.045780181884766, "logps/chosen": -4.500863075256348, "logps/rejected": -4.890792369842529, "loss": 3.6594, "rewards/accuracies": 0.75, "rewards/chosen": -45.008628845214844, "rewards/margins": 3.899293899536133, "rewards/rejected": -48.907928466796875, "step": 5498 }, { "epoch": 0.7487745098039216, "grad_norm": 40.88187422143812, "learning_rate": 1.4422257186483733e-07, "logits/chosen": 14.812686920166016, "logits/rejected": 14.586309432983398, "logps/chosen": -4.483715057373047, "logps/rejected": -4.656651496887207, "loss": 3.6229, "rewards/accuracies": 0.25, "rewards/chosen": -44.83715057373047, "rewards/margins": 1.7293634414672852, "rewards/rejected": -46.56651306152344, "step": 5499 }, { "epoch": 0.7489106753812637, "grad_norm": 39.485960898674705, "learning_rate": 1.440764136947739e-07, "logits/chosen": 14.372482299804688, "logits/rejected": 14.785295486450195, "logps/chosen": -4.000829696655273, "logps/rejected": -4.485873222351074, "loss": 3.6036, "rewards/accuracies": 1.0, "rewards/chosen": -40.00829315185547, "rewards/margins": 4.85044002532959, "rewards/rejected": -44.858734130859375, "step": 5500 }, { "epoch": 0.7490468409586056, "grad_norm": 43.777710217087325, "learning_rate": 1.4393031335276039e-07, "logits/chosen": 13.278608322143555, "logits/rejected": 13.94806957244873, "logps/chosen": -4.200552940368652, "logps/rejected": -4.629427909851074, "loss": 4.5264, "rewards/accuracies": 0.75, "rewards/chosen": -42.00552749633789, "rewards/margins": 4.288753509521484, "rewards/rejected": -46.294281005859375, "step": 5501 }, { "epoch": 0.7491830065359477, "grad_norm": 33.867837445791075, "learning_rate": 1.4378427087180915e-07, "logits/chosen": 13.123775482177734, "logits/rejected": 13.415956497192383, "logps/chosen": -4.230567932128906, "logps/rejected": -4.332413673400879, "loss": 3.4242, "rewards/accuracies": 0.5, "rewards/chosen": -42.3056755065918, "rewards/margins": 1.018458366394043, "rewards/rejected": -43.324134826660156, "step": 5502 }, { "epoch": 0.7493191721132898, "grad_norm": 41.96574093352921, "learning_rate": 1.4363828628491982e-07, "logits/chosen": 14.251585006713867, "logits/rejected": 14.183704376220703, "logps/chosen": -4.825356960296631, "logps/rejected": -4.506234169006348, "loss": 3.9893, "rewards/accuracies": 0.25, "rewards/chosen": -48.253570556640625, "rewards/margins": -3.191229820251465, "rewards/rejected": -45.062339782714844, "step": 5503 }, { "epoch": 0.7494553376906318, "grad_norm": 40.05083836879676, "learning_rate": 1.434923596250789e-07, "logits/chosen": 13.68835735321045, "logits/rejected": 13.782779693603516, "logps/chosen": -4.269153594970703, "logps/rejected": -4.195443153381348, "loss": 4.1759, "rewards/accuracies": 0.75, "rewards/chosen": -42.69153594970703, "rewards/margins": -0.7371015548706055, "rewards/rejected": -41.954437255859375, "step": 5504 }, { "epoch": 0.7495915032679739, "grad_norm": 39.600524863083834, "learning_rate": 1.4334649092525956e-07, "logits/chosen": 14.598625183105469, "logits/rejected": 14.755960464477539, "logps/chosen": -4.467144966125488, "logps/rejected": -4.716283798217773, "loss": 4.0018, "rewards/accuracies": 0.75, "rewards/chosen": -44.671451568603516, "rewards/margins": 2.4913835525512695, "rewards/rejected": -47.16283416748047, "step": 5505 }, { "epoch": 0.7497276688453159, "grad_norm": 39.73723071216584, "learning_rate": 1.4320068021842207e-07, "logits/chosen": 14.574586868286133, "logits/rejected": 14.740768432617188, "logps/chosen": -4.394405364990234, "logps/rejected": -4.709287166595459, "loss": 3.6067, "rewards/accuracies": 0.5, "rewards/chosen": -43.944053649902344, "rewards/margins": 3.1488208770751953, "rewards/rejected": -47.092872619628906, "step": 5506 }, { "epoch": 0.7498638344226579, "grad_norm": 38.42243243844884, "learning_rate": 1.4305492753751377e-07, "logits/chosen": 14.249143600463867, "logits/rejected": 14.569601058959961, "logps/chosen": -4.550336837768555, "logps/rejected": -4.87770938873291, "loss": 3.9277, "rewards/accuracies": 0.75, "rewards/chosen": -45.50336837768555, "rewards/margins": 3.2737245559692383, "rewards/rejected": -48.777095794677734, "step": 5507 }, { "epoch": 0.75, "grad_norm": 36.709489361462936, "learning_rate": 1.4290923291546836e-07, "logits/chosen": 13.192573547363281, "logits/rejected": 13.357521057128906, "logps/chosen": -4.054897308349609, "logps/rejected": -4.324382305145264, "loss": 3.7811, "rewards/accuracies": 1.0, "rewards/chosen": -40.548973083496094, "rewards/margins": 2.6948471069335938, "rewards/rejected": -43.24382019042969, "step": 5508 }, { "epoch": 0.7501361655773421, "grad_norm": 43.44456258213446, "learning_rate": 1.4276359638520693e-07, "logits/chosen": 14.327085494995117, "logits/rejected": 13.591224670410156, "logps/chosen": -4.3548712730407715, "logps/rejected": -4.276066303253174, "loss": 4.0243, "rewards/accuracies": 0.5, "rewards/chosen": -43.54871368408203, "rewards/margins": -0.7880487442016602, "rewards/rejected": -42.76066589355469, "step": 5509 }, { "epoch": 0.7502723311546841, "grad_norm": 44.82541902674374, "learning_rate": 1.4261801797963725e-07, "logits/chosen": 14.428001403808594, "logits/rejected": 13.85183334350586, "logps/chosen": -4.571253776550293, "logps/rejected": -4.484854698181152, "loss": 4.5189, "rewards/accuracies": 0.25, "rewards/chosen": -45.7125358581543, "rewards/margins": -0.8639888763427734, "rewards/rejected": -44.84854507446289, "step": 5510 }, { "epoch": 0.7504084967320261, "grad_norm": 35.87448990880821, "learning_rate": 1.4247249773165405e-07, "logits/chosen": 14.207711219787598, "logits/rejected": 14.748113632202148, "logps/chosen": -4.473525047302246, "logps/rejected": -4.663022518157959, "loss": 3.4798, "rewards/accuracies": 0.75, "rewards/chosen": -44.735252380371094, "rewards/margins": 1.894974708557129, "rewards/rejected": -46.63022232055664, "step": 5511 }, { "epoch": 0.7505446623093682, "grad_norm": 46.27308375021961, "learning_rate": 1.4232703567413862e-07, "logits/chosen": 13.438047409057617, "logits/rejected": 14.264745712280273, "logps/chosen": -4.3133087158203125, "logps/rejected": -4.773672580718994, "loss": 4.0586, "rewards/accuracies": 1.0, "rewards/chosen": -43.133087158203125, "rewards/margins": 4.603636741638184, "rewards/rejected": -47.736724853515625, "step": 5512 }, { "epoch": 0.7506808278867102, "grad_norm": 37.332271423509795, "learning_rate": 1.4218163183995938e-07, "logits/chosen": 14.443656921386719, "logits/rejected": 15.497520446777344, "logps/chosen": -4.727343559265137, "logps/rejected": -5.074961185455322, "loss": 3.7962, "rewards/accuracies": 1.0, "rewards/chosen": -47.2734375, "rewards/margins": 3.476177215576172, "rewards/rejected": -50.74961471557617, "step": 5513 }, { "epoch": 0.7508169934640523, "grad_norm": 39.2585446466546, "learning_rate": 1.4203628626197177e-07, "logits/chosen": 14.153648376464844, "logits/rejected": 14.458501815795898, "logps/chosen": -4.329824924468994, "logps/rejected": -4.6001763343811035, "loss": 4.0708, "rewards/accuracies": 0.75, "rewards/chosen": -43.298248291015625, "rewards/margins": 2.7035140991210938, "rewards/rejected": -46.00176239013672, "step": 5514 }, { "epoch": 0.7509531590413944, "grad_norm": 39.29504407154993, "learning_rate": 1.4189099897301743e-07, "logits/chosen": 15.28030014038086, "logits/rejected": 14.634967803955078, "logps/chosen": -5.04612922668457, "logps/rejected": -4.994806289672852, "loss": 3.9199, "rewards/accuracies": 0.5, "rewards/chosen": -50.4612922668457, "rewards/margins": -0.5132293701171875, "rewards/rejected": -49.948062896728516, "step": 5515 }, { "epoch": 0.7510893246187363, "grad_norm": 42.58160261384557, "learning_rate": 1.4174577000592546e-07, "logits/chosen": 14.152233123779297, "logits/rejected": 13.526845932006836, "logps/chosen": -4.186313152313232, "logps/rejected": -4.1715497970581055, "loss": 4.2324, "rewards/accuracies": 0.5, "rewards/chosen": -41.863128662109375, "rewards/margins": -0.1476278305053711, "rewards/rejected": -41.71549987792969, "step": 5516 }, { "epoch": 0.7512254901960784, "grad_norm": 37.054310140252554, "learning_rate": 1.4160059939351165e-07, "logits/chosen": 13.761383056640625, "logits/rejected": 13.964272499084473, "logps/chosen": -4.078879356384277, "logps/rejected": -4.471101760864258, "loss": 3.6365, "rewards/accuracies": 0.75, "rewards/chosen": -40.78879165649414, "rewards/margins": 3.922226905822754, "rewards/rejected": -44.71101760864258, "step": 5517 }, { "epoch": 0.7513616557734205, "grad_norm": 38.42598460350479, "learning_rate": 1.4145548716857826e-07, "logits/chosen": 13.553022384643555, "logits/rejected": 14.41085147857666, "logps/chosen": -4.291335582733154, "logps/rejected": -4.630054473876953, "loss": 4.1539, "rewards/accuracies": 1.0, "rewards/chosen": -42.91335678100586, "rewards/margins": 3.387190818786621, "rewards/rejected": -46.30054473876953, "step": 5518 }, { "epoch": 0.7514978213507625, "grad_norm": 38.31003171029563, "learning_rate": 1.4131043336391462e-07, "logits/chosen": 14.439321517944336, "logits/rejected": 14.197421073913574, "logps/chosen": -4.7654218673706055, "logps/rejected": -4.730215549468994, "loss": 4.2016, "rewards/accuracies": 0.25, "rewards/chosen": -47.65421676635742, "rewards/margins": -0.35206127166748047, "rewards/rejected": -47.302154541015625, "step": 5519 }, { "epoch": 0.7516339869281046, "grad_norm": 46.61855134380285, "learning_rate": 1.4116543801229707e-07, "logits/chosen": 13.574975967407227, "logits/rejected": 13.371366500854492, "logps/chosen": -4.0654826164245605, "logps/rejected": -4.118636131286621, "loss": 4.3299, "rewards/accuracies": 0.75, "rewards/chosen": -40.65482711791992, "rewards/margins": 0.5315370559692383, "rewards/rejected": -41.186363220214844, "step": 5520 }, { "epoch": 0.7517701525054467, "grad_norm": 44.56644420391031, "learning_rate": 1.4102050114648823e-07, "logits/chosen": 14.521347045898438, "logits/rejected": 14.342111587524414, "logps/chosen": -4.42546272277832, "logps/rejected": -4.258014678955078, "loss": 4.3341, "rewards/accuracies": 0.25, "rewards/chosen": -44.25463104248047, "rewards/margins": -1.6744804382324219, "rewards/rejected": -42.58014678955078, "step": 5521 }, { "epoch": 0.7519063180827886, "grad_norm": 46.455367914458826, "learning_rate": 1.408756227992379e-07, "logits/chosen": 13.155077934265137, "logits/rejected": 14.294160842895508, "logps/chosen": -4.088178634643555, "logps/rejected": -4.726064682006836, "loss": 3.7146, "rewards/accuracies": 1.0, "rewards/chosen": -40.88178634643555, "rewards/margins": 6.3788604736328125, "rewards/rejected": -47.260650634765625, "step": 5522 }, { "epoch": 0.7520424836601307, "grad_norm": 38.47637380009571, "learning_rate": 1.407308030032827e-07, "logits/chosen": 14.827608108520508, "logits/rejected": 14.797348022460938, "logps/chosen": -4.596731185913086, "logps/rejected": -5.0178303718566895, "loss": 4.2708, "rewards/accuracies": 0.75, "rewards/chosen": -45.967315673828125, "rewards/margins": 4.210989952087402, "rewards/rejected": -50.178306579589844, "step": 5523 }, { "epoch": 0.7521786492374728, "grad_norm": 42.96725761391438, "learning_rate": 1.405860417913455e-07, "logits/chosen": 13.853424072265625, "logits/rejected": 14.453384399414062, "logps/chosen": -4.157449245452881, "logps/rejected": -4.510708808898926, "loss": 4.1573, "rewards/accuracies": 0.75, "rewards/chosen": -41.57448959350586, "rewards/margins": 3.5325984954833984, "rewards/rejected": -45.107086181640625, "step": 5524 }, { "epoch": 0.7523148148148148, "grad_norm": 39.35738349906755, "learning_rate": 1.4044133919613653e-07, "logits/chosen": 13.658827781677246, "logits/rejected": 14.263239860534668, "logps/chosen": -4.335976600646973, "logps/rejected": -4.595341205596924, "loss": 4.3898, "rewards/accuracies": 0.75, "rewards/chosen": -43.359764099121094, "rewards/margins": 2.5936460494995117, "rewards/rejected": -45.95341110229492, "step": 5525 }, { "epoch": 0.7524509803921569, "grad_norm": 40.29563720863122, "learning_rate": 1.4029669525035264e-07, "logits/chosen": 13.429285049438477, "logits/rejected": 13.60908317565918, "logps/chosen": -4.513812065124512, "logps/rejected": -4.456014633178711, "loss": 4.2528, "rewards/accuracies": 0.5, "rewards/chosen": -45.13812255859375, "rewards/margins": -0.577977180480957, "rewards/rejected": -44.56014633178711, "step": 5526 }, { "epoch": 0.7525871459694989, "grad_norm": 40.06618776833967, "learning_rate": 1.4015210998667707e-07, "logits/chosen": 13.872400283813477, "logits/rejected": 14.689756393432617, "logps/chosen": -4.360696792602539, "logps/rejected": -4.793100833892822, "loss": 3.822, "rewards/accuracies": 1.0, "rewards/chosen": -43.60696792602539, "rewards/margins": 4.324040412902832, "rewards/rejected": -47.931007385253906, "step": 5527 }, { "epoch": 0.7527233115468409, "grad_norm": 39.578742851811704, "learning_rate": 1.4000758343778015e-07, "logits/chosen": 14.563797950744629, "logits/rejected": 14.590227127075195, "logps/chosen": -4.496066093444824, "logps/rejected": -4.705878257751465, "loss": 4.0346, "rewards/accuracies": 0.75, "rewards/chosen": -44.960662841796875, "rewards/margins": 2.0981197357177734, "rewards/rejected": -47.05878448486328, "step": 5528 }, { "epoch": 0.752859477124183, "grad_norm": 38.80976990997057, "learning_rate": 1.3986311563631903e-07, "logits/chosen": 14.562171936035156, "logits/rejected": 14.47193717956543, "logps/chosen": -4.100503444671631, "logps/rejected": -4.497201442718506, "loss": 4.1901, "rewards/accuracies": 1.0, "rewards/chosen": -41.005035400390625, "rewards/margins": 3.9669809341430664, "rewards/rejected": -44.972015380859375, "step": 5529 }, { "epoch": 0.7529956427015251, "grad_norm": 37.97700867541894, "learning_rate": 1.397187066149371e-07, "logits/chosen": 14.894495964050293, "logits/rejected": 14.687222480773926, "logps/chosen": -4.659625053405762, "logps/rejected": -4.672845840454102, "loss": 4.1792, "rewards/accuracies": 0.25, "rewards/chosen": -46.59625244140625, "rewards/margins": 0.13220787048339844, "rewards/rejected": -46.72845458984375, "step": 5530 }, { "epoch": 0.753131808278867, "grad_norm": 42.90343769066371, "learning_rate": 1.395743564062649e-07, "logits/chosen": 13.622163772583008, "logits/rejected": 14.590326309204102, "logps/chosen": -4.16663932800293, "logps/rejected": -4.872332572937012, "loss": 4.0704, "rewards/accuracies": 0.75, "rewards/chosen": -41.66638946533203, "rewards/margins": 7.056931495666504, "rewards/rejected": -48.72332000732422, "step": 5531 }, { "epoch": 0.7532679738562091, "grad_norm": 45.78838666374832, "learning_rate": 1.3943006504291968e-07, "logits/chosen": 14.682191848754883, "logits/rejected": 14.87435531616211, "logps/chosen": -4.604864120483398, "logps/rejected": -4.662632942199707, "loss": 4.6221, "rewards/accuracies": 0.75, "rewards/chosen": -46.048641204833984, "rewards/margins": 0.5776901245117188, "rewards/rejected": -46.62632751464844, "step": 5532 }, { "epoch": 0.7534041394335512, "grad_norm": 40.21688487595041, "learning_rate": 1.392858325575051e-07, "logits/chosen": 13.175207138061523, "logits/rejected": 14.110506057739258, "logps/chosen": -4.145374298095703, "logps/rejected": -4.461649417877197, "loss": 3.9702, "rewards/accuracies": 1.0, "rewards/chosen": -41.45374298095703, "rewards/margins": 3.162750244140625, "rewards/rejected": -44.616493225097656, "step": 5533 }, { "epoch": 0.7535403050108932, "grad_norm": 43.475789139926704, "learning_rate": 1.3914165898261168e-07, "logits/chosen": 14.115148544311523, "logits/rejected": 14.647530555725098, "logps/chosen": -4.6144328117370605, "logps/rejected": -4.448105812072754, "loss": 3.7104, "rewards/accuracies": 0.25, "rewards/chosen": -46.14432907104492, "rewards/margins": -1.6632671356201172, "rewards/rejected": -44.48106384277344, "step": 5534 }, { "epoch": 0.7536764705882353, "grad_norm": 35.660357680150994, "learning_rate": 1.3899754435081685e-07, "logits/chosen": 13.762713432312012, "logits/rejected": 13.963878631591797, "logps/chosen": -4.296594142913818, "logps/rejected": -4.596307754516602, "loss": 3.9963, "rewards/accuracies": 0.75, "rewards/chosen": -42.965938568115234, "rewards/margins": 2.9971389770507812, "rewards/rejected": -45.96308135986328, "step": 5535 }, { "epoch": 0.7538126361655774, "grad_norm": 45.26975712282914, "learning_rate": 1.388534886946842e-07, "logits/chosen": 13.525140762329102, "logits/rejected": 14.041519165039062, "logps/chosen": -4.285723686218262, "logps/rejected": -4.63303804397583, "loss": 4.2862, "rewards/accuracies": 0.75, "rewards/chosen": -42.857242584228516, "rewards/margins": 3.473142623901367, "rewards/rejected": -46.33038330078125, "step": 5536 }, { "epoch": 0.7539488017429193, "grad_norm": 42.31633531917794, "learning_rate": 1.387094920467644e-07, "logits/chosen": 14.535494804382324, "logits/rejected": 14.928598403930664, "logps/chosen": -4.304498672485352, "logps/rejected": -4.626731872558594, "loss": 3.4786, "rewards/accuracies": 0.75, "rewards/chosen": -43.04498291015625, "rewards/margins": 3.2223339080810547, "rewards/rejected": -46.26731872558594, "step": 5537 }, { "epoch": 0.7540849673202614, "grad_norm": 40.20399858145811, "learning_rate": 1.385655544395949e-07, "logits/chosen": 14.291690826416016, "logits/rejected": 15.306032180786133, "logps/chosen": -4.3918375968933105, "logps/rejected": -4.838470458984375, "loss": 3.4688, "rewards/accuracies": 0.75, "rewards/chosen": -43.91837692260742, "rewards/margins": 4.466329574584961, "rewards/rejected": -48.38470458984375, "step": 5538 }, { "epoch": 0.7542211328976035, "grad_norm": 37.610621609716226, "learning_rate": 1.384216759056993e-07, "logits/chosen": 13.635034561157227, "logits/rejected": 14.399577140808105, "logps/chosen": -4.514514446258545, "logps/rejected": -4.674550533294678, "loss": 3.6536, "rewards/accuracies": 0.75, "rewards/chosen": -45.145145416259766, "rewards/margins": 1.600358009338379, "rewards/rejected": -46.745506286621094, "step": 5539 }, { "epoch": 0.7543572984749455, "grad_norm": 37.78325320116517, "learning_rate": 1.382778564775882e-07, "logits/chosen": 14.800235748291016, "logits/rejected": 14.942176818847656, "logps/chosen": -4.604762077331543, "logps/rejected": -4.824490547180176, "loss": 4.1975, "rewards/accuracies": 0.75, "rewards/chosen": -46.04762268066406, "rewards/margins": 2.1972827911376953, "rewards/rejected": -48.244903564453125, "step": 5540 }, { "epoch": 0.7544934640522876, "grad_norm": 38.742242421202, "learning_rate": 1.3813409618775903e-07, "logits/chosen": 14.04092025756836, "logits/rejected": 13.532662391662598, "logps/chosen": -4.7499284744262695, "logps/rejected": -4.470659255981445, "loss": 4.1155, "rewards/accuracies": 0.5, "rewards/chosen": -47.49928283691406, "rewards/margins": -2.7926902770996094, "rewards/rejected": -44.70659637451172, "step": 5541 }, { "epoch": 0.7546296296296297, "grad_norm": 43.87458773402077, "learning_rate": 1.3799039506869528e-07, "logits/chosen": 14.277127265930176, "logits/rejected": 15.052114486694336, "logps/chosen": -4.673418998718262, "logps/rejected": -4.936436653137207, "loss": 3.4527, "rewards/accuracies": 0.75, "rewards/chosen": -46.73418426513672, "rewards/margins": 2.630176544189453, "rewards/rejected": -49.36436462402344, "step": 5542 }, { "epoch": 0.7547657952069716, "grad_norm": 39.71454210705974, "learning_rate": 1.3784675315286754e-07, "logits/chosen": 14.50777530670166, "logits/rejected": 14.619012832641602, "logps/chosen": -4.570746421813965, "logps/rejected": -4.4710187911987305, "loss": 4.307, "rewards/accuracies": 0.5, "rewards/chosen": -45.707462310791016, "rewards/margins": -0.9972772598266602, "rewards/rejected": -44.71018600463867, "step": 5543 }, { "epoch": 0.7549019607843137, "grad_norm": 37.9442436155312, "learning_rate": 1.3770317047273307e-07, "logits/chosen": 14.051642417907715, "logits/rejected": 14.384977340698242, "logps/chosen": -4.364198684692383, "logps/rejected": -4.357058048248291, "loss": 3.7297, "rewards/accuracies": 0.5, "rewards/chosen": -43.64198303222656, "rewards/margins": -0.07140159606933594, "rewards/rejected": -43.570579528808594, "step": 5544 }, { "epoch": 0.7550381263616558, "grad_norm": 36.76797981781835, "learning_rate": 1.3755964706073524e-07, "logits/chosen": 13.783113479614258, "logits/rejected": 14.291996002197266, "logps/chosen": -4.340537071228027, "logps/rejected": -4.591108322143555, "loss": 3.5195, "rewards/accuracies": 0.75, "rewards/chosen": -43.40536880493164, "rewards/margins": 2.5057125091552734, "rewards/rejected": -45.91107940673828, "step": 5545 }, { "epoch": 0.7551742919389978, "grad_norm": 39.976909212518066, "learning_rate": 1.3741618294930452e-07, "logits/chosen": 14.860204696655273, "logits/rejected": 14.746339797973633, "logps/chosen": -4.830165863037109, "logps/rejected": -4.811246395111084, "loss": 3.6954, "rewards/accuracies": 0.5, "rewards/chosen": -48.301658630371094, "rewards/margins": -0.1891937255859375, "rewards/rejected": -48.112464904785156, "step": 5546 }, { "epoch": 0.7553104575163399, "grad_norm": 37.43054901893348, "learning_rate": 1.3727277817085793e-07, "logits/chosen": 13.879142761230469, "logits/rejected": 14.018991470336914, "logps/chosen": -4.466986179351807, "logps/rejected": -4.607525825500488, "loss": 4.1599, "rewards/accuracies": 0.5, "rewards/chosen": -44.66986083984375, "rewards/margins": 1.4053974151611328, "rewards/rejected": -46.075260162353516, "step": 5547 }, { "epoch": 0.7554466230936819, "grad_norm": 42.26612177578258, "learning_rate": 1.371294327577987e-07, "logits/chosen": 14.13011360168457, "logits/rejected": 13.918987274169922, "logps/chosen": -4.291253089904785, "logps/rejected": -4.30405855178833, "loss": 3.8718, "rewards/accuracies": 0.5, "rewards/chosen": -42.91252899169922, "rewards/margins": 0.12806034088134766, "rewards/rejected": -43.04058837890625, "step": 5548 }, { "epoch": 0.755582788671024, "grad_norm": 39.3792735996203, "learning_rate": 1.3698614674251708e-07, "logits/chosen": 13.498214721679688, "logits/rejected": 14.448673248291016, "logps/chosen": -4.15938663482666, "logps/rejected": -4.899319171905518, "loss": 3.8472, "rewards/accuracies": 0.75, "rewards/chosen": -41.59386444091797, "rewards/margins": 7.39932918548584, "rewards/rejected": -48.993194580078125, "step": 5549 }, { "epoch": 0.755718954248366, "grad_norm": 39.74123695444384, "learning_rate": 1.3684292015738982e-07, "logits/chosen": 13.869915962219238, "logits/rejected": 14.339401245117188, "logps/chosen": -4.407602310180664, "logps/rejected": -4.667283058166504, "loss": 3.5782, "rewards/accuracies": 0.5, "rewards/chosen": -44.07602310180664, "rewards/margins": 2.596808433532715, "rewards/rejected": -46.672828674316406, "step": 5550 }, { "epoch": 0.7558551198257081, "grad_norm": 41.08241585841069, "learning_rate": 1.366997530347799e-07, "logits/chosen": 14.687314987182617, "logits/rejected": 15.029864311218262, "logps/chosen": -4.731919288635254, "logps/rejected": -4.829631805419922, "loss": 4.1758, "rewards/accuracies": 0.5, "rewards/chosen": -47.319190979003906, "rewards/margins": 0.9771289825439453, "rewards/rejected": -48.29631805419922, "step": 5551 }, { "epoch": 0.7559912854030502, "grad_norm": 41.616413760311545, "learning_rate": 1.3655664540703722e-07, "logits/chosen": 14.513629913330078, "logits/rejected": 14.517251968383789, "logps/chosen": -5.012890815734863, "logps/rejected": -4.896389961242676, "loss": 3.9711, "rewards/accuracies": 0.75, "rewards/chosen": -50.12890625, "rewards/margins": -1.1650075912475586, "rewards/rejected": -48.963897705078125, "step": 5552 }, { "epoch": 0.7561274509803921, "grad_norm": 38.90142728338876, "learning_rate": 1.3641359730649828e-07, "logits/chosen": 13.8802490234375, "logits/rejected": 13.853185653686523, "logps/chosen": -4.400108337402344, "logps/rejected": -4.390847206115723, "loss": 3.9357, "rewards/accuracies": 0.5, "rewards/chosen": -44.00108337402344, "rewards/margins": -0.09261131286621094, "rewards/rejected": -43.908470153808594, "step": 5553 }, { "epoch": 0.7562636165577342, "grad_norm": 41.4903502516843, "learning_rate": 1.3627060876548572e-07, "logits/chosen": 14.542795181274414, "logits/rejected": 14.327238082885742, "logps/chosen": -4.508077621459961, "logps/rejected": -4.6259260177612305, "loss": 4.0185, "rewards/accuracies": 0.5, "rewards/chosen": -45.080780029296875, "rewards/margins": 1.1784820556640625, "rewards/rejected": -46.25925827026367, "step": 5554 }, { "epoch": 0.7563997821350763, "grad_norm": 40.65767927133919, "learning_rate": 1.3612767981630917e-07, "logits/chosen": 14.060426712036133, "logits/rejected": 14.332279205322266, "logps/chosen": -4.671550750732422, "logps/rejected": -4.807744026184082, "loss": 4.0743, "rewards/accuracies": 0.5, "rewards/chosen": -46.71550750732422, "rewards/margins": 1.3619318008422852, "rewards/rejected": -48.07743835449219, "step": 5555 }, { "epoch": 0.7565359477124183, "grad_norm": 39.990564650778694, "learning_rate": 1.3598481049126464e-07, "logits/chosen": 14.128091812133789, "logits/rejected": 13.986690521240234, "logps/chosen": -4.542899131774902, "logps/rejected": -4.536048889160156, "loss": 3.9287, "rewards/accuracies": 0.5, "rewards/chosen": -45.428993225097656, "rewards/margins": -0.06850814819335938, "rewards/rejected": -45.3604850769043, "step": 5556 }, { "epoch": 0.7566721132897604, "grad_norm": 38.56814401869383, "learning_rate": 1.3584200082263446e-07, "logits/chosen": 13.727434158325195, "logits/rejected": 14.332192420959473, "logps/chosen": -4.205220699310303, "logps/rejected": -4.4651641845703125, "loss": 4.4045, "rewards/accuracies": 0.75, "rewards/chosen": -42.052207946777344, "rewards/margins": 2.5994348526000977, "rewards/rejected": -44.651641845703125, "step": 5557 }, { "epoch": 0.7568082788671024, "grad_norm": 39.714543691794404, "learning_rate": 1.356992508426877e-07, "logits/chosen": 14.28707504272461, "logits/rejected": 14.036579132080078, "logps/chosen": -4.583162307739258, "logps/rejected": -4.29792594909668, "loss": 3.4109, "rewards/accuracies": 0.25, "rewards/chosen": -45.83161926269531, "rewards/margins": -2.8523597717285156, "rewards/rejected": -42.9792594909668, "step": 5558 }, { "epoch": 0.7569444444444444, "grad_norm": 42.21459826890421, "learning_rate": 1.355565605836801e-07, "logits/chosen": 13.234626770019531, "logits/rejected": 14.560678482055664, "logps/chosen": -4.11259126663208, "logps/rejected": -4.598485946655273, "loss": 3.5359, "rewards/accuracies": 1.0, "rewards/chosen": -41.12590789794922, "rewards/margins": 4.858949661254883, "rewards/rejected": -45.984859466552734, "step": 5559 }, { "epoch": 0.7570806100217865, "grad_norm": 45.699407321244365, "learning_rate": 1.3541393007785345e-07, "logits/chosen": 13.731775283813477, "logits/rejected": 14.25265884399414, "logps/chosen": -4.200397491455078, "logps/rejected": -4.611247539520264, "loss": 3.9823, "rewards/accuracies": 1.0, "rewards/chosen": -42.00397872924805, "rewards/margins": 4.108495712280273, "rewards/rejected": -46.11247253417969, "step": 5560 }, { "epoch": 0.7572167755991286, "grad_norm": 36.00565613804867, "learning_rate": 1.3527135935743634e-07, "logits/chosen": 13.773311614990234, "logits/rejected": 14.257511138916016, "logps/chosen": -4.441835880279541, "logps/rejected": -4.701517581939697, "loss": 3.7253, "rewards/accuracies": 1.0, "rewards/chosen": -44.418357849121094, "rewards/margins": 2.5968170166015625, "rewards/rejected": -47.015174865722656, "step": 5561 }, { "epoch": 0.7573529411764706, "grad_norm": 39.08891560204778, "learning_rate": 1.35128848454644e-07, "logits/chosen": 13.32796859741211, "logits/rejected": 14.329183578491211, "logps/chosen": -4.2174391746521, "logps/rejected": -4.446962356567383, "loss": 3.6976, "rewards/accuracies": 0.75, "rewards/chosen": -42.17439270019531, "rewards/margins": 2.2952356338500977, "rewards/rejected": -44.469627380371094, "step": 5562 }, { "epoch": 0.7574891067538126, "grad_norm": 38.784781290560964, "learning_rate": 1.3498639740167766e-07, "logits/chosen": 13.795032501220703, "logits/rejected": 14.810049057006836, "logps/chosen": -4.528302192687988, "logps/rejected": -5.09271240234375, "loss": 3.4439, "rewards/accuracies": 1.0, "rewards/chosen": -45.283023834228516, "rewards/margins": 5.644103050231934, "rewards/rejected": -50.9271240234375, "step": 5563 }, { "epoch": 0.7576252723311547, "grad_norm": 36.70282307223763, "learning_rate": 1.3484400623072546e-07, "logits/chosen": 14.63863754272461, "logits/rejected": 15.55841064453125, "logps/chosen": -4.698625564575195, "logps/rejected": -4.835026741027832, "loss": 3.5858, "rewards/accuracies": 0.75, "rewards/chosen": -46.98625946044922, "rewards/margins": 1.3640127182006836, "rewards/rejected": -48.35027313232422, "step": 5564 }, { "epoch": 0.7577614379084967, "grad_norm": 36.914188355831186, "learning_rate": 1.3470167497396197e-07, "logits/chosen": 13.992704391479492, "logits/rejected": 13.99746322631836, "logps/chosen": -4.168871879577637, "logps/rejected": -4.431326866149902, "loss": 4.2432, "rewards/accuracies": 1.0, "rewards/chosen": -41.688720703125, "rewards/margins": 2.6245460510253906, "rewards/rejected": -44.313270568847656, "step": 5565 }, { "epoch": 0.7578976034858388, "grad_norm": 38.766107042102114, "learning_rate": 1.345594036635479e-07, "logits/chosen": 13.818853378295898, "logits/rejected": 14.25798225402832, "logps/chosen": -4.477576732635498, "logps/rejected": -4.450368404388428, "loss": 4.1002, "rewards/accuracies": 0.5, "rewards/chosen": -44.77576446533203, "rewards/margins": -0.2720804214477539, "rewards/rejected": -44.503684997558594, "step": 5566 }, { "epoch": 0.7580337690631809, "grad_norm": 40.9689077171127, "learning_rate": 1.3441719233163072e-07, "logits/chosen": 13.999433517456055, "logits/rejected": 13.910160064697266, "logps/chosen": -4.207528114318848, "logps/rejected": -4.2233171463012695, "loss": 3.8363, "rewards/accuracies": 0.75, "rewards/chosen": -42.075279235839844, "rewards/margins": 0.15789127349853516, "rewards/rejected": -42.23316955566406, "step": 5567 }, { "epoch": 0.7581699346405228, "grad_norm": 37.047148594540516, "learning_rate": 1.3427504101034438e-07, "logits/chosen": 14.325066566467285, "logits/rejected": 14.008306503295898, "logps/chosen": -4.465212345123291, "logps/rejected": -4.2053914070129395, "loss": 4.3286, "rewards/accuracies": 0.25, "rewards/chosen": -44.652122497558594, "rewards/margins": -2.59820556640625, "rewards/rejected": -42.053916931152344, "step": 5568 }, { "epoch": 0.7583061002178649, "grad_norm": 39.38665821732543, "learning_rate": 1.3413294973180889e-07, "logits/chosen": 14.356393814086914, "logits/rejected": 14.741952896118164, "logps/chosen": -4.428266525268555, "logps/rejected": -4.85260009765625, "loss": 4.1028, "rewards/accuracies": 1.0, "rewards/chosen": -44.28266525268555, "rewards/margins": 4.2433366775512695, "rewards/rejected": -48.5260009765625, "step": 5569 }, { "epoch": 0.758442265795207, "grad_norm": 37.78431805078418, "learning_rate": 1.3399091852813107e-07, "logits/chosen": 13.586018562316895, "logits/rejected": 13.672805786132812, "logps/chosen": -4.173873424530029, "logps/rejected": -4.285773277282715, "loss": 3.583, "rewards/accuracies": 0.5, "rewards/chosen": -41.73873519897461, "rewards/margins": 1.118997573852539, "rewards/rejected": -42.857730865478516, "step": 5570 }, { "epoch": 0.758578431372549, "grad_norm": 42.96326119623289, "learning_rate": 1.3384894743140422e-07, "logits/chosen": 13.518796920776367, "logits/rejected": 14.372268676757812, "logps/chosen": -4.259490013122559, "logps/rejected": -4.5681376457214355, "loss": 4.1379, "rewards/accuracies": 0.5, "rewards/chosen": -42.59489822387695, "rewards/margins": 3.0864791870117188, "rewards/rejected": -45.68137741088867, "step": 5571 }, { "epoch": 0.7587145969498911, "grad_norm": 39.54513382038862, "learning_rate": 1.3370703647370762e-07, "logits/chosen": 14.28075122833252, "logits/rejected": 14.534160614013672, "logps/chosen": -4.486155986785889, "logps/rejected": -4.6814093589782715, "loss": 3.5805, "rewards/accuracies": 0.5, "rewards/chosen": -44.86156463623047, "rewards/margins": 1.952530860900879, "rewards/rejected": -46.81409454345703, "step": 5572 }, { "epoch": 0.7588507625272332, "grad_norm": 37.23385376215571, "learning_rate": 1.3356518568710725e-07, "logits/chosen": 14.067716598510742, "logits/rejected": 14.17739486694336, "logps/chosen": -4.3924150466918945, "logps/rejected": -4.5223069190979, "loss": 3.8291, "rewards/accuracies": 0.75, "rewards/chosen": -43.92414855957031, "rewards/margins": 1.2989187240600586, "rewards/rejected": -45.22306823730469, "step": 5573 }, { "epoch": 0.7589869281045751, "grad_norm": 41.664978315663355, "learning_rate": 1.3342339510365576e-07, "logits/chosen": 14.04944133758545, "logits/rejected": 14.724096298217773, "logps/chosen": -4.550043106079102, "logps/rejected": -4.756322860717773, "loss": 4.2647, "rewards/accuracies": 0.5, "rewards/chosen": -45.50043487548828, "rewards/margins": 2.0627965927124023, "rewards/rejected": -47.563228607177734, "step": 5574 }, { "epoch": 0.7591230936819172, "grad_norm": 41.01763913875852, "learning_rate": 1.3328166475539151e-07, "logits/chosen": 13.686445236206055, "logits/rejected": 14.468902587890625, "logps/chosen": -4.439970016479492, "logps/rejected": -4.540366172790527, "loss": 3.9094, "rewards/accuracies": 0.5, "rewards/chosen": -44.399696350097656, "rewards/margins": 1.0039634704589844, "rewards/rejected": -45.40365982055664, "step": 5575 }, { "epoch": 0.7592592592592593, "grad_norm": 48.8249071963218, "learning_rate": 1.3313999467433986e-07, "logits/chosen": 13.222982406616211, "logits/rejected": 13.881509780883789, "logps/chosen": -4.337894439697266, "logps/rejected": -4.625082015991211, "loss": 4.3347, "rewards/accuracies": 0.75, "rewards/chosen": -43.378944396972656, "rewards/margins": 2.871870994567871, "rewards/rejected": -46.250816345214844, "step": 5576 }, { "epoch": 0.7593954248366013, "grad_norm": 41.2970628264379, "learning_rate": 1.3299838489251242e-07, "logits/chosen": 13.681732177734375, "logits/rejected": 14.393827438354492, "logps/chosen": -4.646655559539795, "logps/rejected": -4.788730621337891, "loss": 3.221, "rewards/accuracies": 0.75, "rewards/chosen": -46.466556549072266, "rewards/margins": 1.4207496643066406, "rewards/rejected": -47.887306213378906, "step": 5577 }, { "epoch": 0.7595315904139434, "grad_norm": 38.0188872058844, "learning_rate": 1.3285683544190685e-07, "logits/chosen": 14.454358100891113, "logits/rejected": 14.609262466430664, "logps/chosen": -4.221370697021484, "logps/rejected": -4.612435817718506, "loss": 4.0226, "rewards/accuracies": 1.0, "rewards/chosen": -42.213706970214844, "rewards/margins": 3.910651206970215, "rewards/rejected": -46.124359130859375, "step": 5578 }, { "epoch": 0.7596677559912854, "grad_norm": 39.17683934955319, "learning_rate": 1.3271534635450753e-07, "logits/chosen": 14.131114959716797, "logits/rejected": 15.142184257507324, "logps/chosen": -4.268512725830078, "logps/rejected": -4.7970452308654785, "loss": 4.1893, "rewards/accuracies": 0.75, "rewards/chosen": -42.685123443603516, "rewards/margins": 5.285327911376953, "rewards/rejected": -47.97045135498047, "step": 5579 }, { "epoch": 0.7598039215686274, "grad_norm": 47.77880540369449, "learning_rate": 1.3257391766228518e-07, "logits/chosen": 14.554035186767578, "logits/rejected": 14.08841323852539, "logps/chosen": -4.489441871643066, "logps/rejected": -4.5970683097839355, "loss": 4.1663, "rewards/accuracies": 0.75, "rewards/chosen": -44.89441680908203, "rewards/margins": 1.0762672424316406, "rewards/rejected": -45.97068405151367, "step": 5580 }, { "epoch": 0.7599400871459695, "grad_norm": 34.52904498640069, "learning_rate": 1.3243254939719663e-07, "logits/chosen": 14.18400764465332, "logits/rejected": 15.431964874267578, "logps/chosen": -4.669486045837402, "logps/rejected": -4.976099491119385, "loss": 3.5963, "rewards/accuracies": 1.0, "rewards/chosen": -46.694862365722656, "rewards/margins": 3.066131591796875, "rewards/rejected": -49.76099395751953, "step": 5581 }, { "epoch": 0.7600762527233116, "grad_norm": 36.25160652473205, "learning_rate": 1.3229124159118525e-07, "logits/chosen": 13.398944854736328, "logits/rejected": 13.805702209472656, "logps/chosen": -4.068849563598633, "logps/rejected": -4.314018726348877, "loss": 3.6814, "rewards/accuracies": 0.5, "rewards/chosen": -40.68849182128906, "rewards/margins": 2.4516944885253906, "rewards/rejected": -43.14018630981445, "step": 5582 }, { "epoch": 0.7602124183006536, "grad_norm": 38.32721062312382, "learning_rate": 1.3214999427618087e-07, "logits/chosen": 14.236120223999023, "logits/rejected": 14.506889343261719, "logps/chosen": -4.427547931671143, "logps/rejected": -4.61588191986084, "loss": 3.9068, "rewards/accuracies": 0.75, "rewards/chosen": -44.275482177734375, "rewards/margins": 1.8833351135253906, "rewards/rejected": -46.1588134765625, "step": 5583 }, { "epoch": 0.7603485838779956, "grad_norm": 38.946738275712086, "learning_rate": 1.3200880748409925e-07, "logits/chosen": 14.893272399902344, "logits/rejected": 15.08493709564209, "logps/chosen": -4.524251937866211, "logps/rejected": -5.0949506759643555, "loss": 3.7848, "rewards/accuracies": 1.0, "rewards/chosen": -45.24251937866211, "rewards/margins": 5.706984519958496, "rewards/rejected": -50.94950485229492, "step": 5584 }, { "epoch": 0.7604847494553377, "grad_norm": 37.787373606787504, "learning_rate": 1.318676812468428e-07, "logits/chosen": 13.958062171936035, "logits/rejected": 13.699176788330078, "logps/chosen": -4.446962833404541, "logps/rejected": -4.513396739959717, "loss": 4.0654, "rewards/accuracies": 0.75, "rewards/chosen": -44.469627380371094, "rewards/margins": 0.6643390655517578, "rewards/rejected": -45.133968353271484, "step": 5585 }, { "epoch": 0.7606209150326797, "grad_norm": 40.14111685419316, "learning_rate": 1.317266155963003e-07, "logits/chosen": 14.112873077392578, "logits/rejected": 14.293167114257812, "logps/chosen": -4.431156158447266, "logps/rejected": -4.575014114379883, "loss": 3.9829, "rewards/accuracies": 0.75, "rewards/chosen": -44.311561584472656, "rewards/margins": 1.4385766983032227, "rewards/rejected": -45.75013732910156, "step": 5586 }, { "epoch": 0.7607570806100218, "grad_norm": 35.604051333417665, "learning_rate": 1.3158561056434646e-07, "logits/chosen": 13.553786277770996, "logits/rejected": 14.219736099243164, "logps/chosen": -4.5769362449646, "logps/rejected": -4.570927619934082, "loss": 3.7917, "rewards/accuracies": 0.5, "rewards/chosen": -45.76936340332031, "rewards/margins": -0.060088157653808594, "rewards/rejected": -45.70927429199219, "step": 5587 }, { "epoch": 0.7608932461873639, "grad_norm": 36.97508423365026, "learning_rate": 1.3144466618284265e-07, "logits/chosen": 14.127452850341797, "logits/rejected": 14.856586456298828, "logps/chosen": -4.417459964752197, "logps/rejected": -4.690392971038818, "loss": 3.6036, "rewards/accuracies": 0.75, "rewards/chosen": -44.174598693847656, "rewards/margins": 2.7293291091918945, "rewards/rejected": -46.9039306640625, "step": 5588 }, { "epoch": 0.7610294117647058, "grad_norm": 40.14774884128508, "learning_rate": 1.313037824836364e-07, "logits/chosen": 13.352567672729492, "logits/rejected": 13.909446716308594, "logps/chosen": -4.407590866088867, "logps/rejected": -4.69856595993042, "loss": 3.8107, "rewards/accuracies": 0.75, "rewards/chosen": -44.075904846191406, "rewards/margins": 2.909754753112793, "rewards/rejected": -46.985660552978516, "step": 5589 }, { "epoch": 0.7611655773420479, "grad_norm": 40.31048425507567, "learning_rate": 1.3116295949856172e-07, "logits/chosen": 14.642584800720215, "logits/rejected": 14.127893447875977, "logps/chosen": -4.402432441711426, "logps/rejected": -4.46055269241333, "loss": 4.0459, "rewards/accuracies": 0.5, "rewards/chosen": -44.024330139160156, "rewards/margins": 0.5812005996704102, "rewards/rejected": -44.60552978515625, "step": 5590 }, { "epoch": 0.76130174291939, "grad_norm": 40.14220253103249, "learning_rate": 1.310221972594384e-07, "logits/chosen": 14.52902603149414, "logits/rejected": 14.098041534423828, "logps/chosen": -4.286136150360107, "logps/rejected": -4.194372653961182, "loss": 4.3246, "rewards/accuracies": 0.5, "rewards/chosen": -42.861358642578125, "rewards/margins": -0.9176340103149414, "rewards/rejected": -41.9437255859375, "step": 5591 }, { "epoch": 0.761437908496732, "grad_norm": 37.859767868633675, "learning_rate": 1.3088149579807303e-07, "logits/chosen": 13.989137649536133, "logits/rejected": 13.509391784667969, "logps/chosen": -4.252656936645508, "logps/rejected": -4.264188766479492, "loss": 3.7371, "rewards/accuracies": 0.5, "rewards/chosen": -42.52656555175781, "rewards/margins": 0.11532115936279297, "rewards/rejected": -42.64188766479492, "step": 5592 }, { "epoch": 0.7615740740740741, "grad_norm": 37.28031626837362, "learning_rate": 1.3074085514625837e-07, "logits/chosen": 14.008108139038086, "logits/rejected": 14.333932876586914, "logps/chosen": -4.178294658660889, "logps/rejected": -4.364407539367676, "loss": 3.9406, "rewards/accuracies": 0.5, "rewards/chosen": -41.7829475402832, "rewards/margins": 1.8611268997192383, "rewards/rejected": -43.644073486328125, "step": 5593 }, { "epoch": 0.7617102396514162, "grad_norm": 38.44583404491641, "learning_rate": 1.3060027533577308e-07, "logits/chosen": 14.517988204956055, "logits/rejected": 14.205390930175781, "logps/chosen": -4.810807228088379, "logps/rejected": -4.9240007400512695, "loss": 3.6906, "rewards/accuracies": 0.75, "rewards/chosen": -48.108070373535156, "rewards/margins": 1.1319398880004883, "rewards/rejected": -49.24000930786133, "step": 5594 }, { "epoch": 0.7618464052287581, "grad_norm": 39.20064439559325, "learning_rate": 1.3045975639838242e-07, "logits/chosen": 14.651629447937012, "logits/rejected": 14.454765319824219, "logps/chosen": -4.48134183883667, "logps/rejected": -4.691323280334473, "loss": 4.433, "rewards/accuracies": 0.75, "rewards/chosen": -44.813419342041016, "rewards/margins": 2.0998144149780273, "rewards/rejected": -46.91323471069336, "step": 5595 }, { "epoch": 0.7619825708061002, "grad_norm": 33.91858046844986, "learning_rate": 1.3031929836583788e-07, "logits/chosen": 14.880180358886719, "logits/rejected": 14.384057998657227, "logps/chosen": -4.43082332611084, "logps/rejected": -4.661168098449707, "loss": 3.6107, "rewards/accuracies": 0.75, "rewards/chosen": -44.30823516845703, "rewards/margins": 2.3034467697143555, "rewards/rejected": -46.61167907714844, "step": 5596 }, { "epoch": 0.7621187363834423, "grad_norm": 49.633137575501955, "learning_rate": 1.301789012698772e-07, "logits/chosen": 13.266348838806152, "logits/rejected": 14.19858455657959, "logps/chosen": -4.221254348754883, "logps/rejected": -4.713201522827148, "loss": 3.7352, "rewards/accuracies": 1.0, "rewards/chosen": -42.21253967285156, "rewards/margins": 4.9194746017456055, "rewards/rejected": -47.13201904296875, "step": 5597 }, { "epoch": 0.7622549019607843, "grad_norm": 36.28392221996176, "learning_rate": 1.3003856514222403e-07, "logits/chosen": 13.976022720336914, "logits/rejected": 14.711565971374512, "logps/chosen": -4.520552158355713, "logps/rejected": -4.809803009033203, "loss": 3.4956, "rewards/accuracies": 0.75, "rewards/chosen": -45.20552444458008, "rewards/margins": 2.8925046920776367, "rewards/rejected": -48.09803009033203, "step": 5598 }, { "epoch": 0.7623910675381264, "grad_norm": 38.8022801040372, "learning_rate": 1.298982900145886e-07, "logits/chosen": 14.565013885498047, "logits/rejected": 14.082475662231445, "logps/chosen": -4.707691192626953, "logps/rejected": -4.495502471923828, "loss": 3.8662, "rewards/accuracies": 0.25, "rewards/chosen": -47.07691192626953, "rewards/margins": -2.1218862533569336, "rewards/rejected": -44.95502471923828, "step": 5599 }, { "epoch": 0.7625272331154684, "grad_norm": 39.865565841883175, "learning_rate": 1.297580759186673e-07, "logits/chosen": 13.496282577514648, "logits/rejected": 14.07275676727295, "logps/chosen": -4.24442720413208, "logps/rejected": -4.564708232879639, "loss": 4.4556, "rewards/accuracies": 0.75, "rewards/chosen": -42.444271087646484, "rewards/margins": 3.2028141021728516, "rewards/rejected": -45.64708709716797, "step": 5600 }, { "epoch": 0.7626633986928104, "grad_norm": 36.99756981801035, "learning_rate": 1.296179228861425e-07, "logits/chosen": 14.519363403320312, "logits/rejected": 14.043079376220703, "logps/chosen": -4.426578521728516, "logps/rejected": -4.460514545440674, "loss": 3.8654, "rewards/accuracies": 0.25, "rewards/chosen": -44.265785217285156, "rewards/margins": 0.33936214447021484, "rewards/rejected": -44.60514831542969, "step": 5601 }, { "epoch": 0.7627995642701525, "grad_norm": 39.59268756170076, "learning_rate": 1.29477830948683e-07, "logits/chosen": 14.256927490234375, "logits/rejected": 14.602092742919922, "logps/chosen": -4.692584037780762, "logps/rejected": -5.041968822479248, "loss": 3.5173, "rewards/accuracies": 0.75, "rewards/chosen": -46.92584228515625, "rewards/margins": 3.4938440322875977, "rewards/rejected": -50.41968536376953, "step": 5602 }, { "epoch": 0.7629357298474946, "grad_norm": 43.994695970955924, "learning_rate": 1.2933780013794374e-07, "logits/chosen": 13.675063133239746, "logits/rejected": 14.222838401794434, "logps/chosen": -4.086150169372559, "logps/rejected": -4.402163505554199, "loss": 4.1276, "rewards/accuracies": 1.0, "rewards/chosen": -40.86150360107422, "rewards/margins": 3.1601266860961914, "rewards/rejected": -44.021629333496094, "step": 5603 }, { "epoch": 0.7630718954248366, "grad_norm": 40.67867397351134, "learning_rate": 1.2919783048556604e-07, "logits/chosen": 14.558488845825195, "logits/rejected": 14.75025749206543, "logps/chosen": -4.874192237854004, "logps/rejected": -4.7798004150390625, "loss": 3.613, "rewards/accuracies": 0.5, "rewards/chosen": -48.741920471191406, "rewards/margins": -0.9439191818237305, "rewards/rejected": -47.79800033569336, "step": 5604 }, { "epoch": 0.7632080610021786, "grad_norm": 41.36412143507726, "learning_rate": 1.2905792202317686e-07, "logits/chosen": 14.30364990234375, "logits/rejected": 13.889623641967773, "logps/chosen": -4.4102373123168945, "logps/rejected": -4.493747711181641, "loss": 4.1099, "rewards/accuracies": 0.5, "rewards/chosen": -44.10237503051758, "rewards/margins": 0.8350982666015625, "rewards/rejected": -44.93747329711914, "step": 5605 }, { "epoch": 0.7633442265795207, "grad_norm": 38.224580026732696, "learning_rate": 1.2891807478238982e-07, "logits/chosen": 14.479266166687012, "logits/rejected": 14.734210968017578, "logps/chosen": -4.234340667724609, "logps/rejected": -4.705758571624756, "loss": 3.7928, "rewards/accuracies": 1.0, "rewards/chosen": -42.343406677246094, "rewards/margins": 4.714179992675781, "rewards/rejected": -47.05759048461914, "step": 5606 }, { "epoch": 0.7634803921568627, "grad_norm": 40.026270968696416, "learning_rate": 1.287782887948047e-07, "logits/chosen": 13.899066925048828, "logits/rejected": 13.670585632324219, "logps/chosen": -4.265866279602051, "logps/rejected": -4.4815144538879395, "loss": 4.3284, "rewards/accuracies": 0.75, "rewards/chosen": -42.658668518066406, "rewards/margins": 2.1564807891845703, "rewards/rejected": -44.815147399902344, "step": 5607 }, { "epoch": 0.7636165577342048, "grad_norm": 39.440848668616304, "learning_rate": 1.2863856409200707e-07, "logits/chosen": 14.079916000366211, "logits/rejected": 14.331298828125, "logps/chosen": -4.539489269256592, "logps/rejected": -4.591105937957764, "loss": 3.8445, "rewards/accuracies": 0.75, "rewards/chosen": -45.394893646240234, "rewards/margins": 0.5161666870117188, "rewards/rejected": -45.91106033325195, "step": 5608 }, { "epoch": 0.7637527233115469, "grad_norm": 39.63566390967711, "learning_rate": 1.2849890070556897e-07, "logits/chosen": 14.632741928100586, "logits/rejected": 15.001008033752441, "logps/chosen": -4.745911121368408, "logps/rejected": -4.903106212615967, "loss": 4.0598, "rewards/accuracies": 0.75, "rewards/chosen": -47.45911407470703, "rewards/margins": 1.5719480514526367, "rewards/rejected": -49.03105926513672, "step": 5609 }, { "epoch": 0.7638888888888888, "grad_norm": 41.02489395631618, "learning_rate": 1.2835929866704862e-07, "logits/chosen": 13.549549102783203, "logits/rejected": 14.494224548339844, "logps/chosen": -4.404088020324707, "logps/rejected": -4.740214824676514, "loss": 4.1418, "rewards/accuracies": 0.75, "rewards/chosen": -44.0408821105957, "rewards/margins": 3.3612661361694336, "rewards/rejected": -47.40214920043945, "step": 5610 }, { "epoch": 0.7640250544662309, "grad_norm": 38.35510279981077, "learning_rate": 1.2821975800799e-07, "logits/chosen": 14.186199188232422, "logits/rejected": 14.062854766845703, "logps/chosen": -4.173039436340332, "logps/rejected": -4.107839107513428, "loss": 4.1854, "rewards/accuracies": 0.5, "rewards/chosen": -41.73039627075195, "rewards/margins": -0.6520071029663086, "rewards/rejected": -41.078392028808594, "step": 5611 }, { "epoch": 0.764161220043573, "grad_norm": 41.32824471386722, "learning_rate": 1.2808027875992365e-07, "logits/chosen": 14.110937118530273, "logits/rejected": 14.227188110351562, "logps/chosen": -4.682538986206055, "logps/rejected": -4.5331525802612305, "loss": 4.158, "rewards/accuracies": 0.25, "rewards/chosen": -46.82539367675781, "rewards/margins": -1.493865966796875, "rewards/rejected": -45.33152770996094, "step": 5612 }, { "epoch": 0.764297385620915, "grad_norm": 43.497489229405566, "learning_rate": 1.279408609543661e-07, "logits/chosen": 13.914285659790039, "logits/rejected": 14.447004318237305, "logps/chosen": -4.3579912185668945, "logps/rejected": -4.637155532836914, "loss": 3.8617, "rewards/accuracies": 0.75, "rewards/chosen": -43.57991027832031, "rewards/margins": 2.7916479110717773, "rewards/rejected": -46.371559143066406, "step": 5613 }, { "epoch": 0.7644335511982571, "grad_norm": 38.70331565652983, "learning_rate": 1.278015046228198e-07, "logits/chosen": 14.370426177978516, "logits/rejected": 14.63452434539795, "logps/chosen": -4.815320014953613, "logps/rejected": -5.103156089782715, "loss": 3.9519, "rewards/accuracies": 0.75, "rewards/chosen": -48.153202056884766, "rewards/margins": 2.878361701965332, "rewards/rejected": -51.03156280517578, "step": 5614 }, { "epoch": 0.7645697167755992, "grad_norm": 46.59857631208105, "learning_rate": 1.2766220979677354e-07, "logits/chosen": 13.537556648254395, "logits/rejected": 13.460002899169922, "logps/chosen": -4.095533847808838, "logps/rejected": -4.138472080230713, "loss": 3.7206, "rewards/accuracies": 0.5, "rewards/chosen": -40.95534133911133, "rewards/margins": 0.4293813705444336, "rewards/rejected": -41.38471984863281, "step": 5615 }, { "epoch": 0.7647058823529411, "grad_norm": 37.883999557357775, "learning_rate": 1.2752297650770225e-07, "logits/chosen": 14.082853317260742, "logits/rejected": 14.146614074707031, "logps/chosen": -4.555307865142822, "logps/rejected": -4.564228057861328, "loss": 3.5926, "rewards/accuracies": 0.25, "rewards/chosen": -45.553077697753906, "rewards/margins": 0.0892038345336914, "rewards/rejected": -45.64228057861328, "step": 5616 }, { "epoch": 0.7648420479302832, "grad_norm": 38.41835282556846, "learning_rate": 1.2738380478706662e-07, "logits/chosen": 13.773786544799805, "logits/rejected": 14.710403442382812, "logps/chosen": -4.115747928619385, "logps/rejected": -4.574131488800049, "loss": 3.658, "rewards/accuracies": 1.0, "rewards/chosen": -41.1574821472168, "rewards/margins": 4.583834648132324, "rewards/rejected": -45.74131774902344, "step": 5617 }, { "epoch": 0.7649782135076253, "grad_norm": 37.9272984368531, "learning_rate": 1.2724469466631376e-07, "logits/chosen": 15.043062210083008, "logits/rejected": 14.684608459472656, "logps/chosen": -4.744317531585693, "logps/rejected": -4.831933975219727, "loss": 4.0954, "rewards/accuracies": 0.5, "rewards/chosen": -47.44317626953125, "rewards/margins": 0.8761634826660156, "rewards/rejected": -48.319339752197266, "step": 5618 }, { "epoch": 0.7651143790849673, "grad_norm": 38.95992695463609, "learning_rate": 1.271056461768769e-07, "logits/chosen": 13.325735092163086, "logits/rejected": 14.402170181274414, "logps/chosen": -4.261857509613037, "logps/rejected": -4.665276050567627, "loss": 4.0969, "rewards/accuracies": 1.0, "rewards/chosen": -42.61857604980469, "rewards/margins": 4.034183502197266, "rewards/rejected": -46.65276336669922, "step": 5619 }, { "epoch": 0.7652505446623094, "grad_norm": 40.195676824308826, "learning_rate": 1.2696665935017494e-07, "logits/chosen": 14.832653045654297, "logits/rejected": 15.014530181884766, "logps/chosen": -4.384906768798828, "logps/rejected": -4.675673484802246, "loss": 4.1679, "rewards/accuracies": 1.0, "rewards/chosen": -43.84906768798828, "rewards/margins": 2.9076690673828125, "rewards/rejected": -46.756736755371094, "step": 5620 }, { "epoch": 0.7653867102396514, "grad_norm": 38.81845936237223, "learning_rate": 1.2682773421761316e-07, "logits/chosen": 13.219707489013672, "logits/rejected": 13.376240730285645, "logps/chosen": -4.021632194519043, "logps/rejected": -4.201604843139648, "loss": 4.1248, "rewards/accuracies": 0.75, "rewards/chosen": -40.2163200378418, "rewards/margins": 1.7997245788574219, "rewards/rejected": -42.01604461669922, "step": 5621 }, { "epoch": 0.7655228758169934, "grad_norm": 39.763218139473764, "learning_rate": 1.2668887081058312e-07, "logits/chosen": 13.757129669189453, "logits/rejected": 14.04682731628418, "logps/chosen": -4.2930755615234375, "logps/rejected": -4.450170516967773, "loss": 4.0216, "rewards/accuracies": 0.75, "rewards/chosen": -42.930755615234375, "rewards/margins": 1.570948600769043, "rewards/rejected": -44.501708984375, "step": 5622 }, { "epoch": 0.7656590413943355, "grad_norm": 38.40850354229552, "learning_rate": 1.2655006916046173e-07, "logits/chosen": 13.435977935791016, "logits/rejected": 14.281557083129883, "logps/chosen": -4.268099784851074, "logps/rejected": -4.7151336669921875, "loss": 4.0994, "rewards/accuracies": 1.0, "rewards/chosen": -42.680999755859375, "rewards/margins": 4.470334053039551, "rewards/rejected": -47.151336669921875, "step": 5623 }, { "epoch": 0.7657952069716776, "grad_norm": 37.94196044729813, "learning_rate": 1.264113292986126e-07, "logits/chosen": 13.98609733581543, "logits/rejected": 14.282112121582031, "logps/chosen": -4.517553329467773, "logps/rejected": -4.738566875457764, "loss": 3.692, "rewards/accuracies": 0.75, "rewards/chosen": -45.175537109375, "rewards/margins": 2.210134506225586, "rewards/rejected": -47.38566970825195, "step": 5624 }, { "epoch": 0.7659313725490197, "grad_norm": 44.22491028461276, "learning_rate": 1.2627265125638524e-07, "logits/chosen": 14.57441520690918, "logits/rejected": 14.452371597290039, "logps/chosen": -4.622122287750244, "logps/rejected": -4.73846960067749, "loss": 3.7116, "rewards/accuracies": 0.75, "rewards/chosen": -46.221221923828125, "rewards/margins": 1.163473129272461, "rewards/rejected": -47.38469696044922, "step": 5625 }, { "epoch": 0.7660675381263616, "grad_norm": 38.84650262093643, "learning_rate": 1.2613403506511482e-07, "logits/chosen": 13.238494873046875, "logits/rejected": 13.721019744873047, "logps/chosen": -4.2251129150390625, "logps/rejected": -4.642542839050293, "loss": 3.5708, "rewards/accuracies": 0.75, "rewards/chosen": -42.251129150390625, "rewards/margins": 4.174295425415039, "rewards/rejected": -46.4254264831543, "step": 5626 }, { "epoch": 0.7662037037037037, "grad_norm": 41.55180447184879, "learning_rate": 1.2599548075612302e-07, "logits/chosen": 13.790950775146484, "logits/rejected": 13.819287300109863, "logps/chosen": -4.3951897621154785, "logps/rejected": -4.348651885986328, "loss": 4.2475, "rewards/accuracies": 0.5, "rewards/chosen": -43.95189666748047, "rewards/margins": -0.4653816223144531, "rewards/rejected": -43.48651885986328, "step": 5627 }, { "epoch": 0.7663398692810458, "grad_norm": 38.49233608944197, "learning_rate": 1.2585698836071742e-07, "logits/chosen": 13.81061840057373, "logits/rejected": 14.073429107666016, "logps/chosen": -4.429342269897461, "logps/rejected": -4.704920768737793, "loss": 3.3955, "rewards/accuracies": 0.75, "rewards/chosen": -44.29342269897461, "rewards/margins": 2.755784034729004, "rewards/rejected": -47.04920959472656, "step": 5628 }, { "epoch": 0.7664760348583878, "grad_norm": 38.403456952021756, "learning_rate": 1.2571855791019124e-07, "logits/chosen": 14.967643737792969, "logits/rejected": 14.040390014648438, "logps/chosen": -4.405791282653809, "logps/rejected": -4.449872016906738, "loss": 3.7501, "rewards/accuracies": 0.5, "rewards/chosen": -44.05791473388672, "rewards/margins": 0.4408073425292969, "rewards/rejected": -44.49871826171875, "step": 5629 }, { "epoch": 0.7666122004357299, "grad_norm": 38.857289476092305, "learning_rate": 1.2558018943582417e-07, "logits/chosen": 13.657642364501953, "logits/rejected": 14.474964141845703, "logps/chosen": -4.236854553222656, "logps/rejected": -4.5799455642700195, "loss": 3.7773, "rewards/accuracies": 0.75, "rewards/chosen": -42.36854553222656, "rewards/margins": 3.4309139251708984, "rewards/rejected": -45.79945373535156, "step": 5630 }, { "epoch": 0.766748366013072, "grad_norm": 41.72027769983034, "learning_rate": 1.2544188296888175e-07, "logits/chosen": 14.470495223999023, "logits/rejected": 15.308862686157227, "logps/chosen": -4.622588157653809, "logps/rejected": -4.897582054138184, "loss": 3.5899, "rewards/accuracies": 1.0, "rewards/chosen": -46.22587966918945, "rewards/margins": 2.7499446868896484, "rewards/rejected": -48.975826263427734, "step": 5631 }, { "epoch": 0.7668845315904139, "grad_norm": 34.12137106275905, "learning_rate": 1.253036385406153e-07, "logits/chosen": 13.666593551635742, "logits/rejected": 14.961936950683594, "logps/chosen": -4.392199516296387, "logps/rejected": -4.9365081787109375, "loss": 3.8825, "rewards/accuracies": 0.75, "rewards/chosen": -43.9219970703125, "rewards/margins": 5.443083763122559, "rewards/rejected": -49.365081787109375, "step": 5632 }, { "epoch": 0.767020697167756, "grad_norm": 41.354473172997736, "learning_rate": 1.2516545618226236e-07, "logits/chosen": 14.163612365722656, "logits/rejected": 14.064456939697266, "logps/chosen": -4.765947341918945, "logps/rejected": -4.744585037231445, "loss": 4.1975, "rewards/accuracies": 0.5, "rewards/chosen": -47.65946960449219, "rewards/margins": -0.2136220932006836, "rewards/rejected": -47.44584655761719, "step": 5633 }, { "epoch": 0.7671568627450981, "grad_norm": 40.807838972011155, "learning_rate": 1.2502733592504658e-07, "logits/chosen": 15.105443954467773, "logits/rejected": 14.709806442260742, "logps/chosen": -4.754083633422852, "logps/rejected": -4.708385467529297, "loss": 4.4384, "rewards/accuracies": 0.25, "rewards/chosen": -47.54083251953125, "rewards/margins": -0.45697689056396484, "rewards/rejected": -47.08385467529297, "step": 5634 }, { "epoch": 0.7672930283224401, "grad_norm": 42.89496008822212, "learning_rate": 1.2488927780017702e-07, "logits/chosen": 14.982809066772461, "logits/rejected": 14.207693099975586, "logps/chosen": -4.791214466094971, "logps/rejected": -4.5389204025268555, "loss": 4.3428, "rewards/accuracies": 0.0, "rewards/chosen": -47.91214370727539, "rewards/margins": -2.5229368209838867, "rewards/rejected": -45.38920593261719, "step": 5635 }, { "epoch": 0.7674291938997821, "grad_norm": 39.80792056868469, "learning_rate": 1.2475128183884925e-07, "logits/chosen": 14.246057510375977, "logits/rejected": 14.364889144897461, "logps/chosen": -4.375437259674072, "logps/rejected": -4.8266801834106445, "loss": 3.6942, "rewards/accuracies": 0.75, "rewards/chosen": -43.754371643066406, "rewards/margins": 4.5124311447143555, "rewards/rejected": -48.26680374145508, "step": 5636 }, { "epoch": 0.7675653594771242, "grad_norm": 34.310169151243336, "learning_rate": 1.2461334807224466e-07, "logits/chosen": 14.124332427978516, "logits/rejected": 14.43673324584961, "logps/chosen": -4.498546123504639, "logps/rejected": -4.64738655090332, "loss": 3.3937, "rewards/accuracies": 0.5, "rewards/chosen": -44.98545837402344, "rewards/margins": 1.4884033203125, "rewards/rejected": -46.47386169433594, "step": 5637 }, { "epoch": 0.7677015250544662, "grad_norm": 38.70735112954646, "learning_rate": 1.2447547653153034e-07, "logits/chosen": 14.000341415405273, "logits/rejected": 14.51120662689209, "logps/chosen": -4.428569793701172, "logps/rejected": -4.651586055755615, "loss": 3.5086, "rewards/accuracies": 0.75, "rewards/chosen": -44.28569793701172, "rewards/margins": 2.2301645278930664, "rewards/rejected": -46.51586151123047, "step": 5638 }, { "epoch": 0.7678376906318083, "grad_norm": 40.22561282826405, "learning_rate": 1.2433766724785962e-07, "logits/chosen": 13.915426254272461, "logits/rejected": 13.784917831420898, "logps/chosen": -4.207361221313477, "logps/rejected": -4.581878185272217, "loss": 4.4157, "rewards/accuracies": 1.0, "rewards/chosen": -42.07361602783203, "rewards/margins": 3.745168685913086, "rewards/rejected": -45.81877899169922, "step": 5639 }, { "epoch": 0.7679738562091504, "grad_norm": 41.05700738643152, "learning_rate": 1.2419992025237176e-07, "logits/chosen": 14.046747207641602, "logits/rejected": 13.086820602416992, "logps/chosen": -4.166533470153809, "logps/rejected": -4.148186683654785, "loss": 4.2551, "rewards/accuracies": 0.5, "rewards/chosen": -41.66533660888672, "rewards/margins": -0.1834697723388672, "rewards/rejected": -41.48186492919922, "step": 5640 }, { "epoch": 0.7681100217864923, "grad_norm": 44.660388645087785, "learning_rate": 1.240622355761916e-07, "logits/chosen": 13.438922882080078, "logits/rejected": 14.012855529785156, "logps/chosen": -4.341772079467773, "logps/rejected": -4.46322774887085, "loss": 3.9711, "rewards/accuracies": 0.75, "rewards/chosen": -43.417720794677734, "rewards/margins": 1.2145566940307617, "rewards/rejected": -44.63227844238281, "step": 5641 }, { "epoch": 0.7682461873638344, "grad_norm": 44.50595332210119, "learning_rate": 1.2392461325043018e-07, "logits/chosen": 13.719907760620117, "logits/rejected": 13.744251251220703, "logps/chosen": -4.387425422668457, "logps/rejected": -4.249169826507568, "loss": 4.314, "rewards/accuracies": 0.5, "rewards/chosen": -43.8742561340332, "rewards/margins": -1.3825578689575195, "rewards/rejected": -42.49169921875, "step": 5642 }, { "epoch": 0.7683823529411765, "grad_norm": 40.976890076921364, "learning_rate": 1.2378705330618463e-07, "logits/chosen": 14.807106971740723, "logits/rejected": 14.582481384277344, "logps/chosen": -4.333232402801514, "logps/rejected": -4.43709659576416, "loss": 4.5813, "rewards/accuracies": 0.75, "rewards/chosen": -43.33232498168945, "rewards/margins": 1.038640022277832, "rewards/rejected": -44.37096405029297, "step": 5643 }, { "epoch": 0.7685185185185185, "grad_norm": 38.82207793677755, "learning_rate": 1.2364955577453743e-07, "logits/chosen": 13.997828483581543, "logits/rejected": 14.016386032104492, "logps/chosen": -4.175657272338867, "logps/rejected": -4.305792808532715, "loss": 4.0049, "rewards/accuracies": 0.75, "rewards/chosen": -41.75657653808594, "rewards/margins": 1.301356315612793, "rewards/rejected": -43.05792999267578, "step": 5644 }, { "epoch": 0.7686546840958606, "grad_norm": 39.80064362158424, "learning_rate": 1.2351212068655749e-07, "logits/chosen": 14.74579906463623, "logits/rejected": 14.638507843017578, "logps/chosen": -4.3957600593566895, "logps/rejected": -4.587817192077637, "loss": 3.9915, "rewards/accuracies": 0.75, "rewards/chosen": -43.957603454589844, "rewards/margins": 1.9205732345581055, "rewards/rejected": -45.878173828125, "step": 5645 }, { "epoch": 0.7687908496732027, "grad_norm": 39.42173038677042, "learning_rate": 1.2337474807329944e-07, "logits/chosen": 14.07056999206543, "logits/rejected": 14.164383888244629, "logps/chosen": -4.361886978149414, "logps/rejected": -4.518380165100098, "loss": 3.8326, "rewards/accuracies": 0.5, "rewards/chosen": -43.61886978149414, "rewards/margins": 1.5649299621582031, "rewards/rejected": -45.183799743652344, "step": 5646 }, { "epoch": 0.7689270152505446, "grad_norm": 38.50441408678834, "learning_rate": 1.2323743796580351e-07, "logits/chosen": 14.516233444213867, "logits/rejected": 14.579586029052734, "logps/chosen": -4.444485664367676, "logps/rejected": -4.69749116897583, "loss": 3.8782, "rewards/accuracies": 0.75, "rewards/chosen": -44.44485855102539, "rewards/margins": 2.5300559997558594, "rewards/rejected": -46.97491455078125, "step": 5647 }, { "epoch": 0.7690631808278867, "grad_norm": 38.7508034570656, "learning_rate": 1.2310019039509628e-07, "logits/chosen": 14.14126968383789, "logits/rejected": 14.242799758911133, "logps/chosen": -4.223897933959961, "logps/rejected": -4.390593528747559, "loss": 3.525, "rewards/accuracies": 0.75, "rewards/chosen": -42.238983154296875, "rewards/margins": 1.6669492721557617, "rewards/rejected": -43.90592956542969, "step": 5648 }, { "epoch": 0.7691993464052288, "grad_norm": 38.21450741817792, "learning_rate": 1.2296300539219e-07, "logits/chosen": 13.979811668395996, "logits/rejected": 14.378336906433105, "logps/chosen": -4.345090866088867, "logps/rejected": -4.658984661102295, "loss": 4.1845, "rewards/accuracies": 0.5, "rewards/chosen": -43.450904846191406, "rewards/margins": 3.13893985748291, "rewards/rejected": -46.589847564697266, "step": 5649 }, { "epoch": 0.7693355119825708, "grad_norm": 39.95685336805532, "learning_rate": 1.2282588298808255e-07, "logits/chosen": 14.27170181274414, "logits/rejected": 14.614397048950195, "logps/chosen": -4.208680152893066, "logps/rejected": -4.505899429321289, "loss": 4.0797, "rewards/accuracies": 0.75, "rewards/chosen": -42.0868034362793, "rewards/margins": 2.9721927642822266, "rewards/rejected": -45.058998107910156, "step": 5650 }, { "epoch": 0.7694716775599129, "grad_norm": 45.58876328569294, "learning_rate": 1.2268882321375796e-07, "logits/chosen": 14.219783782958984, "logits/rejected": 14.359502792358398, "logps/chosen": -4.102450370788574, "logps/rejected": -4.436038017272949, "loss": 3.9542, "rewards/accuracies": 1.0, "rewards/chosen": -41.024505615234375, "rewards/margins": 3.3358755111694336, "rewards/rejected": -44.360382080078125, "step": 5651 }, { "epoch": 0.7696078431372549, "grad_norm": 42.32975492505328, "learning_rate": 1.2255182610018619e-07, "logits/chosen": 13.582220077514648, "logits/rejected": 12.783576965332031, "logps/chosen": -4.065442085266113, "logps/rejected": -4.051008224487305, "loss": 3.813, "rewards/accuracies": 0.5, "rewards/chosen": -40.6544189453125, "rewards/margins": -0.14433670043945312, "rewards/rejected": -40.51008605957031, "step": 5652 }, { "epoch": 0.7697440087145969, "grad_norm": 41.372508580090866, "learning_rate": 1.2241489167832257e-07, "logits/chosen": 14.440937995910645, "logits/rejected": 14.884101867675781, "logps/chosen": -4.508207321166992, "logps/rejected": -4.732891082763672, "loss": 3.9148, "rewards/accuracies": 0.75, "rewards/chosen": -45.08207702636719, "rewards/margins": 2.246832847595215, "rewards/rejected": -47.32891082763672, "step": 5653 }, { "epoch": 0.769880174291939, "grad_norm": 39.15302977386807, "learning_rate": 1.2227801997910872e-07, "logits/chosen": 14.392437934875488, "logits/rejected": 15.366939544677734, "logps/chosen": -4.813186168670654, "logps/rejected": -4.942863464355469, "loss": 3.8029, "rewards/accuracies": 0.5, "rewards/chosen": -48.13186264038086, "rewards/margins": 1.2967729568481445, "rewards/rejected": -49.42863464355469, "step": 5654 }, { "epoch": 0.7700163398692811, "grad_norm": 38.19047971286501, "learning_rate": 1.2214121103347213e-07, "logits/chosen": 13.949682235717773, "logits/rejected": 13.736394882202148, "logps/chosen": -4.323986053466797, "logps/rejected": -4.297916412353516, "loss": 3.5804, "rewards/accuracies": 0.5, "rewards/chosen": -43.23986053466797, "rewards/margins": -0.2606935501098633, "rewards/rejected": -42.97916793823242, "step": 5655 }, { "epoch": 0.7701525054466231, "grad_norm": 38.47279034768139, "learning_rate": 1.220044648723255e-07, "logits/chosen": 14.208236694335938, "logits/rejected": 13.839848518371582, "logps/chosen": -4.403988838195801, "logps/rejected": -4.324082374572754, "loss": 4.0599, "rewards/accuracies": 0.25, "rewards/chosen": -44.039886474609375, "rewards/margins": -0.7990627288818359, "rewards/rejected": -43.24082565307617, "step": 5656 }, { "epoch": 0.7702886710239651, "grad_norm": 35.15734876942793, "learning_rate": 1.2186778152656797e-07, "logits/chosen": 13.59637451171875, "logits/rejected": 14.329949378967285, "logps/chosen": -4.195446491241455, "logps/rejected": -4.538139343261719, "loss": 3.4881, "rewards/accuracies": 0.75, "rewards/chosen": -41.954463958740234, "rewards/margins": 3.4269275665283203, "rewards/rejected": -45.38139343261719, "step": 5657 }, { "epoch": 0.7704248366013072, "grad_norm": 43.06222975877636, "learning_rate": 1.2173116102708446e-07, "logits/chosen": 14.68614387512207, "logits/rejected": 15.477060317993164, "logps/chosen": -4.908509254455566, "logps/rejected": -4.8096818923950195, "loss": 4.2178, "rewards/accuracies": 0.5, "rewards/chosen": -49.0850944519043, "rewards/margins": -0.9882736206054688, "rewards/rejected": -48.09682083129883, "step": 5658 }, { "epoch": 0.7705610021786492, "grad_norm": 45.188683104571695, "learning_rate": 1.2159460340474513e-07, "logits/chosen": 14.79708480834961, "logits/rejected": 15.152910232543945, "logps/chosen": -4.876819610595703, "logps/rejected": -4.879629611968994, "loss": 4.4331, "rewards/accuracies": 0.75, "rewards/chosen": -48.7681999206543, "rewards/margins": 0.028098106384277344, "rewards/rejected": -48.79629898071289, "step": 5659 }, { "epoch": 0.7706971677559913, "grad_norm": 37.91591928160795, "learning_rate": 1.2145810869040652e-07, "logits/chosen": 14.624488830566406, "logits/rejected": 14.189950942993164, "logps/chosen": -4.573659896850586, "logps/rejected": -4.7441325187683105, "loss": 3.6224, "rewards/accuracies": 0.5, "rewards/chosen": -45.73659896850586, "rewards/margins": 1.704728126525879, "rewards/rejected": -47.44132614135742, "step": 5660 }, { "epoch": 0.7708333333333334, "grad_norm": 41.13226426786149, "learning_rate": 1.213216769149108e-07, "logits/chosen": 14.61143684387207, "logits/rejected": 15.111087799072266, "logps/chosen": -4.680964946746826, "logps/rejected": -4.789310932159424, "loss": 3.4363, "rewards/accuracies": 0.75, "rewards/chosen": -46.80965042114258, "rewards/margins": 1.0834627151489258, "rewards/rejected": -47.89311218261719, "step": 5661 }, { "epoch": 0.7709694989106753, "grad_norm": 38.167125380275145, "learning_rate": 1.2118530810908563e-07, "logits/chosen": 14.013130187988281, "logits/rejected": 14.721121788024902, "logps/chosen": -4.478293418884277, "logps/rejected": -4.803564071655273, "loss": 3.6735, "rewards/accuracies": 1.0, "rewards/chosen": -44.78293228149414, "rewards/margins": 3.25270938873291, "rewards/rejected": -48.035640716552734, "step": 5662 }, { "epoch": 0.7711056644880174, "grad_norm": 39.05605171378359, "learning_rate": 1.210490023037448e-07, "logits/chosen": 14.325078964233398, "logits/rejected": 14.482702255249023, "logps/chosen": -4.416125774383545, "logps/rejected": -4.51987361907959, "loss": 3.8077, "rewards/accuracies": 0.5, "rewards/chosen": -44.1612548828125, "rewards/margins": 1.0374841690063477, "rewards/rejected": -45.1987419128418, "step": 5663 }, { "epoch": 0.7712418300653595, "grad_norm": 37.823106330606954, "learning_rate": 1.2091275952968784e-07, "logits/chosen": 14.511639595031738, "logits/rejected": 14.392687797546387, "logps/chosen": -4.489223957061768, "logps/rejected": -4.434319019317627, "loss": 3.9065, "rewards/accuracies": 0.5, "rewards/chosen": -44.89223861694336, "rewards/margins": -0.5490503311157227, "rewards/rejected": -44.34318923950195, "step": 5664 }, { "epoch": 0.7713779956427015, "grad_norm": 40.29378156339233, "learning_rate": 1.207765798176997e-07, "logits/chosen": 14.330982208251953, "logits/rejected": 14.657979965209961, "logps/chosen": -4.5375165939331055, "logps/rejected": -4.523286819458008, "loss": 3.8925, "rewards/accuracies": 0.5, "rewards/chosen": -45.37516403198242, "rewards/margins": -0.14229583740234375, "rewards/rejected": -45.23286819458008, "step": 5665 }, { "epoch": 0.7715141612200436, "grad_norm": 37.39360486514734, "learning_rate": 1.206404631985515e-07, "logits/chosen": 13.686346054077148, "logits/rejected": 13.521522521972656, "logps/chosen": -4.238258361816406, "logps/rejected": -4.350803375244141, "loss": 3.6056, "rewards/accuracies": 0.25, "rewards/chosen": -42.38258361816406, "rewards/margins": 1.125448226928711, "rewards/rejected": -43.50802993774414, "step": 5666 }, { "epoch": 0.7716503267973857, "grad_norm": 34.425019737219884, "learning_rate": 1.205044097029999e-07, "logits/chosen": 13.304872512817383, "logits/rejected": 13.796926498413086, "logps/chosen": -4.316047191619873, "logps/rejected": -4.558591842651367, "loss": 3.856, "rewards/accuracies": 0.75, "rewards/chosen": -43.16047286987305, "rewards/margins": 2.425445556640625, "rewards/rejected": -45.585914611816406, "step": 5667 }, { "epoch": 0.7717864923747276, "grad_norm": 47.38744327377759, "learning_rate": 1.203684193617872e-07, "logits/chosen": 13.208889961242676, "logits/rejected": 14.413003921508789, "logps/chosen": -4.340152263641357, "logps/rejected": -4.675119400024414, "loss": 4.1008, "rewards/accuracies": 0.75, "rewards/chosen": -43.40152359008789, "rewards/margins": 3.3496713638305664, "rewards/rejected": -46.75119400024414, "step": 5668 }, { "epoch": 0.7719226579520697, "grad_norm": 37.78700155683079, "learning_rate": 1.2023249220564155e-07, "logits/chosen": 15.363763809204102, "logits/rejected": 14.766979217529297, "logps/chosen": -4.8800272941589355, "logps/rejected": -4.627688884735107, "loss": 3.9363, "rewards/accuracies": 0.5, "rewards/chosen": -48.800270080566406, "rewards/margins": -2.523383140563965, "rewards/rejected": -46.276885986328125, "step": 5669 }, { "epoch": 0.7720588235294118, "grad_norm": 41.97183316752014, "learning_rate": 1.2009662826527703e-07, "logits/chosen": 14.247354507446289, "logits/rejected": 14.381233215332031, "logps/chosen": -4.470034599304199, "logps/rejected": -4.572587490081787, "loss": 3.8984, "rewards/accuracies": 0.25, "rewards/chosen": -44.700347900390625, "rewards/margins": 1.0255298614501953, "rewards/rejected": -45.72587585449219, "step": 5670 }, { "epoch": 0.7721949891067538, "grad_norm": 36.63796732387944, "learning_rate": 1.199608275713929e-07, "logits/chosen": 15.341137886047363, "logits/rejected": 14.804916381835938, "logps/chosen": -4.5100202560424805, "logps/rejected": -4.745884418487549, "loss": 3.9176, "rewards/accuracies": 0.75, "rewards/chosen": -45.10020065307617, "rewards/margins": 2.3586435317993164, "rewards/rejected": -47.45884323120117, "step": 5671 }, { "epoch": 0.7723311546840959, "grad_norm": 36.5680870455806, "learning_rate": 1.1982509015467458e-07, "logits/chosen": 13.989466667175293, "logits/rejected": 14.840437889099121, "logps/chosen": -4.496700286865234, "logps/rejected": -4.830822944641113, "loss": 3.7479, "rewards/accuracies": 0.75, "rewards/chosen": -44.967002868652344, "rewards/margins": 3.3412256240844727, "rewards/rejected": -48.308231353759766, "step": 5672 }, { "epoch": 0.7724673202614379, "grad_norm": 39.0846159619394, "learning_rate": 1.196894160457933e-07, "logits/chosen": 14.277139663696289, "logits/rejected": 13.671123504638672, "logps/chosen": -4.525290489196777, "logps/rejected": -4.548878192901611, "loss": 4.0264, "rewards/accuracies": 0.25, "rewards/chosen": -45.252906799316406, "rewards/margins": 0.23587703704833984, "rewards/rejected": -45.48878479003906, "step": 5673 }, { "epoch": 0.7726034858387799, "grad_norm": 44.81379163837069, "learning_rate": 1.1955380527540535e-07, "logits/chosen": 13.972475051879883, "logits/rejected": 14.842155456542969, "logps/chosen": -4.257530212402344, "logps/rejected": -4.983997344970703, "loss": 4.5566, "rewards/accuracies": 1.0, "rewards/chosen": -42.57529830932617, "rewards/margins": 7.26467227935791, "rewards/rejected": -49.83997344970703, "step": 5674 }, { "epoch": 0.772739651416122, "grad_norm": 89.78585542757055, "learning_rate": 1.194182578741533e-07, "logits/chosen": 14.072322845458984, "logits/rejected": 14.673192024230957, "logps/chosen": -4.1344757080078125, "logps/rejected": -4.560107707977295, "loss": 3.8325, "rewards/accuracies": 1.0, "rewards/chosen": -41.344757080078125, "rewards/margins": 4.256319999694824, "rewards/rejected": -45.601078033447266, "step": 5675 }, { "epoch": 0.7728758169934641, "grad_norm": 44.71111048998493, "learning_rate": 1.1928277387266535e-07, "logits/chosen": 14.469425201416016, "logits/rejected": 14.332906723022461, "logps/chosen": -4.485210418701172, "logps/rejected": -4.475129127502441, "loss": 4.1881, "rewards/accuracies": 0.5, "rewards/chosen": -44.85210418701172, "rewards/margins": -0.10081291198730469, "rewards/rejected": -44.75128936767578, "step": 5676 }, { "epoch": 0.773011982570806, "grad_norm": 39.075616556703984, "learning_rate": 1.1914735330155492e-07, "logits/chosen": 14.014701843261719, "logits/rejected": 14.756753921508789, "logps/chosen": -4.4224042892456055, "logps/rejected": -4.539067268371582, "loss": 4.1442, "rewards/accuracies": 0.5, "rewards/chosen": -44.224037170410156, "rewards/margins": 1.1666383743286133, "rewards/rejected": -45.39067840576172, "step": 5677 }, { "epoch": 0.7731481481481481, "grad_norm": 41.50083646381442, "learning_rate": 1.1901199619142155e-07, "logits/chosen": 14.33553695678711, "logits/rejected": 14.306388854980469, "logps/chosen": -4.701131820678711, "logps/rejected": -4.864518642425537, "loss": 4.0062, "rewards/accuracies": 0.5, "rewards/chosen": -47.01131820678711, "rewards/margins": 1.6338701248168945, "rewards/rejected": -48.64518737792969, "step": 5678 }, { "epoch": 0.7732843137254902, "grad_norm": 70.44951900260126, "learning_rate": 1.1887670257285045e-07, "logits/chosen": 13.814079284667969, "logits/rejected": 13.961986541748047, "logps/chosen": -4.346708297729492, "logps/rejected": -4.406839847564697, "loss": 3.964, "rewards/accuracies": 0.5, "rewards/chosen": -43.46708297729492, "rewards/margins": 0.6013164520263672, "rewards/rejected": -44.068397521972656, "step": 5679 }, { "epoch": 0.7734204793028322, "grad_norm": 36.831506120025445, "learning_rate": 1.187414724764121e-07, "logits/chosen": 13.516314506530762, "logits/rejected": 14.53175163269043, "logps/chosen": -4.192337989807129, "logps/rejected": -4.612570762634277, "loss": 3.9724, "rewards/accuracies": 0.75, "rewards/chosen": -41.923377990722656, "rewards/margins": 4.202329635620117, "rewards/rejected": -46.12570571899414, "step": 5680 }, { "epoch": 0.7735566448801743, "grad_norm": 41.26897339090409, "learning_rate": 1.1860630593266291e-07, "logits/chosen": 14.036127090454102, "logits/rejected": 15.417734146118164, "logps/chosen": -4.381640434265137, "logps/rejected": -4.540989875793457, "loss": 3.7021, "rewards/accuracies": 0.5, "rewards/chosen": -43.81640625, "rewards/margins": 1.5934944152832031, "rewards/rejected": -45.4099006652832, "step": 5681 }, { "epoch": 0.7736928104575164, "grad_norm": 41.93180903458659, "learning_rate": 1.1847120297214508e-07, "logits/chosen": 13.62434196472168, "logits/rejected": 14.429901123046875, "logps/chosen": -4.099067211151123, "logps/rejected": -4.487484931945801, "loss": 3.927, "rewards/accuracies": 0.75, "rewards/chosen": -40.99066925048828, "rewards/margins": 3.8841800689697266, "rewards/rejected": -44.87485122680664, "step": 5682 }, { "epoch": 0.7738289760348583, "grad_norm": 39.82283160859244, "learning_rate": 1.183361636253859e-07, "logits/chosen": 14.1903657913208, "logits/rejected": 15.012303352355957, "logps/chosen": -4.275615692138672, "logps/rejected": -4.406623363494873, "loss": 3.9707, "rewards/accuracies": 1.0, "rewards/chosen": -42.75615692138672, "rewards/margins": 1.3100805282592773, "rewards/rejected": -44.06623458862305, "step": 5683 }, { "epoch": 0.7739651416122004, "grad_norm": 40.996923653409226, "learning_rate": 1.1820118792289883e-07, "logits/chosen": 13.813310623168945, "logits/rejected": 13.559019088745117, "logps/chosen": -4.175429821014404, "logps/rejected": -4.264900207519531, "loss": 3.5222, "rewards/accuracies": 0.75, "rewards/chosen": -41.754302978515625, "rewards/margins": 0.8947010040283203, "rewards/rejected": -42.64900207519531, "step": 5684 }, { "epoch": 0.7741013071895425, "grad_norm": 38.967038980084894, "learning_rate": 1.1806627589518288e-07, "logits/chosen": 13.739677429199219, "logits/rejected": 14.158069610595703, "logps/chosen": -4.071963787078857, "logps/rejected": -4.581300735473633, "loss": 3.8907, "rewards/accuracies": 0.75, "rewards/chosen": -40.71963882446289, "rewards/margins": 5.093366622924805, "rewards/rejected": -45.81300735473633, "step": 5685 }, { "epoch": 0.7742374727668845, "grad_norm": 43.581515407144536, "learning_rate": 1.1793142757272221e-07, "logits/chosen": 14.006292343139648, "logits/rejected": 14.062322616577148, "logps/chosen": -3.9715378284454346, "logps/rejected": -4.4912214279174805, "loss": 4.2606, "rewards/accuracies": 0.75, "rewards/chosen": -39.71537780761719, "rewards/margins": 5.196836471557617, "rewards/rejected": -44.91221237182617, "step": 5686 }, { "epoch": 0.7743736383442266, "grad_norm": 44.081865330934036, "learning_rate": 1.1779664298598713e-07, "logits/chosen": 14.618497848510742, "logits/rejected": 14.836763381958008, "logps/chosen": -4.262563228607178, "logps/rejected": -4.367587089538574, "loss": 4.1145, "rewards/accuracies": 0.5, "rewards/chosen": -42.62562942504883, "rewards/margins": 1.050241470336914, "rewards/rejected": -43.675872802734375, "step": 5687 }, { "epoch": 0.7745098039215687, "grad_norm": 36.321449278787675, "learning_rate": 1.1766192216543323e-07, "logits/chosen": 14.943914413452148, "logits/rejected": 14.62935733795166, "logps/chosen": -4.774352073669434, "logps/rejected": -4.833764553070068, "loss": 3.3848, "rewards/accuracies": 0.5, "rewards/chosen": -47.74352264404297, "rewards/margins": 0.5941228866577148, "rewards/rejected": -48.337646484375, "step": 5688 }, { "epoch": 0.7746459694989106, "grad_norm": 43.98762242329059, "learning_rate": 1.1752726514150201e-07, "logits/chosen": 14.151179313659668, "logits/rejected": 14.861506462097168, "logps/chosen": -4.286400318145752, "logps/rejected": -4.802187919616699, "loss": 4.0286, "rewards/accuracies": 1.0, "rewards/chosen": -42.8640022277832, "rewards/margins": 5.157879829406738, "rewards/rejected": -48.021881103515625, "step": 5689 }, { "epoch": 0.7747821350762527, "grad_norm": 49.73385139372557, "learning_rate": 1.1739267194462002e-07, "logits/chosen": 14.54884147644043, "logits/rejected": 15.539012908935547, "logps/chosen": -4.5513811111450195, "logps/rejected": -5.096293926239014, "loss": 3.4949, "rewards/accuracies": 1.0, "rewards/chosen": -45.51381301879883, "rewards/margins": 5.449124336242676, "rewards/rejected": -50.96293640136719, "step": 5690 }, { "epoch": 0.7749183006535948, "grad_norm": 40.74999816269814, "learning_rate": 1.1725814260519986e-07, "logits/chosen": 14.773560523986816, "logits/rejected": 14.994588851928711, "logps/chosen": -4.6916913986206055, "logps/rejected": -4.8053669929504395, "loss": 3.9147, "rewards/accuracies": 0.75, "rewards/chosen": -46.91691589355469, "rewards/margins": 1.1367559432983398, "rewards/rejected": -48.053672790527344, "step": 5691 }, { "epoch": 0.7750544662309368, "grad_norm": 36.53512098478931, "learning_rate": 1.1712367715363968e-07, "logits/chosen": 14.391843795776367, "logits/rejected": 14.975767135620117, "logps/chosen": -4.621842861175537, "logps/rejected": -5.03913688659668, "loss": 3.608, "rewards/accuracies": 0.75, "rewards/chosen": -46.21842956542969, "rewards/margins": 4.172943115234375, "rewards/rejected": -50.39137268066406, "step": 5692 }, { "epoch": 0.7751906318082789, "grad_norm": 38.58617356302892, "learning_rate": 1.1698927562032284e-07, "logits/chosen": 14.748811721801758, "logits/rejected": 15.718502044677734, "logps/chosen": -4.551575183868408, "logps/rejected": -4.819770812988281, "loss": 4.0558, "rewards/accuracies": 0.5, "rewards/chosen": -45.515750885009766, "rewards/margins": 2.681957244873047, "rewards/rejected": -48.19770812988281, "step": 5693 }, { "epoch": 0.7753267973856209, "grad_norm": 36.46268881573395, "learning_rate": 1.1685493803561853e-07, "logits/chosen": 14.530473709106445, "logits/rejected": 14.897503852844238, "logps/chosen": -4.76064395904541, "logps/rejected": -4.974615573883057, "loss": 3.8256, "rewards/accuracies": 0.75, "rewards/chosen": -47.60643768310547, "rewards/margins": 2.1397199630737305, "rewards/rejected": -49.74615478515625, "step": 5694 }, { "epoch": 0.7754629629629629, "grad_norm": 37.06816087101859, "learning_rate": 1.1672066442988149e-07, "logits/chosen": 13.494394302368164, "logits/rejected": 13.907293319702148, "logps/chosen": -4.2795491218566895, "logps/rejected": -4.749677658081055, "loss": 4.0152, "rewards/accuracies": 1.0, "rewards/chosen": -42.795494079589844, "rewards/margins": 4.7012834548950195, "rewards/rejected": -47.49678039550781, "step": 5695 }, { "epoch": 0.775599128540305, "grad_norm": 37.236838217589906, "learning_rate": 1.1658645483345205e-07, "logits/chosen": 14.121963500976562, "logits/rejected": 14.235885620117188, "logps/chosen": -4.679152488708496, "logps/rejected": -4.692688941955566, "loss": 3.5529, "rewards/accuracies": 0.5, "rewards/chosen": -46.791526794433594, "rewards/margins": 0.1353626251220703, "rewards/rejected": -46.92688751220703, "step": 5696 }, { "epoch": 0.7757352941176471, "grad_norm": 37.854543417922066, "learning_rate": 1.164523092766557e-07, "logits/chosen": 14.725704193115234, "logits/rejected": 14.564481735229492, "logps/chosen": -4.583535194396973, "logps/rejected": -4.764836311340332, "loss": 3.5602, "rewards/accuracies": 1.0, "rewards/chosen": -45.835350036621094, "rewards/margins": 1.8130102157592773, "rewards/rejected": -47.64836120605469, "step": 5697 }, { "epoch": 0.775871459694989, "grad_norm": 45.05631486718914, "learning_rate": 1.1631822778980392e-07, "logits/chosen": 14.645827293395996, "logits/rejected": 14.205249786376953, "logps/chosen": -4.09178352355957, "logps/rejected": -4.09559440612793, "loss": 4.2884, "rewards/accuracies": 0.5, "rewards/chosen": -40.91783142089844, "rewards/margins": 0.03811073303222656, "rewards/rejected": -40.95594024658203, "step": 5698 }, { "epoch": 0.7760076252723311, "grad_norm": 40.43347377868916, "learning_rate": 1.1618421040319364e-07, "logits/chosen": 14.495975494384766, "logits/rejected": 14.590145111083984, "logps/chosen": -4.507421016693115, "logps/rejected": -4.572638511657715, "loss": 3.5977, "rewards/accuracies": 0.5, "rewards/chosen": -45.07421112060547, "rewards/margins": 0.6521749496459961, "rewards/rejected": -45.72638702392578, "step": 5699 }, { "epoch": 0.7761437908496732, "grad_norm": 43.026340145607364, "learning_rate": 1.1605025714710697e-07, "logits/chosen": 14.36618423461914, "logits/rejected": 14.88578987121582, "logps/chosen": -4.170990943908691, "logps/rejected": -4.333329200744629, "loss": 4.0517, "rewards/accuracies": 0.75, "rewards/chosen": -41.70990753173828, "rewards/margins": 1.6233844757080078, "rewards/rejected": -43.33329391479492, "step": 5700 }, { "epoch": 0.7762799564270153, "grad_norm": 46.476388470183196, "learning_rate": 1.1591636805181178e-07, "logits/chosen": 14.925273895263672, "logits/rejected": 14.960551261901855, "logps/chosen": -4.7010393142700195, "logps/rejected": -4.824668884277344, "loss": 4.3881, "rewards/accuracies": 0.5, "rewards/chosen": -47.01039123535156, "rewards/margins": 1.2362937927246094, "rewards/rejected": -48.24668884277344, "step": 5701 }, { "epoch": 0.7764161220043573, "grad_norm": 37.43887409761715, "learning_rate": 1.1578254314756155e-07, "logits/chosen": 13.748067855834961, "logits/rejected": 14.836714744567871, "logps/chosen": -4.290709495544434, "logps/rejected": -4.735413551330566, "loss": 3.8014, "rewards/accuracies": 1.0, "rewards/chosen": -42.9070930480957, "rewards/margins": 4.4470415115356445, "rewards/rejected": -47.35413360595703, "step": 5702 }, { "epoch": 0.7765522875816994, "grad_norm": 39.12017846634417, "learning_rate": 1.1564878246459517e-07, "logits/chosen": 13.531929016113281, "logits/rejected": 14.43589973449707, "logps/chosen": -4.494485855102539, "logps/rejected": -4.386806964874268, "loss": 3.9479, "rewards/accuracies": 0.5, "rewards/chosen": -44.944854736328125, "rewards/margins": -1.0767879486083984, "rewards/rejected": -43.868072509765625, "step": 5703 }, { "epoch": 0.7766884531590414, "grad_norm": 37.61030421243097, "learning_rate": 1.1551508603313673e-07, "logits/chosen": 14.665983200073242, "logits/rejected": 14.829902648925781, "logps/chosen": -4.722308158874512, "logps/rejected": -5.019077301025391, "loss": 3.9632, "rewards/accuracies": 1.0, "rewards/chosen": -47.223079681396484, "rewards/margins": 2.967693328857422, "rewards/rejected": -50.190773010253906, "step": 5704 }, { "epoch": 0.7768246187363834, "grad_norm": 38.30333167745173, "learning_rate": 1.1538145388339615e-07, "logits/chosen": 14.356627464294434, "logits/rejected": 13.792730331420898, "logps/chosen": -4.425596714019775, "logps/rejected": -4.4047160148620605, "loss": 3.875, "rewards/accuracies": 0.5, "rewards/chosen": -44.2559700012207, "rewards/margins": -0.20881175994873047, "rewards/rejected": -44.047157287597656, "step": 5705 }, { "epoch": 0.7769607843137255, "grad_norm": 39.96503935671665, "learning_rate": 1.1524788604556891e-07, "logits/chosen": 13.165546417236328, "logits/rejected": 14.031322479248047, "logps/chosen": -4.125448226928711, "logps/rejected": -4.577517509460449, "loss": 3.9084, "rewards/accuracies": 0.75, "rewards/chosen": -41.254486083984375, "rewards/margins": 4.52069091796875, "rewards/rejected": -45.775177001953125, "step": 5706 }, { "epoch": 0.7770969498910676, "grad_norm": 47.592237215780415, "learning_rate": 1.1511438254983548e-07, "logits/chosen": 14.698970794677734, "logits/rejected": 14.673604965209961, "logps/chosen": -4.596567153930664, "logps/rejected": -4.823680877685547, "loss": 4.1352, "rewards/accuracies": 0.75, "rewards/chosen": -45.965675354003906, "rewards/margins": 2.271134376525879, "rewards/rejected": -48.23680877685547, "step": 5707 }, { "epoch": 0.7772331154684096, "grad_norm": 37.22942219507858, "learning_rate": 1.1498094342636218e-07, "logits/chosen": 14.780866622924805, "logits/rejected": 13.711647033691406, "logps/chosen": -4.8070855140686035, "logps/rejected": -4.494001865386963, "loss": 3.7723, "rewards/accuracies": 0.25, "rewards/chosen": -48.070858001708984, "rewards/margins": -3.1308374404907227, "rewards/rejected": -44.94001770019531, "step": 5708 }, { "epoch": 0.7773692810457516, "grad_norm": 45.92216170087361, "learning_rate": 1.1484756870530081e-07, "logits/chosen": 14.185853004455566, "logits/rejected": 14.639914512634277, "logps/chosen": -4.568367958068848, "logps/rejected": -4.754631042480469, "loss": 3.1665, "rewards/accuracies": 0.75, "rewards/chosen": -45.68368148803711, "rewards/margins": 1.8626298904418945, "rewards/rejected": -47.54631042480469, "step": 5709 }, { "epoch": 0.7775054466230937, "grad_norm": 38.168369678156466, "learning_rate": 1.147142584167883e-07, "logits/chosen": 15.10877799987793, "logits/rejected": 15.23116683959961, "logps/chosen": -4.534515380859375, "logps/rejected": -4.749897003173828, "loss": 4.1803, "rewards/accuracies": 0.5, "rewards/chosen": -45.34516143798828, "rewards/margins": 2.1538095474243164, "rewards/rejected": -47.498966217041016, "step": 5710 }, { "epoch": 0.7776416122004357, "grad_norm": 38.85462350190125, "learning_rate": 1.145810125909473e-07, "logits/chosen": 15.268678665161133, "logits/rejected": 15.3750638961792, "logps/chosen": -4.682337760925293, "logps/rejected": -4.772264003753662, "loss": 4.2962, "rewards/accuracies": 0.5, "rewards/chosen": -46.8233757019043, "rewards/margins": 0.8992643356323242, "rewards/rejected": -47.72264099121094, "step": 5711 }, { "epoch": 0.7777777777777778, "grad_norm": 40.04196231674549, "learning_rate": 1.1444783125788591e-07, "logits/chosen": 13.701454162597656, "logits/rejected": 13.741792678833008, "logps/chosen": -4.254435062408447, "logps/rejected": -4.326572418212891, "loss": 4.5657, "rewards/accuracies": 0.25, "rewards/chosen": -42.544349670410156, "rewards/margins": 0.7213735580444336, "rewards/rejected": -43.265724182128906, "step": 5712 }, { "epoch": 0.7779139433551199, "grad_norm": 38.67347840204266, "learning_rate": 1.1431471444769734e-07, "logits/chosen": 14.692649841308594, "logits/rejected": 14.023914337158203, "logps/chosen": -4.451409339904785, "logps/rejected": -4.230489730834961, "loss": 4.2176, "rewards/accuracies": 0.5, "rewards/chosen": -44.514095306396484, "rewards/margins": -2.2092018127441406, "rewards/rejected": -42.304893493652344, "step": 5713 }, { "epoch": 0.7780501089324618, "grad_norm": 36.70907742993963, "learning_rate": 1.1418166219046051e-07, "logits/chosen": 13.504664421081543, "logits/rejected": 13.839032173156738, "logps/chosen": -4.294804573059082, "logps/rejected": -4.491350173950195, "loss": 3.9648, "rewards/accuracies": 0.75, "rewards/chosen": -42.94804763793945, "rewards/margins": 1.9654531478881836, "rewards/rejected": -44.91349792480469, "step": 5714 }, { "epoch": 0.7781862745098039, "grad_norm": 40.68327543651791, "learning_rate": 1.1404867451623981e-07, "logits/chosen": 14.15156364440918, "logits/rejected": 15.39415168762207, "logps/chosen": -4.398858070373535, "logps/rejected": -4.608109951019287, "loss": 4.2401, "rewards/accuracies": 0.75, "rewards/chosen": -43.98857879638672, "rewards/margins": 2.0925235748291016, "rewards/rejected": -46.08110046386719, "step": 5715 }, { "epoch": 0.778322440087146, "grad_norm": 43.59230029757043, "learning_rate": 1.1391575145508471e-07, "logits/chosen": 14.182010650634766, "logits/rejected": 14.766955375671387, "logps/chosen": -4.319879531860352, "logps/rejected": -4.555859565734863, "loss": 4.2233, "rewards/accuracies": 0.75, "rewards/chosen": -43.198795318603516, "rewards/margins": 2.35980224609375, "rewards/rejected": -45.558597564697266, "step": 5716 }, { "epoch": 0.778458605664488, "grad_norm": 39.087086603315065, "learning_rate": 1.1378289303703036e-07, "logits/chosen": 13.398059844970703, "logits/rejected": 14.341941833496094, "logps/chosen": -4.4998979568481445, "logps/rejected": -4.646645545959473, "loss": 4.291, "rewards/accuracies": 0.5, "rewards/chosen": -44.998985290527344, "rewards/margins": 1.4674739837646484, "rewards/rejected": -46.46645736694336, "step": 5717 }, { "epoch": 0.7785947712418301, "grad_norm": 40.59972671504458, "learning_rate": 1.1365009929209737e-07, "logits/chosen": 14.915294647216797, "logits/rejected": 14.44692611694336, "logps/chosen": -4.43843412399292, "logps/rejected": -4.525755882263184, "loss": 3.9854, "rewards/accuracies": 0.75, "rewards/chosen": -44.384342193603516, "rewards/margins": 0.8732175827026367, "rewards/rejected": -45.25756072998047, "step": 5718 }, { "epoch": 0.7787309368191722, "grad_norm": 36.23700934357582, "learning_rate": 1.1351737025029132e-07, "logits/chosen": 15.191688537597656, "logits/rejected": 15.179349899291992, "logps/chosen": -5.025216102600098, "logps/rejected": -4.675272464752197, "loss": 3.8979, "rewards/accuracies": 0.25, "rewards/chosen": -50.252166748046875, "rewards/margins": -3.499441146850586, "rewards/rejected": -46.752723693847656, "step": 5719 }, { "epoch": 0.7788671023965141, "grad_norm": 83.9680574069421, "learning_rate": 1.133847059416035e-07, "logits/chosen": 14.266483306884766, "logits/rejected": 15.364709854125977, "logps/chosen": -4.735245704650879, "logps/rejected": -4.8016557693481445, "loss": 3.6304, "rewards/accuracies": 0.75, "rewards/chosen": -47.352455139160156, "rewards/margins": 0.6641054153442383, "rewards/rejected": -48.01655960083008, "step": 5720 }, { "epoch": 0.7790032679738562, "grad_norm": 47.209773408043894, "learning_rate": 1.1325210639601071e-07, "logits/chosen": 14.802776336669922, "logits/rejected": 15.18587589263916, "logps/chosen": -4.559417724609375, "logps/rejected": -4.642132759094238, "loss": 3.5007, "rewards/accuracies": 0.5, "rewards/chosen": -45.59417724609375, "rewards/margins": 0.8271484375, "rewards/rejected": -46.42132568359375, "step": 5721 }, { "epoch": 0.7791394335511983, "grad_norm": 37.66273227804506, "learning_rate": 1.1311957164347461e-07, "logits/chosen": 14.805015563964844, "logits/rejected": 15.320865631103516, "logps/chosen": -4.638826370239258, "logps/rejected": -4.946510314941406, "loss": 3.7116, "rewards/accuracies": 0.5, "rewards/chosen": -46.38826370239258, "rewards/margins": 3.076838493347168, "rewards/rejected": -49.4650993347168, "step": 5722 }, { "epoch": 0.7792755991285403, "grad_norm": 39.03927424885514, "learning_rate": 1.1298710171394272e-07, "logits/chosen": 14.736282348632812, "logits/rejected": 14.735061645507812, "logps/chosen": -4.59003210067749, "logps/rejected": -4.604189872741699, "loss": 4.5788, "rewards/accuracies": 0.25, "rewards/chosen": -45.90031814575195, "rewards/margins": 0.14158058166503906, "rewards/rejected": -46.041900634765625, "step": 5723 }, { "epoch": 0.7794117647058824, "grad_norm": 42.03456151647066, "learning_rate": 1.1285469663734777e-07, "logits/chosen": 14.469480514526367, "logits/rejected": 14.755803108215332, "logps/chosen": -4.509407997131348, "logps/rejected": -4.692409515380859, "loss": 4.2003, "rewards/accuracies": 0.5, "rewards/chosen": -45.094078063964844, "rewards/margins": 1.830021858215332, "rewards/rejected": -46.924095153808594, "step": 5724 }, { "epoch": 0.7795479302832244, "grad_norm": 40.43229983872206, "learning_rate": 1.1272235644360754e-07, "logits/chosen": 13.583520889282227, "logits/rejected": 15.282991409301758, "logps/chosen": -4.403611183166504, "logps/rejected": -4.901257038116455, "loss": 4.1928, "rewards/accuracies": 1.0, "rewards/chosen": -44.036109924316406, "rewards/margins": 4.976459503173828, "rewards/rejected": -49.012569427490234, "step": 5725 }, { "epoch": 0.7796840958605664, "grad_norm": 37.44080192974052, "learning_rate": 1.1259008116262556e-07, "logits/chosen": 14.243423461914062, "logits/rejected": 14.76718521118164, "logps/chosen": -4.608924865722656, "logps/rejected": -4.7693071365356445, "loss": 3.6238, "rewards/accuracies": 0.5, "rewards/chosen": -46.08924865722656, "rewards/margins": 1.6038227081298828, "rewards/rejected": -47.69307327270508, "step": 5726 }, { "epoch": 0.7798202614379085, "grad_norm": 37.025334324919825, "learning_rate": 1.1245787082429061e-07, "logits/chosen": 15.18557357788086, "logits/rejected": 14.832741737365723, "logps/chosen": -4.3448591232299805, "logps/rejected": -4.333054542541504, "loss": 3.6773, "rewards/accuracies": 0.25, "rewards/chosen": -43.44858932495117, "rewards/margins": -0.11804008483886719, "rewards/rejected": -43.33054733276367, "step": 5727 }, { "epoch": 0.7799564270152506, "grad_norm": 36.52601704826779, "learning_rate": 1.1232572545847649e-07, "logits/chosen": 14.527981758117676, "logits/rejected": 14.10037612915039, "logps/chosen": -4.409407138824463, "logps/rejected": -4.412310600280762, "loss": 4.099, "rewards/accuracies": 0.75, "rewards/chosen": -44.09407043457031, "rewards/margins": 0.029035568237304688, "rewards/rejected": -44.12310791015625, "step": 5728 }, { "epoch": 0.7800925925925926, "grad_norm": 45.729733637340225, "learning_rate": 1.1219364509504266e-07, "logits/chosen": 14.0801420211792, "logits/rejected": 14.495939254760742, "logps/chosen": -4.170741081237793, "logps/rejected": -4.575620174407959, "loss": 3.8207, "rewards/accuracies": 0.75, "rewards/chosen": -41.70741271972656, "rewards/margins": 4.048789024353027, "rewards/rejected": -45.756202697753906, "step": 5729 }, { "epoch": 0.7802287581699346, "grad_norm": 37.23511451094243, "learning_rate": 1.1206162976383384e-07, "logits/chosen": 14.69888687133789, "logits/rejected": 15.075382232666016, "logps/chosen": -4.4755859375, "logps/rejected": -4.62767219543457, "loss": 4.1229, "rewards/accuracies": 0.5, "rewards/chosen": -44.755859375, "rewards/margins": 1.5208606719970703, "rewards/rejected": -46.27672576904297, "step": 5730 }, { "epoch": 0.7803649237472767, "grad_norm": 39.56987639239277, "learning_rate": 1.1192967949467975e-07, "logits/chosen": 14.164909362792969, "logits/rejected": 14.838753700256348, "logps/chosen": -4.443421363830566, "logps/rejected": -4.876070022583008, "loss": 4.1516, "rewards/accuracies": 1.0, "rewards/chosen": -44.43421173095703, "rewards/margins": 4.32649040222168, "rewards/rejected": -48.760704040527344, "step": 5731 }, { "epoch": 0.7805010893246187, "grad_norm": 36.54082516297034, "learning_rate": 1.1179779431739582e-07, "logits/chosen": 13.715658187866211, "logits/rejected": 14.055251121520996, "logps/chosen": -4.082406520843506, "logps/rejected": -4.0392374992370605, "loss": 3.8, "rewards/accuracies": 0.25, "rewards/chosen": -40.824066162109375, "rewards/margins": -0.43169116973876953, "rewards/rejected": -40.39237594604492, "step": 5732 }, { "epoch": 0.7806372549019608, "grad_norm": 39.0838294387542, "learning_rate": 1.116659742617827e-07, "logits/chosen": 14.572234153747559, "logits/rejected": 14.202239990234375, "logps/chosen": -4.649413108825684, "logps/rejected": -4.559403419494629, "loss": 3.9151, "rewards/accuracies": 0.5, "rewards/chosen": -46.49413299560547, "rewards/margins": -0.9001026153564453, "rewards/rejected": -45.594032287597656, "step": 5733 }, { "epoch": 0.7807734204793029, "grad_norm": 47.38624804758152, "learning_rate": 1.1153421935762595e-07, "logits/chosen": 13.519439697265625, "logits/rejected": 13.555397033691406, "logps/chosen": -4.161354064941406, "logps/rejected": -4.146875381469727, "loss": 4.5644, "rewards/accuracies": 0.5, "rewards/chosen": -41.61354446411133, "rewards/margins": -0.1447896957397461, "rewards/rejected": -41.468753814697266, "step": 5734 }, { "epoch": 0.7809095860566448, "grad_norm": 40.70232731626503, "learning_rate": 1.1140252963469686e-07, "logits/chosen": 13.523628234863281, "logits/rejected": 14.175947189331055, "logps/chosen": -4.302828311920166, "logps/rejected": -4.7697978019714355, "loss": 3.8258, "rewards/accuracies": 1.0, "rewards/chosen": -43.028282165527344, "rewards/margins": 4.6696929931640625, "rewards/rejected": -47.697975158691406, "step": 5735 }, { "epoch": 0.7810457516339869, "grad_norm": 41.57247100779494, "learning_rate": 1.1127090512275183e-07, "logits/chosen": 14.34999942779541, "logits/rejected": 14.462081909179688, "logps/chosen": -4.7075581550598145, "logps/rejected": -4.756110191345215, "loss": 3.7768, "rewards/accuracies": 0.5, "rewards/chosen": -47.07558059692383, "rewards/margins": 0.4855184555053711, "rewards/rejected": -47.561100006103516, "step": 5736 }, { "epoch": 0.781181917211329, "grad_norm": 38.458845970753224, "learning_rate": 1.1113934585153235e-07, "logits/chosen": 13.923282623291016, "logits/rejected": 14.02604866027832, "logps/chosen": -4.482592582702637, "logps/rejected": -4.511903762817383, "loss": 3.8089, "rewards/accuracies": 0.5, "rewards/chosen": -44.825927734375, "rewards/margins": 0.2931079864501953, "rewards/rejected": -45.11903381347656, "step": 5737 }, { "epoch": 0.781318082788671, "grad_norm": 39.27898402232914, "learning_rate": 1.1100785185076542e-07, "logits/chosen": 14.135177612304688, "logits/rejected": 14.364238739013672, "logps/chosen": -4.328272819519043, "logps/rejected": -4.4341535568237305, "loss": 3.9327, "rewards/accuracies": 0.5, "rewards/chosen": -43.28273010253906, "rewards/margins": 1.0588092803955078, "rewards/rejected": -44.34153747558594, "step": 5738 }, { "epoch": 0.7814542483660131, "grad_norm": 36.1269728058331, "learning_rate": 1.1087642315016332e-07, "logits/chosen": 13.721851348876953, "logits/rejected": 14.395835876464844, "logps/chosen": -4.42467737197876, "logps/rejected": -4.79154109954834, "loss": 3.4942, "rewards/accuracies": 0.75, "rewards/chosen": -44.24677276611328, "rewards/margins": 3.668638229370117, "rewards/rejected": -47.91541290283203, "step": 5739 }, { "epoch": 0.7815904139433552, "grad_norm": 37.655961048683345, "learning_rate": 1.1074505977942323e-07, "logits/chosen": 13.483680725097656, "logits/rejected": 14.448722839355469, "logps/chosen": -4.336871147155762, "logps/rejected": -4.647190093994141, "loss": 4.091, "rewards/accuracies": 0.75, "rewards/chosen": -43.36871337890625, "rewards/margins": 3.1031932830810547, "rewards/rejected": -46.47190475463867, "step": 5740 }, { "epoch": 0.7817265795206971, "grad_norm": 47.262869747576055, "learning_rate": 1.1061376176822785e-07, "logits/chosen": 14.08481216430664, "logits/rejected": 13.48979377746582, "logps/chosen": -4.314541816711426, "logps/rejected": -4.239765167236328, "loss": 3.7608, "rewards/accuracies": 0.5, "rewards/chosen": -43.14542007446289, "rewards/margins": -0.7477645874023438, "rewards/rejected": -42.39765548706055, "step": 5741 }, { "epoch": 0.7818627450980392, "grad_norm": 38.024824798431865, "learning_rate": 1.1048252914624522e-07, "logits/chosen": 14.21851921081543, "logits/rejected": 14.36053466796875, "logps/chosen": -4.293205261230469, "logps/rejected": -4.504943370819092, "loss": 4.0794, "rewards/accuracies": 0.75, "rewards/chosen": -42.93205261230469, "rewards/margins": 2.1173791885375977, "rewards/rejected": -45.049434661865234, "step": 5742 }, { "epoch": 0.7819989106753813, "grad_norm": 41.09537489379588, "learning_rate": 1.1035136194312822e-07, "logits/chosen": 14.49172306060791, "logits/rejected": 14.510232925415039, "logps/chosen": -4.436324119567871, "logps/rejected": -4.692808151245117, "loss": 4.443, "rewards/accuracies": 0.75, "rewards/chosen": -44.363243103027344, "rewards/margins": 2.564835548400879, "rewards/rejected": -46.928077697753906, "step": 5743 }, { "epoch": 0.7821350762527233, "grad_norm": 39.22876125726238, "learning_rate": 1.102202601885152e-07, "logits/chosen": 14.745070457458496, "logits/rejected": 14.746528625488281, "logps/chosen": -4.240134239196777, "logps/rejected": -4.451305866241455, "loss": 3.9426, "rewards/accuracies": 0.75, "rewards/chosen": -42.401344299316406, "rewards/margins": 2.1117143630981445, "rewards/rejected": -44.5130615234375, "step": 5744 }, { "epoch": 0.7822712418300654, "grad_norm": 40.214373944327264, "learning_rate": 1.1008922391202986e-07, "logits/chosen": 14.829322814941406, "logits/rejected": 14.779655456542969, "logps/chosen": -4.7204132080078125, "logps/rejected": -4.805782318115234, "loss": 3.6826, "rewards/accuracies": 0.75, "rewards/chosen": -47.204132080078125, "rewards/margins": 0.8536901473999023, "rewards/rejected": -48.057823181152344, "step": 5745 }, { "epoch": 0.7824074074074074, "grad_norm": 40.05139583350185, "learning_rate": 1.0995825314328073e-07, "logits/chosen": 14.396685600280762, "logits/rejected": 15.179804801940918, "logps/chosen": -4.4575700759887695, "logps/rejected": -4.862285614013672, "loss": 4.1354, "rewards/accuracies": 0.75, "rewards/chosen": -44.57570266723633, "rewards/margins": 4.047153472900391, "rewards/rejected": -48.62285614013672, "step": 5746 }, { "epoch": 0.7825435729847494, "grad_norm": 36.83202703612041, "learning_rate": 1.0982734791186179e-07, "logits/chosen": 14.022233963012695, "logits/rejected": 15.051794052124023, "logps/chosen": -4.177624702453613, "logps/rejected": -4.6438164710998535, "loss": 4.1229, "rewards/accuracies": 0.75, "rewards/chosen": -41.776248931884766, "rewards/margins": 4.661918640136719, "rewards/rejected": -46.43816375732422, "step": 5747 }, { "epoch": 0.7826797385620915, "grad_norm": 37.46845047135907, "learning_rate": 1.0969650824735226e-07, "logits/chosen": 14.511734008789062, "logits/rejected": 14.241329193115234, "logps/chosen": -4.394179344177246, "logps/rejected": -4.604743003845215, "loss": 3.8508, "rewards/accuracies": 1.0, "rewards/chosen": -43.941795349121094, "rewards/margins": 2.105632781982422, "rewards/rejected": -46.047428131103516, "step": 5748 }, { "epoch": 0.7828159041394336, "grad_norm": 57.24765336899015, "learning_rate": 1.0956573417931627e-07, "logits/chosen": 13.673500061035156, "logits/rejected": 14.31021499633789, "logps/chosen": -4.628271579742432, "logps/rejected": -4.61915397644043, "loss": 3.6626, "rewards/accuracies": 0.5, "rewards/chosen": -46.28271484375, "rewards/margins": -0.09117412567138672, "rewards/rejected": -46.19154357910156, "step": 5749 }, { "epoch": 0.7829520697167756, "grad_norm": 40.41615360581304, "learning_rate": 1.0943502573730343e-07, "logits/chosen": 14.367679595947266, "logits/rejected": 14.419190406799316, "logps/chosen": -4.397086143493652, "logps/rejected": -4.5454206466674805, "loss": 4.0851, "rewards/accuracies": 0.75, "rewards/chosen": -43.970863342285156, "rewards/margins": 1.4833440780639648, "rewards/rejected": -45.45420455932617, "step": 5750 }, { "epoch": 0.7830882352941176, "grad_norm": 38.16821108602082, "learning_rate": 1.0930438295084842e-07, "logits/chosen": 14.211664199829102, "logits/rejected": 14.314178466796875, "logps/chosen": -4.395319938659668, "logps/rejected": -4.84636116027832, "loss": 3.8108, "rewards/accuracies": 1.0, "rewards/chosen": -43.95320129394531, "rewards/margins": 4.510410308837891, "rewards/rejected": -48.46360778808594, "step": 5751 }, { "epoch": 0.7832244008714597, "grad_norm": 39.86995704856372, "learning_rate": 1.0917380584947094e-07, "logits/chosen": 13.85518741607666, "logits/rejected": 14.402044296264648, "logps/chosen": -4.276808738708496, "logps/rejected": -4.63535213470459, "loss": 3.867, "rewards/accuracies": 0.75, "rewards/chosen": -42.768089294433594, "rewards/margins": 3.5854339599609375, "rewards/rejected": -46.35352325439453, "step": 5752 }, { "epoch": 0.7833605664488017, "grad_norm": 40.526056108969875, "learning_rate": 1.0904329446267597e-07, "logits/chosen": 14.154977798461914, "logits/rejected": 15.635126113891602, "logps/chosen": -4.398804664611816, "logps/rejected": -4.817831993103027, "loss": 3.6026, "rewards/accuracies": 1.0, "rewards/chosen": -43.98804473876953, "rewards/margins": 4.19027042388916, "rewards/rejected": -48.17831802368164, "step": 5753 }, { "epoch": 0.7834967320261438, "grad_norm": 35.72800189476371, "learning_rate": 1.0891284881995387e-07, "logits/chosen": 14.604423522949219, "logits/rejected": 15.073470115661621, "logps/chosen": -4.507349014282227, "logps/rejected": -4.903584957122803, "loss": 3.7812, "rewards/accuracies": 0.75, "rewards/chosen": -45.073490142822266, "rewards/margins": 3.962357521057129, "rewards/rejected": -49.03584671020508, "step": 5754 }, { "epoch": 0.7836328976034859, "grad_norm": 40.442078293571115, "learning_rate": 1.0878246895077956e-07, "logits/chosen": 15.000764846801758, "logits/rejected": 15.10855484008789, "logps/chosen": -4.585391998291016, "logps/rejected": -4.768609523773193, "loss": 4.0588, "rewards/accuracies": 0.5, "rewards/chosen": -45.853919982910156, "rewards/margins": 1.8321733474731445, "rewards/rejected": -47.686092376708984, "step": 5755 }, { "epoch": 0.7837690631808278, "grad_norm": 38.46490864975581, "learning_rate": 1.0865215488461359e-07, "logits/chosen": 14.106115341186523, "logits/rejected": 14.713537216186523, "logps/chosen": -4.452970504760742, "logps/rejected": -4.879981994628906, "loss": 3.5783, "rewards/accuracies": 1.0, "rewards/chosen": -44.52970504760742, "rewards/margins": 4.270115852355957, "rewards/rejected": -48.79981994628906, "step": 5756 }, { "epoch": 0.7839052287581699, "grad_norm": 40.15131242243624, "learning_rate": 1.0852190665090173e-07, "logits/chosen": 14.476571083068848, "logits/rejected": 14.982429504394531, "logps/chosen": -4.558833122253418, "logps/rejected": -4.579498291015625, "loss": 4.4868, "rewards/accuracies": 0.5, "rewards/chosen": -45.58832931518555, "rewards/margins": 0.20665454864501953, "rewards/rejected": -45.79498291015625, "step": 5757 }, { "epoch": 0.784041394335512, "grad_norm": 42.84068377051866, "learning_rate": 1.0839172427907426e-07, "logits/chosen": 14.898122787475586, "logits/rejected": 14.869636535644531, "logps/chosen": -4.611692428588867, "logps/rejected": -4.6348419189453125, "loss": 4.1277, "rewards/accuracies": 0.5, "rewards/chosen": -46.11692428588867, "rewards/margins": 0.23149490356445312, "rewards/rejected": -46.348419189453125, "step": 5758 }, { "epoch": 0.784177559912854, "grad_norm": 44.63104278905328, "learning_rate": 1.0826160779854716e-07, "logits/chosen": 14.159764289855957, "logits/rejected": 14.409942626953125, "logps/chosen": -4.554424285888672, "logps/rejected": -4.605576515197754, "loss": 3.7012, "rewards/accuracies": 0.75, "rewards/chosen": -45.54423904418945, "rewards/margins": 0.5115251541137695, "rewards/rejected": -46.055763244628906, "step": 5759 }, { "epoch": 0.7843137254901961, "grad_norm": 42.26155274864442, "learning_rate": 1.0813155723872145e-07, "logits/chosen": 14.44944953918457, "logits/rejected": 14.385339736938477, "logps/chosen": -4.620996475219727, "logps/rejected": -4.599935531616211, "loss": 4.0746, "rewards/accuracies": 0.5, "rewards/chosen": -46.20996856689453, "rewards/margins": -0.21061134338378906, "rewards/rejected": -45.999359130859375, "step": 5760 }, { "epoch": 0.7844498910675382, "grad_norm": 39.26525202890714, "learning_rate": 1.0800157262898286e-07, "logits/chosen": 14.056884765625, "logits/rejected": 14.794978141784668, "logps/chosen": -4.6681036949157715, "logps/rejected": -4.867157936096191, "loss": 4.0493, "rewards/accuracies": 0.75, "rewards/chosen": -46.681034088134766, "rewards/margins": 1.9905414581298828, "rewards/rejected": -48.67157745361328, "step": 5761 }, { "epoch": 0.7845860566448801, "grad_norm": 35.06241558617745, "learning_rate": 1.0787165399870261e-07, "logits/chosen": 14.877090454101562, "logits/rejected": 15.182310104370117, "logps/chosen": -4.767795562744141, "logps/rejected": -4.993520736694336, "loss": 3.8555, "rewards/accuracies": 0.75, "rewards/chosen": -47.677955627441406, "rewards/margins": 2.2572526931762695, "rewards/rejected": -49.935211181640625, "step": 5762 }, { "epoch": 0.7847222222222222, "grad_norm": 36.260677628635925, "learning_rate": 1.0774180137723705e-07, "logits/chosen": 13.83546257019043, "logits/rejected": 14.514688491821289, "logps/chosen": -4.1560516357421875, "logps/rejected": -4.662993907928467, "loss": 4.3444, "rewards/accuracies": 0.75, "rewards/chosen": -41.560516357421875, "rewards/margins": 5.069421768188477, "rewards/rejected": -46.62993621826172, "step": 5763 }, { "epoch": 0.7848583877995643, "grad_norm": 43.215019694718265, "learning_rate": 1.0761201479392714e-07, "logits/chosen": 14.667905807495117, "logits/rejected": 15.074211120605469, "logps/chosen": -4.417331695556641, "logps/rejected": -4.980630874633789, "loss": 4.3761, "rewards/accuracies": 1.0, "rewards/chosen": -44.173316955566406, "rewards/margins": 5.632993698120117, "rewards/rejected": -49.80630874633789, "step": 5764 }, { "epoch": 0.7849945533769063, "grad_norm": 40.36022983048283, "learning_rate": 1.0748229427809942e-07, "logits/chosen": 14.970096588134766, "logits/rejected": 14.504278182983398, "logps/chosen": -4.671291828155518, "logps/rejected": -4.412277698516846, "loss": 4.0985, "rewards/accuracies": 0.5, "rewards/chosen": -46.712921142578125, "rewards/margins": -2.590137481689453, "rewards/rejected": -44.122779846191406, "step": 5765 }, { "epoch": 0.7851307189542484, "grad_norm": 42.778215932062416, "learning_rate": 1.073526398590654e-07, "logits/chosen": 13.99822998046875, "logits/rejected": 14.004087448120117, "logps/chosen": -4.480314254760742, "logps/rejected": -4.536770343780518, "loss": 4.0631, "rewards/accuracies": 0.75, "rewards/chosen": -44.80314254760742, "rewards/margins": 0.5645618438720703, "rewards/rejected": -45.36770248413086, "step": 5766 }, { "epoch": 0.7852668845315904, "grad_norm": 42.896259365698, "learning_rate": 1.072230515661213e-07, "logits/chosen": 14.947877883911133, "logits/rejected": 14.628145217895508, "logps/chosen": -4.824390411376953, "logps/rejected": -4.719995975494385, "loss": 4.5199, "rewards/accuracies": 0.5, "rewards/chosen": -48.24390411376953, "rewards/margins": -1.0439443588256836, "rewards/rejected": -47.19995880126953, "step": 5767 }, { "epoch": 0.7854030501089324, "grad_norm": 39.60169352187389, "learning_rate": 1.0709352942854887e-07, "logits/chosen": 14.256536483764648, "logits/rejected": 14.236099243164062, "logps/chosen": -4.613772392272949, "logps/rejected": -4.538069725036621, "loss": 3.9343, "rewards/accuracies": 0.5, "rewards/chosen": -46.137725830078125, "rewards/margins": -0.7570314407348633, "rewards/rejected": -45.38069152832031, "step": 5768 }, { "epoch": 0.7855392156862745, "grad_norm": 39.85903204234491, "learning_rate": 1.0696407347561471e-07, "logits/chosen": 13.809864044189453, "logits/rejected": 14.060471534729004, "logps/chosen": -4.201246738433838, "logps/rejected": -4.488053321838379, "loss": 4.3506, "rewards/accuracies": 0.75, "rewards/chosen": -42.01247024536133, "rewards/margins": 2.868062973022461, "rewards/rejected": -44.880531311035156, "step": 5769 }, { "epoch": 0.7856753812636166, "grad_norm": 37.964944126493606, "learning_rate": 1.0683468373657034e-07, "logits/chosen": 14.110630989074707, "logits/rejected": 14.490439414978027, "logps/chosen": -4.465205669403076, "logps/rejected": -4.636377334594727, "loss": 3.9272, "rewards/accuracies": 0.5, "rewards/chosen": -44.65205764770508, "rewards/margins": 1.711714744567871, "rewards/rejected": -46.36376953125, "step": 5770 }, { "epoch": 0.7858115468409586, "grad_norm": 38.496617039145576, "learning_rate": 1.0670536024065251e-07, "logits/chosen": 14.561408996582031, "logits/rejected": 14.121955871582031, "logps/chosen": -4.549405097961426, "logps/rejected": -4.598509311676025, "loss": 3.7299, "rewards/accuracies": 0.75, "rewards/chosen": -45.494049072265625, "rewards/margins": 0.4910449981689453, "rewards/rejected": -45.98509216308594, "step": 5771 }, { "epoch": 0.7859477124183006, "grad_norm": 38.024892697742104, "learning_rate": 1.0657610301708304e-07, "logits/chosen": 14.793951034545898, "logits/rejected": 14.969096183776855, "logps/chosen": -4.517861366271973, "logps/rejected": -4.933682441711426, "loss": 3.8837, "rewards/accuracies": 0.75, "rewards/chosen": -45.178619384765625, "rewards/margins": 4.158202171325684, "rewards/rejected": -49.336822509765625, "step": 5772 }, { "epoch": 0.7860838779956427, "grad_norm": 43.886630192795685, "learning_rate": 1.064469120950684e-07, "logits/chosen": 14.522085189819336, "logits/rejected": 13.72800064086914, "logps/chosen": -4.7209930419921875, "logps/rejected": -4.569974899291992, "loss": 4.0389, "rewards/accuracies": 0.5, "rewards/chosen": -47.209930419921875, "rewards/margins": -1.5101861953735352, "rewards/rejected": -45.699745178222656, "step": 5773 }, { "epoch": 0.7862200435729847, "grad_norm": 37.24448413202068, "learning_rate": 1.0631778750380055e-07, "logits/chosen": 15.006937026977539, "logits/rejected": 14.879056930541992, "logps/chosen": -4.901158332824707, "logps/rejected": -4.765447616577148, "loss": 3.8486, "rewards/accuracies": 0.5, "rewards/chosen": -49.01158142089844, "rewards/margins": -1.3571033477783203, "rewards/rejected": -47.65447998046875, "step": 5774 }, { "epoch": 0.7863562091503268, "grad_norm": 39.80179603278516, "learning_rate": 1.0618872927245632e-07, "logits/chosen": 14.085212707519531, "logits/rejected": 14.112081527709961, "logps/chosen": -4.528921127319336, "logps/rejected": -4.45905876159668, "loss": 3.7788, "rewards/accuracies": 0.5, "rewards/chosen": -45.289215087890625, "rewards/margins": -0.6986207962036133, "rewards/rejected": -44.59059143066406, "step": 5775 }, { "epoch": 0.7864923747276689, "grad_norm": 37.71446150277731, "learning_rate": 1.060597374301973e-07, "logits/chosen": 13.38298225402832, "logits/rejected": 13.799636840820312, "logps/chosen": -4.224756240844727, "logps/rejected": -4.549910068511963, "loss": 3.7476, "rewards/accuracies": 0.75, "rewards/chosen": -42.24755859375, "rewards/margins": 3.251542091369629, "rewards/rejected": -45.49909973144531, "step": 5776 }, { "epoch": 0.786628540305011, "grad_norm": 36.57750084935871, "learning_rate": 1.059308120061703e-07, "logits/chosen": 14.756442070007324, "logits/rejected": 14.63194465637207, "logps/chosen": -4.57130241394043, "logps/rejected": -4.7670698165893555, "loss": 3.8781, "rewards/accuracies": 0.75, "rewards/chosen": -45.71302032470703, "rewards/margins": 1.9576749801635742, "rewards/rejected": -47.67070007324219, "step": 5777 }, { "epoch": 0.7867647058823529, "grad_norm": 38.1385758950163, "learning_rate": 1.0580195302950725e-07, "logits/chosen": 13.79625129699707, "logits/rejected": 14.636434555053711, "logps/chosen": -4.3365654945373535, "logps/rejected": -4.340211868286133, "loss": 4.1549, "rewards/accuracies": 0.5, "rewards/chosen": -43.36565399169922, "rewards/margins": 0.03646373748779297, "rewards/rejected": -43.40211868286133, "step": 5778 }, { "epoch": 0.786900871459695, "grad_norm": 39.1687893305254, "learning_rate": 1.0567316052932467e-07, "logits/chosen": 14.488921165466309, "logits/rejected": 15.093486785888672, "logps/chosen": -4.69553279876709, "logps/rejected": -4.787905693054199, "loss": 3.7804, "rewards/accuracies": 0.5, "rewards/chosen": -46.95532989501953, "rewards/margins": 0.923731803894043, "rewards/rejected": -47.879058837890625, "step": 5779 }, { "epoch": 0.7870370370370371, "grad_norm": 41.85515643676251, "learning_rate": 1.0554443453472436e-07, "logits/chosen": 14.071908950805664, "logits/rejected": 14.540837287902832, "logps/chosen": -4.415698528289795, "logps/rejected": -4.615734100341797, "loss": 3.4188, "rewards/accuracies": 0.75, "rewards/chosen": -44.156986236572266, "rewards/margins": 2.000356674194336, "rewards/rejected": -46.15734100341797, "step": 5780 }, { "epoch": 0.7871732026143791, "grad_norm": 38.34343751374458, "learning_rate": 1.0541577507479322e-07, "logits/chosen": 14.48638916015625, "logits/rejected": 14.738738059997559, "logps/chosen": -4.8145856857299805, "logps/rejected": -4.977237224578857, "loss": 4.1211, "rewards/accuracies": 0.75, "rewards/chosen": -48.14585876464844, "rewards/margins": 1.626516342163086, "rewards/rejected": -49.772377014160156, "step": 5781 }, { "epoch": 0.7873093681917211, "grad_norm": 46.030978521332315, "learning_rate": 1.0528718217860263e-07, "logits/chosen": 14.46072006225586, "logits/rejected": 14.24075984954834, "logps/chosen": -4.542375087738037, "logps/rejected": -4.712249279022217, "loss": 4.9686, "rewards/accuracies": 0.5, "rewards/chosen": -45.423744201660156, "rewards/margins": 1.6987438201904297, "rewards/rejected": -47.12248992919922, "step": 5782 }, { "epoch": 0.7874455337690632, "grad_norm": 35.908812115730065, "learning_rate": 1.0515865587520938e-07, "logits/chosen": 14.7987060546875, "logits/rejected": 14.768884658813477, "logps/chosen": -4.68555212020874, "logps/rejected": -4.623966693878174, "loss": 3.8968, "rewards/accuracies": 0.5, "rewards/chosen": -46.85551834106445, "rewards/margins": -0.6158514022827148, "rewards/rejected": -46.23966979980469, "step": 5783 }, { "epoch": 0.7875816993464052, "grad_norm": 36.74865124489306, "learning_rate": 1.050301961936551e-07, "logits/chosen": 13.98806381225586, "logits/rejected": 14.239923477172852, "logps/chosen": -4.208168983459473, "logps/rejected": -4.696098804473877, "loss": 3.5832, "rewards/accuracies": 1.0, "rewards/chosen": -42.081687927246094, "rewards/margins": 4.879299163818359, "rewards/rejected": -46.96098709106445, "step": 5784 }, { "epoch": 0.7877178649237473, "grad_norm": 37.24594146428656, "learning_rate": 1.0490180316296613e-07, "logits/chosen": 14.809937477111816, "logits/rejected": 15.050369262695312, "logps/chosen": -4.812223434448242, "logps/rejected": -4.791630744934082, "loss": 3.8012, "rewards/accuracies": 0.25, "rewards/chosen": -48.122230529785156, "rewards/margins": -0.20592403411865234, "rewards/rejected": -47.91630935668945, "step": 5785 }, { "epoch": 0.7878540305010894, "grad_norm": 39.58661709853289, "learning_rate": 1.0477347681215402e-07, "logits/chosen": 13.656399726867676, "logits/rejected": 13.627050399780273, "logps/chosen": -4.239450454711914, "logps/rejected": -4.519804954528809, "loss": 4.3945, "rewards/accuracies": 0.75, "rewards/chosen": -42.394508361816406, "rewards/margins": 2.8035430908203125, "rewards/rejected": -45.19805145263672, "step": 5786 }, { "epoch": 0.7879901960784313, "grad_norm": 40.90961957307297, "learning_rate": 1.0464521717021524e-07, "logits/chosen": 14.703052520751953, "logits/rejected": 15.303930282592773, "logps/chosen": -4.774855136871338, "logps/rejected": -4.836064338684082, "loss": 3.9471, "rewards/accuracies": 0.5, "rewards/chosen": -47.74855422973633, "rewards/margins": 0.6120872497558594, "rewards/rejected": -48.36064147949219, "step": 5787 }, { "epoch": 0.7881263616557734, "grad_norm": 36.64569570822722, "learning_rate": 1.0451702426613116e-07, "logits/chosen": 13.973939895629883, "logits/rejected": 14.009578704833984, "logps/chosen": -4.065906047821045, "logps/rejected": -4.41847038269043, "loss": 3.512, "rewards/accuracies": 1.0, "rewards/chosen": -40.6590576171875, "rewards/margins": 3.525646209716797, "rewards/rejected": -44.1847038269043, "step": 5788 }, { "epoch": 0.7882625272331155, "grad_norm": 37.39579528373276, "learning_rate": 1.0438889812886777e-07, "logits/chosen": 14.097270965576172, "logits/rejected": 14.349544525146484, "logps/chosen": -3.8463282585144043, "logps/rejected": -4.35467529296875, "loss": 3.976, "rewards/accuracies": 1.0, "rewards/chosen": -38.46328353881836, "rewards/margins": 5.083469390869141, "rewards/rejected": -43.5467529296875, "step": 5789 }, { "epoch": 0.7883986928104575, "grad_norm": 44.712368668413774, "learning_rate": 1.0426083878737646e-07, "logits/chosen": 14.226505279541016, "logits/rejected": 14.968263626098633, "logps/chosen": -4.524736404418945, "logps/rejected": -4.932816028594971, "loss": 4.2184, "rewards/accuracies": 0.75, "rewards/chosen": -45.24736404418945, "rewards/margins": 4.080799102783203, "rewards/rejected": -49.32815933227539, "step": 5790 }, { "epoch": 0.7885348583877996, "grad_norm": 39.3791813481041, "learning_rate": 1.0413284627059331e-07, "logits/chosen": 13.841215133666992, "logits/rejected": 14.033584594726562, "logps/chosen": -4.5384745597839355, "logps/rejected": -4.678555011749268, "loss": 4.1227, "rewards/accuracies": 1.0, "rewards/chosen": -45.384742736816406, "rewards/margins": 1.4008054733276367, "rewards/rejected": -46.785552978515625, "step": 5791 }, { "epoch": 0.7886710239651417, "grad_norm": 36.17961670344476, "learning_rate": 1.040049206074391e-07, "logits/chosen": 14.198217391967773, "logits/rejected": 14.621724128723145, "logps/chosen": -4.512195587158203, "logps/rejected": -4.4861907958984375, "loss": 3.351, "rewards/accuracies": 0.5, "rewards/chosen": -45.1219596862793, "rewards/margins": -0.2600526809692383, "rewards/rejected": -44.861907958984375, "step": 5792 }, { "epoch": 0.7888071895424836, "grad_norm": 33.99701862168758, "learning_rate": 1.0387706182681984e-07, "logits/chosen": 14.362363815307617, "logits/rejected": 14.628229141235352, "logps/chosen": -4.531039237976074, "logps/rejected": -4.956393241882324, "loss": 3.796, "rewards/accuracies": 0.75, "rewards/chosen": -45.310394287109375, "rewards/margins": 4.253537178039551, "rewards/rejected": -49.563934326171875, "step": 5793 }, { "epoch": 0.7889433551198257, "grad_norm": 37.51278844290447, "learning_rate": 1.0374926995762616e-07, "logits/chosen": 14.079061508178711, "logits/rejected": 13.764017105102539, "logps/chosen": -4.618532657623291, "logps/rejected": -4.39358377456665, "loss": 4.1755, "rewards/accuracies": 0.25, "rewards/chosen": -46.185325622558594, "rewards/margins": -2.2494888305664062, "rewards/rejected": -43.93583679199219, "step": 5794 }, { "epoch": 0.7890795206971678, "grad_norm": 37.059475256529964, "learning_rate": 1.0362154502873393e-07, "logits/chosen": 14.060879707336426, "logits/rejected": 14.336762428283691, "logps/chosen": -4.306009769439697, "logps/rejected": -4.611771583557129, "loss": 3.679, "rewards/accuracies": 0.75, "rewards/chosen": -43.06010055541992, "rewards/margins": 3.0576210021972656, "rewards/rejected": -46.11772155761719, "step": 5795 }, { "epoch": 0.7892156862745098, "grad_norm": 40.7381996904162, "learning_rate": 1.0349388706900337e-07, "logits/chosen": 14.288989067077637, "logits/rejected": 14.510605812072754, "logps/chosen": -4.223931789398193, "logps/rejected": -4.547547340393066, "loss": 4.4137, "rewards/accuracies": 0.75, "rewards/chosen": -42.23931884765625, "rewards/margins": 3.2361526489257812, "rewards/rejected": -45.47547149658203, "step": 5796 }, { "epoch": 0.7893518518518519, "grad_norm": 38.04136428582788, "learning_rate": 1.0336629610727995e-07, "logits/chosen": 14.143560409545898, "logits/rejected": 15.450874328613281, "logps/chosen": -4.559293746948242, "logps/rejected": -4.862492561340332, "loss": 3.9295, "rewards/accuracies": 0.75, "rewards/chosen": -45.59294128417969, "rewards/margins": 3.031984329223633, "rewards/rejected": -48.62492370605469, "step": 5797 }, { "epoch": 0.789488017429194, "grad_norm": 41.27452146489439, "learning_rate": 1.032387721723941e-07, "logits/chosen": 14.567211151123047, "logits/rejected": 14.196776390075684, "logps/chosen": -4.778327941894531, "logps/rejected": -4.673303604125977, "loss": 4.2686, "rewards/accuracies": 0.5, "rewards/chosen": -47.78327941894531, "rewards/margins": -1.0502452850341797, "rewards/rejected": -46.7330322265625, "step": 5798 }, { "epoch": 0.7896241830065359, "grad_norm": 36.18637179349842, "learning_rate": 1.0311131529316055e-07, "logits/chosen": 14.291933059692383, "logits/rejected": 14.900662422180176, "logps/chosen": -4.579693794250488, "logps/rejected": -4.5757646560668945, "loss": 3.6554, "rewards/accuracies": 0.5, "rewards/chosen": -45.79693603515625, "rewards/margins": -0.03929328918457031, "rewards/rejected": -45.75764465332031, "step": 5799 }, { "epoch": 0.789760348583878, "grad_norm": 40.694160808157456, "learning_rate": 1.0298392549837944e-07, "logits/chosen": 14.323158264160156, "logits/rejected": 15.134937286376953, "logps/chosen": -4.562424659729004, "logps/rejected": -4.764618873596191, "loss": 4.1766, "rewards/accuracies": 0.5, "rewards/chosen": -45.62425231933594, "rewards/margins": 2.021940231323242, "rewards/rejected": -47.64619064331055, "step": 5800 }, { "epoch": 0.7898965141612201, "grad_norm": 37.81458224684876, "learning_rate": 1.0285660281683553e-07, "logits/chosen": 13.161870956420898, "logits/rejected": 14.237741470336914, "logps/chosen": -4.085578441619873, "logps/rejected": -4.477263450622559, "loss": 4.0501, "rewards/accuracies": 1.0, "rewards/chosen": -40.85578155517578, "rewards/margins": 3.9168529510498047, "rewards/rejected": -44.77263641357422, "step": 5801 }, { "epoch": 0.7900326797385621, "grad_norm": 38.81100122286865, "learning_rate": 1.0272934727729854e-07, "logits/chosen": 13.96016788482666, "logits/rejected": 15.180620193481445, "logps/chosen": -4.12387228012085, "logps/rejected": -4.583261489868164, "loss": 3.706, "rewards/accuracies": 1.0, "rewards/chosen": -41.23872375488281, "rewards/margins": 4.5938873291015625, "rewards/rejected": -45.832611083984375, "step": 5802 }, { "epoch": 0.7901688453159041, "grad_norm": 37.371986142820596, "learning_rate": 1.0260215890852268e-07, "logits/chosen": 14.730487823486328, "logits/rejected": 14.761348724365234, "logps/chosen": -4.316005706787109, "logps/rejected": -4.603004455566406, "loss": 4.1035, "rewards/accuracies": 0.75, "rewards/chosen": -43.160057067871094, "rewards/margins": 2.869992256164551, "rewards/rejected": -46.03004455566406, "step": 5803 }, { "epoch": 0.7903050108932462, "grad_norm": 83.31882484201657, "learning_rate": 1.0247503773924733e-07, "logits/chosen": 14.254950523376465, "logits/rejected": 14.084283828735352, "logps/chosen": -4.52705192565918, "logps/rejected": -4.186893463134766, "loss": 4.6059, "rewards/accuracies": 0.25, "rewards/chosen": -45.27051544189453, "rewards/margins": -3.4015769958496094, "rewards/rejected": -41.868934631347656, "step": 5804 }, { "epoch": 0.7904411764705882, "grad_norm": 38.456844788138284, "learning_rate": 1.0234798379819664e-07, "logits/chosen": 14.14826774597168, "logits/rejected": 14.790750503540039, "logps/chosen": -4.397568702697754, "logps/rejected": -4.847780227661133, "loss": 3.5852, "rewards/accuracies": 1.0, "rewards/chosen": -43.975685119628906, "rewards/margins": 4.502115249633789, "rewards/rejected": -48.47780227661133, "step": 5805 }, { "epoch": 0.7905773420479303, "grad_norm": 39.04607597530641, "learning_rate": 1.0222099711407937e-07, "logits/chosen": 14.222051620483398, "logits/rejected": 14.736576080322266, "logps/chosen": -4.716719150543213, "logps/rejected": -5.0983967781066895, "loss": 3.7812, "rewards/accuracies": 0.75, "rewards/chosen": -47.167198181152344, "rewards/margins": 3.8167724609375, "rewards/rejected": -50.98396682739258, "step": 5806 }, { "epoch": 0.7907135076252724, "grad_norm": 36.9770920433292, "learning_rate": 1.0209407771558924e-07, "logits/chosen": 14.986255645751953, "logits/rejected": 14.603660583496094, "logps/chosen": -4.555784225463867, "logps/rejected": -4.735238075256348, "loss": 3.5131, "rewards/accuracies": 0.75, "rewards/chosen": -45.557838439941406, "rewards/margins": 1.7945404052734375, "rewards/rejected": -47.352378845214844, "step": 5807 }, { "epoch": 0.7908496732026143, "grad_norm": 39.74305540870175, "learning_rate": 1.0196722563140489e-07, "logits/chosen": 14.596076965332031, "logits/rejected": 14.191197395324707, "logps/chosen": -4.498486518859863, "logps/rejected": -4.491780757904053, "loss": 4.134, "rewards/accuracies": 0.25, "rewards/chosen": -44.98487091064453, "rewards/margins": -0.06705951690673828, "rewards/rejected": -44.917808532714844, "step": 5808 }, { "epoch": 0.7909858387799564, "grad_norm": 37.58924599482362, "learning_rate": 1.018404408901894e-07, "logits/chosen": 14.225900650024414, "logits/rejected": 14.810354232788086, "logps/chosen": -4.42808723449707, "logps/rejected": -4.587508201599121, "loss": 3.9513, "rewards/accuracies": 1.0, "rewards/chosen": -44.28087615966797, "rewards/margins": 1.5942096710205078, "rewards/rejected": -45.875083923339844, "step": 5809 }, { "epoch": 0.7911220043572985, "grad_norm": 38.76569065397893, "learning_rate": 1.0171372352059084e-07, "logits/chosen": 15.215877532958984, "logits/rejected": 15.331483840942383, "logps/chosen": -4.661171913146973, "logps/rejected": -4.777826309204102, "loss": 4.2797, "rewards/accuracies": 0.75, "rewards/chosen": -46.61172103881836, "rewards/margins": 1.1665430068969727, "rewards/rejected": -47.778263092041016, "step": 5810 }, { "epoch": 0.7912581699346405, "grad_norm": 40.965476116168205, "learning_rate": 1.0158707355124225e-07, "logits/chosen": 13.644904136657715, "logits/rejected": 14.462841033935547, "logps/chosen": -4.161472797393799, "logps/rejected": -4.378970623016357, "loss": 4.2892, "rewards/accuracies": 0.75, "rewards/chosen": -41.61473083496094, "rewards/margins": 2.1749753952026367, "rewards/rejected": -43.789703369140625, "step": 5811 }, { "epoch": 0.7913943355119826, "grad_norm": 40.373984011592455, "learning_rate": 1.0146049101076095e-07, "logits/chosen": 15.043664932250977, "logits/rejected": 14.830734252929688, "logps/chosen": -4.840903282165527, "logps/rejected": -4.739621639251709, "loss": 4.2534, "rewards/accuracies": 0.25, "rewards/chosen": -48.409034729003906, "rewards/margins": -1.0128183364868164, "rewards/rejected": -47.396217346191406, "step": 5812 }, { "epoch": 0.7915305010893247, "grad_norm": 40.54197189394295, "learning_rate": 1.0133397592774952e-07, "logits/chosen": 13.987703323364258, "logits/rejected": 13.648883819580078, "logps/chosen": -4.2136993408203125, "logps/rejected": -4.349675178527832, "loss": 3.7863, "rewards/accuracies": 0.75, "rewards/chosen": -42.136993408203125, "rewards/margins": 1.3597536087036133, "rewards/rejected": -43.49674987792969, "step": 5813 }, { "epoch": 0.7916666666666666, "grad_norm": 37.11846591404945, "learning_rate": 1.0120752833079511e-07, "logits/chosen": 14.231754302978516, "logits/rejected": 13.751358032226562, "logps/chosen": -4.579748630523682, "logps/rejected": -4.37483549118042, "loss": 3.9335, "rewards/accuracies": 0.25, "rewards/chosen": -45.7974853515625, "rewards/margins": -2.04913330078125, "rewards/rejected": -43.74835205078125, "step": 5814 }, { "epoch": 0.7918028322440087, "grad_norm": 43.20636688582859, "learning_rate": 1.0108114824846938e-07, "logits/chosen": 14.464818954467773, "logits/rejected": 15.261762619018555, "logps/chosen": -4.565864562988281, "logps/rejected": -4.960123062133789, "loss": 3.2181, "rewards/accuracies": 0.5, "rewards/chosen": -45.65864181518555, "rewards/margins": 3.9425859451293945, "rewards/rejected": -49.601226806640625, "step": 5815 }, { "epoch": 0.7919389978213508, "grad_norm": 39.589749058757036, "learning_rate": 1.0095483570932915e-07, "logits/chosen": 14.584695816040039, "logits/rejected": 14.791637420654297, "logps/chosen": -4.351122856140137, "logps/rejected": -4.719498157501221, "loss": 4.0666, "rewards/accuracies": 0.75, "rewards/chosen": -43.51123046875, "rewards/margins": 3.6837501525878906, "rewards/rejected": -47.194984436035156, "step": 5816 }, { "epoch": 0.7920751633986928, "grad_norm": 45.17489174157606, "learning_rate": 1.0082859074191579e-07, "logits/chosen": 14.145036697387695, "logits/rejected": 14.8359375, "logps/chosen": -4.35888147354126, "logps/rejected": -5.192155838012695, "loss": 4.2729, "rewards/accuracies": 1.0, "rewards/chosen": -43.58881378173828, "rewards/margins": 8.33273983001709, "rewards/rejected": -51.92155456542969, "step": 5817 }, { "epoch": 0.7922113289760349, "grad_norm": 40.24948206880588, "learning_rate": 1.007024133747552e-07, "logits/chosen": 13.455072402954102, "logits/rejected": 14.65591049194336, "logps/chosen": -4.254306316375732, "logps/rejected": -4.698533058166504, "loss": 3.9527, "rewards/accuracies": 0.5, "rewards/chosen": -42.543060302734375, "rewards/margins": 4.442269325256348, "rewards/rejected": -46.985328674316406, "step": 5818 }, { "epoch": 0.7923474945533769, "grad_norm": 36.662311363674924, "learning_rate": 1.0057630363635836e-07, "logits/chosen": 14.032716751098633, "logits/rejected": 14.311899185180664, "logps/chosen": -4.615462303161621, "logps/rejected": -4.7697038650512695, "loss": 3.6422, "rewards/accuracies": 0.5, "rewards/chosen": -46.15462112426758, "rewards/margins": 1.5424203872680664, "rewards/rejected": -47.69704055786133, "step": 5819 }, { "epoch": 0.7924836601307189, "grad_norm": 44.99376410434393, "learning_rate": 1.0045026155522087e-07, "logits/chosen": 14.870820999145508, "logits/rejected": 14.82870864868164, "logps/chosen": -4.657829284667969, "logps/rejected": -4.592367172241211, "loss": 4.131, "rewards/accuracies": 0.5, "rewards/chosen": -46.57829284667969, "rewards/margins": -0.6546201705932617, "rewards/rejected": -45.92367172241211, "step": 5820 }, { "epoch": 0.792619825708061, "grad_norm": 40.159705886002314, "learning_rate": 1.003242871598228e-07, "logits/chosen": 14.07341480255127, "logits/rejected": 14.26177978515625, "logps/chosen": -4.589354515075684, "logps/rejected": -4.723963260650635, "loss": 3.8775, "rewards/accuracies": 0.75, "rewards/chosen": -45.89354705810547, "rewards/margins": 1.3460893630981445, "rewards/rejected": -47.2396354675293, "step": 5821 }, { "epoch": 0.7927559912854031, "grad_norm": 41.18766037565458, "learning_rate": 1.0019838047862919e-07, "logits/chosen": 13.701557159423828, "logits/rejected": 14.611272811889648, "logps/chosen": -4.180795669555664, "logps/rejected": -4.655045986175537, "loss": 4.2688, "rewards/accuracies": 0.75, "rewards/chosen": -41.807960510253906, "rewards/margins": 4.742504119873047, "rewards/rejected": -46.55046081542969, "step": 5822 }, { "epoch": 0.7928921568627451, "grad_norm": 38.4843422153375, "learning_rate": 1.0007254154008977e-07, "logits/chosen": 13.894157409667969, "logits/rejected": 14.740864753723145, "logps/chosen": -4.344817638397217, "logps/rejected": -4.688411712646484, "loss": 3.4312, "rewards/accuracies": 0.75, "rewards/chosen": -43.448177337646484, "rewards/margins": 3.4359350204467773, "rewards/rejected": -46.88411331176758, "step": 5823 }, { "epoch": 0.7930283224400871, "grad_norm": 41.322166545116055, "learning_rate": 9.994677037263874e-08, "logits/chosen": 14.597574234008789, "logits/rejected": 14.436383247375488, "logps/chosen": -4.357822418212891, "logps/rejected": -4.404834747314453, "loss": 3.76, "rewards/accuracies": 0.5, "rewards/chosen": -43.578224182128906, "rewards/margins": 0.4701213836669922, "rewards/rejected": -44.04834747314453, "step": 5824 }, { "epoch": 0.7931644880174292, "grad_norm": 40.21862113877917, "learning_rate": 9.982106700469519e-08, "logits/chosen": 14.016295433044434, "logits/rejected": 14.727437973022461, "logps/chosen": -4.290724754333496, "logps/rejected": -4.663107872009277, "loss": 3.7715, "rewards/accuracies": 0.75, "rewards/chosen": -42.90724182128906, "rewards/margins": 3.723832130432129, "rewards/rejected": -46.63107681274414, "step": 5825 }, { "epoch": 0.7933006535947712, "grad_norm": 37.709024499667294, "learning_rate": 9.969543146466297e-08, "logits/chosen": 14.169352531433105, "logits/rejected": 14.77524471282959, "logps/chosen": -4.657143592834473, "logps/rejected": -4.654212951660156, "loss": 3.8025, "rewards/accuracies": 0.5, "rewards/chosen": -46.57143783569336, "rewards/margins": -0.029311180114746094, "rewards/rejected": -46.5421257019043, "step": 5826 }, { "epoch": 0.7934368191721133, "grad_norm": 38.13700305408297, "learning_rate": 9.956986378093022e-08, "logits/chosen": 15.278459548950195, "logits/rejected": 15.409231185913086, "logps/chosen": -5.040332794189453, "logps/rejected": -5.159884929656982, "loss": 3.9047, "rewards/accuracies": 0.5, "rewards/chosen": -50.4033317565918, "rewards/margins": 1.195516586303711, "rewards/rejected": -51.598846435546875, "step": 5827 }, { "epoch": 0.7935729847494554, "grad_norm": 37.88053321716757, "learning_rate": 9.944436398187014e-08, "logits/chosen": 14.041229248046875, "logits/rejected": 14.266833305358887, "logps/chosen": -4.179825782775879, "logps/rejected": -4.484430313110352, "loss": 3.2505, "rewards/accuracies": 1.0, "rewards/chosen": -41.798255920410156, "rewards/margins": 3.0460519790649414, "rewards/rejected": -44.84430694580078, "step": 5828 }, { "epoch": 0.7937091503267973, "grad_norm": 41.87481359665193, "learning_rate": 9.931893209584061e-08, "logits/chosen": 13.92513656616211, "logits/rejected": 14.416133880615234, "logps/chosen": -4.398487567901611, "logps/rejected": -4.720396041870117, "loss": 4.6502, "rewards/accuracies": 0.75, "rewards/chosen": -43.98487854003906, "rewards/margins": 3.2190847396850586, "rewards/rejected": -47.20396423339844, "step": 5829 }, { "epoch": 0.7938453159041394, "grad_norm": 38.77636177860016, "learning_rate": 9.919356815118364e-08, "logits/chosen": 14.923818588256836, "logits/rejected": 15.328634262084961, "logps/chosen": -4.453158378601074, "logps/rejected": -4.810763359069824, "loss": 3.788, "rewards/accuracies": 0.75, "rewards/chosen": -44.531585693359375, "rewards/margins": 3.576046943664551, "rewards/rejected": -48.107635498046875, "step": 5830 }, { "epoch": 0.7939814814814815, "grad_norm": 39.25219636650198, "learning_rate": 9.906827217622647e-08, "logits/chosen": 13.383845329284668, "logits/rejected": 14.251371383666992, "logps/chosen": -3.95786452293396, "logps/rejected": -4.497220516204834, "loss": 3.7702, "rewards/accuracies": 1.0, "rewards/chosen": -39.578643798828125, "rewards/margins": 5.393559455871582, "rewards/rejected": -44.972206115722656, "step": 5831 }, { "epoch": 0.7941176470588235, "grad_norm": 39.50534215957421, "learning_rate": 9.894304419928086e-08, "logits/chosen": 14.489078521728516, "logits/rejected": 14.621365547180176, "logps/chosen": -4.929034233093262, "logps/rejected": -4.840936660766602, "loss": 3.8749, "rewards/accuracies": 0.5, "rewards/chosen": -49.290340423583984, "rewards/margins": -0.880976676940918, "rewards/rejected": -48.40936279296875, "step": 5832 }, { "epoch": 0.7942538126361656, "grad_norm": 36.893897003262225, "learning_rate": 9.881788424864281e-08, "logits/chosen": 14.807644844055176, "logits/rejected": 14.939477920532227, "logps/chosen": -4.614345550537109, "logps/rejected": -4.751694679260254, "loss": 3.8382, "rewards/accuracies": 0.75, "rewards/chosen": -46.143455505371094, "rewards/margins": 1.373488426208496, "rewards/rejected": -47.516944885253906, "step": 5833 }, { "epoch": 0.7943899782135077, "grad_norm": 40.617177836115964, "learning_rate": 9.869279235259345e-08, "logits/chosen": 13.984821319580078, "logits/rejected": 14.666159629821777, "logps/chosen": -4.552350044250488, "logps/rejected": -4.724667549133301, "loss": 4.2859, "rewards/accuracies": 0.75, "rewards/chosen": -45.52349853515625, "rewards/margins": 1.7231731414794922, "rewards/rejected": -47.246673583984375, "step": 5834 }, { "epoch": 0.7945261437908496, "grad_norm": 40.85883345948418, "learning_rate": 9.856776853939837e-08, "logits/chosen": 14.370046615600586, "logits/rejected": 14.40657901763916, "logps/chosen": -4.265610694885254, "logps/rejected": -4.6123480796813965, "loss": 3.7009, "rewards/accuracies": 0.75, "rewards/chosen": -42.656105041503906, "rewards/margins": 3.4673728942871094, "rewards/rejected": -46.12348175048828, "step": 5835 }, { "epoch": 0.7946623093681917, "grad_norm": 44.509760198434584, "learning_rate": 9.844281283730755e-08, "logits/chosen": 14.724872589111328, "logits/rejected": 14.99220085144043, "logps/chosen": -4.747544288635254, "logps/rejected": -4.756779670715332, "loss": 4.0538, "rewards/accuracies": 0.5, "rewards/chosen": -47.475440979003906, "rewards/margins": 0.09235858917236328, "rewards/rejected": -47.56780242919922, "step": 5836 }, { "epoch": 0.7947984749455338, "grad_norm": 43.64115310145884, "learning_rate": 9.83179252745559e-08, "logits/chosen": 13.500917434692383, "logits/rejected": 14.12077808380127, "logps/chosen": -4.438043594360352, "logps/rejected": -4.690156936645508, "loss": 4.5718, "rewards/accuracies": 0.75, "rewards/chosen": -44.38043212890625, "rewards/margins": 2.521132469177246, "rewards/rejected": -46.90156555175781, "step": 5837 }, { "epoch": 0.7949346405228758, "grad_norm": 40.13794632171727, "learning_rate": 9.819310587936285e-08, "logits/chosen": 14.708335876464844, "logits/rejected": 15.003961563110352, "logps/chosen": -4.405733108520508, "logps/rejected": -4.464817047119141, "loss": 3.9952, "rewards/accuracies": 0.25, "rewards/chosen": -44.057334899902344, "rewards/margins": 0.5908374786376953, "rewards/rejected": -44.648170471191406, "step": 5838 }, { "epoch": 0.7950708061002179, "grad_norm": 38.465143516241945, "learning_rate": 9.806835467993217e-08, "logits/chosen": 14.19576644897461, "logits/rejected": 14.891361236572266, "logps/chosen": -4.533047199249268, "logps/rejected": -4.7140960693359375, "loss": 3.306, "rewards/accuracies": 1.0, "rewards/chosen": -45.330474853515625, "rewards/margins": 1.8104915618896484, "rewards/rejected": -47.14096450805664, "step": 5839 }, { "epoch": 0.7952069716775599, "grad_norm": 45.15822417327045, "learning_rate": 9.794367170445257e-08, "logits/chosen": 13.776152610778809, "logits/rejected": 14.972187042236328, "logps/chosen": -4.246457099914551, "logps/rejected": -4.794102668762207, "loss": 3.9516, "rewards/accuracies": 0.75, "rewards/chosen": -42.464569091796875, "rewards/margins": 5.47645378112793, "rewards/rejected": -47.94102478027344, "step": 5840 }, { "epoch": 0.7953431372549019, "grad_norm": 39.033188960170975, "learning_rate": 9.781905698109722e-08, "logits/chosen": 14.457569122314453, "logits/rejected": 15.206502914428711, "logps/chosen": -4.3855671882629395, "logps/rejected": -4.939456939697266, "loss": 4.0372, "rewards/accuracies": 1.0, "rewards/chosen": -43.85567092895508, "rewards/margins": 5.5388994216918945, "rewards/rejected": -49.394569396972656, "step": 5841 }, { "epoch": 0.795479302832244, "grad_norm": 38.56100275136614, "learning_rate": 9.769451053802376e-08, "logits/chosen": 13.900657653808594, "logits/rejected": 14.40966796875, "logps/chosen": -4.4898271560668945, "logps/rejected": -4.696765422821045, "loss": 3.8502, "rewards/accuracies": 0.75, "rewards/chosen": -44.89826965332031, "rewards/margins": 2.0693836212158203, "rewards/rejected": -46.9676513671875, "step": 5842 }, { "epoch": 0.7956154684095861, "grad_norm": 35.0009578238592, "learning_rate": 9.75700324033745e-08, "logits/chosen": 13.842421531677246, "logits/rejected": 14.003843307495117, "logps/chosen": -4.286505222320557, "logps/rejected": -4.405100345611572, "loss": 3.6812, "rewards/accuracies": 0.75, "rewards/chosen": -42.86505126953125, "rewards/margins": 1.1859512329101562, "rewards/rejected": -44.051002502441406, "step": 5843 }, { "epoch": 0.795751633986928, "grad_norm": 40.718485125564364, "learning_rate": 9.744562260527645e-08, "logits/chosen": 14.163511276245117, "logits/rejected": 14.261419296264648, "logps/chosen": -4.289643287658691, "logps/rejected": -4.505450248718262, "loss": 4.1678, "rewards/accuracies": 0.75, "rewards/chosen": -42.89643096923828, "rewards/margins": 2.1580705642700195, "rewards/rejected": -45.054500579833984, "step": 5844 }, { "epoch": 0.7958877995642701, "grad_norm": 39.460702886564604, "learning_rate": 9.732128117184078e-08, "logits/chosen": 15.140753746032715, "logits/rejected": 15.939004898071289, "logps/chosen": -4.18901252746582, "logps/rejected": -4.577223777770996, "loss": 3.9389, "rewards/accuracies": 0.75, "rewards/chosen": -41.89012145996094, "rewards/margins": 3.8821144104003906, "rewards/rejected": -45.77223587036133, "step": 5845 }, { "epoch": 0.7960239651416122, "grad_norm": 44.18992778829539, "learning_rate": 9.719700813116358e-08, "logits/chosen": 14.31562328338623, "logits/rejected": 14.75115966796875, "logps/chosen": -4.738744735717773, "logps/rejected": -4.583049297332764, "loss": 4.5811, "rewards/accuracies": 0.25, "rewards/chosen": -47.38744354248047, "rewards/margins": -1.5569524765014648, "rewards/rejected": -45.83049011230469, "step": 5846 }, { "epoch": 0.7961601307189542, "grad_norm": 38.275233185081056, "learning_rate": 9.707280351132552e-08, "logits/chosen": 14.17281723022461, "logits/rejected": 14.178112030029297, "logps/chosen": -4.871132850646973, "logps/rejected": -5.080628395080566, "loss": 4.0403, "rewards/accuracies": 0.75, "rewards/chosen": -48.711326599121094, "rewards/margins": 2.094961166381836, "rewards/rejected": -50.8062858581543, "step": 5847 }, { "epoch": 0.7962962962962963, "grad_norm": 37.7371674069461, "learning_rate": 9.694866734039143e-08, "logits/chosen": 13.690494537353516, "logits/rejected": 14.025132179260254, "logps/chosen": -4.188799858093262, "logps/rejected": -4.633423805236816, "loss": 3.5595, "rewards/accuracies": 1.0, "rewards/chosen": -41.88800048828125, "rewards/margins": 4.446235656738281, "rewards/rejected": -46.33423614501953, "step": 5848 }, { "epoch": 0.7964324618736384, "grad_norm": 38.63590582343593, "learning_rate": 9.682459964641099e-08, "logits/chosen": 13.989885330200195, "logits/rejected": 14.131209373474121, "logps/chosen": -4.424520492553711, "logps/rejected": -4.579358100891113, "loss": 3.9667, "rewards/accuracies": 0.75, "rewards/chosen": -44.245201110839844, "rewards/margins": 1.5483818054199219, "rewards/rejected": -45.793582916259766, "step": 5849 }, { "epoch": 0.7965686274509803, "grad_norm": 41.311474035919254, "learning_rate": 9.670060045741846e-08, "logits/chosen": 14.57642650604248, "logits/rejected": 15.357458114624023, "logps/chosen": -4.684351921081543, "logps/rejected": -4.853907585144043, "loss": 4.1556, "rewards/accuracies": 0.75, "rewards/chosen": -46.8435173034668, "rewards/margins": 1.6955623626708984, "rewards/rejected": -48.53907775878906, "step": 5850 }, { "epoch": 0.7967047930283224, "grad_norm": 39.92168843615386, "learning_rate": 9.657666980143222e-08, "logits/chosen": 14.50838851928711, "logits/rejected": 14.479791641235352, "logps/chosen": -4.733049392700195, "logps/rejected": -4.716470718383789, "loss": 3.7277, "rewards/accuracies": 0.5, "rewards/chosen": -47.33049774169922, "rewards/margins": -0.16579055786132812, "rewards/rejected": -47.16470718383789, "step": 5851 }, { "epoch": 0.7968409586056645, "grad_norm": 39.9189749730272, "learning_rate": 9.645280770645556e-08, "logits/chosen": 14.661588668823242, "logits/rejected": 14.61728286743164, "logps/chosen": -4.80888032913208, "logps/rejected": -4.698112487792969, "loss": 3.7895, "rewards/accuracies": 0.5, "rewards/chosen": -48.08880615234375, "rewards/margins": -1.1076784133911133, "rewards/rejected": -46.98112487792969, "step": 5852 }, { "epoch": 0.7969771241830066, "grad_norm": 44.68210489853009, "learning_rate": 9.632901420047627e-08, "logits/chosen": 14.535797119140625, "logits/rejected": 14.824564933776855, "logps/chosen": -4.505653381347656, "logps/rejected": -4.869056701660156, "loss": 4.3716, "rewards/accuracies": 0.75, "rewards/chosen": -45.05653762817383, "rewards/margins": 3.634031295776367, "rewards/rejected": -48.69056701660156, "step": 5853 }, { "epoch": 0.7971132897603486, "grad_norm": 42.13604117988285, "learning_rate": 9.620528931146629e-08, "logits/chosen": 13.906352996826172, "logits/rejected": 14.072528839111328, "logps/chosen": -4.31857442855835, "logps/rejected": -4.186518669128418, "loss": 3.9608, "rewards/accuracies": 0.5, "rewards/chosen": -43.18574523925781, "rewards/margins": -1.320561408996582, "rewards/rejected": -41.86518096923828, "step": 5854 }, { "epoch": 0.7972494553376906, "grad_norm": 43.4500452221261, "learning_rate": 9.608163306738238e-08, "logits/chosen": 14.750917434692383, "logits/rejected": 14.634017944335938, "logps/chosen": -4.9505157470703125, "logps/rejected": -5.008807182312012, "loss": 4.1219, "rewards/accuracies": 0.75, "rewards/chosen": -49.505157470703125, "rewards/margins": 0.5829172134399414, "rewards/rejected": -50.08807373046875, "step": 5855 }, { "epoch": 0.7973856209150327, "grad_norm": 39.19343973559085, "learning_rate": 9.59580454961658e-08, "logits/chosen": 14.427316665649414, "logits/rejected": 15.264729499816895, "logps/chosen": -4.620697498321533, "logps/rejected": -4.911288738250732, "loss": 4.025, "rewards/accuracies": 0.75, "rewards/chosen": -46.206974029541016, "rewards/margins": 2.905914306640625, "rewards/rejected": -49.112884521484375, "step": 5856 }, { "epoch": 0.7975217864923747, "grad_norm": 43.18675705231442, "learning_rate": 9.583452662574196e-08, "logits/chosen": 14.662398338317871, "logits/rejected": 14.128294944763184, "logps/chosen": -4.754229545593262, "logps/rejected": -4.70023775100708, "loss": 4.3735, "rewards/accuracies": 0.75, "rewards/chosen": -47.54229736328125, "rewards/margins": -0.5399179458618164, "rewards/rejected": -47.002376556396484, "step": 5857 }, { "epoch": 0.7976579520697168, "grad_norm": 38.50830527120611, "learning_rate": 9.571107648402108e-08, "logits/chosen": 15.297714233398438, "logits/rejected": 15.377946853637695, "logps/chosen": -4.895421981811523, "logps/rejected": -5.061967372894287, "loss": 3.9033, "rewards/accuracies": 0.75, "rewards/chosen": -48.954219818115234, "rewards/margins": 1.665456771850586, "rewards/rejected": -50.61967468261719, "step": 5858 }, { "epoch": 0.7977941176470589, "grad_norm": 38.230019733948765, "learning_rate": 9.558769509889786e-08, "logits/chosen": 12.942526817321777, "logits/rejected": 13.977249145507812, "logps/chosen": -4.077188491821289, "logps/rejected": -4.5168986320495605, "loss": 4.168, "rewards/accuracies": 0.75, "rewards/chosen": -40.77188491821289, "rewards/margins": 4.397101402282715, "rewards/rejected": -45.168983459472656, "step": 5859 }, { "epoch": 0.7979302832244008, "grad_norm": 37.645982174057515, "learning_rate": 9.54643824982511e-08, "logits/chosen": 14.420122146606445, "logits/rejected": 14.37843132019043, "logps/chosen": -4.469005584716797, "logps/rejected": -4.897367477416992, "loss": 3.7149, "rewards/accuracies": 0.75, "rewards/chosen": -44.69005584716797, "rewards/margins": 4.2836198806762695, "rewards/rejected": -48.97367858886719, "step": 5860 }, { "epoch": 0.7980664488017429, "grad_norm": 49.22919642158945, "learning_rate": 9.53411387099445e-08, "logits/chosen": 13.525896072387695, "logits/rejected": 13.317147254943848, "logps/chosen": -4.217768669128418, "logps/rejected": -4.446861267089844, "loss": 4.1363, "rewards/accuracies": 0.75, "rewards/chosen": -42.17768096923828, "rewards/margins": 2.29093074798584, "rewards/rejected": -44.46861267089844, "step": 5861 }, { "epoch": 0.798202614379085, "grad_norm": 40.04345069032302, "learning_rate": 9.521796376182601e-08, "logits/chosen": 14.07340145111084, "logits/rejected": 14.544795036315918, "logps/chosen": -4.365888595581055, "logps/rejected": -4.7676568031311035, "loss": 4.3707, "rewards/accuracies": 1.0, "rewards/chosen": -43.65888595581055, "rewards/margins": 4.017681121826172, "rewards/rejected": -47.67656707763672, "step": 5862 }, { "epoch": 0.798338779956427, "grad_norm": 44.783861248695644, "learning_rate": 9.509485768172783e-08, "logits/chosen": 14.499017715454102, "logits/rejected": 14.288980484008789, "logps/chosen": -4.713958740234375, "logps/rejected": -4.574858665466309, "loss": 4.3147, "rewards/accuracies": 0.5, "rewards/chosen": -47.13958740234375, "rewards/margins": -1.3910045623779297, "rewards/rejected": -45.74858474731445, "step": 5863 }, { "epoch": 0.7984749455337691, "grad_norm": 37.62801847024133, "learning_rate": 9.497182049746694e-08, "logits/chosen": 14.277193069458008, "logits/rejected": 14.937021255493164, "logps/chosen": -4.485063552856445, "logps/rejected": -4.835273742675781, "loss": 4.093, "rewards/accuracies": 1.0, "rewards/chosen": -44.85063934326172, "rewards/margins": 3.502100944519043, "rewards/rejected": -48.35273742675781, "step": 5864 }, { "epoch": 0.7986111111111112, "grad_norm": 40.56827854879488, "learning_rate": 9.484885223684473e-08, "logits/chosen": 13.947807312011719, "logits/rejected": 14.445765495300293, "logps/chosen": -4.435813903808594, "logps/rejected": -4.638808250427246, "loss": 4.3157, "rewards/accuracies": 0.5, "rewards/chosen": -44.35813903808594, "rewards/margins": 2.0299463272094727, "rewards/rejected": -46.388084411621094, "step": 5865 }, { "epoch": 0.7987472766884531, "grad_norm": 37.764401095377835, "learning_rate": 9.472595292764664e-08, "logits/chosen": 14.833200454711914, "logits/rejected": 14.935943603515625, "logps/chosen": -4.705025672912598, "logps/rejected": -4.877585411071777, "loss": 3.9934, "rewards/accuracies": 0.75, "rewards/chosen": -47.05025863647461, "rewards/margins": 1.7255935668945312, "rewards/rejected": -48.77585220336914, "step": 5866 }, { "epoch": 0.7988834422657952, "grad_norm": 37.89841804557501, "learning_rate": 9.460312259764291e-08, "logits/chosen": 14.165899276733398, "logits/rejected": 14.151148796081543, "logps/chosen": -4.381875991821289, "logps/rejected": -4.416895866394043, "loss": 4.1625, "rewards/accuracies": 0.5, "rewards/chosen": -43.81875991821289, "rewards/margins": 0.35019779205322266, "rewards/rejected": -44.16896057128906, "step": 5867 }, { "epoch": 0.7990196078431373, "grad_norm": 40.83635585436507, "learning_rate": 9.448036127458813e-08, "logits/chosen": 14.143373489379883, "logits/rejected": 13.969709396362305, "logps/chosen": -4.436952590942383, "logps/rejected": -4.34206485748291, "loss": 4.2694, "rewards/accuracies": 0.25, "rewards/chosen": -44.36952209472656, "rewards/margins": -0.9488735198974609, "rewards/rejected": -43.42064666748047, "step": 5868 }, { "epoch": 0.7991557734204793, "grad_norm": 37.90598239534107, "learning_rate": 9.435766898622115e-08, "logits/chosen": 14.067087173461914, "logits/rejected": 14.613480567932129, "logps/chosen": -4.139352798461914, "logps/rejected": -4.403562545776367, "loss": 3.9129, "rewards/accuracies": 0.75, "rewards/chosen": -41.393524169921875, "rewards/margins": 2.642106056213379, "rewards/rejected": -44.03562927246094, "step": 5869 }, { "epoch": 0.7992919389978214, "grad_norm": 39.13550760306614, "learning_rate": 9.423504576026524e-08, "logits/chosen": 14.25044059753418, "logits/rejected": 14.64400863647461, "logps/chosen": -3.947425603866577, "logps/rejected": -4.500377655029297, "loss": 3.3355, "rewards/accuracies": 1.0, "rewards/chosen": -39.4742546081543, "rewards/margins": 5.5295209884643555, "rewards/rejected": -45.00377655029297, "step": 5870 }, { "epoch": 0.7994281045751634, "grad_norm": 46.626695480869095, "learning_rate": 9.411249162442838e-08, "logits/chosen": 13.762287139892578, "logits/rejected": 13.458314895629883, "logps/chosen": -4.436384201049805, "logps/rejected": -4.2659831047058105, "loss": 3.4018, "rewards/accuracies": 0.25, "rewards/chosen": -44.36383819580078, "rewards/margins": -1.7040081024169922, "rewards/rejected": -42.65983200073242, "step": 5871 }, { "epoch": 0.7995642701525054, "grad_norm": 42.036882370343086, "learning_rate": 9.399000660640242e-08, "logits/chosen": 14.53626537322998, "logits/rejected": 14.77933406829834, "logps/chosen": -4.394855499267578, "logps/rejected": -4.419583797454834, "loss": 3.8877, "rewards/accuracies": 0.5, "rewards/chosen": -43.94855499267578, "rewards/margins": 0.2472848892211914, "rewards/rejected": -44.195838928222656, "step": 5872 }, { "epoch": 0.7997004357298475, "grad_norm": 40.10996630041656, "learning_rate": 9.386759073386397e-08, "logits/chosen": 14.71010971069336, "logits/rejected": 13.856935501098633, "logps/chosen": -4.393038749694824, "logps/rejected": -4.635592460632324, "loss": 3.6762, "rewards/accuracies": 0.5, "rewards/chosen": -43.93038558959961, "rewards/margins": 2.4255428314208984, "rewards/rejected": -46.355926513671875, "step": 5873 }, { "epoch": 0.7998366013071896, "grad_norm": 37.79774380862776, "learning_rate": 9.374524403447401e-08, "logits/chosen": 14.088391304016113, "logits/rejected": 13.601177215576172, "logps/chosen": -4.609445095062256, "logps/rejected": -4.676619529724121, "loss": 3.9875, "rewards/accuracies": 0.5, "rewards/chosen": -46.094451904296875, "rewards/margins": 0.6717443466186523, "rewards/rejected": -46.76619338989258, "step": 5874 }, { "epoch": 0.7999727668845316, "grad_norm": 41.805867163316364, "learning_rate": 9.362296653587755e-08, "logits/chosen": 13.823497772216797, "logits/rejected": 13.861629486083984, "logps/chosen": -4.359407424926758, "logps/rejected": -4.338052749633789, "loss": 4.2962, "rewards/accuracies": 0.5, "rewards/chosen": -43.594078063964844, "rewards/margins": -0.2135477066040039, "rewards/rejected": -43.380531311035156, "step": 5875 }, { "epoch": 0.8001089324618736, "grad_norm": 41.49127139673004, "learning_rate": 9.350075826570436e-08, "logits/chosen": 14.100189208984375, "logits/rejected": 13.914894104003906, "logps/chosen": -4.351858615875244, "logps/rejected": -4.376986980438232, "loss": 4.4202, "rewards/accuracies": 0.75, "rewards/chosen": -43.51858901977539, "rewards/margins": 0.2512836456298828, "rewards/rejected": -43.769874572753906, "step": 5876 }, { "epoch": 0.8002450980392157, "grad_norm": 45.837706610758055, "learning_rate": 9.337861925156851e-08, "logits/chosen": 14.091355323791504, "logits/rejected": 14.105955123901367, "logps/chosen": -4.495134353637695, "logps/rejected": -4.643576622009277, "loss": 4.4119, "rewards/accuracies": 0.75, "rewards/chosen": -44.95133972167969, "rewards/margins": 1.4844255447387695, "rewards/rejected": -46.435768127441406, "step": 5877 }, { "epoch": 0.8003812636165577, "grad_norm": 38.53421111756381, "learning_rate": 9.325654952106811e-08, "logits/chosen": 14.269759178161621, "logits/rejected": 14.904448509216309, "logps/chosen": -4.477669715881348, "logps/rejected": -4.707452774047852, "loss": 3.9003, "rewards/accuracies": 1.0, "rewards/chosen": -44.776695251464844, "rewards/margins": 2.2978296279907227, "rewards/rejected": -47.07452392578125, "step": 5878 }, { "epoch": 0.8005174291938998, "grad_norm": 36.593348143079595, "learning_rate": 9.313454910178587e-08, "logits/chosen": 14.202925682067871, "logits/rejected": 14.88376235961914, "logps/chosen": -4.193268299102783, "logps/rejected": -4.415882587432861, "loss": 3.9214, "rewards/accuracies": 0.75, "rewards/chosen": -41.932682037353516, "rewards/margins": 2.2261428833007812, "rewards/rejected": -44.1588249206543, "step": 5879 }, { "epoch": 0.8006535947712419, "grad_norm": 39.59292274315738, "learning_rate": 9.301261802128895e-08, "logits/chosen": 13.773832321166992, "logits/rejected": 14.24542236328125, "logps/chosen": -4.244541645050049, "logps/rejected": -4.625784397125244, "loss": 3.9416, "rewards/accuracies": 0.75, "rewards/chosen": -42.44541549682617, "rewards/margins": 3.8124313354492188, "rewards/rejected": -46.25784683227539, "step": 5880 }, { "epoch": 0.8007897603485838, "grad_norm": 37.16492103828308, "learning_rate": 9.289075630712848e-08, "logits/chosen": 14.88839340209961, "logits/rejected": 15.247340202331543, "logps/chosen": -4.84327507019043, "logps/rejected": -4.976205348968506, "loss": 3.9248, "rewards/accuracies": 0.75, "rewards/chosen": -48.43274688720703, "rewards/margins": 1.329305648803711, "rewards/rejected": -49.762054443359375, "step": 5881 }, { "epoch": 0.8009259259259259, "grad_norm": 40.19938908738232, "learning_rate": 9.276896398684022e-08, "logits/chosen": 14.359718322753906, "logits/rejected": 14.289857864379883, "logps/chosen": -4.8155903816223145, "logps/rejected": -4.787743091583252, "loss": 4.229, "rewards/accuracies": 0.5, "rewards/chosen": -48.155906677246094, "rewards/margins": -0.278472900390625, "rewards/rejected": -47.8774299621582, "step": 5882 }, { "epoch": 0.801062091503268, "grad_norm": 37.988062843046905, "learning_rate": 9.264724108794424e-08, "logits/chosen": 13.926454544067383, "logits/rejected": 14.405553817749023, "logps/chosen": -4.416146278381348, "logps/rejected": -4.745273590087891, "loss": 3.5438, "rewards/accuracies": 1.0, "rewards/chosen": -44.161460876464844, "rewards/margins": 3.291274070739746, "rewards/rejected": -47.452735900878906, "step": 5883 }, { "epoch": 0.80119825708061, "grad_norm": 37.63724416860559, "learning_rate": 9.252558763794463e-08, "logits/chosen": 14.67416000366211, "logits/rejected": 15.121391296386719, "logps/chosen": -4.345058441162109, "logps/rejected": -4.76071834564209, "loss": 3.9978, "rewards/accuracies": 1.0, "rewards/chosen": -43.450584411621094, "rewards/margins": 4.1565961837768555, "rewards/rejected": -47.607181549072266, "step": 5884 }, { "epoch": 0.8013344226579521, "grad_norm": 42.30746900847136, "learning_rate": 9.24040036643301e-08, "logits/chosen": 14.22047233581543, "logits/rejected": 13.264766693115234, "logps/chosen": -4.551102638244629, "logps/rejected": -4.328115940093994, "loss": 3.8952, "rewards/accuracies": 0.25, "rewards/chosen": -45.511024475097656, "rewards/margins": -2.229865074157715, "rewards/rejected": -43.281158447265625, "step": 5885 }, { "epoch": 0.8014705882352942, "grad_norm": 34.96930888049449, "learning_rate": 9.228248919457357e-08, "logits/chosen": 14.541492462158203, "logits/rejected": 14.62446403503418, "logps/chosen": -4.656530380249023, "logps/rejected": -4.429816246032715, "loss": 3.7411, "rewards/accuracies": 0.25, "rewards/chosen": -46.5653076171875, "rewards/margins": -2.267141342163086, "rewards/rejected": -44.29816436767578, "step": 5886 }, { "epoch": 0.8016067538126361, "grad_norm": 35.73340291681941, "learning_rate": 9.216104425613234e-08, "logits/chosen": 14.868568420410156, "logits/rejected": 15.508049964904785, "logps/chosen": -4.992710590362549, "logps/rejected": -5.2867655754089355, "loss": 3.4719, "rewards/accuracies": 0.5, "rewards/chosen": -49.927101135253906, "rewards/margins": 2.9405508041381836, "rewards/rejected": -52.867652893066406, "step": 5887 }, { "epoch": 0.8017429193899782, "grad_norm": 46.61655076208719, "learning_rate": 9.203966887644763e-08, "logits/chosen": 14.56045913696289, "logits/rejected": 14.37047290802002, "logps/chosen": -4.530306816101074, "logps/rejected": -4.558052062988281, "loss": 3.7673, "rewards/accuracies": 0.5, "rewards/chosen": -45.303070068359375, "rewards/margins": 0.2774534225463867, "rewards/rejected": -45.58052444458008, "step": 5888 }, { "epoch": 0.8018790849673203, "grad_norm": 38.06570559449307, "learning_rate": 9.191836308294538e-08, "logits/chosen": 14.050430297851562, "logits/rejected": 14.562328338623047, "logps/chosen": -4.249926567077637, "logps/rejected": -4.2545857429504395, "loss": 3.5119, "rewards/accuracies": 0.5, "rewards/chosen": -42.499263763427734, "rewards/margins": 0.04659557342529297, "rewards/rejected": -42.545860290527344, "step": 5889 }, { "epoch": 0.8020152505446623, "grad_norm": 35.61319692372334, "learning_rate": 9.179712690303575e-08, "logits/chosen": 14.516969680786133, "logits/rejected": 14.99941349029541, "logps/chosen": -4.13459587097168, "logps/rejected": -4.579952239990234, "loss": 3.9356, "rewards/accuracies": 1.0, "rewards/chosen": -41.34595489501953, "rewards/margins": 4.4535627365112305, "rewards/rejected": -45.79951858520508, "step": 5890 }, { "epoch": 0.8021514161220044, "grad_norm": 37.562501540640135, "learning_rate": 9.16759603641128e-08, "logits/chosen": 14.104646682739258, "logits/rejected": 13.752344131469727, "logps/chosen": -4.513653755187988, "logps/rejected": -4.334837913513184, "loss": 3.7293, "rewards/accuracies": 0.25, "rewards/chosen": -45.13653564453125, "rewards/margins": -1.7881574630737305, "rewards/rejected": -43.34838104248047, "step": 5891 }, { "epoch": 0.8022875816993464, "grad_norm": 40.625499454820044, "learning_rate": 9.155486349355528e-08, "logits/chosen": 14.686031341552734, "logits/rejected": 14.407032012939453, "logps/chosen": -4.763664245605469, "logps/rejected": -4.840604305267334, "loss": 4.1708, "rewards/accuracies": 0.75, "rewards/chosen": -47.63664245605469, "rewards/margins": 0.7694015502929688, "rewards/rejected": -48.406044006347656, "step": 5892 }, { "epoch": 0.8024237472766884, "grad_norm": 39.83490345140271, "learning_rate": 9.143383631872592e-08, "logits/chosen": 14.32259464263916, "logits/rejected": 14.440021514892578, "logps/chosen": -4.327004432678223, "logps/rejected": -4.688529968261719, "loss": 4.1406, "rewards/accuracies": 0.75, "rewards/chosen": -43.270042419433594, "rewards/margins": 3.6152515411376953, "rewards/rejected": -46.88529586791992, "step": 5893 }, { "epoch": 0.8025599128540305, "grad_norm": 42.08917846424957, "learning_rate": 9.131287886697205e-08, "logits/chosen": 14.327943801879883, "logits/rejected": 14.153949737548828, "logps/chosen": -4.287843704223633, "logps/rejected": -4.443264484405518, "loss": 3.7393, "rewards/accuracies": 0.5, "rewards/chosen": -42.87843322753906, "rewards/margins": 1.5542078018188477, "rewards/rejected": -44.43264389038086, "step": 5894 }, { "epoch": 0.8026960784313726, "grad_norm": 39.76807574374182, "learning_rate": 9.11919911656247e-08, "logits/chosen": 14.521716117858887, "logits/rejected": 14.90334701538086, "logps/chosen": -4.635422706604004, "logps/rejected": -4.94874382019043, "loss": 4.3342, "rewards/accuracies": 1.0, "rewards/chosen": -46.354225158691406, "rewards/margins": 3.133213996887207, "rewards/rejected": -49.48744201660156, "step": 5895 }, { "epoch": 0.8028322440087146, "grad_norm": 38.63771081762357, "learning_rate": 9.107117324199958e-08, "logits/chosen": 13.870574951171875, "logits/rejected": 14.3047456741333, "logps/chosen": -4.312923431396484, "logps/rejected": -4.549087047576904, "loss": 3.8638, "rewards/accuracies": 0.75, "rewards/chosen": -43.129234313964844, "rewards/margins": 2.3616342544555664, "rewards/rejected": -45.49087142944336, "step": 5896 }, { "epoch": 0.8029684095860566, "grad_norm": 36.42191679154656, "learning_rate": 9.095042512339656e-08, "logits/chosen": 14.289392471313477, "logits/rejected": 15.022002220153809, "logps/chosen": -4.50941801071167, "logps/rejected": -4.948990345001221, "loss": 3.9086, "rewards/accuracies": 1.0, "rewards/chosen": -45.094181060791016, "rewards/margins": 4.395724296569824, "rewards/rejected": -49.489906311035156, "step": 5897 }, { "epoch": 0.8031045751633987, "grad_norm": 40.62433641401019, "learning_rate": 9.082974683709959e-08, "logits/chosen": 14.615007400512695, "logits/rejected": 15.395355224609375, "logps/chosen": -4.920952796936035, "logps/rejected": -5.190067291259766, "loss": 3.974, "rewards/accuracies": 0.75, "rewards/chosen": -49.20952606201172, "rewards/margins": 2.6911497116088867, "rewards/rejected": -51.900672912597656, "step": 5898 }, { "epoch": 0.8032407407407407, "grad_norm": 39.92054605378445, "learning_rate": 9.070913841037691e-08, "logits/chosen": 13.966619491577148, "logits/rejected": 14.54836654663086, "logps/chosen": -4.682391166687012, "logps/rejected": -4.753957748413086, "loss": 3.7744, "rewards/accuracies": 0.5, "rewards/chosen": -46.82390594482422, "rewards/margins": 0.7156667709350586, "rewards/rejected": -47.539573669433594, "step": 5899 }, { "epoch": 0.8033769063180828, "grad_norm": 38.876405830432624, "learning_rate": 9.0588599870481e-08, "logits/chosen": 13.601486206054688, "logits/rejected": 14.364862442016602, "logps/chosen": -4.299788475036621, "logps/rejected": -4.750932693481445, "loss": 3.4393, "rewards/accuracies": 0.75, "rewards/chosen": -42.997886657714844, "rewards/margins": 4.511444091796875, "rewards/rejected": -47.50933074951172, "step": 5900 }, { "epoch": 0.8035130718954249, "grad_norm": 38.97777634816298, "learning_rate": 9.04681312446487e-08, "logits/chosen": 13.850069046020508, "logits/rejected": 14.088943481445312, "logps/chosen": -4.291769981384277, "logps/rejected": -4.470534801483154, "loss": 3.7628, "rewards/accuracies": 0.75, "rewards/chosen": -42.917694091796875, "rewards/margins": 1.7876548767089844, "rewards/rejected": -44.70534896850586, "step": 5901 }, { "epoch": 0.8036492374727668, "grad_norm": 40.71349209898055, "learning_rate": 9.034773256010066e-08, "logits/chosen": 14.206422805786133, "logits/rejected": 14.029400825500488, "logps/chosen": -4.6674113273620605, "logps/rejected": -4.482800483703613, "loss": 3.4994, "rewards/accuracies": 0.25, "rewards/chosen": -46.674110412597656, "rewards/margins": -1.8461055755615234, "rewards/rejected": -44.828006744384766, "step": 5902 }, { "epoch": 0.8037854030501089, "grad_norm": 41.10672183637358, "learning_rate": 9.022740384404204e-08, "logits/chosen": 15.014954566955566, "logits/rejected": 13.802576065063477, "logps/chosen": -4.239407539367676, "logps/rejected": -4.420812129974365, "loss": 4.521, "rewards/accuracies": 0.75, "rewards/chosen": -42.394073486328125, "rewards/margins": 1.8140478134155273, "rewards/rejected": -44.20812225341797, "step": 5903 }, { "epoch": 0.803921568627451, "grad_norm": 39.14806194648868, "learning_rate": 9.010714512366227e-08, "logits/chosen": 14.88601303100586, "logits/rejected": 14.432610511779785, "logps/chosen": -4.8704657554626465, "logps/rejected": -4.7265119552612305, "loss": 4.2905, "rewards/accuracies": 0.0, "rewards/chosen": -48.70465850830078, "rewards/margins": -1.4395427703857422, "rewards/rejected": -47.26511764526367, "step": 5904 }, { "epoch": 0.804057734204793, "grad_norm": 36.56067696241171, "learning_rate": 8.998695642613454e-08, "logits/chosen": 13.806495666503906, "logits/rejected": 13.7815523147583, "logps/chosen": -4.154634475708008, "logps/rejected": -4.202207565307617, "loss": 3.9221, "rewards/accuracies": 0.75, "rewards/chosen": -41.54634094238281, "rewards/margins": 0.4757366180419922, "rewards/rejected": -42.02207946777344, "step": 5905 }, { "epoch": 0.8041938997821351, "grad_norm": 40.16063189318031, "learning_rate": 8.986683777861657e-08, "logits/chosen": 14.807365417480469, "logits/rejected": 15.241941452026367, "logps/chosen": -4.683028221130371, "logps/rejected": -4.97167444229126, "loss": 3.618, "rewards/accuracies": 0.75, "rewards/chosen": -46.830284118652344, "rewards/margins": 2.8864593505859375, "rewards/rejected": -49.71674346923828, "step": 5906 }, { "epoch": 0.8043300653594772, "grad_norm": 39.213558753404605, "learning_rate": 8.974678920825036e-08, "logits/chosen": 13.54819393157959, "logits/rejected": 15.289299964904785, "logps/chosen": -4.199798107147217, "logps/rejected": -4.643363952636719, "loss": 3.7128, "rewards/accuracies": 1.0, "rewards/chosen": -41.99797821044922, "rewards/margins": 4.435660362243652, "rewards/rejected": -46.43363952636719, "step": 5907 }, { "epoch": 0.8044662309368191, "grad_norm": 39.018029314019955, "learning_rate": 8.962681074216156e-08, "logits/chosen": 14.40921401977539, "logits/rejected": 14.415655136108398, "logps/chosen": -4.224647521972656, "logps/rejected": -4.2524309158325195, "loss": 4.0978, "rewards/accuracies": 0.5, "rewards/chosen": -42.2464714050293, "rewards/margins": 0.27784299850463867, "rewards/rejected": -42.524314880371094, "step": 5908 }, { "epoch": 0.8046023965141612, "grad_norm": 42.4070027190942, "learning_rate": 8.950690240746043e-08, "logits/chosen": 14.334455490112305, "logits/rejected": 14.485715866088867, "logps/chosen": -4.259821891784668, "logps/rejected": -4.253848075866699, "loss": 4.505, "rewards/accuracies": 0.75, "rewards/chosen": -42.59822082519531, "rewards/margins": -0.05974006652832031, "rewards/rejected": -42.53847885131836, "step": 5909 }, { "epoch": 0.8047385620915033, "grad_norm": 38.01841568874496, "learning_rate": 8.938706423124141e-08, "logits/chosen": 14.07107925415039, "logits/rejected": 15.260488510131836, "logps/chosen": -4.487781524658203, "logps/rejected": -5.260578155517578, "loss": 3.6562, "rewards/accuracies": 1.0, "rewards/chosen": -44.87781524658203, "rewards/margins": 7.727964401245117, "rewards/rejected": -52.60578155517578, "step": 5910 }, { "epoch": 0.8048747276688453, "grad_norm": 38.9821344496136, "learning_rate": 8.926729624058263e-08, "logits/chosen": 14.05634880065918, "logits/rejected": 15.384538650512695, "logps/chosen": -4.446194648742676, "logps/rejected": -4.877450942993164, "loss": 4.2772, "rewards/accuracies": 1.0, "rewards/chosen": -44.461944580078125, "rewards/margins": 4.312561988830566, "rewards/rejected": -48.774505615234375, "step": 5911 }, { "epoch": 0.8050108932461874, "grad_norm": 39.609925257980464, "learning_rate": 8.914759846254681e-08, "logits/chosen": 14.869691848754883, "logits/rejected": 14.654422760009766, "logps/chosen": -4.729803562164307, "logps/rejected": -4.688678741455078, "loss": 4.2343, "rewards/accuracies": 0.5, "rewards/chosen": -47.298038482666016, "rewards/margins": -0.41124725341796875, "rewards/rejected": -46.88679122924805, "step": 5912 }, { "epoch": 0.8051470588235294, "grad_norm": 38.041867898281076, "learning_rate": 8.902797092418079e-08, "logits/chosen": 14.408761978149414, "logits/rejected": 15.2320556640625, "logps/chosen": -4.3508710861206055, "logps/rejected": -4.718716144561768, "loss": 3.9519, "rewards/accuracies": 0.75, "rewards/chosen": -43.50870895385742, "rewards/margins": 3.678452491760254, "rewards/rejected": -47.187164306640625, "step": 5913 }, { "epoch": 0.8052832244008714, "grad_norm": 39.404742371078974, "learning_rate": 8.890841365251511e-08, "logits/chosen": 14.233793258666992, "logits/rejected": 14.278242111206055, "logps/chosen": -4.237553596496582, "logps/rejected": -4.5231828689575195, "loss": 4.2065, "rewards/accuracies": 0.75, "rewards/chosen": -42.37553787231445, "rewards/margins": 2.856292724609375, "rewards/rejected": -45.23183059692383, "step": 5914 }, { "epoch": 0.8054193899782135, "grad_norm": 37.77134505851736, "learning_rate": 8.87889266745649e-08, "logits/chosen": 13.28253173828125, "logits/rejected": 13.891529083251953, "logps/chosen": -4.314485549926758, "logps/rejected": -4.327990531921387, "loss": 3.9272, "rewards/accuracies": 0.75, "rewards/chosen": -43.14485549926758, "rewards/margins": 0.13505077362060547, "rewards/rejected": -43.2799072265625, "step": 5915 }, { "epoch": 0.8055555555555556, "grad_norm": 39.48716009503864, "learning_rate": 8.866951001732932e-08, "logits/chosen": 14.670038223266602, "logits/rejected": 13.773996353149414, "logps/chosen": -4.363339424133301, "logps/rejected": -4.004512786865234, "loss": 3.6104, "rewards/accuracies": 0.5, "rewards/chosen": -43.63339614868164, "rewards/margins": -3.5882644653320312, "rewards/rejected": -40.045127868652344, "step": 5916 }, { "epoch": 0.8056917211328976, "grad_norm": 37.05049004250311, "learning_rate": 8.855016370779131e-08, "logits/chosen": 14.585895538330078, "logits/rejected": 14.813472747802734, "logps/chosen": -4.728743553161621, "logps/rejected": -4.965561389923096, "loss": 4.1517, "rewards/accuracies": 0.5, "rewards/chosen": -47.287437438964844, "rewards/margins": 2.368178367614746, "rewards/rejected": -49.655616760253906, "step": 5917 }, { "epoch": 0.8058278867102396, "grad_norm": 65.64404718200544, "learning_rate": 8.84308877729183e-08, "logits/chosen": 13.500251770019531, "logits/rejected": 14.265083312988281, "logps/chosen": -4.265846252441406, "logps/rejected": -4.692889213562012, "loss": 4.1028, "rewards/accuracies": 1.0, "rewards/chosen": -42.65846252441406, "rewards/margins": 4.2704315185546875, "rewards/rejected": -46.92889404296875, "step": 5918 }, { "epoch": 0.8059640522875817, "grad_norm": 45.25140278907453, "learning_rate": 8.831168223966177e-08, "logits/chosen": 14.278087615966797, "logits/rejected": 15.171079635620117, "logps/chosen": -4.567094802856445, "logps/rejected": -4.972670555114746, "loss": 4.5901, "rewards/accuracies": 0.75, "rewards/chosen": -45.67094802856445, "rewards/margins": 4.055754661560059, "rewards/rejected": -49.72670364379883, "step": 5919 }, { "epoch": 0.8061002178649237, "grad_norm": 37.71360364703213, "learning_rate": 8.819254713495694e-08, "logits/chosen": 13.441143035888672, "logits/rejected": 14.293105125427246, "logps/chosen": -4.05526065826416, "logps/rejected": -4.281932830810547, "loss": 3.6255, "rewards/accuracies": 0.5, "rewards/chosen": -40.5526123046875, "rewards/margins": 2.2667179107666016, "rewards/rejected": -42.81932830810547, "step": 5920 }, { "epoch": 0.8062363834422658, "grad_norm": 38.00487458105584, "learning_rate": 8.807348248572352e-08, "logits/chosen": 13.960278511047363, "logits/rejected": 14.703856468200684, "logps/chosen": -4.246329307556152, "logps/rejected": -4.464231014251709, "loss": 3.8406, "rewards/accuracies": 1.0, "rewards/chosen": -42.463294982910156, "rewards/margins": 2.179011344909668, "rewards/rejected": -44.64230728149414, "step": 5921 }, { "epoch": 0.8063725490196079, "grad_norm": 36.77045893231751, "learning_rate": 8.795448831886525e-08, "logits/chosen": 14.162620544433594, "logits/rejected": 14.818033218383789, "logps/chosen": -4.313787460327148, "logps/rejected": -4.696712970733643, "loss": 3.3705, "rewards/accuracies": 1.0, "rewards/chosen": -43.13787841796875, "rewards/margins": 3.829252243041992, "rewards/rejected": -46.967132568359375, "step": 5922 }, { "epoch": 0.8065087145969498, "grad_norm": 41.972896331712725, "learning_rate": 8.783556466126966e-08, "logits/chosen": 15.560569763183594, "logits/rejected": 15.008764266967773, "logps/chosen": -4.8329267501831055, "logps/rejected": -4.956234931945801, "loss": 4.23, "rewards/accuracies": 0.75, "rewards/chosen": -48.32926559448242, "rewards/margins": 1.233083724975586, "rewards/rejected": -49.562347412109375, "step": 5923 }, { "epoch": 0.8066448801742919, "grad_norm": 41.25415343167443, "learning_rate": 8.771671153980858e-08, "logits/chosen": 14.194786071777344, "logits/rejected": 14.418377876281738, "logps/chosen": -4.587717533111572, "logps/rejected": -4.541269779205322, "loss": 4.0243, "rewards/accuracies": 0.25, "rewards/chosen": -45.877174377441406, "rewards/margins": -0.4644765853881836, "rewards/rejected": -45.412696838378906, "step": 5924 }, { "epoch": 0.806781045751634, "grad_norm": 42.8766090601599, "learning_rate": 8.759792898133799e-08, "logits/chosen": 15.327417373657227, "logits/rejected": 14.786402702331543, "logps/chosen": -4.862028121948242, "logps/rejected": -4.608826160430908, "loss": 4.2816, "rewards/accuracies": 0.25, "rewards/chosen": -48.62028503417969, "rewards/margins": -2.532022476196289, "rewards/rejected": -46.088260650634766, "step": 5925 }, { "epoch": 0.806917211328976, "grad_norm": 38.24618243760967, "learning_rate": 8.747921701269762e-08, "logits/chosen": 14.011923789978027, "logits/rejected": 14.923734664916992, "logps/chosen": -4.642291069030762, "logps/rejected": -4.683123588562012, "loss": 4.2958, "rewards/accuracies": 0.75, "rewards/chosen": -46.42291259765625, "rewards/margins": 0.4083232879638672, "rewards/rejected": -46.83123779296875, "step": 5926 }, { "epoch": 0.8070533769063181, "grad_norm": 35.88304010486211, "learning_rate": 8.736057566071147e-08, "logits/chosen": 14.518033027648926, "logits/rejected": 14.525077819824219, "logps/chosen": -4.489914417266846, "logps/rejected": -4.34963321685791, "loss": 4.1013, "rewards/accuracies": 0.25, "rewards/chosen": -44.899147033691406, "rewards/margins": -1.4028100967407227, "rewards/rejected": -43.496334075927734, "step": 5927 }, { "epoch": 0.8071895424836601, "grad_norm": 39.32817276268087, "learning_rate": 8.724200495218764e-08, "logits/chosen": 13.318973541259766, "logits/rejected": 14.46104907989502, "logps/chosen": -4.2823381423950195, "logps/rejected": -4.589698791503906, "loss": 4.4324, "rewards/accuracies": 0.75, "rewards/chosen": -42.82337951660156, "rewards/margins": 3.0736083984375, "rewards/rejected": -45.89698791503906, "step": 5928 }, { "epoch": 0.8073257080610022, "grad_norm": 39.19619748631662, "learning_rate": 8.712350491391797e-08, "logits/chosen": 14.250974655151367, "logits/rejected": 14.100057601928711, "logps/chosen": -4.455926895141602, "logps/rejected": -4.542829513549805, "loss": 4.498, "rewards/accuracies": 0.5, "rewards/chosen": -44.55926513671875, "rewards/margins": 0.8690319061279297, "rewards/rejected": -45.42829895019531, "step": 5929 }, { "epoch": 0.8074618736383442, "grad_norm": 36.365845490622384, "learning_rate": 8.700507557267864e-08, "logits/chosen": 14.63542652130127, "logits/rejected": 14.522285461425781, "logps/chosen": -4.638678550720215, "logps/rejected": -4.766232967376709, "loss": 3.6796, "rewards/accuracies": 0.75, "rewards/chosen": -46.38678741455078, "rewards/margins": 1.2755413055419922, "rewards/rejected": -47.662330627441406, "step": 5930 }, { "epoch": 0.8075980392156863, "grad_norm": 43.120773610341864, "learning_rate": 8.688671695522987e-08, "logits/chosen": 15.025951385498047, "logits/rejected": 15.049642562866211, "logps/chosen": -4.503501892089844, "logps/rejected": -4.720507621765137, "loss": 4.2887, "rewards/accuracies": 0.75, "rewards/chosen": -45.03501892089844, "rewards/margins": 2.170058250427246, "rewards/rejected": -47.205078125, "step": 5931 }, { "epoch": 0.8077342047930284, "grad_norm": 44.681823524413026, "learning_rate": 8.676842908831545e-08, "logits/chosen": 14.910184860229492, "logits/rejected": 14.721660614013672, "logps/chosen": -4.861626625061035, "logps/rejected": -4.5478386878967285, "loss": 3.8169, "rewards/accuracies": 0.0, "rewards/chosen": -48.61626434326172, "rewards/margins": -3.137875556945801, "rewards/rejected": -45.478389739990234, "step": 5932 }, { "epoch": 0.8078703703703703, "grad_norm": 44.5788137759227, "learning_rate": 8.665021199866371e-08, "logits/chosen": 13.695148468017578, "logits/rejected": 13.822647094726562, "logps/chosen": -4.320206642150879, "logps/rejected": -4.383138656616211, "loss": 4.2314, "rewards/accuracies": 0.5, "rewards/chosen": -43.202064514160156, "rewards/margins": 0.6293230056762695, "rewards/rejected": -43.831390380859375, "step": 5933 }, { "epoch": 0.8080065359477124, "grad_norm": 41.52942128993249, "learning_rate": 8.653206571298688e-08, "logits/chosen": 14.264415740966797, "logits/rejected": 14.656353950500488, "logps/chosen": -4.555352687835693, "logps/rejected": -4.7813920974731445, "loss": 3.4061, "rewards/accuracies": 0.5, "rewards/chosen": -45.55352783203125, "rewards/margins": 2.2603893280029297, "rewards/rejected": -47.81391906738281, "step": 5934 }, { "epoch": 0.8081427015250545, "grad_norm": 38.167132262748446, "learning_rate": 8.64139902579808e-08, "logits/chosen": 14.273015975952148, "logits/rejected": 14.813937187194824, "logps/chosen": -4.713262557983398, "logps/rejected": -4.973721981048584, "loss": 3.8241, "rewards/accuracies": 1.0, "rewards/chosen": -47.132625579833984, "rewards/margins": 2.6045961380004883, "rewards/rejected": -49.737220764160156, "step": 5935 }, { "epoch": 0.8082788671023965, "grad_norm": 38.20651156020037, "learning_rate": 8.629598566032577e-08, "logits/chosen": 14.565279006958008, "logits/rejected": 14.525590896606445, "logps/chosen": -4.300577163696289, "logps/rejected": -4.512255668640137, "loss": 3.5535, "rewards/accuracies": 0.5, "rewards/chosen": -43.00577163696289, "rewards/margins": 2.116786003112793, "rewards/rejected": -45.12255859375, "step": 5936 }, { "epoch": 0.8084150326797386, "grad_norm": 40.03864756274502, "learning_rate": 8.617805194668597e-08, "logits/chosen": 13.924434661865234, "logits/rejected": 14.62696647644043, "logps/chosen": -4.586489200592041, "logps/rejected": -4.879673004150391, "loss": 3.8105, "rewards/accuracies": 0.75, "rewards/chosen": -45.86489486694336, "rewards/margins": 2.931839942932129, "rewards/rejected": -48.79673385620117, "step": 5937 }, { "epoch": 0.8085511982570807, "grad_norm": 39.88198609865187, "learning_rate": 8.606018914370933e-08, "logits/chosen": 15.071495056152344, "logits/rejected": 14.974924087524414, "logps/chosen": -4.976263999938965, "logps/rejected": -5.075257301330566, "loss": 3.8319, "rewards/accuracies": 0.25, "rewards/chosen": -49.76264190673828, "rewards/margins": 0.9899349212646484, "rewards/rejected": -50.7525749206543, "step": 5938 }, { "epoch": 0.8086873638344226, "grad_norm": 40.959109021689855, "learning_rate": 8.594239727802799e-08, "logits/chosen": 13.585075378417969, "logits/rejected": 14.305577278137207, "logps/chosen": -4.405613899230957, "logps/rejected": -4.458742141723633, "loss": 3.6199, "rewards/accuracies": 0.5, "rewards/chosen": -44.05614471435547, "rewards/margins": 0.5312767028808594, "rewards/rejected": -44.58741760253906, "step": 5939 }, { "epoch": 0.8088235294117647, "grad_norm": 43.313869463425256, "learning_rate": 8.582467637625814e-08, "logits/chosen": 15.060260772705078, "logits/rejected": 14.91296672821045, "logps/chosen": -4.489808082580566, "logps/rejected": -4.73790168762207, "loss": 4.1126, "rewards/accuracies": 0.75, "rewards/chosen": -44.8980827331543, "rewards/margins": 2.4809341430664062, "rewards/rejected": -47.37901306152344, "step": 5940 }, { "epoch": 0.8089596949891068, "grad_norm": 40.93410040514597, "learning_rate": 8.570702646499954e-08, "logits/chosen": 13.738775253295898, "logits/rejected": 13.418939590454102, "logps/chosen": -4.364017486572266, "logps/rejected": -4.486673355102539, "loss": 3.7599, "rewards/accuracies": 0.5, "rewards/chosen": -43.64017868041992, "rewards/margins": 1.226552963256836, "rewards/rejected": -44.866729736328125, "step": 5941 }, { "epoch": 0.8090958605664488, "grad_norm": 54.9890744513786, "learning_rate": 8.55894475708363e-08, "logits/chosen": 13.642179489135742, "logits/rejected": 14.590495109558105, "logps/chosen": -4.267358303070068, "logps/rejected": -4.598443031311035, "loss": 4.8214, "rewards/accuracies": 0.75, "rewards/chosen": -42.673583984375, "rewards/margins": 3.3108434677124023, "rewards/rejected": -45.98442840576172, "step": 5942 }, { "epoch": 0.8092320261437909, "grad_norm": 38.90725392622357, "learning_rate": 8.547193972033642e-08, "logits/chosen": 14.573885917663574, "logits/rejected": 15.516555786132812, "logps/chosen": -4.3824896812438965, "logps/rejected": -4.7457804679870605, "loss": 3.9164, "rewards/accuracies": 0.75, "rewards/chosen": -43.82489776611328, "rewards/margins": 3.632906913757324, "rewards/rejected": -47.45780563354492, "step": 5943 }, { "epoch": 0.809368191721133, "grad_norm": 38.168911455199684, "learning_rate": 8.535450294005153e-08, "logits/chosen": 15.358293533325195, "logits/rejected": 15.001398086547852, "logps/chosen": -4.673744201660156, "logps/rejected": -4.85722541809082, "loss": 4.1146, "rewards/accuracies": 0.75, "rewards/chosen": -46.73744201660156, "rewards/margins": 1.8348150253295898, "rewards/rejected": -48.57225799560547, "step": 5944 }, { "epoch": 0.8095043572984749, "grad_norm": 40.34460427906498, "learning_rate": 8.523713725651762e-08, "logits/chosen": 13.890295028686523, "logits/rejected": 14.66389274597168, "logps/chosen": -4.209802150726318, "logps/rejected": -4.341030120849609, "loss": 4.4538, "rewards/accuracies": 0.75, "rewards/chosen": -42.0980224609375, "rewards/margins": 1.3122787475585938, "rewards/rejected": -43.410301208496094, "step": 5945 }, { "epoch": 0.809640522875817, "grad_norm": 39.424298461826254, "learning_rate": 8.511984269625454e-08, "logits/chosen": 14.016956329345703, "logits/rejected": 14.532503128051758, "logps/chosen": -4.425013542175293, "logps/rejected": -4.671117782592773, "loss": 4.0148, "rewards/accuracies": 0.75, "rewards/chosen": -44.25013732910156, "rewards/margins": 2.4610424041748047, "rewards/rejected": -46.711181640625, "step": 5946 }, { "epoch": 0.8097766884531591, "grad_norm": 40.71316964845014, "learning_rate": 8.500261928576567e-08, "logits/chosen": 14.601615905761719, "logits/rejected": 14.428509712219238, "logps/chosen": -4.758155822753906, "logps/rejected": -4.3809733390808105, "loss": 4.0016, "rewards/accuracies": 0.5, "rewards/chosen": -47.58155822753906, "rewards/margins": -3.771824836730957, "rewards/rejected": -43.809730529785156, "step": 5947 }, { "epoch": 0.8099128540305011, "grad_norm": 59.09422651741242, "learning_rate": 8.488546705153879e-08, "logits/chosen": 13.724329948425293, "logits/rejected": 15.033513069152832, "logps/chosen": -4.634148120880127, "logps/rejected": -5.21290397644043, "loss": 3.8491, "rewards/accuracies": 1.0, "rewards/chosen": -46.34148406982422, "rewards/margins": 5.787555694580078, "rewards/rejected": -52.1290397644043, "step": 5948 }, { "epoch": 0.8100490196078431, "grad_norm": 39.25391352698822, "learning_rate": 8.476838602004553e-08, "logits/chosen": 14.688551902770996, "logits/rejected": 14.948799133300781, "logps/chosen": -4.8425374031066895, "logps/rejected": -4.732821464538574, "loss": 4.3251, "rewards/accuracies": 0.5, "rewards/chosen": -48.42537307739258, "rewards/margins": -1.0971603393554688, "rewards/rejected": -47.32821273803711, "step": 5949 }, { "epoch": 0.8101851851851852, "grad_norm": 37.95054128183307, "learning_rate": 8.465137621774104e-08, "logits/chosen": 14.270305633544922, "logits/rejected": 14.497447967529297, "logps/chosen": -4.515114784240723, "logps/rejected": -4.742190361022949, "loss": 3.8979, "rewards/accuracies": 0.75, "rewards/chosen": -45.15114974975586, "rewards/margins": 2.2707509994506836, "rewards/rejected": -47.42190170288086, "step": 5950 }, { "epoch": 0.8103213507625272, "grad_norm": 39.27207047215153, "learning_rate": 8.453443767106478e-08, "logits/chosen": 15.200324058532715, "logits/rejected": 15.295531272888184, "logps/chosen": -4.963079452514648, "logps/rejected": -4.956237316131592, "loss": 4.3582, "rewards/accuracies": 0.5, "rewards/chosen": -49.63079833984375, "rewards/margins": -0.06842041015625, "rewards/rejected": -49.5623779296875, "step": 5951 }, { "epoch": 0.8104575163398693, "grad_norm": 43.52304816480106, "learning_rate": 8.441757040644013e-08, "logits/chosen": 13.907913208007812, "logits/rejected": 13.42329216003418, "logps/chosen": -4.419384002685547, "logps/rejected": -4.159393310546875, "loss": 4.3081, "rewards/accuracies": 0.25, "rewards/chosen": -44.19384002685547, "rewards/margins": -2.599905014038086, "rewards/rejected": -41.593936920166016, "step": 5952 }, { "epoch": 0.8105936819172114, "grad_norm": 39.748771447335116, "learning_rate": 8.430077445027395e-08, "logits/chosen": 13.845823287963867, "logits/rejected": 14.749619483947754, "logps/chosen": -4.641053199768066, "logps/rejected": -5.031785011291504, "loss": 4.0156, "rewards/accuracies": 0.75, "rewards/chosen": -46.41053009033203, "rewards/margins": 3.9073171615600586, "rewards/rejected": -50.317848205566406, "step": 5953 }, { "epoch": 0.8107298474945533, "grad_norm": 39.80139587547837, "learning_rate": 8.41840498289574e-08, "logits/chosen": 14.836318969726562, "logits/rejected": 14.85396957397461, "logps/chosen": -4.617396831512451, "logps/rejected": -4.917135238647461, "loss": 3.4762, "rewards/accuracies": 0.75, "rewards/chosen": -46.17396545410156, "rewards/margins": 2.997385025024414, "rewards/rejected": -49.17135238647461, "step": 5954 }, { "epoch": 0.8108660130718954, "grad_norm": 42.536047051265314, "learning_rate": 8.406739656886541e-08, "logits/chosen": 14.19238567352295, "logits/rejected": 14.197776794433594, "logps/chosen": -4.451101303100586, "logps/rejected": -4.409175395965576, "loss": 4.2492, "rewards/accuracies": 0.75, "rewards/chosen": -44.511009216308594, "rewards/margins": -0.41925811767578125, "rewards/rejected": -44.09175109863281, "step": 5955 }, { "epoch": 0.8110021786492375, "grad_norm": 38.69962621509171, "learning_rate": 8.395081469635661e-08, "logits/chosen": 14.380526542663574, "logits/rejected": 13.313873291015625, "logps/chosen": -4.5262451171875, "logps/rejected": -4.367093086242676, "loss": 4.2917, "rewards/accuracies": 0.5, "rewards/chosen": -45.26244354248047, "rewards/margins": -1.591517448425293, "rewards/rejected": -43.670928955078125, "step": 5956 }, { "epoch": 0.8111383442265795, "grad_norm": 41.32309678030828, "learning_rate": 8.383430423777373e-08, "logits/chosen": 14.138900756835938, "logits/rejected": 14.633164405822754, "logps/chosen": -4.605594158172607, "logps/rejected": -4.779536247253418, "loss": 4.1561, "rewards/accuracies": 0.75, "rewards/chosen": -46.055938720703125, "rewards/margins": 1.7394189834594727, "rewards/rejected": -47.79535675048828, "step": 5957 }, { "epoch": 0.8112745098039216, "grad_norm": 41.419002518441324, "learning_rate": 8.371786521944338e-08, "logits/chosen": 14.00497055053711, "logits/rejected": 13.995765686035156, "logps/chosen": -4.56736421585083, "logps/rejected": -4.498154640197754, "loss": 3.761, "rewards/accuracies": 0.25, "rewards/chosen": -45.673641204833984, "rewards/margins": -0.6920976638793945, "rewards/rejected": -44.981544494628906, "step": 5958 }, { "epoch": 0.8114106753812637, "grad_norm": 42.621307371049305, "learning_rate": 8.360149766767568e-08, "logits/chosen": 14.58839225769043, "logits/rejected": 14.432904243469238, "logps/chosen": -4.53748893737793, "logps/rejected": -4.819978713989258, "loss": 3.8146, "rewards/accuracies": 0.75, "rewards/chosen": -45.37488555908203, "rewards/margins": 2.824901580810547, "rewards/rejected": -48.199790954589844, "step": 5959 }, { "epoch": 0.8115468409586056, "grad_norm": 39.142536966750356, "learning_rate": 8.348520160876496e-08, "logits/chosen": 14.075616836547852, "logits/rejected": 15.106019020080566, "logps/chosen": -4.640757083892822, "logps/rejected": -5.1974029541015625, "loss": 4.0892, "rewards/accuracies": 0.75, "rewards/chosen": -46.407569885253906, "rewards/margins": 5.566462516784668, "rewards/rejected": -51.974029541015625, "step": 5960 }, { "epoch": 0.8116830065359477, "grad_norm": 39.67993228288481, "learning_rate": 8.336897706898937e-08, "logits/chosen": 13.648653984069824, "logits/rejected": 14.67386245727539, "logps/chosen": -4.519143104553223, "logps/rejected": -4.636691093444824, "loss": 4.0268, "rewards/accuracies": 0.5, "rewards/chosen": -45.191429138183594, "rewards/margins": 1.1754817962646484, "rewards/rejected": -46.366912841796875, "step": 5961 }, { "epoch": 0.8118191721132898, "grad_norm": 43.45193599955311, "learning_rate": 8.32528240746106e-08, "logits/chosen": 14.625337600708008, "logits/rejected": 15.289390563964844, "logps/chosen": -4.710155487060547, "logps/rejected": -5.071971893310547, "loss": 4.2366, "rewards/accuracies": 0.75, "rewards/chosen": -47.10155487060547, "rewards/margins": 3.618166923522949, "rewards/rejected": -50.719722747802734, "step": 5962 }, { "epoch": 0.8119553376906318, "grad_norm": 42.92663767861539, "learning_rate": 8.31367426518745e-08, "logits/chosen": 14.02552318572998, "logits/rejected": 14.808690071105957, "logps/chosen": -3.99782395362854, "logps/rejected": -4.301850318908691, "loss": 3.7736, "rewards/accuracies": 0.75, "rewards/chosen": -39.978240966796875, "rewards/margins": 3.040262222290039, "rewards/rejected": -43.01850509643555, "step": 5963 }, { "epoch": 0.8120915032679739, "grad_norm": 42.49265988638519, "learning_rate": 8.302073282701072e-08, "logits/chosen": 14.126104354858398, "logits/rejected": 14.036066055297852, "logps/chosen": -4.386066436767578, "logps/rejected": -4.334537506103516, "loss": 4.1523, "rewards/accuracies": 0.5, "rewards/chosen": -43.86066436767578, "rewards/margins": -0.5152864456176758, "rewards/rejected": -43.345375061035156, "step": 5964 }, { "epoch": 0.8122276688453159, "grad_norm": 37.48199879869295, "learning_rate": 8.290479462623242e-08, "logits/chosen": 13.968345642089844, "logits/rejected": 14.923519134521484, "logps/chosen": -4.555505752563477, "logps/rejected": -4.607358932495117, "loss": 3.6458, "rewards/accuracies": 0.5, "rewards/chosen": -45.55506134033203, "rewards/margins": 0.5185298919677734, "rewards/rejected": -46.07359313964844, "step": 5965 }, { "epoch": 0.8123638344226579, "grad_norm": 40.72401787214701, "learning_rate": 8.278892807573691e-08, "logits/chosen": 14.48054313659668, "logits/rejected": 15.00457763671875, "logps/chosen": -4.472964763641357, "logps/rejected": -4.88226318359375, "loss": 3.8332, "rewards/accuracies": 1.0, "rewards/chosen": -44.72964859008789, "rewards/margins": 4.092981338500977, "rewards/rejected": -48.822628021240234, "step": 5966 }, { "epoch": 0.8125, "grad_norm": 39.502440585978235, "learning_rate": 8.26731332017053e-08, "logits/chosen": 14.006482124328613, "logits/rejected": 13.844148635864258, "logps/chosen": -4.667939186096191, "logps/rejected": -4.520122051239014, "loss": 3.9473, "rewards/accuracies": 0.5, "rewards/chosen": -46.67938995361328, "rewards/margins": -1.4781684875488281, "rewards/rejected": -45.20121765136719, "step": 5967 }, { "epoch": 0.8126361655773421, "grad_norm": 39.97706236221565, "learning_rate": 8.255741003030219e-08, "logits/chosen": 13.442566871643066, "logits/rejected": 14.125707626342773, "logps/chosen": -4.2375335693359375, "logps/rejected": -4.565403461456299, "loss": 3.6014, "rewards/accuracies": 1.0, "rewards/chosen": -42.375335693359375, "rewards/margins": 3.278697967529297, "rewards/rejected": -45.65403747558594, "step": 5968 }, { "epoch": 0.8127723311546841, "grad_norm": 38.39395284347786, "learning_rate": 8.24417585876763e-08, "logits/chosen": 14.477357864379883, "logits/rejected": 14.289912223815918, "logps/chosen": -4.719048500061035, "logps/rejected": -4.824642658233643, "loss": 4.187, "rewards/accuracies": 0.5, "rewards/chosen": -47.190486907958984, "rewards/margins": 1.0559415817260742, "rewards/rejected": -48.246429443359375, "step": 5969 }, { "epoch": 0.8129084967320261, "grad_norm": 41.56757475349332, "learning_rate": 8.232617889996012e-08, "logits/chosen": 13.82181453704834, "logits/rejected": 13.756048202514648, "logps/chosen": -4.297563076019287, "logps/rejected": -4.60530424118042, "loss": 3.6989, "rewards/accuracies": 0.75, "rewards/chosen": -42.97563171386719, "rewards/margins": 3.0774059295654297, "rewards/rejected": -46.05303955078125, "step": 5970 }, { "epoch": 0.8130446623093682, "grad_norm": 38.91711468336806, "learning_rate": 8.221067099326964e-08, "logits/chosen": 14.455221176147461, "logits/rejected": 14.992659568786621, "logps/chosen": -4.529034614562988, "logps/rejected": -4.720772743225098, "loss": 3.9075, "rewards/accuracies": 0.5, "rewards/chosen": -45.29034423828125, "rewards/margins": 1.9173851013183594, "rewards/rejected": -47.207725524902344, "step": 5971 }, { "epoch": 0.8131808278867102, "grad_norm": 40.41484196596743, "learning_rate": 8.209523489370491e-08, "logits/chosen": 14.99003791809082, "logits/rejected": 15.161992073059082, "logps/chosen": -4.753668308258057, "logps/rejected": -4.746298789978027, "loss": 3.9749, "rewards/accuracies": 0.5, "rewards/chosen": -47.536685943603516, "rewards/margins": -0.07369709014892578, "rewards/rejected": -47.46298599243164, "step": 5972 }, { "epoch": 0.8133169934640523, "grad_norm": 42.863366460165864, "learning_rate": 8.197987062734979e-08, "logits/chosen": 14.588125228881836, "logits/rejected": 13.865604400634766, "logps/chosen": -4.568199157714844, "logps/rejected": -4.261551856994629, "loss": 3.6759, "rewards/accuracies": 0.25, "rewards/chosen": -45.68199157714844, "rewards/margins": -3.066472053527832, "rewards/rejected": -42.615516662597656, "step": 5973 }, { "epoch": 0.8134531590413944, "grad_norm": 39.39356703737146, "learning_rate": 8.18645782202716e-08, "logits/chosen": 14.709318161010742, "logits/rejected": 14.88947868347168, "logps/chosen": -4.692794322967529, "logps/rejected": -4.994320869445801, "loss": 4.1284, "rewards/accuracies": 0.75, "rewards/chosen": -46.92794418334961, "rewards/margins": 3.015268325805664, "rewards/rejected": -49.94321060180664, "step": 5974 }, { "epoch": 0.8135893246187363, "grad_norm": 41.35502510370381, "learning_rate": 8.174935769852167e-08, "logits/chosen": 14.01414680480957, "logits/rejected": 13.559452056884766, "logps/chosen": -4.461628437042236, "logps/rejected": -4.543180465698242, "loss": 3.4559, "rewards/accuracies": 0.75, "rewards/chosen": -44.61628723144531, "rewards/margins": 0.815516471862793, "rewards/rejected": -45.43180465698242, "step": 5975 }, { "epoch": 0.8137254901960784, "grad_norm": 39.35951595529105, "learning_rate": 8.163420908813519e-08, "logits/chosen": 13.809406280517578, "logits/rejected": 13.13976001739502, "logps/chosen": -4.321087837219238, "logps/rejected": -4.218573093414307, "loss": 4.1447, "rewards/accuracies": 0.5, "rewards/chosen": -43.21087646484375, "rewards/margins": -1.0251455307006836, "rewards/rejected": -42.18572998046875, "step": 5976 }, { "epoch": 0.8138616557734205, "grad_norm": 41.70396136533112, "learning_rate": 8.151913241513067e-08, "logits/chosen": 14.445197105407715, "logits/rejected": 15.04287338256836, "logps/chosen": -4.504457473754883, "logps/rejected": -4.894477844238281, "loss": 3.7754, "rewards/accuracies": 1.0, "rewards/chosen": -45.04457092285156, "rewards/margins": 3.9002017974853516, "rewards/rejected": -48.94477844238281, "step": 5977 }, { "epoch": 0.8139978213507625, "grad_norm": 36.002746484524, "learning_rate": 8.140412770551078e-08, "logits/chosen": 14.899600982666016, "logits/rejected": 15.074695587158203, "logps/chosen": -5.011843204498291, "logps/rejected": -4.862305641174316, "loss": 3.6716, "rewards/accuracies": 0.5, "rewards/chosen": -50.118431091308594, "rewards/margins": -1.4953737258911133, "rewards/rejected": -48.62305450439453, "step": 5978 }, { "epoch": 0.8141339869281046, "grad_norm": 37.47735394265672, "learning_rate": 8.128919498526188e-08, "logits/chosen": 14.18497371673584, "logits/rejected": 14.40826416015625, "logps/chosen": -4.486652374267578, "logps/rejected": -4.522211074829102, "loss": 3.5547, "rewards/accuracies": 0.25, "rewards/chosen": -44.86652374267578, "rewards/margins": 0.3555908203125, "rewards/rejected": -45.22211456298828, "step": 5979 }, { "epoch": 0.8142701525054467, "grad_norm": 39.42916105247551, "learning_rate": 8.117433428035373e-08, "logits/chosen": 14.080976486206055, "logits/rejected": 14.256771087646484, "logps/chosen": -4.8810343742370605, "logps/rejected": -4.915968418121338, "loss": 3.9493, "rewards/accuracies": 0.5, "rewards/chosen": -48.810340881347656, "rewards/margins": 0.34934139251708984, "rewards/rejected": -49.15968322753906, "step": 5980 }, { "epoch": 0.8144063180827886, "grad_norm": 42.225547312054985, "learning_rate": 8.105954561674022e-08, "logits/chosen": 13.970691680908203, "logits/rejected": 14.726436614990234, "logps/chosen": -4.296528339385986, "logps/rejected": -4.902047157287598, "loss": 3.637, "rewards/accuracies": 1.0, "rewards/chosen": -42.96528625488281, "rewards/margins": 6.05518913269043, "rewards/rejected": -49.020469665527344, "step": 5981 }, { "epoch": 0.8145424836601307, "grad_norm": 37.35595463276281, "learning_rate": 8.094482902035884e-08, "logits/chosen": 14.033466339111328, "logits/rejected": 14.853551864624023, "logps/chosen": -4.571111679077148, "logps/rejected": -4.516030311584473, "loss": 3.7458, "rewards/accuracies": 0.5, "rewards/chosen": -45.711116790771484, "rewards/margins": -0.550811767578125, "rewards/rejected": -45.160301208496094, "step": 5982 }, { "epoch": 0.8146786492374728, "grad_norm": 47.058701467569904, "learning_rate": 8.083018451713064e-08, "logits/chosen": 14.616726875305176, "logits/rejected": 14.430315017700195, "logps/chosen": -4.604534149169922, "logps/rejected": -4.576606750488281, "loss": 4.6356, "rewards/accuracies": 0.5, "rewards/chosen": -46.04534149169922, "rewards/margins": -0.2792778015136719, "rewards/rejected": -45.76606369018555, "step": 5983 }, { "epoch": 0.8148148148148148, "grad_norm": 40.71159623163237, "learning_rate": 8.071561213296046e-08, "logits/chosen": 14.405717849731445, "logits/rejected": 13.954167366027832, "logps/chosen": -4.652739524841309, "logps/rejected": -4.442274570465088, "loss": 4.1329, "rewards/accuracies": 0.25, "rewards/chosen": -46.52739334106445, "rewards/margins": -2.104644775390625, "rewards/rejected": -44.42274856567383, "step": 5984 }, { "epoch": 0.8149509803921569, "grad_norm": 41.11399300919271, "learning_rate": 8.0601111893737e-08, "logits/chosen": 13.913833618164062, "logits/rejected": 15.100085258483887, "logps/chosen": -4.523806095123291, "logps/rejected": -4.941041946411133, "loss": 4.2144, "rewards/accuracies": 0.75, "rewards/chosen": -45.238059997558594, "rewards/margins": 4.172357559204102, "rewards/rejected": -49.41041564941406, "step": 5985 }, { "epoch": 0.8150871459694989, "grad_norm": 42.32518286821609, "learning_rate": 8.048668382533255e-08, "logits/chosen": 15.027626037597656, "logits/rejected": 15.266153335571289, "logps/chosen": -5.002243995666504, "logps/rejected": -5.434590816497803, "loss": 3.7192, "rewards/accuracies": 1.0, "rewards/chosen": -50.02244186401367, "rewards/margins": 4.3234663009643555, "rewards/rejected": -54.345909118652344, "step": 5986 }, { "epoch": 0.8152233115468409, "grad_norm": 39.6423507955909, "learning_rate": 8.037232795360296e-08, "logits/chosen": 14.341405868530273, "logits/rejected": 14.96290397644043, "logps/chosen": -4.646063804626465, "logps/rejected": -5.086950302124023, "loss": 3.8002, "rewards/accuracies": 0.75, "rewards/chosen": -46.46063995361328, "rewards/margins": 4.408862113952637, "rewards/rejected": -50.869503021240234, "step": 5987 }, { "epoch": 0.815359477124183, "grad_norm": 45.62692365112215, "learning_rate": 8.025804430438791e-08, "logits/chosen": 14.34824275970459, "logits/rejected": 14.066190719604492, "logps/chosen": -4.647307395935059, "logps/rejected": -4.657141208648682, "loss": 4.4485, "rewards/accuracies": 0.5, "rewards/chosen": -46.47307586669922, "rewards/margins": 0.09833717346191406, "rewards/rejected": -46.5714111328125, "step": 5988 }, { "epoch": 0.8154956427015251, "grad_norm": 44.67069489110868, "learning_rate": 8.014383290351086e-08, "logits/chosen": 13.898908615112305, "logits/rejected": 13.98951530456543, "logps/chosen": -4.478249549865723, "logps/rejected": -4.4642744064331055, "loss": 3.7264, "rewards/accuracies": 0.5, "rewards/chosen": -44.78249740600586, "rewards/margins": -0.13975238800048828, "rewards/rejected": -44.64274597167969, "step": 5989 }, { "epoch": 0.815631808278867, "grad_norm": 37.83986801723868, "learning_rate": 8.002969377677864e-08, "logits/chosen": 15.22791862487793, "logits/rejected": 15.105445861816406, "logps/chosen": -4.866739273071289, "logps/rejected": -4.930823802947998, "loss": 4.2934, "rewards/accuracies": 0.5, "rewards/chosen": -48.667388916015625, "rewards/margins": 0.6408491134643555, "rewards/rejected": -49.3082389831543, "step": 5990 }, { "epoch": 0.8157679738562091, "grad_norm": 46.16594308366116, "learning_rate": 7.991562694998197e-08, "logits/chosen": 13.300935745239258, "logits/rejected": 13.537652969360352, "logps/chosen": -4.128688812255859, "logps/rejected": -4.34090518951416, "loss": 3.7124, "rewards/accuracies": 0.75, "rewards/chosen": -41.28688430786133, "rewards/margins": 2.122163772583008, "rewards/rejected": -43.40904998779297, "step": 5991 }, { "epoch": 0.8159041394335512, "grad_norm": 39.342616222945196, "learning_rate": 7.980163244889527e-08, "logits/chosen": 14.394256591796875, "logits/rejected": 15.285100936889648, "logps/chosen": -4.384585380554199, "logps/rejected": -4.7795257568359375, "loss": 4.1074, "rewards/accuracies": 0.75, "rewards/chosen": -43.845855712890625, "rewards/margins": 3.949404716491699, "rewards/rejected": -47.795257568359375, "step": 5992 }, { "epoch": 0.8160403050108932, "grad_norm": 37.24742532391498, "learning_rate": 7.968771029927662e-08, "logits/chosen": 13.775382041931152, "logits/rejected": 14.016046524047852, "logps/chosen": -4.167253494262695, "logps/rejected": -4.498120307922363, "loss": 3.8618, "rewards/accuracies": 0.75, "rewards/chosen": -41.67253112792969, "rewards/margins": 3.3086700439453125, "rewards/rejected": -44.981201171875, "step": 5993 }, { "epoch": 0.8161764705882353, "grad_norm": 40.43348018371849, "learning_rate": 7.957386052686743e-08, "logits/chosen": 13.213447570800781, "logits/rejected": 14.130258560180664, "logps/chosen": -4.073032379150391, "logps/rejected": -4.427971363067627, "loss": 3.6286, "rewards/accuracies": 0.5, "rewards/chosen": -40.730323791503906, "rewards/margins": 3.549391746520996, "rewards/rejected": -44.27971267700195, "step": 5994 }, { "epoch": 0.8163126361655774, "grad_norm": 51.53770334467604, "learning_rate": 7.946008315739314e-08, "logits/chosen": 13.868278503417969, "logits/rejected": 14.528266906738281, "logps/chosen": -4.235813617706299, "logps/rejected": -4.657301902770996, "loss": 4.8453, "rewards/accuracies": 1.0, "rewards/chosen": -42.35813522338867, "rewards/margins": 4.2148847579956055, "rewards/rejected": -46.573020935058594, "step": 5995 }, { "epoch": 0.8164488017429193, "grad_norm": 40.37791025683695, "learning_rate": 7.934637821656274e-08, "logits/chosen": 14.105789184570312, "logits/rejected": 14.497702598571777, "logps/chosen": -4.525142192840576, "logps/rejected": -4.527972221374512, "loss": 3.9995, "rewards/accuracies": 0.25, "rewards/chosen": -45.25141906738281, "rewards/margins": 0.02830028533935547, "rewards/rejected": -45.279720306396484, "step": 5996 }, { "epoch": 0.8165849673202614, "grad_norm": 37.44154201089707, "learning_rate": 7.923274573006864e-08, "logits/chosen": 14.351537704467773, "logits/rejected": 14.339595794677734, "logps/chosen": -4.49985408782959, "logps/rejected": -4.813037872314453, "loss": 3.45, "rewards/accuracies": 1.0, "rewards/chosen": -44.998538970947266, "rewards/margins": 3.1318397521972656, "rewards/rejected": -48.13037872314453, "step": 5997 }, { "epoch": 0.8167211328976035, "grad_norm": 38.77137552600056, "learning_rate": 7.911918572358715e-08, "logits/chosen": 15.392932891845703, "logits/rejected": 15.069473266601562, "logps/chosen": -4.582574367523193, "logps/rejected": -4.66007137298584, "loss": 3.6746, "rewards/accuracies": 0.75, "rewards/chosen": -45.82574462890625, "rewards/margins": 0.774968147277832, "rewards/rejected": -46.60071563720703, "step": 5998 }, { "epoch": 0.8168572984749455, "grad_norm": 45.527567111576936, "learning_rate": 7.900569822277807e-08, "logits/chosen": 13.793088912963867, "logits/rejected": 14.95860481262207, "logps/chosen": -4.186483383178711, "logps/rejected": -4.5067338943481445, "loss": 4.1254, "rewards/accuracies": 0.5, "rewards/chosen": -41.86483383178711, "rewards/margins": 3.202507972717285, "rewards/rejected": -45.067344665527344, "step": 5999 }, { "epoch": 0.8169934640522876, "grad_norm": 39.7086072585647, "learning_rate": 7.889228325328496e-08, "logits/chosen": 13.312541961669922, "logits/rejected": 14.288604736328125, "logps/chosen": -4.176128387451172, "logps/rejected": -4.695748329162598, "loss": 3.9615, "rewards/accuracies": 0.75, "rewards/chosen": -41.76128005981445, "rewards/margins": 5.196202278137207, "rewards/rejected": -46.957481384277344, "step": 6000 }, { "epoch": 0.8171296296296297, "grad_norm": 42.84416408147112, "learning_rate": 7.877894084073462e-08, "logits/chosen": 14.21241569519043, "logits/rejected": 15.148483276367188, "logps/chosen": -4.63804817199707, "logps/rejected": -4.725125312805176, "loss": 4.7357, "rewards/accuracies": 0.5, "rewards/chosen": -46.3804817199707, "rewards/margins": 0.8707761764526367, "rewards/rejected": -47.251258850097656, "step": 6001 }, { "epoch": 0.8172657952069716, "grad_norm": 40.68527356803043, "learning_rate": 7.866567101073785e-08, "logits/chosen": 14.65075397491455, "logits/rejected": 14.127361297607422, "logps/chosen": -4.830407619476318, "logps/rejected": -4.78848934173584, "loss": 4.0365, "rewards/accuracies": 0.5, "rewards/chosen": -48.3040771484375, "rewards/margins": -0.41918182373046875, "rewards/rejected": -47.88489532470703, "step": 6002 }, { "epoch": 0.8174019607843137, "grad_norm": 41.44227141268145, "learning_rate": 7.8552473788889e-08, "logits/chosen": 14.529004096984863, "logits/rejected": 14.925827026367188, "logps/chosen": -4.819284915924072, "logps/rejected": -4.94648551940918, "loss": 4.0586, "rewards/accuracies": 0.75, "rewards/chosen": -48.192848205566406, "rewards/margins": 1.2720069885253906, "rewards/rejected": -49.4648551940918, "step": 6003 }, { "epoch": 0.8175381263616558, "grad_norm": 38.97238558530849, "learning_rate": 7.84393492007657e-08, "logits/chosen": 15.182727813720703, "logits/rejected": 15.260002136230469, "logps/chosen": -4.441224098205566, "logps/rejected": -4.7142229080200195, "loss": 3.8897, "rewards/accuracies": 1.0, "rewards/chosen": -44.41224670410156, "rewards/margins": 2.729985237121582, "rewards/rejected": -47.14222717285156, "step": 6004 }, { "epoch": 0.8176742919389978, "grad_norm": 40.77722180849458, "learning_rate": 7.83262972719295e-08, "logits/chosen": 13.922283172607422, "logits/rejected": 13.459244728088379, "logps/chosen": -4.328864574432373, "logps/rejected": -4.51921272277832, "loss": 4.5926, "rewards/accuracies": 0.5, "rewards/chosen": -43.28864288330078, "rewards/margins": 1.9034833908081055, "rewards/rejected": -45.1921272277832, "step": 6005 }, { "epoch": 0.8178104575163399, "grad_norm": 36.8196919522761, "learning_rate": 7.82133180279255e-08, "logits/chosen": 13.888526916503906, "logits/rejected": 14.603412628173828, "logps/chosen": -4.2383623123168945, "logps/rejected": -4.490116119384766, "loss": 3.4368, "rewards/accuracies": 0.75, "rewards/chosen": -42.383628845214844, "rewards/margins": 2.517533302307129, "rewards/rejected": -44.901161193847656, "step": 6006 }, { "epoch": 0.8179466230936819, "grad_norm": 45.09999256735687, "learning_rate": 7.810041149428213e-08, "logits/chosen": 14.70777702331543, "logits/rejected": 14.518636703491211, "logps/chosen": -4.847630977630615, "logps/rejected": -4.802252769470215, "loss": 4.1843, "rewards/accuracies": 0.25, "rewards/chosen": -48.47631072998047, "rewards/margins": -0.4537830352783203, "rewards/rejected": -48.02252960205078, "step": 6007 }, { "epoch": 0.818082788671024, "grad_norm": 39.49084739922532, "learning_rate": 7.798757769651159e-08, "logits/chosen": 14.043169021606445, "logits/rejected": 14.672694206237793, "logps/chosen": -4.298036098480225, "logps/rejected": -4.756779670715332, "loss": 4.2137, "rewards/accuracies": 0.75, "rewards/chosen": -42.98036575317383, "rewards/margins": 4.587434768676758, "rewards/rejected": -47.56779861450195, "step": 6008 }, { "epoch": 0.818218954248366, "grad_norm": 37.38726425743419, "learning_rate": 7.78748166601098e-08, "logits/chosen": 13.774232864379883, "logits/rejected": 14.196138381958008, "logps/chosen": -4.099627494812012, "logps/rejected": -4.488529205322266, "loss": 3.7554, "rewards/accuracies": 0.75, "rewards/chosen": -40.99627685546875, "rewards/margins": 3.8890199661254883, "rewards/rejected": -44.885292053222656, "step": 6009 }, { "epoch": 0.8183551198257081, "grad_norm": 41.67462302435928, "learning_rate": 7.776212841055576e-08, "logits/chosen": 14.527641296386719, "logits/rejected": 14.932121276855469, "logps/chosen": -4.476648807525635, "logps/rejected": -4.751904487609863, "loss": 3.9847, "rewards/accuracies": 0.75, "rewards/chosen": -44.7664909362793, "rewards/margins": 2.7525558471679688, "rewards/rejected": -47.51904296875, "step": 6010 }, { "epoch": 0.8184912854030502, "grad_norm": 39.64720346861866, "learning_rate": 7.764951297331248e-08, "logits/chosen": 13.923417091369629, "logits/rejected": 14.514671325683594, "logps/chosen": -4.346155166625977, "logps/rejected": -4.800126552581787, "loss": 3.7122, "rewards/accuracies": 0.75, "rewards/chosen": -43.46155548095703, "rewards/margins": 4.539711952209473, "rewards/rejected": -48.00126647949219, "step": 6011 }, { "epoch": 0.8186274509803921, "grad_norm": 40.90383538814268, "learning_rate": 7.753697037382641e-08, "logits/chosen": 14.497492790222168, "logits/rejected": 14.856223106384277, "logps/chosen": -4.5915069580078125, "logps/rejected": -4.876042366027832, "loss": 3.433, "rewards/accuracies": 0.75, "rewards/chosen": -45.915069580078125, "rewards/margins": 2.8453550338745117, "rewards/rejected": -48.76042175292969, "step": 6012 }, { "epoch": 0.8187636165577342, "grad_norm": 40.30233468732079, "learning_rate": 7.742450063752728e-08, "logits/chosen": 13.649055480957031, "logits/rejected": 14.532654762268066, "logps/chosen": -4.246085166931152, "logps/rejected": -4.5871734619140625, "loss": 3.8167, "rewards/accuracies": 0.5, "rewards/chosen": -42.460853576660156, "rewards/margins": 3.4108810424804688, "rewards/rejected": -45.871734619140625, "step": 6013 }, { "epoch": 0.8188997821350763, "grad_norm": 37.84183705078692, "learning_rate": 7.731210378982868e-08, "logits/chosen": 15.104113578796387, "logits/rejected": 15.233173370361328, "logps/chosen": -4.539424419403076, "logps/rejected": -4.681975364685059, "loss": 3.55, "rewards/accuracies": 0.75, "rewards/chosen": -45.39424133300781, "rewards/margins": 1.4255104064941406, "rewards/rejected": -46.81975173950195, "step": 6014 }, { "epoch": 0.8190359477124183, "grad_norm": 41.869400484523105, "learning_rate": 7.71997798561277e-08, "logits/chosen": 14.354220390319824, "logits/rejected": 14.560026168823242, "logps/chosen": -4.498408317565918, "logps/rejected": -5.034021377563477, "loss": 4.1282, "rewards/accuracies": 1.0, "rewards/chosen": -44.98408508300781, "rewards/margins": 5.3561296463012695, "rewards/rejected": -50.340213775634766, "step": 6015 }, { "epoch": 0.8191721132897604, "grad_norm": 37.82529577888831, "learning_rate": 7.708752886180465e-08, "logits/chosen": 13.403682708740234, "logits/rejected": 13.321876525878906, "logps/chosen": -4.138321399688721, "logps/rejected": -4.319758892059326, "loss": 3.3822, "rewards/accuracies": 0.5, "rewards/chosen": -41.383216857910156, "rewards/margins": 1.8143739700317383, "rewards/rejected": -43.19758987426758, "step": 6016 }, { "epoch": 0.8193082788671024, "grad_norm": 41.299530671489165, "learning_rate": 7.697535083222363e-08, "logits/chosen": 14.587408065795898, "logits/rejected": 13.723617553710938, "logps/chosen": -4.508700370788574, "logps/rejected": -4.379712104797363, "loss": 4.2294, "rewards/accuracies": 0.25, "rewards/chosen": -45.087005615234375, "rewards/margins": -1.289881706237793, "rewards/rejected": -43.797122955322266, "step": 6017 }, { "epoch": 0.8194444444444444, "grad_norm": 45.148239293173596, "learning_rate": 7.686324579273242e-08, "logits/chosen": 13.760384559631348, "logits/rejected": 14.680395126342773, "logps/chosen": -4.040334701538086, "logps/rejected": -4.405794143676758, "loss": 3.8365, "rewards/accuracies": 1.0, "rewards/chosen": -40.403350830078125, "rewards/margins": 3.654590606689453, "rewards/rejected": -44.05793762207031, "step": 6018 }, { "epoch": 0.8195806100217865, "grad_norm": 42.208550702314824, "learning_rate": 7.675121376866176e-08, "logits/chosen": 14.00645923614502, "logits/rejected": 14.804350852966309, "logps/chosen": -4.248836040496826, "logps/rejected": -4.825677871704102, "loss": 4.3342, "rewards/accuracies": 1.0, "rewards/chosen": -42.48836135864258, "rewards/margins": 5.768416404724121, "rewards/rejected": -48.256778717041016, "step": 6019 }, { "epoch": 0.8197167755991286, "grad_norm": 39.55780723380706, "learning_rate": 7.663925478532633e-08, "logits/chosen": 13.28636646270752, "logits/rejected": 14.549040794372559, "logps/chosen": -4.185599327087402, "logps/rejected": -4.7709150314331055, "loss": 4.1734, "rewards/accuracies": 1.0, "rewards/chosen": -41.855995178222656, "rewards/margins": 5.853155136108398, "rewards/rejected": -47.709144592285156, "step": 6020 }, { "epoch": 0.8198529411764706, "grad_norm": 44.37264641692493, "learning_rate": 7.652736886802431e-08, "logits/chosen": 14.862899780273438, "logits/rejected": 14.767428398132324, "logps/chosen": -4.800068378448486, "logps/rejected": -4.765845775604248, "loss": 3.8437, "rewards/accuracies": 0.5, "rewards/chosen": -48.00068664550781, "rewards/margins": -0.3422269821166992, "rewards/rejected": -47.65845489501953, "step": 6021 }, { "epoch": 0.8199891067538126, "grad_norm": 35.820957050594956, "learning_rate": 7.641555604203707e-08, "logits/chosen": 13.720703125, "logits/rejected": 15.157005310058594, "logps/chosen": -4.502200603485107, "logps/rejected": -4.699634552001953, "loss": 3.7267, "rewards/accuracies": 0.5, "rewards/chosen": -45.022003173828125, "rewards/margins": 1.9743413925170898, "rewards/rejected": -46.99634552001953, "step": 6022 }, { "epoch": 0.8201252723311547, "grad_norm": 43.14914018406039, "learning_rate": 7.630381633262972e-08, "logits/chosen": 13.94264030456543, "logits/rejected": 14.842921257019043, "logps/chosen": -4.264797210693359, "logps/rejected": -4.774679183959961, "loss": 4.0824, "rewards/accuracies": 0.75, "rewards/chosen": -42.647972106933594, "rewards/margins": 5.098819732666016, "rewards/rejected": -47.74679183959961, "step": 6023 }, { "epoch": 0.8202614379084967, "grad_norm": 38.01295637237164, "learning_rate": 7.619214976505089e-08, "logits/chosen": 14.180292129516602, "logits/rejected": 14.760187149047852, "logps/chosen": -4.580408096313477, "logps/rejected": -4.951174736022949, "loss": 3.6712, "rewards/accuracies": 0.5, "rewards/chosen": -45.80408477783203, "rewards/margins": 3.70766544342041, "rewards/rejected": -49.511749267578125, "step": 6024 }, { "epoch": 0.8203976034858388, "grad_norm": 39.420328184255794, "learning_rate": 7.608055636453228e-08, "logits/chosen": 13.734947204589844, "logits/rejected": 13.956835746765137, "logps/chosen": -4.356785774230957, "logps/rejected": -4.7616376876831055, "loss": 4.1174, "rewards/accuracies": 1.0, "rewards/chosen": -43.56785583496094, "rewards/margins": 4.048520088195801, "rewards/rejected": -47.61637878417969, "step": 6025 }, { "epoch": 0.8205337690631809, "grad_norm": 40.6878508493158, "learning_rate": 7.596903615628955e-08, "logits/chosen": 14.094951629638672, "logits/rejected": 15.34115219116211, "logps/chosen": -4.436038017272949, "logps/rejected": -4.957910537719727, "loss": 3.9797, "rewards/accuracies": 0.75, "rewards/chosen": -44.360382080078125, "rewards/margins": 5.218725204467773, "rewards/rejected": -49.57910919189453, "step": 6026 }, { "epoch": 0.8206699346405228, "grad_norm": 38.55172053224083, "learning_rate": 7.585758916552167e-08, "logits/chosen": 14.326987266540527, "logits/rejected": 14.182968139648438, "logps/chosen": -4.309831619262695, "logps/rejected": -4.456246376037598, "loss": 3.9367, "rewards/accuracies": 0.5, "rewards/chosen": -43.09831237792969, "rewards/margins": 1.4641494750976562, "rewards/rejected": -44.56246566772461, "step": 6027 }, { "epoch": 0.8208061002178649, "grad_norm": 43.892139808241495, "learning_rate": 7.57462154174108e-08, "logits/chosen": 12.973543167114258, "logits/rejected": 13.643218994140625, "logps/chosen": -4.181532859802246, "logps/rejected": -4.392844200134277, "loss": 4.2176, "rewards/accuracies": 0.75, "rewards/chosen": -41.81532669067383, "rewards/margins": 2.113117218017578, "rewards/rejected": -43.928443908691406, "step": 6028 }, { "epoch": 0.820942265795207, "grad_norm": 42.98742633791599, "learning_rate": 7.563491493712284e-08, "logits/chosen": 14.538965225219727, "logits/rejected": 14.696380615234375, "logps/chosen": -4.671580791473389, "logps/rejected": -4.6215314865112305, "loss": 4.3134, "rewards/accuracies": 0.5, "rewards/chosen": -46.7158088684082, "rewards/margins": -0.5004920959472656, "rewards/rejected": -46.21531677246094, "step": 6029 }, { "epoch": 0.821078431372549, "grad_norm": 42.792363734210795, "learning_rate": 7.55236877498072e-08, "logits/chosen": 14.236913681030273, "logits/rejected": 14.832027435302734, "logps/chosen": -4.381623268127441, "logps/rejected": -4.66511344909668, "loss": 4.1069, "rewards/accuracies": 0.5, "rewards/chosen": -43.81623458862305, "rewards/margins": 2.8348989486694336, "rewards/rejected": -46.65113067626953, "step": 6030 }, { "epoch": 0.8212145969498911, "grad_norm": 42.60488475463362, "learning_rate": 7.541253388059634e-08, "logits/chosen": 14.829753875732422, "logits/rejected": 14.456899642944336, "logps/chosen": -4.776880741119385, "logps/rejected": -4.58027458190918, "loss": 3.7884, "rewards/accuracies": 0.25, "rewards/chosen": -47.76880645751953, "rewards/margins": -1.9660625457763672, "rewards/rejected": -45.8027458190918, "step": 6031 }, { "epoch": 0.8213507625272332, "grad_norm": 40.67034811736415, "learning_rate": 7.530145335460654e-08, "logits/chosen": 14.48868179321289, "logits/rejected": 15.850229263305664, "logps/chosen": -4.392571449279785, "logps/rejected": -5.050787448883057, "loss": 3.8525, "rewards/accuracies": 1.0, "rewards/chosen": -43.92571258544922, "rewards/margins": 6.5821638107299805, "rewards/rejected": -50.50787353515625, "step": 6032 }, { "epoch": 0.8214869281045751, "grad_norm": 42.890476420547955, "learning_rate": 7.519044619693744e-08, "logits/chosen": 14.490255355834961, "logits/rejected": 14.099477767944336, "logps/chosen": -4.651735305786133, "logps/rejected": -4.536527633666992, "loss": 4.5203, "rewards/accuracies": 0.25, "rewards/chosen": -46.517356872558594, "rewards/margins": -1.1520757675170898, "rewards/rejected": -45.36527633666992, "step": 6033 }, { "epoch": 0.8216230936819172, "grad_norm": 40.06238697374678, "learning_rate": 7.507951243267183e-08, "logits/chosen": 13.968732833862305, "logits/rejected": 13.863668441772461, "logps/chosen": -4.4317426681518555, "logps/rejected": -4.556494235992432, "loss": 4.1649, "rewards/accuracies": 0.75, "rewards/chosen": -44.31742477416992, "rewards/margins": 1.2475175857543945, "rewards/rejected": -45.56494140625, "step": 6034 }, { "epoch": 0.8217592592592593, "grad_norm": 41.58586539423261, "learning_rate": 7.49686520868762e-08, "logits/chosen": 14.617986679077148, "logits/rejected": 14.889283180236816, "logps/chosen": -4.61223030090332, "logps/rejected": -4.665738582611084, "loss": 4.312, "rewards/accuracies": 0.5, "rewards/chosen": -46.1223030090332, "rewards/margins": 0.5350809097290039, "rewards/rejected": -46.65738296508789, "step": 6035 }, { "epoch": 0.8218954248366013, "grad_norm": 38.98389365935182, "learning_rate": 7.485786518460045e-08, "logits/chosen": 14.428482055664062, "logits/rejected": 14.809194564819336, "logps/chosen": -4.68992805480957, "logps/rejected": -4.982572555541992, "loss": 3.5075, "rewards/accuracies": 0.75, "rewards/chosen": -46.89927673339844, "rewards/margins": 2.9264450073242188, "rewards/rejected": -49.825721740722656, "step": 6036 }, { "epoch": 0.8220315904139434, "grad_norm": 36.12857559552748, "learning_rate": 7.474715175087763e-08, "logits/chosen": 13.700933456420898, "logits/rejected": 15.094429016113281, "logps/chosen": -4.3954057693481445, "logps/rejected": -4.76460075378418, "loss": 3.7239, "rewards/accuracies": 0.75, "rewards/chosen": -43.95405578613281, "rewards/margins": 3.691946029663086, "rewards/rejected": -47.64600372314453, "step": 6037 }, { "epoch": 0.8221677559912854, "grad_norm": 39.77657348436124, "learning_rate": 7.463651181072444e-08, "logits/chosen": 13.901912689208984, "logits/rejected": 14.38042163848877, "logps/chosen": -4.383412837982178, "logps/rejected": -4.752869606018066, "loss": 3.8457, "rewards/accuracies": 0.75, "rewards/chosen": -43.834129333496094, "rewards/margins": 3.6945648193359375, "rewards/rejected": -47.52869415283203, "step": 6038 }, { "epoch": 0.8223039215686274, "grad_norm": 41.25046943462297, "learning_rate": 7.4525945389141e-08, "logits/chosen": 13.642072677612305, "logits/rejected": 13.95231819152832, "logps/chosen": -4.622012615203857, "logps/rejected": -4.602715492248535, "loss": 3.8579, "rewards/accuracies": 0.5, "rewards/chosen": -46.22012710571289, "rewards/margins": -0.19297027587890625, "rewards/rejected": -46.02715301513672, "step": 6039 }, { "epoch": 0.8224400871459695, "grad_norm": 44.45644247315561, "learning_rate": 7.441545251111047e-08, "logits/chosen": 14.169889450073242, "logits/rejected": 14.953592300415039, "logps/chosen": -4.692682266235352, "logps/rejected": -4.68890380859375, "loss": 4.1269, "rewards/accuracies": 0.75, "rewards/chosen": -46.926822662353516, "rewards/margins": -0.037784576416015625, "rewards/rejected": -46.8890380859375, "step": 6040 }, { "epoch": 0.8225762527233116, "grad_norm": 41.33819260670901, "learning_rate": 7.430503320159975e-08, "logits/chosen": 14.512789726257324, "logits/rejected": 15.010868072509766, "logps/chosen": -4.672266006469727, "logps/rejected": -4.976963996887207, "loss": 3.9977, "rewards/accuracies": 0.5, "rewards/chosen": -46.722660064697266, "rewards/margins": 3.0469770431518555, "rewards/rejected": -49.76963806152344, "step": 6041 }, { "epoch": 0.8227124183006536, "grad_norm": 38.97483506026432, "learning_rate": 7.419468748555915e-08, "logits/chosen": 14.723848342895508, "logits/rejected": 15.457330703735352, "logps/chosen": -4.706051349639893, "logps/rejected": -5.184124946594238, "loss": 3.8592, "rewards/accuracies": 0.75, "rewards/chosen": -47.060516357421875, "rewards/margins": 4.780730247497559, "rewards/rejected": -51.84124755859375, "step": 6042 }, { "epoch": 0.8228485838779956, "grad_norm": 39.777421664435224, "learning_rate": 7.408441538792187e-08, "logits/chosen": 13.797050476074219, "logits/rejected": 13.77688217163086, "logps/chosen": -4.33291482925415, "logps/rejected": -4.585965156555176, "loss": 3.6713, "rewards/accuracies": 1.0, "rewards/chosen": -43.32914733886719, "rewards/margins": 2.530501365661621, "rewards/rejected": -45.859649658203125, "step": 6043 }, { "epoch": 0.8229847494553377, "grad_norm": 39.19536825226281, "learning_rate": 7.397421693360506e-08, "logits/chosen": 14.190515518188477, "logits/rejected": 14.768600463867188, "logps/chosen": -4.296804428100586, "logps/rejected": -4.6865715980529785, "loss": 3.4448, "rewards/accuracies": 0.75, "rewards/chosen": -42.96804428100586, "rewards/margins": 3.8976736068725586, "rewards/rejected": -46.86571502685547, "step": 6044 }, { "epoch": 0.8231209150326797, "grad_norm": 40.55619799491416, "learning_rate": 7.386409214750893e-08, "logits/chosen": 14.436704635620117, "logits/rejected": 15.049564361572266, "logps/chosen": -4.373745918273926, "logps/rejected": -5.028958320617676, "loss": 3.8709, "rewards/accuracies": 1.0, "rewards/chosen": -43.73746109008789, "rewards/margins": 6.552123069763184, "rewards/rejected": -50.289581298828125, "step": 6045 }, { "epoch": 0.8232570806100218, "grad_norm": 43.8288917732911, "learning_rate": 7.375404105451699e-08, "logits/chosen": 14.19094467163086, "logits/rejected": 14.12060546875, "logps/chosen": -4.609737873077393, "logps/rejected": -4.720578193664551, "loss": 4.4131, "rewards/accuracies": 0.75, "rewards/chosen": -46.09737777709961, "rewards/margins": 1.1084051132202148, "rewards/rejected": -47.205780029296875, "step": 6046 }, { "epoch": 0.8233932461873639, "grad_norm": 36.264605527825104, "learning_rate": 7.364406367949621e-08, "logits/chosen": 14.587498664855957, "logits/rejected": 14.168087005615234, "logps/chosen": -4.745700836181641, "logps/rejected": -4.8435821533203125, "loss": 3.5591, "rewards/accuracies": 0.75, "rewards/chosen": -47.457008361816406, "rewards/margins": 0.9788122177124023, "rewards/rejected": -48.435821533203125, "step": 6047 }, { "epoch": 0.8235294117647058, "grad_norm": 39.18062200608471, "learning_rate": 7.353416004729705e-08, "logits/chosen": 13.910175323486328, "logits/rejected": 13.795553207397461, "logps/chosen": -4.389157295227051, "logps/rejected": -4.447350025177002, "loss": 3.7641, "rewards/accuracies": 0.25, "rewards/chosen": -43.891571044921875, "rewards/margins": 0.5819282531738281, "rewards/rejected": -44.4734992980957, "step": 6048 }, { "epoch": 0.8236655773420479, "grad_norm": 42.163829542705194, "learning_rate": 7.342433018275289e-08, "logits/chosen": 14.539979934692383, "logits/rejected": 14.612295150756836, "logps/chosen": -4.5252180099487305, "logps/rejected": -4.813600063323975, "loss": 3.7983, "rewards/accuracies": 0.75, "rewards/chosen": -45.25218200683594, "rewards/margins": 2.883817672729492, "rewards/rejected": -48.13600158691406, "step": 6049 }, { "epoch": 0.82380174291939, "grad_norm": 37.166195884237496, "learning_rate": 7.331457411068088e-08, "logits/chosen": 13.957697868347168, "logits/rejected": 14.299046516418457, "logps/chosen": -4.417788505554199, "logps/rejected": -4.654058933258057, "loss": 4.1422, "rewards/accuracies": 0.75, "rewards/chosen": -44.177879333496094, "rewards/margins": 2.3627071380615234, "rewards/rejected": -46.54058837890625, "step": 6050 }, { "epoch": 0.823937908496732, "grad_norm": 38.54861778160979, "learning_rate": 7.320489185588132e-08, "logits/chosen": 14.177787780761719, "logits/rejected": 13.58263874053955, "logps/chosen": -4.572007179260254, "logps/rejected": -4.531595230102539, "loss": 3.6926, "rewards/accuracies": 0.5, "rewards/chosen": -45.72007751464844, "rewards/margins": -0.40412330627441406, "rewards/rejected": -45.315948486328125, "step": 6051 }, { "epoch": 0.8240740740740741, "grad_norm": 38.46426123282261, "learning_rate": 7.309528344313766e-08, "logits/chosen": 13.940523147583008, "logits/rejected": 14.691329956054688, "logps/chosen": -4.559811592102051, "logps/rejected": -4.7814106941223145, "loss": 4.2127, "rewards/accuracies": 0.75, "rewards/chosen": -45.598114013671875, "rewards/margins": 2.2159910202026367, "rewards/rejected": -47.81410598754883, "step": 6052 }, { "epoch": 0.8242102396514162, "grad_norm": 38.05176965795897, "learning_rate": 7.298574889721694e-08, "logits/chosen": 14.417356491088867, "logits/rejected": 14.971731185913086, "logps/chosen": -4.423007011413574, "logps/rejected": -4.9835686683654785, "loss": 3.592, "rewards/accuracies": 1.0, "rewards/chosen": -44.230072021484375, "rewards/margins": 5.605617523193359, "rewards/rejected": -49.83568572998047, "step": 6053 }, { "epoch": 0.8243464052287581, "grad_norm": 37.033503860806164, "learning_rate": 7.287628824286951e-08, "logits/chosen": 13.865543365478516, "logits/rejected": 14.304594039916992, "logps/chosen": -4.413761138916016, "logps/rejected": -4.481498718261719, "loss": 4.0815, "rewards/accuracies": 0.75, "rewards/chosen": -44.137611389160156, "rewards/margins": 0.6773738861083984, "rewards/rejected": -44.81498718261719, "step": 6054 }, { "epoch": 0.8244825708061002, "grad_norm": 38.56911520104676, "learning_rate": 7.276690150482862e-08, "logits/chosen": 14.020017623901367, "logits/rejected": 14.25662612915039, "logps/chosen": -4.535367965698242, "logps/rejected": -4.5906524658203125, "loss": 4.0919, "rewards/accuracies": 0.25, "rewards/chosen": -45.353675842285156, "rewards/margins": 0.5528469085693359, "rewards/rejected": -45.906524658203125, "step": 6055 }, { "epoch": 0.8246187363834423, "grad_norm": 38.19646418764338, "learning_rate": 7.265758870781132e-08, "logits/chosen": 14.823880195617676, "logits/rejected": 14.873763084411621, "logps/chosen": -4.848873138427734, "logps/rejected": -4.657222747802734, "loss": 3.9068, "rewards/accuracies": 0.25, "rewards/chosen": -48.488731384277344, "rewards/margins": -1.9165029525756836, "rewards/rejected": -46.572227478027344, "step": 6056 }, { "epoch": 0.8247549019607843, "grad_norm": 42.41969740889559, "learning_rate": 7.254834987651781e-08, "logits/chosen": 15.266988754272461, "logits/rejected": 14.674005508422852, "logps/chosen": -5.024270057678223, "logps/rejected": -4.6833176612854, "loss": 3.8537, "rewards/accuracies": 0.25, "rewards/chosen": -50.242698669433594, "rewards/margins": -3.4095230102539062, "rewards/rejected": -46.83317565917969, "step": 6057 }, { "epoch": 0.8248910675381264, "grad_norm": 40.921519769854655, "learning_rate": 7.243918503563122e-08, "logits/chosen": 14.441866874694824, "logits/rejected": 14.136096954345703, "logps/chosen": -4.946114540100098, "logps/rejected": -4.81185245513916, "loss": 4.1872, "rewards/accuracies": 0.5, "rewards/chosen": -49.461143493652344, "rewards/margins": -1.342616081237793, "rewards/rejected": -48.1185302734375, "step": 6058 }, { "epoch": 0.8250272331154684, "grad_norm": 37.726231791399236, "learning_rate": 7.233009420981849e-08, "logits/chosen": 14.233108520507812, "logits/rejected": 14.999802589416504, "logps/chosen": -4.610074996948242, "logps/rejected": -4.808406352996826, "loss": 3.8408, "rewards/accuracies": 0.75, "rewards/chosen": -46.10074996948242, "rewards/margins": 1.9833135604858398, "rewards/rejected": -48.08406448364258, "step": 6059 }, { "epoch": 0.8251633986928104, "grad_norm": 40.721787061834, "learning_rate": 7.222107742372957e-08, "logits/chosen": 13.760741233825684, "logits/rejected": 14.349794387817383, "logps/chosen": -4.43243408203125, "logps/rejected": -4.537227630615234, "loss": 4.202, "rewards/accuracies": 0.5, "rewards/chosen": -44.3243408203125, "rewards/margins": 1.047933578491211, "rewards/rejected": -45.372276306152344, "step": 6060 }, { "epoch": 0.8252995642701525, "grad_norm": 35.7231043780971, "learning_rate": 7.211213470199755e-08, "logits/chosen": 14.094461441040039, "logits/rejected": 14.31088638305664, "logps/chosen": -4.6793012619018555, "logps/rejected": -4.5874481201171875, "loss": 3.6418, "rewards/accuracies": 0.5, "rewards/chosen": -46.79301834106445, "rewards/margins": -0.9185371398925781, "rewards/rejected": -45.874481201171875, "step": 6061 }, { "epoch": 0.8254357298474946, "grad_norm": 39.76292417736645, "learning_rate": 7.200326606923908e-08, "logits/chosen": 13.739508628845215, "logits/rejected": 14.893240928649902, "logps/chosen": -4.333333969116211, "logps/rejected": -4.940118789672852, "loss": 3.7525, "rewards/accuracies": 1.0, "rewards/chosen": -43.333343505859375, "rewards/margins": 6.067842483520508, "rewards/rejected": -49.40118408203125, "step": 6062 }, { "epoch": 0.8255718954248366, "grad_norm": 37.449541393484104, "learning_rate": 7.189447155005397e-08, "logits/chosen": 14.251962661743164, "logits/rejected": 14.382091522216797, "logps/chosen": -4.344602584838867, "logps/rejected": -4.549226760864258, "loss": 3.7595, "rewards/accuracies": 0.5, "rewards/chosen": -43.446022033691406, "rewards/margins": 2.04624080657959, "rewards/rejected": -45.49226379394531, "step": 6063 }, { "epoch": 0.8257080610021786, "grad_norm": 41.421918294314544, "learning_rate": 7.178575116902506e-08, "logits/chosen": 14.838671684265137, "logits/rejected": 14.477760314941406, "logps/chosen": -4.598201751708984, "logps/rejected": -4.628789901733398, "loss": 3.4721, "rewards/accuracies": 0.5, "rewards/chosen": -45.982017517089844, "rewards/margins": 0.30588340759277344, "rewards/rejected": -46.28790283203125, "step": 6064 }, { "epoch": 0.8258442265795207, "grad_norm": 36.268756010251934, "learning_rate": 7.167710495071872e-08, "logits/chosen": 13.901817321777344, "logits/rejected": 14.308311462402344, "logps/chosen": -4.511014938354492, "logps/rejected": -4.507732391357422, "loss": 3.5426, "rewards/accuracies": 0.25, "rewards/chosen": -45.11014938354492, "rewards/margins": -0.03282928466796875, "rewards/rejected": -45.07732009887695, "step": 6065 }, { "epoch": 0.8259803921568627, "grad_norm": 39.46927697597172, "learning_rate": 7.156853291968458e-08, "logits/chosen": 13.99966812133789, "logits/rejected": 14.048482894897461, "logps/chosen": -4.423985481262207, "logps/rejected": -4.377402305603027, "loss": 3.8769, "rewards/accuracies": 0.75, "rewards/chosen": -44.23985290527344, "rewards/margins": -0.46582698822021484, "rewards/rejected": -43.774024963378906, "step": 6066 }, { "epoch": 0.8261165577342048, "grad_norm": 41.849915946483996, "learning_rate": 7.146003510045516e-08, "logits/chosen": 14.712206840515137, "logits/rejected": 14.81979751586914, "logps/chosen": -4.471068382263184, "logps/rejected": -4.576263427734375, "loss": 4.0703, "rewards/accuracies": 0.75, "rewards/chosen": -44.7106819152832, "rewards/margins": 1.0519514083862305, "rewards/rejected": -45.76263427734375, "step": 6067 }, { "epoch": 0.8262527233115469, "grad_norm": 38.28287099250866, "learning_rate": 7.135161151754654e-08, "logits/chosen": 14.2711181640625, "logits/rejected": 14.304998397827148, "logps/chosen": -4.531559467315674, "logps/rejected": -4.5520339012146, "loss": 4.2696, "rewards/accuracies": 0.5, "rewards/chosen": -45.31559371948242, "rewards/margins": 0.2047433853149414, "rewards/rejected": -45.52033996582031, "step": 6068 }, { "epoch": 0.8263888888888888, "grad_norm": 42.86161116679106, "learning_rate": 7.124326219545804e-08, "logits/chosen": 14.595911026000977, "logits/rejected": 14.83761215209961, "logps/chosen": -4.441425323486328, "logps/rejected": -4.513574600219727, "loss": 4.0588, "rewards/accuracies": 0.5, "rewards/chosen": -44.41425323486328, "rewards/margins": 0.7214908599853516, "rewards/rejected": -45.1357421875, "step": 6069 }, { "epoch": 0.8265250544662309, "grad_norm": 40.740306594596674, "learning_rate": 7.113498715867185e-08, "logits/chosen": 13.849004745483398, "logits/rejected": 14.19088363647461, "logps/chosen": -4.49882698059082, "logps/rejected": -4.755785942077637, "loss": 3.8346, "rewards/accuracies": 0.5, "rewards/chosen": -44.98826599121094, "rewards/margins": 2.569594383239746, "rewards/rejected": -47.557861328125, "step": 6070 }, { "epoch": 0.826661220043573, "grad_norm": 40.659656513605995, "learning_rate": 7.102678643165378e-08, "logits/chosen": 14.8900146484375, "logits/rejected": 15.178922653198242, "logps/chosen": -4.790339469909668, "logps/rejected": -5.044641971588135, "loss": 3.6308, "rewards/accuracies": 1.0, "rewards/chosen": -47.90339279174805, "rewards/margins": 2.5430259704589844, "rewards/rejected": -50.44641876220703, "step": 6071 }, { "epoch": 0.826797385620915, "grad_norm": 38.62002207136447, "learning_rate": 7.091866003885271e-08, "logits/chosen": 15.210567474365234, "logits/rejected": 14.67914867401123, "logps/chosen": -4.842087268829346, "logps/rejected": -4.590938091278076, "loss": 4.3151, "rewards/accuracies": 0.25, "rewards/chosen": -48.420867919921875, "rewards/margins": -2.511490821838379, "rewards/rejected": -45.90937805175781, "step": 6072 }, { "epoch": 0.8269335511982571, "grad_norm": 73.59826017903893, "learning_rate": 7.08106080047005e-08, "logits/chosen": 14.156423568725586, "logits/rejected": 14.884260177612305, "logps/chosen": -4.4311981201171875, "logps/rejected": -4.862832546234131, "loss": 3.5269, "rewards/accuracies": 0.75, "rewards/chosen": -44.311981201171875, "rewards/margins": 4.316341400146484, "rewards/rejected": -48.62832260131836, "step": 6073 }, { "epoch": 0.8270697167755992, "grad_norm": 38.58882584242164, "learning_rate": 7.070263035361254e-08, "logits/chosen": 13.883543014526367, "logits/rejected": 14.491338729858398, "logps/chosen": -4.581623077392578, "logps/rejected": -4.79247522354126, "loss": 4.1403, "rewards/accuracies": 0.75, "rewards/chosen": -45.81623077392578, "rewards/margins": 2.1085243225097656, "rewards/rejected": -47.92475128173828, "step": 6074 }, { "epoch": 0.8272058823529411, "grad_norm": 38.23265165719738, "learning_rate": 7.059472710998737e-08, "logits/chosen": 14.050077438354492, "logits/rejected": 14.061813354492188, "logps/chosen": -4.497598171234131, "logps/rejected": -4.554957866668701, "loss": 4.109, "rewards/accuracies": 0.25, "rewards/chosen": -44.975982666015625, "rewards/margins": 0.5735969543457031, "rewards/rejected": -45.54957580566406, "step": 6075 }, { "epoch": 0.8273420479302832, "grad_norm": 43.286006255749136, "learning_rate": 7.04868982982064e-08, "logits/chosen": 14.356332778930664, "logits/rejected": 14.884689331054688, "logps/chosen": -4.695705413818359, "logps/rejected": -4.871602535247803, "loss": 4.3218, "rewards/accuracies": 0.75, "rewards/chosen": -46.95705795288086, "rewards/margins": 1.758967399597168, "rewards/rejected": -48.716026306152344, "step": 6076 }, { "epoch": 0.8274782135076253, "grad_norm": 41.277221170860514, "learning_rate": 7.037914394263449e-08, "logits/chosen": 14.987344741821289, "logits/rejected": 15.231377601623535, "logps/chosen": -4.6642255783081055, "logps/rejected": -4.658037185668945, "loss": 4.1252, "rewards/accuracies": 0.5, "rewards/chosen": -46.64225769042969, "rewards/margins": -0.06188201904296875, "rewards/rejected": -46.58037567138672, "step": 6077 }, { "epoch": 0.8276143790849673, "grad_norm": 42.53417935675147, "learning_rate": 7.027146406761981e-08, "logits/chosen": 14.094209671020508, "logits/rejected": 14.870521545410156, "logps/chosen": -4.389730453491211, "logps/rejected": -4.828824996948242, "loss": 3.7674, "rewards/accuracies": 0.75, "rewards/chosen": -43.897300720214844, "rewards/margins": 4.3909502029418945, "rewards/rejected": -48.28824996948242, "step": 6078 }, { "epoch": 0.8277505446623094, "grad_norm": 37.423321618260196, "learning_rate": 7.016385869749331e-08, "logits/chosen": 14.175128936767578, "logits/rejected": 14.8407564163208, "logps/chosen": -4.5970845222473145, "logps/rejected": -5.072591781616211, "loss": 3.555, "rewards/accuracies": 1.0, "rewards/chosen": -45.97084426879883, "rewards/margins": 4.755074501037598, "rewards/rejected": -50.72591781616211, "step": 6079 }, { "epoch": 0.8278867102396514, "grad_norm": 37.01853354094483, "learning_rate": 7.005632785656938e-08, "logits/chosen": 14.621549606323242, "logits/rejected": 14.845047950744629, "logps/chosen": -4.951155185699463, "logps/rejected": -5.087174892425537, "loss": 3.9789, "rewards/accuracies": 0.75, "rewards/chosen": -49.51155090332031, "rewards/margins": 1.3601932525634766, "rewards/rejected": -50.87174987792969, "step": 6080 }, { "epoch": 0.8280228758169934, "grad_norm": 41.7245028450728, "learning_rate": 6.99488715691456e-08, "logits/chosen": 14.466846466064453, "logits/rejected": 15.141697883605957, "logps/chosen": -4.820626258850098, "logps/rejected": -5.128000736236572, "loss": 3.8971, "rewards/accuracies": 0.75, "rewards/chosen": -48.20626449584961, "rewards/margins": 3.073741912841797, "rewards/rejected": -51.280006408691406, "step": 6081 }, { "epoch": 0.8281590413943355, "grad_norm": 40.119852408744684, "learning_rate": 6.984148985950242e-08, "logits/chosen": 14.267662048339844, "logits/rejected": 15.034744262695312, "logps/chosen": -4.638570308685303, "logps/rejected": -4.70817232131958, "loss": 3.5555, "rewards/accuracies": 0.5, "rewards/chosen": -46.385704040527344, "rewards/margins": 0.6960182189941406, "rewards/rejected": -47.08171844482422, "step": 6082 }, { "epoch": 0.8282952069716776, "grad_norm": 40.51406236225782, "learning_rate": 6.973418275190374e-08, "logits/chosen": 15.476975440979004, "logits/rejected": 15.310139656066895, "logps/chosen": -4.855722427368164, "logps/rejected": -5.092972278594971, "loss": 3.9856, "rewards/accuracies": 0.75, "rewards/chosen": -48.557228088378906, "rewards/margins": 2.372495651245117, "rewards/rejected": -50.92972183227539, "step": 6083 }, { "epoch": 0.8284313725490197, "grad_norm": 40.748855175817674, "learning_rate": 6.962695027059649e-08, "logits/chosen": 14.129922866821289, "logits/rejected": 13.225704193115234, "logps/chosen": -4.6410932540893555, "logps/rejected": -4.3733367919921875, "loss": 3.8807, "rewards/accuracies": 0.25, "rewards/chosen": -46.41093444824219, "rewards/margins": -2.6775646209716797, "rewards/rejected": -43.733367919921875, "step": 6084 }, { "epoch": 0.8285675381263616, "grad_norm": 43.54342392869334, "learning_rate": 6.951979243981077e-08, "logits/chosen": 14.076129913330078, "logits/rejected": 14.7442626953125, "logps/chosen": -4.186595916748047, "logps/rejected": -4.805567264556885, "loss": 4.1338, "rewards/accuracies": 1.0, "rewards/chosen": -41.865962982177734, "rewards/margins": 6.189709663391113, "rewards/rejected": -48.05567169189453, "step": 6085 }, { "epoch": 0.8287037037037037, "grad_norm": 35.848727461995985, "learning_rate": 6.941270928375967e-08, "logits/chosen": 14.276762962341309, "logits/rejected": 15.221200942993164, "logps/chosen": -4.3235273361206055, "logps/rejected": -4.6821088790893555, "loss": 3.7779, "rewards/accuracies": 0.75, "rewards/chosen": -43.23527526855469, "rewards/margins": 3.5858154296875, "rewards/rejected": -46.82109069824219, "step": 6086 }, { "epoch": 0.8288398692810458, "grad_norm": 38.23800197899565, "learning_rate": 6.930570082663951e-08, "logits/chosen": 14.269173622131348, "logits/rejected": 14.183759689331055, "logps/chosen": -4.48423433303833, "logps/rejected": -4.461477279663086, "loss": 3.7819, "rewards/accuracies": 0.5, "rewards/chosen": -44.842342376708984, "rewards/margins": -0.22757434844970703, "rewards/rejected": -44.614768981933594, "step": 6087 }, { "epoch": 0.8289760348583878, "grad_norm": 50.87881341938159, "learning_rate": 6.919876709262995e-08, "logits/chosen": 14.084066390991211, "logits/rejected": 14.006141662597656, "logps/chosen": -4.435039520263672, "logps/rejected": -4.774471282958984, "loss": 3.4278, "rewards/accuracies": 1.0, "rewards/chosen": -44.35039520263672, "rewards/margins": 3.3943138122558594, "rewards/rejected": -47.744712829589844, "step": 6088 }, { "epoch": 0.8291122004357299, "grad_norm": 41.562180290865925, "learning_rate": 6.909190810589324e-08, "logits/chosen": 14.797163009643555, "logits/rejected": 14.714442253112793, "logps/chosen": -4.685112476348877, "logps/rejected": -4.850634574890137, "loss": 3.7085, "rewards/accuracies": 0.75, "rewards/chosen": -46.85112380981445, "rewards/margins": 1.6552209854125977, "rewards/rejected": -48.50634765625, "step": 6089 }, { "epoch": 0.829248366013072, "grad_norm": 42.14307703604587, "learning_rate": 6.898512389057529e-08, "logits/chosen": 14.264955520629883, "logits/rejected": 14.213687896728516, "logps/chosen": -4.537480354309082, "logps/rejected": -4.491600513458252, "loss": 4.2966, "rewards/accuracies": 0.25, "rewards/chosen": -45.37480163574219, "rewards/margins": -0.4587984085083008, "rewards/rejected": -44.9160041809082, "step": 6090 }, { "epoch": 0.8293845315904139, "grad_norm": 37.90083727767995, "learning_rate": 6.887841447080473e-08, "logits/chosen": 13.751640319824219, "logits/rejected": 14.907299041748047, "logps/chosen": -4.207671165466309, "logps/rejected": -4.587124824523926, "loss": 3.5692, "rewards/accuracies": 1.0, "rewards/chosen": -42.07671356201172, "rewards/margins": 3.794534683227539, "rewards/rejected": -45.871246337890625, "step": 6091 }, { "epoch": 0.829520697167756, "grad_norm": 40.828236016394264, "learning_rate": 6.877177987069363e-08, "logits/chosen": 14.115979194641113, "logits/rejected": 15.074551582336426, "logps/chosen": -4.5796308517456055, "logps/rejected": -5.325275421142578, "loss": 3.9801, "rewards/accuracies": 1.0, "rewards/chosen": -45.79631042480469, "rewards/margins": 7.456439971923828, "rewards/rejected": -53.25275421142578, "step": 6092 }, { "epoch": 0.8296568627450981, "grad_norm": 38.310881195718835, "learning_rate": 6.866522011433668e-08, "logits/chosen": 13.38557243347168, "logits/rejected": 14.043527603149414, "logps/chosen": -4.525265693664551, "logps/rejected": -4.837750434875488, "loss": 3.5227, "rewards/accuracies": 1.0, "rewards/chosen": -45.252655029296875, "rewards/margins": 3.124852180480957, "rewards/rejected": -48.377506256103516, "step": 6093 }, { "epoch": 0.8297930283224401, "grad_norm": 38.29228010542103, "learning_rate": 6.855873522581213e-08, "logits/chosen": 14.061481475830078, "logits/rejected": 14.448578834533691, "logps/chosen": -4.646764278411865, "logps/rejected": -4.967892646789551, "loss": 4.2158, "rewards/accuracies": 1.0, "rewards/chosen": -46.46764373779297, "rewards/margins": 3.2112836837768555, "rewards/rejected": -49.678924560546875, "step": 6094 }, { "epoch": 0.8299291938997821, "grad_norm": 44.202946707347884, "learning_rate": 6.845232522918119e-08, "logits/chosen": 14.883737564086914, "logits/rejected": 14.748565673828125, "logps/chosen": -4.53798770904541, "logps/rejected": -4.64774751663208, "loss": 4.0213, "rewards/accuracies": 0.75, "rewards/chosen": -45.37987518310547, "rewards/margins": 1.0975980758666992, "rewards/rejected": -46.47747802734375, "step": 6095 }, { "epoch": 0.8300653594771242, "grad_norm": 39.436647944078224, "learning_rate": 6.834599014848783e-08, "logits/chosen": 13.98619270324707, "logits/rejected": 13.89140510559082, "logps/chosen": -4.452051162719727, "logps/rejected": -4.421264171600342, "loss": 3.7864, "rewards/accuracies": 0.5, "rewards/chosen": -44.520511627197266, "rewards/margins": -0.30786895751953125, "rewards/rejected": -44.21263885498047, "step": 6096 }, { "epoch": 0.8302015250544662, "grad_norm": 47.09648093371716, "learning_rate": 6.82397300077595e-08, "logits/chosen": 13.766579627990723, "logits/rejected": 13.57094955444336, "logps/chosen": -4.370791435241699, "logps/rejected": -4.463755130767822, "loss": 4.2141, "rewards/accuracies": 0.75, "rewards/chosen": -43.70791244506836, "rewards/margins": 0.9296388626098633, "rewards/rejected": -44.63755416870117, "step": 6097 }, { "epoch": 0.8303376906318083, "grad_norm": 42.82206051256688, "learning_rate": 6.813354483100653e-08, "logits/chosen": 13.728118896484375, "logits/rejected": 14.299227714538574, "logps/chosen": -4.396982192993164, "logps/rejected": -4.585845470428467, "loss": 3.7613, "rewards/accuracies": 0.5, "rewards/chosen": -43.96982192993164, "rewards/margins": 1.888631820678711, "rewards/rejected": -45.858455657958984, "step": 6098 }, { "epoch": 0.8304738562091504, "grad_norm": 39.09284772063077, "learning_rate": 6.802743464222241e-08, "logits/chosen": 14.284765243530273, "logits/rejected": 14.890828132629395, "logps/chosen": -4.448030948638916, "logps/rejected": -4.792871952056885, "loss": 3.5832, "rewards/accuracies": 0.75, "rewards/chosen": -44.480308532714844, "rewards/margins": 3.448410987854004, "rewards/rejected": -47.92871856689453, "step": 6099 }, { "epoch": 0.8306100217864923, "grad_norm": 43.73625995986307, "learning_rate": 6.792139946538347e-08, "logits/chosen": 14.374214172363281, "logits/rejected": 14.570846557617188, "logps/chosen": -4.557127475738525, "logps/rejected": -4.833904266357422, "loss": 4.1962, "rewards/accuracies": 0.5, "rewards/chosen": -45.57127380371094, "rewards/margins": 2.7677698135375977, "rewards/rejected": -48.33904266357422, "step": 6100 }, { "epoch": 0.8307461873638344, "grad_norm": 38.81975609687075, "learning_rate": 6.78154393244493e-08, "logits/chosen": 14.562736511230469, "logits/rejected": 14.685369491577148, "logps/chosen": -4.77974796295166, "logps/rejected": -5.296919822692871, "loss": 3.5647, "rewards/accuracies": 0.75, "rewards/chosen": -47.79747772216797, "rewards/margins": 5.171723365783691, "rewards/rejected": -52.969200134277344, "step": 6101 }, { "epoch": 0.8308823529411765, "grad_norm": 38.201793865408334, "learning_rate": 6.77095542433626e-08, "logits/chosen": 14.210610389709473, "logits/rejected": 13.97611141204834, "logps/chosen": -4.579843044281006, "logps/rejected": -4.404514789581299, "loss": 3.5634, "rewards/accuracies": 0.25, "rewards/chosen": -45.798431396484375, "rewards/margins": -1.7532806396484375, "rewards/rejected": -44.04515075683594, "step": 6102 }, { "epoch": 0.8310185185185185, "grad_norm": 39.55754911790776, "learning_rate": 6.760374424604878e-08, "logits/chosen": 14.37745475769043, "logits/rejected": 14.512289047241211, "logps/chosen": -4.4881486892700195, "logps/rejected": -4.637349605560303, "loss": 3.9359, "rewards/accuracies": 0.5, "rewards/chosen": -44.88148498535156, "rewards/margins": 1.4920110702514648, "rewards/rejected": -46.373497009277344, "step": 6103 }, { "epoch": 0.8311546840958606, "grad_norm": 38.61303970444363, "learning_rate": 6.74980093564165e-08, "logits/chosen": 13.322860717773438, "logits/rejected": 14.440400123596191, "logps/chosen": -4.287919044494629, "logps/rejected": -4.540780067443848, "loss": 3.4992, "rewards/accuracies": 0.75, "rewards/chosen": -42.87919235229492, "rewards/margins": 2.528611183166504, "rewards/rejected": -45.407806396484375, "step": 6104 }, { "epoch": 0.8312908496732027, "grad_norm": 41.231369166912174, "learning_rate": 6.739234959835762e-08, "logits/chosen": 14.024831771850586, "logits/rejected": 14.944028854370117, "logps/chosen": -4.423428058624268, "logps/rejected": -4.973886489868164, "loss": 3.9786, "rewards/accuracies": 1.0, "rewards/chosen": -44.234283447265625, "rewards/margins": 5.504582405090332, "rewards/rejected": -49.738861083984375, "step": 6105 }, { "epoch": 0.8314270152505446, "grad_norm": 37.31273139619829, "learning_rate": 6.728676499574666e-08, "logits/chosen": 14.488388061523438, "logits/rejected": 14.546026229858398, "logps/chosen": -4.633738994598389, "logps/rejected": -4.762263774871826, "loss": 3.689, "rewards/accuracies": 0.5, "rewards/chosen": -46.3373908996582, "rewards/margins": 1.285247802734375, "rewards/rejected": -47.62263870239258, "step": 6106 }, { "epoch": 0.8315631808278867, "grad_norm": 42.5858887007741, "learning_rate": 6.718125557244133e-08, "logits/chosen": 14.460437774658203, "logits/rejected": 14.588985443115234, "logps/chosen": -4.827062606811523, "logps/rejected": -4.737194538116455, "loss": 4.4316, "rewards/accuracies": 0.25, "rewards/chosen": -48.270626068115234, "rewards/margins": -0.8986835479736328, "rewards/rejected": -47.371944427490234, "step": 6107 }, { "epoch": 0.8316993464052288, "grad_norm": 41.80023178212338, "learning_rate": 6.707582135228254e-08, "logits/chosen": 14.691854476928711, "logits/rejected": 14.034215927124023, "logps/chosen": -4.836966514587402, "logps/rejected": -4.4296135902404785, "loss": 4.1149, "rewards/accuracies": 0.0, "rewards/chosen": -48.36966323852539, "rewards/margins": -4.0735273361206055, "rewards/rejected": -44.29613494873047, "step": 6108 }, { "epoch": 0.8318355119825708, "grad_norm": 38.47996119017808, "learning_rate": 6.697046235909379e-08, "logits/chosen": 14.427728652954102, "logits/rejected": 15.146726608276367, "logps/chosen": -4.4583821296691895, "logps/rejected": -4.931671619415283, "loss": 3.1605, "rewards/accuracies": 1.0, "rewards/chosen": -44.583824157714844, "rewards/margins": 4.732892990112305, "rewards/rejected": -49.316715240478516, "step": 6109 }, { "epoch": 0.8319716775599129, "grad_norm": 41.84525210728755, "learning_rate": 6.686517861668188e-08, "logits/chosen": 14.595102310180664, "logits/rejected": 14.40718936920166, "logps/chosen": -4.365870475769043, "logps/rejected": -4.3798322677612305, "loss": 3.807, "rewards/accuracies": 0.25, "rewards/chosen": -43.65869903564453, "rewards/margins": 0.13962364196777344, "rewards/rejected": -43.79832458496094, "step": 6110 }, { "epoch": 0.8321078431372549, "grad_norm": 41.922177806802075, "learning_rate": 6.675997014883669e-08, "logits/chosen": 14.401679992675781, "logits/rejected": 14.43206787109375, "logps/chosen": -4.434107780456543, "logps/rejected": -4.497762680053711, "loss": 3.8695, "rewards/accuracies": 0.5, "rewards/chosen": -44.34107208251953, "rewards/margins": 0.6365537643432617, "rewards/rejected": -44.977630615234375, "step": 6111 }, { "epoch": 0.8322440087145969, "grad_norm": 40.40217410396761, "learning_rate": 6.665483697933077e-08, "logits/chosen": 14.472006797790527, "logits/rejected": 14.362951278686523, "logps/chosen": -4.176186561584473, "logps/rejected": -4.764763832092285, "loss": 4.061, "rewards/accuracies": 1.0, "rewards/chosen": -41.761871337890625, "rewards/margins": 5.885772705078125, "rewards/rejected": -47.64764404296875, "step": 6112 }, { "epoch": 0.832380174291939, "grad_norm": 37.21677604396383, "learning_rate": 6.654977913191988e-08, "logits/chosen": 13.975204467773438, "logits/rejected": 14.011796951293945, "logps/chosen": -4.339505195617676, "logps/rejected": -4.4712653160095215, "loss": 3.5705, "rewards/accuracies": 0.25, "rewards/chosen": -43.395050048828125, "rewards/margins": 1.317601203918457, "rewards/rejected": -44.71265411376953, "step": 6113 }, { "epoch": 0.8325163398692811, "grad_norm": 41.85529637009253, "learning_rate": 6.644479663034283e-08, "logits/chosen": 15.07288932800293, "logits/rejected": 15.013392448425293, "logps/chosen": -4.756616592407227, "logps/rejected": -4.670723915100098, "loss": 3.2553, "rewards/accuracies": 0.5, "rewards/chosen": -47.56616973876953, "rewards/margins": -0.8589248657226562, "rewards/rejected": -46.707244873046875, "step": 6114 }, { "epoch": 0.8326525054466231, "grad_norm": 38.62137882699283, "learning_rate": 6.633988949832105e-08, "logits/chosen": 14.830150604248047, "logits/rejected": 15.562679290771484, "logps/chosen": -4.585247993469238, "logps/rejected": -5.125901222229004, "loss": 3.5853, "rewards/accuracies": 1.0, "rewards/chosen": -45.85247802734375, "rewards/margins": 5.406529426574707, "rewards/rejected": -51.259010314941406, "step": 6115 }, { "epoch": 0.8327886710239651, "grad_norm": 45.31162586657573, "learning_rate": 6.623505775955936e-08, "logits/chosen": 14.009949684143066, "logits/rejected": 14.36330795288086, "logps/chosen": -4.3931965827941895, "logps/rejected": -4.497082710266113, "loss": 3.8145, "rewards/accuracies": 0.75, "rewards/chosen": -43.931968688964844, "rewards/margins": 1.0388574600219727, "rewards/rejected": -44.9708251953125, "step": 6116 }, { "epoch": 0.8329248366013072, "grad_norm": 41.43768038796932, "learning_rate": 6.613030143774536e-08, "logits/chosen": 14.024229049682617, "logits/rejected": 15.061279296875, "logps/chosen": -4.460453033447266, "logps/rejected": -5.0388360023498535, "loss": 4.0541, "rewards/accuracies": 1.0, "rewards/chosen": -44.60453414916992, "rewards/margins": 5.7838239669799805, "rewards/rejected": -50.38835906982422, "step": 6117 }, { "epoch": 0.8330610021786492, "grad_norm": 39.71478134114547, "learning_rate": 6.602562055654943e-08, "logits/chosen": 14.496557235717773, "logits/rejected": 15.253544807434082, "logps/chosen": -4.409678936004639, "logps/rejected": -4.592222213745117, "loss": 4.2439, "rewards/accuracies": 0.75, "rewards/chosen": -44.09678649902344, "rewards/margins": 1.8254318237304688, "rewards/rejected": -45.922218322753906, "step": 6118 }, { "epoch": 0.8331971677559913, "grad_norm": 40.6244443977045, "learning_rate": 6.592101513962523e-08, "logits/chosen": 14.793159484863281, "logits/rejected": 14.738750457763672, "logps/chosen": -4.637530326843262, "logps/rejected": -4.426491737365723, "loss": 4.1012, "rewards/accuracies": 0.25, "rewards/chosen": -46.37530517578125, "rewards/margins": -2.110386848449707, "rewards/rejected": -44.26491928100586, "step": 6119 }, { "epoch": 0.8333333333333334, "grad_norm": 41.033982779240986, "learning_rate": 6.581648521060925e-08, "logits/chosen": 13.746047973632812, "logits/rejected": 14.481775283813477, "logps/chosen": -4.354628562927246, "logps/rejected": -4.566404819488525, "loss": 4.3927, "rewards/accuracies": 0.5, "rewards/chosen": -43.54628372192383, "rewards/margins": 2.1177616119384766, "rewards/rejected": -45.66404724121094, "step": 6120 }, { "epoch": 0.8334694989106753, "grad_norm": 47.10512586558853, "learning_rate": 6.57120307931207e-08, "logits/chosen": 14.048791885375977, "logits/rejected": 15.098541259765625, "logps/chosen": -4.461327075958252, "logps/rejected": -4.932543754577637, "loss": 3.6637, "rewards/accuracies": 0.75, "rewards/chosen": -44.61327362060547, "rewards/margins": 4.712166786193848, "rewards/rejected": -49.325439453125, "step": 6121 }, { "epoch": 0.8336056644880174, "grad_norm": 42.35417505742097, "learning_rate": 6.56076519107621e-08, "logits/chosen": 14.029417037963867, "logits/rejected": 14.682249069213867, "logps/chosen": -4.540962219238281, "logps/rejected": -4.697817802429199, "loss": 4.2697, "rewards/accuracies": 0.75, "rewards/chosen": -45.40962219238281, "rewards/margins": 1.568558692932129, "rewards/rejected": -46.978179931640625, "step": 6122 }, { "epoch": 0.8337418300653595, "grad_norm": 40.00723517134252, "learning_rate": 6.550334858711876e-08, "logits/chosen": 13.767751693725586, "logits/rejected": 14.647310256958008, "logps/chosen": -4.458873271942139, "logps/rejected": -4.828873157501221, "loss": 4.2054, "rewards/accuracies": 1.0, "rewards/chosen": -44.5887336730957, "rewards/margins": 3.6999969482421875, "rewards/rejected": -48.288726806640625, "step": 6123 }, { "epoch": 0.8338779956427015, "grad_norm": 41.08478302129779, "learning_rate": 6.539912084575867e-08, "logits/chosen": 13.513620376586914, "logits/rejected": 14.407224655151367, "logps/chosen": -4.117555618286133, "logps/rejected": -4.595579147338867, "loss": 4.2643, "rewards/accuracies": 1.0, "rewards/chosen": -41.17555618286133, "rewards/margins": 4.780234336853027, "rewards/rejected": -45.955787658691406, "step": 6124 }, { "epoch": 0.8340141612200436, "grad_norm": 36.471061276545086, "learning_rate": 6.529496871023306e-08, "logits/chosen": 14.343762397766113, "logits/rejected": 14.522891998291016, "logps/chosen": -4.431853771209717, "logps/rejected": -4.316773891448975, "loss": 3.8081, "rewards/accuracies": 0.5, "rewards/chosen": -44.318538665771484, "rewards/margins": -1.150796890258789, "rewards/rejected": -43.16773986816406, "step": 6125 }, { "epoch": 0.8341503267973857, "grad_norm": 42.72338158586052, "learning_rate": 6.519089220407608e-08, "logits/chosen": 14.590314865112305, "logits/rejected": 15.266239166259766, "logps/chosen": -4.511697292327881, "logps/rejected": -4.740895748138428, "loss": 4.1884, "rewards/accuracies": 0.75, "rewards/chosen": -45.116973876953125, "rewards/margins": 2.2919845581054688, "rewards/rejected": -47.408958435058594, "step": 6126 }, { "epoch": 0.8342864923747276, "grad_norm": 41.66995691441735, "learning_rate": 6.508689135080447e-08, "logits/chosen": 14.707355499267578, "logits/rejected": 14.854974746704102, "logps/chosen": -4.397587776184082, "logps/rejected": -4.700079917907715, "loss": 4.4185, "rewards/accuracies": 0.75, "rewards/chosen": -43.97587966918945, "rewards/margins": 3.024921417236328, "rewards/rejected": -47.00080108642578, "step": 6127 }, { "epoch": 0.8344226579520697, "grad_norm": 43.209372333893235, "learning_rate": 6.498296617391817e-08, "logits/chosen": 14.356229782104492, "logits/rejected": 14.467267036437988, "logps/chosen": -4.749351978302002, "logps/rejected": -4.829355239868164, "loss": 3.5277, "rewards/accuracies": 0.75, "rewards/chosen": -47.4935188293457, "rewards/margins": 0.8000316619873047, "rewards/rejected": -48.29355239868164, "step": 6128 }, { "epoch": 0.8345588235294118, "grad_norm": 38.61932183689952, "learning_rate": 6.487911669690006e-08, "logits/chosen": 14.96776008605957, "logits/rejected": 14.874239921569824, "logps/chosen": -4.653112411499023, "logps/rejected": -4.952295303344727, "loss": 3.6432, "rewards/accuracies": 0.75, "rewards/chosen": -46.531124114990234, "rewards/margins": 2.9918289184570312, "rewards/rejected": -49.52295684814453, "step": 6129 }, { "epoch": 0.8346949891067538, "grad_norm": 40.85418991118127, "learning_rate": 6.477534294321554e-08, "logits/chosen": 14.857226371765137, "logits/rejected": 15.039273262023926, "logps/chosen": -4.820626258850098, "logps/rejected": -4.904312610626221, "loss": 3.8286, "rewards/accuracies": 0.75, "rewards/chosen": -48.206268310546875, "rewards/margins": 0.8368625640869141, "rewards/rejected": -49.043128967285156, "step": 6130 }, { "epoch": 0.8348311546840959, "grad_norm": 39.067598772157666, "learning_rate": 6.46716449363133e-08, "logits/chosen": 14.663080215454102, "logits/rejected": 14.955449104309082, "logps/chosen": -4.658857345581055, "logps/rejected": -4.947725772857666, "loss": 3.8929, "rewards/accuracies": 0.75, "rewards/chosen": -46.58856964111328, "rewards/margins": 2.8886852264404297, "rewards/rejected": -49.477256774902344, "step": 6131 }, { "epoch": 0.8349673202614379, "grad_norm": 41.077323034608625, "learning_rate": 6.45680226996248e-08, "logits/chosen": 13.895734786987305, "logits/rejected": 14.641590118408203, "logps/chosen": -4.461747646331787, "logps/rejected": -4.588656902313232, "loss": 3.7732, "rewards/accuracies": 0.75, "rewards/chosen": -44.61747741699219, "rewards/margins": 1.2690925598144531, "rewards/rejected": -45.886573791503906, "step": 6132 }, { "epoch": 0.8351034858387799, "grad_norm": 39.36612215917909, "learning_rate": 6.446447625656421e-08, "logits/chosen": 15.091968536376953, "logits/rejected": 15.119592666625977, "logps/chosen": -4.789133071899414, "logps/rejected": -5.024857521057129, "loss": 3.7109, "rewards/accuracies": 0.5, "rewards/chosen": -47.891334533691406, "rewards/margins": 2.357243537902832, "rewards/rejected": -50.24857711791992, "step": 6133 }, { "epoch": 0.835239651416122, "grad_norm": 40.27706146658773, "learning_rate": 6.436100563052882e-08, "logits/chosen": 13.624385833740234, "logits/rejected": 14.786209106445312, "logps/chosen": -4.163999080657959, "logps/rejected": -4.706476211547852, "loss": 4.1267, "rewards/accuracies": 0.5, "rewards/chosen": -41.639991760253906, "rewards/margins": 5.424768447875977, "rewards/rejected": -47.064762115478516, "step": 6134 }, { "epoch": 0.8353758169934641, "grad_norm": 43.08627358674043, "learning_rate": 6.425761084489867e-08, "logits/chosen": 15.021055221557617, "logits/rejected": 14.752233505249023, "logps/chosen": -4.773472785949707, "logps/rejected": -4.8339996337890625, "loss": 3.438, "rewards/accuracies": 0.5, "rewards/chosen": -47.73472595214844, "rewards/margins": 0.6052694320678711, "rewards/rejected": -48.339996337890625, "step": 6135 }, { "epoch": 0.835511982570806, "grad_norm": 42.04586068251646, "learning_rate": 6.415429192303654e-08, "logits/chosen": 14.837308883666992, "logits/rejected": 14.878507614135742, "logps/chosen": -4.5472917556762695, "logps/rejected": -4.437604904174805, "loss": 4.5095, "rewards/accuracies": 0.5, "rewards/chosen": -45.47291946411133, "rewards/margins": -1.0968732833862305, "rewards/rejected": -44.37604522705078, "step": 6136 }, { "epoch": 0.8356481481481481, "grad_norm": 43.24490516262725, "learning_rate": 6.405104888828825e-08, "logits/chosen": 14.619548797607422, "logits/rejected": 14.957396507263184, "logps/chosen": -4.497491836547852, "logps/rejected": -4.796852111816406, "loss": 3.7764, "rewards/accuracies": 0.75, "rewards/chosen": -44.974918365478516, "rewards/margins": 2.9936037063598633, "rewards/rejected": -47.96852111816406, "step": 6137 }, { "epoch": 0.8357843137254902, "grad_norm": 40.27823080070484, "learning_rate": 6.39478817639826e-08, "logits/chosen": 14.11264419555664, "logits/rejected": 14.31336784362793, "logps/chosen": -4.185548305511475, "logps/rejected": -4.3474626541137695, "loss": 3.8416, "rewards/accuracies": 1.0, "rewards/chosen": -41.85548400878906, "rewards/margins": 1.6191463470458984, "rewards/rejected": -43.474632263183594, "step": 6138 }, { "epoch": 0.8359204793028322, "grad_norm": 39.339509195850255, "learning_rate": 6.384479057343078e-08, "logits/chosen": 14.024819374084473, "logits/rejected": 14.326751708984375, "logps/chosen": -4.381712913513184, "logps/rejected": -4.776324272155762, "loss": 4.1595, "rewards/accuracies": 1.0, "rewards/chosen": -43.81713104248047, "rewards/margins": 3.9461145401000977, "rewards/rejected": -47.76324462890625, "step": 6139 }, { "epoch": 0.8360566448801743, "grad_norm": 43.173330284230204, "learning_rate": 6.374177533992719e-08, "logits/chosen": 14.6058931350708, "logits/rejected": 14.666596412658691, "logps/chosen": -4.877252578735352, "logps/rejected": -4.752077579498291, "loss": 4.2235, "rewards/accuracies": 0.25, "rewards/chosen": -48.772525787353516, "rewards/margins": -1.251749038696289, "rewards/rejected": -47.520774841308594, "step": 6140 }, { "epoch": 0.8361928104575164, "grad_norm": 38.7895818652242, "learning_rate": 6.363883608674911e-08, "logits/chosen": 13.912267684936523, "logits/rejected": 15.058631896972656, "logps/chosen": -4.231527328491211, "logps/rejected": -4.41873836517334, "loss": 4.0218, "rewards/accuracies": 0.5, "rewards/chosen": -42.31527328491211, "rewards/margins": 1.8721132278442383, "rewards/rejected": -44.18738555908203, "step": 6141 }, { "epoch": 0.8363289760348583, "grad_norm": 42.155857068953416, "learning_rate": 6.353597283715633e-08, "logits/chosen": 13.617673873901367, "logits/rejected": 14.024524688720703, "logps/chosen": -4.041223049163818, "logps/rejected": -4.361660003662109, "loss": 4.5953, "rewards/accuracies": 1.0, "rewards/chosen": -40.412227630615234, "rewards/margins": 3.204374313354492, "rewards/rejected": -43.616600036621094, "step": 6142 }, { "epoch": 0.8364651416122004, "grad_norm": 39.580595021933355, "learning_rate": 6.34331856143917e-08, "logits/chosen": 13.698816299438477, "logits/rejected": 14.800065994262695, "logps/chosen": -4.507582187652588, "logps/rejected": -4.9922404289245605, "loss": 4.0145, "rewards/accuracies": 1.0, "rewards/chosen": -45.07582092285156, "rewards/margins": 4.846578598022461, "rewards/rejected": -49.922401428222656, "step": 6143 }, { "epoch": 0.8366013071895425, "grad_norm": 38.6357797917479, "learning_rate": 6.333047444168099e-08, "logits/chosen": 14.45734977722168, "logits/rejected": 14.91269588470459, "logps/chosen": -4.8551177978515625, "logps/rejected": -4.7796196937561035, "loss": 3.6602, "rewards/accuracies": 0.25, "rewards/chosen": -48.551177978515625, "rewards/margins": -0.7549839019775391, "rewards/rejected": -47.79619598388672, "step": 6144 }, { "epoch": 0.8367374727668845, "grad_norm": 41.9213128006136, "learning_rate": 6.322783934223239e-08, "logits/chosen": 13.768735885620117, "logits/rejected": 14.860494613647461, "logps/chosen": -4.044042587280273, "logps/rejected": -4.506625175476074, "loss": 3.8875, "rewards/accuracies": 0.75, "rewards/chosen": -40.4404296875, "rewards/margins": 4.625820159912109, "rewards/rejected": -45.066246032714844, "step": 6145 }, { "epoch": 0.8368736383442266, "grad_norm": 45.12987601025814, "learning_rate": 6.312528033923734e-08, "logits/chosen": 14.709892272949219, "logits/rejected": 14.595925331115723, "logps/chosen": -4.451046943664551, "logps/rejected": -4.532772064208984, "loss": 4.3862, "rewards/accuracies": 0.5, "rewards/chosen": -44.510467529296875, "rewards/margins": 0.8172531127929688, "rewards/rejected": -45.32771682739258, "step": 6146 }, { "epoch": 0.8370098039215687, "grad_norm": 38.66383989230643, "learning_rate": 6.302279745586988e-08, "logits/chosen": 14.77701187133789, "logits/rejected": 14.687338829040527, "logps/chosen": -4.777076721191406, "logps/rejected": -4.964646339416504, "loss": 4.1904, "rewards/accuracies": 0.5, "rewards/chosen": -47.77076721191406, "rewards/margins": 1.8756980895996094, "rewards/rejected": -49.646461486816406, "step": 6147 }, { "epoch": 0.8371459694989106, "grad_norm": 44.85020414422164, "learning_rate": 6.292039071528675e-08, "logits/chosen": 14.828690528869629, "logits/rejected": 15.187923431396484, "logps/chosen": -4.696811676025391, "logps/rejected": -5.144219398498535, "loss": 3.3486, "rewards/accuracies": 1.0, "rewards/chosen": -46.968116760253906, "rewards/margins": 4.474074363708496, "rewards/rejected": -51.44219207763672, "step": 6148 }, { "epoch": 0.8372821350762527, "grad_norm": 39.71331145962513, "learning_rate": 6.281806014062763e-08, "logits/chosen": 14.739681243896484, "logits/rejected": 15.53912353515625, "logps/chosen": -4.653196334838867, "logps/rejected": -5.161829948425293, "loss": 3.9474, "rewards/accuracies": 1.0, "rewards/chosen": -46.531959533691406, "rewards/margins": 5.086338043212891, "rewards/rejected": -51.61830139160156, "step": 6149 }, { "epoch": 0.8374183006535948, "grad_norm": 41.2757293330425, "learning_rate": 6.27158057550151e-08, "logits/chosen": 14.39266300201416, "logits/rejected": 15.026956558227539, "logps/chosen": -4.79788875579834, "logps/rejected": -5.024321556091309, "loss": 3.4549, "rewards/accuracies": 0.25, "rewards/chosen": -47.9788818359375, "rewards/margins": 2.2643299102783203, "rewards/rejected": -50.24321365356445, "step": 6150 }, { "epoch": 0.8375544662309368, "grad_norm": 39.42646239268304, "learning_rate": 6.261362758155418e-08, "logits/chosen": 13.776823043823242, "logits/rejected": 13.577274322509766, "logps/chosen": -4.337510585784912, "logps/rejected": -4.531191825866699, "loss": 4.009, "rewards/accuracies": 0.75, "rewards/chosen": -43.37510681152344, "rewards/margins": 1.9368114471435547, "rewards/rejected": -45.311920166015625, "step": 6151 }, { "epoch": 0.8376906318082789, "grad_norm": 38.75485562530434, "learning_rate": 6.251152564333298e-08, "logits/chosen": 15.020989418029785, "logits/rejected": 14.878637313842773, "logps/chosen": -4.733424186706543, "logps/rejected": -4.843644618988037, "loss": 3.9061, "rewards/accuracies": 0.5, "rewards/chosen": -47.3342399597168, "rewards/margins": 1.1022071838378906, "rewards/rejected": -48.43644714355469, "step": 6152 }, { "epoch": 0.8378267973856209, "grad_norm": 42.179728642624546, "learning_rate": 6.240949996342238e-08, "logits/chosen": 15.483592987060547, "logits/rejected": 14.694740295410156, "logps/chosen": -4.84702205657959, "logps/rejected": -4.55339241027832, "loss": 4.3349, "rewards/accuracies": 0.25, "rewards/chosen": -48.470218658447266, "rewards/margins": -2.936295509338379, "rewards/rejected": -45.5339241027832, "step": 6153 }, { "epoch": 0.8379629629629629, "grad_norm": 39.283104170295836, "learning_rate": 6.230755056487571e-08, "logits/chosen": 14.04574966430664, "logits/rejected": 14.627889633178711, "logps/chosen": -4.364813804626465, "logps/rejected": -4.641451358795166, "loss": 3.6124, "rewards/accuracies": 1.0, "rewards/chosen": -43.64813995361328, "rewards/margins": 2.7663745880126953, "rewards/rejected": -46.414512634277344, "step": 6154 }, { "epoch": 0.838099128540305, "grad_norm": 43.217462728215324, "learning_rate": 6.220567747072935e-08, "logits/chosen": 14.475692749023438, "logits/rejected": 14.291702270507812, "logps/chosen": -4.546252250671387, "logps/rejected": -4.6103057861328125, "loss": 4.311, "rewards/accuracies": 0.5, "rewards/chosen": -45.4625244140625, "rewards/margins": 0.6405324935913086, "rewards/rejected": -46.10305404663086, "step": 6155 }, { "epoch": 0.8382352941176471, "grad_norm": 39.05929163167203, "learning_rate": 6.210388070400254e-08, "logits/chosen": 14.353715896606445, "logits/rejected": 14.49295425415039, "logps/chosen": -4.398253917694092, "logps/rejected": -4.641279220581055, "loss": 3.6052, "rewards/accuracies": 0.75, "rewards/chosen": -43.982540130615234, "rewards/margins": 2.4302520751953125, "rewards/rejected": -46.41279220581055, "step": 6156 }, { "epoch": 0.838371459694989, "grad_norm": 41.004710232808506, "learning_rate": 6.200216028769687e-08, "logits/chosen": 13.491340637207031, "logits/rejected": 14.131017684936523, "logps/chosen": -4.2168097496032715, "logps/rejected": -4.62806510925293, "loss": 4.473, "rewards/accuracies": 1.0, "rewards/chosen": -42.168094635009766, "rewards/margins": 4.112550735473633, "rewards/rejected": -46.28064727783203, "step": 6157 }, { "epoch": 0.8385076252723311, "grad_norm": 40.376121542445894, "learning_rate": 6.190051624479698e-08, "logits/chosen": 14.557209968566895, "logits/rejected": 14.547496795654297, "logps/chosen": -4.712552547454834, "logps/rejected": -4.689208984375, "loss": 3.9602, "rewards/accuracies": 0.5, "rewards/chosen": -47.12552261352539, "rewards/margins": -0.23343181610107422, "rewards/rejected": -46.89208984375, "step": 6158 }, { "epoch": 0.8386437908496732, "grad_norm": 48.210635158105546, "learning_rate": 6.179894859827031e-08, "logits/chosen": 14.967949867248535, "logits/rejected": 14.56056022644043, "logps/chosen": -4.754094123840332, "logps/rejected": -4.626589775085449, "loss": 4.6774, "rewards/accuracies": 0.25, "rewards/chosen": -47.54093933105469, "rewards/margins": -1.2750425338745117, "rewards/rejected": -46.265899658203125, "step": 6159 }, { "epoch": 0.8387799564270153, "grad_norm": 39.58686700824109, "learning_rate": 6.169745737106669e-08, "logits/chosen": 14.455970764160156, "logits/rejected": 15.752660751342773, "logps/chosen": -4.585963249206543, "logps/rejected": -5.091006755828857, "loss": 4.1289, "rewards/accuracies": 0.75, "rewards/chosen": -45.8596305847168, "rewards/margins": 5.050436019897461, "rewards/rejected": -50.910064697265625, "step": 6160 }, { "epoch": 0.8389161220043573, "grad_norm": 42.01968283725236, "learning_rate": 6.159604258611902e-08, "logits/chosen": 14.013614654541016, "logits/rejected": 15.05225944519043, "logps/chosen": -4.403552055358887, "logps/rejected": -5.003635883331299, "loss": 4.087, "rewards/accuracies": 1.0, "rewards/chosen": -44.035526275634766, "rewards/margins": 6.000833511352539, "rewards/rejected": -50.03635787963867, "step": 6161 }, { "epoch": 0.8390522875816994, "grad_norm": 44.71689293554512, "learning_rate": 6.149470426634291e-08, "logits/chosen": 14.076995849609375, "logits/rejected": 13.940010070800781, "logps/chosen": -4.166245460510254, "logps/rejected": -4.545990943908691, "loss": 4.0008, "rewards/accuracies": 1.0, "rewards/chosen": -41.662452697753906, "rewards/margins": 3.797451972961426, "rewards/rejected": -45.459903717041016, "step": 6162 }, { "epoch": 0.8391884531590414, "grad_norm": 38.46229799697916, "learning_rate": 6.139344243463638e-08, "logits/chosen": 15.259349822998047, "logits/rejected": 15.566142082214355, "logps/chosen": -5.065507888793945, "logps/rejected": -4.932771682739258, "loss": 3.8571, "rewards/accuracies": 0.5, "rewards/chosen": -50.65507888793945, "rewards/margins": -1.3273649215698242, "rewards/rejected": -49.32771301269531, "step": 6163 }, { "epoch": 0.8393246187363834, "grad_norm": 41.28213200738825, "learning_rate": 6.129225711388048e-08, "logits/chosen": 15.260438919067383, "logits/rejected": 15.180012702941895, "logps/chosen": -4.715411186218262, "logps/rejected": -4.860885143280029, "loss": 3.3503, "rewards/accuracies": 0.5, "rewards/chosen": -47.15411376953125, "rewards/margins": 1.454737663269043, "rewards/rejected": -48.60885238647461, "step": 6164 }, { "epoch": 0.8394607843137255, "grad_norm": 41.38236066282623, "learning_rate": 6.119114832693898e-08, "logits/chosen": 14.03988265991211, "logits/rejected": 14.337879180908203, "logps/chosen": -4.436484336853027, "logps/rejected": -4.659832000732422, "loss": 4.0044, "rewards/accuracies": 0.75, "rewards/chosen": -44.364837646484375, "rewards/margins": 2.2334823608398438, "rewards/rejected": -46.59832000732422, "step": 6165 }, { "epoch": 0.8395969498910676, "grad_norm": 41.960517039705614, "learning_rate": 6.109011609665802e-08, "logits/chosen": 14.420007705688477, "logits/rejected": 14.113322257995605, "logps/chosen": -4.6463141441345215, "logps/rejected": -4.897706031799316, "loss": 4.4334, "rewards/accuracies": 0.75, "rewards/chosen": -46.46314239501953, "rewards/margins": 2.5139217376708984, "rewards/rejected": -48.9770622253418, "step": 6166 }, { "epoch": 0.8397331154684096, "grad_norm": 42.61516196849817, "learning_rate": 6.098916044586682e-08, "logits/chosen": 15.090025901794434, "logits/rejected": 14.780232429504395, "logps/chosen": -4.814027786254883, "logps/rejected": -4.8131303787231445, "loss": 4.0517, "rewards/accuracies": 0.5, "rewards/chosen": -48.140281677246094, "rewards/margins": -0.008977890014648438, "rewards/rejected": -48.13130187988281, "step": 6167 }, { "epoch": 0.8398692810457516, "grad_norm": 40.84122874222223, "learning_rate": 6.088828139737718e-08, "logits/chosen": 13.769220352172852, "logits/rejected": 14.408266067504883, "logps/chosen": -4.324495792388916, "logps/rejected": -4.462316989898682, "loss": 4.0307, "rewards/accuracies": 0.75, "rewards/chosen": -43.244956970214844, "rewards/margins": 1.3782148361206055, "rewards/rejected": -44.6231689453125, "step": 6168 }, { "epoch": 0.8400054466230937, "grad_norm": 36.95295106229567, "learning_rate": 6.078747897398341e-08, "logits/chosen": 14.809534072875977, "logits/rejected": 15.692214012145996, "logps/chosen": -4.569239616394043, "logps/rejected": -4.8272504806518555, "loss": 3.8188, "rewards/accuracies": 0.75, "rewards/chosen": -45.69239807128906, "rewards/margins": 2.580105781555176, "rewards/rejected": -48.27250671386719, "step": 6169 }, { "epoch": 0.8401416122004357, "grad_norm": 45.94720205792897, "learning_rate": 6.068675319846268e-08, "logits/chosen": 14.114063262939453, "logits/rejected": 15.381654739379883, "logps/chosen": -4.537376403808594, "logps/rejected": -5.002058982849121, "loss": 3.5964, "rewards/accuracies": 0.75, "rewards/chosen": -45.37376022338867, "rewards/margins": 4.6468305587768555, "rewards/rejected": -50.020591735839844, "step": 6170 }, { "epoch": 0.8402777777777778, "grad_norm": 38.36597224438388, "learning_rate": 6.058610409357499e-08, "logits/chosen": 14.544548034667969, "logits/rejected": 15.386655807495117, "logps/chosen": -4.693369388580322, "logps/rejected": -4.843297481536865, "loss": 3.2828, "rewards/accuracies": 0.5, "rewards/chosen": -46.933692932128906, "rewards/margins": 1.4992828369140625, "rewards/rejected": -48.43297576904297, "step": 6171 }, { "epoch": 0.8404139433551199, "grad_norm": 37.51919111877882, "learning_rate": 6.048553168206258e-08, "logits/chosen": 14.36441707611084, "logits/rejected": 13.879219055175781, "logps/chosen": -4.493820667266846, "logps/rejected": -4.434215545654297, "loss": 3.9699, "rewards/accuracies": 0.25, "rewards/chosen": -44.938209533691406, "rewards/margins": -0.5960559844970703, "rewards/rejected": -44.34215545654297, "step": 6172 }, { "epoch": 0.8405501089324618, "grad_norm": 41.889677080820626, "learning_rate": 6.038503598665077e-08, "logits/chosen": 14.512361526489258, "logits/rejected": 14.298205375671387, "logps/chosen": -4.512404441833496, "logps/rejected": -4.675689697265625, "loss": 4.2224, "rewards/accuracies": 0.5, "rewards/chosen": -45.12404251098633, "rewards/margins": 1.632852554321289, "rewards/rejected": -46.75689697265625, "step": 6173 }, { "epoch": 0.8406862745098039, "grad_norm": 36.57883811287353, "learning_rate": 6.028461703004746e-08, "logits/chosen": 14.726298332214355, "logits/rejected": 15.269243240356445, "logps/chosen": -4.6192827224731445, "logps/rejected": -5.1405181884765625, "loss": 3.6201, "rewards/accuracies": 1.0, "rewards/chosen": -46.19282531738281, "rewards/margins": 5.212352752685547, "rewards/rejected": -51.40517807006836, "step": 6174 }, { "epoch": 0.840822440087146, "grad_norm": 39.021254757314345, "learning_rate": 6.018427483494295e-08, "logits/chosen": 14.211201667785645, "logits/rejected": 14.557226181030273, "logps/chosen": -4.757382869720459, "logps/rejected": -4.726162910461426, "loss": 3.8012, "rewards/accuracies": 0.25, "rewards/chosen": -47.573829650878906, "rewards/margins": -0.3121986389160156, "rewards/rejected": -47.26163101196289, "step": 6175 }, { "epoch": 0.840958605664488, "grad_norm": 45.08215365372742, "learning_rate": 6.008400942401048e-08, "logits/chosen": 14.277274131774902, "logits/rejected": 14.192131042480469, "logps/chosen": -4.073239326477051, "logps/rejected": -4.119436740875244, "loss": 3.8695, "rewards/accuracies": 0.25, "rewards/chosen": -40.732391357421875, "rewards/margins": 0.46197509765625, "rewards/rejected": -41.194366455078125, "step": 6176 }, { "epoch": 0.8410947712418301, "grad_norm": 43.234975982322574, "learning_rate": 5.998382081990593e-08, "logits/chosen": 14.486587524414062, "logits/rejected": 14.276681900024414, "logps/chosen": -4.428966999053955, "logps/rejected": -4.6455512046813965, "loss": 4.19, "rewards/accuracies": 0.75, "rewards/chosen": -44.289669036865234, "rewards/margins": 2.1658430099487305, "rewards/rejected": -46.45551300048828, "step": 6177 }, { "epoch": 0.8412309368191722, "grad_norm": 40.88109107005349, "learning_rate": 5.988370904526761e-08, "logits/chosen": 15.17929458618164, "logits/rejected": 15.449493408203125, "logps/chosen": -4.926140308380127, "logps/rejected": -4.982613563537598, "loss": 3.8662, "rewards/accuracies": 0.75, "rewards/chosen": -49.26139831542969, "rewards/margins": 0.5647306442260742, "rewards/rejected": -49.826133728027344, "step": 6178 }, { "epoch": 0.8413671023965141, "grad_norm": 39.98359854129148, "learning_rate": 5.978367412271663e-08, "logits/chosen": 14.520584106445312, "logits/rejected": 14.950907707214355, "logps/chosen": -4.535664081573486, "logps/rejected": -4.886196136474609, "loss": 3.8969, "rewards/accuracies": 0.75, "rewards/chosen": -45.35663986206055, "rewards/margins": 3.5053186416625977, "rewards/rejected": -48.861961364746094, "step": 6179 }, { "epoch": 0.8415032679738562, "grad_norm": 42.3219045264709, "learning_rate": 5.968371607485685e-08, "logits/chosen": 14.078399658203125, "logits/rejected": 13.802814483642578, "logps/chosen": -4.605438232421875, "logps/rejected": -4.427249908447266, "loss": 4.5161, "rewards/accuracies": 0.25, "rewards/chosen": -46.05438232421875, "rewards/margins": -1.781881332397461, "rewards/rejected": -44.27250289916992, "step": 6180 }, { "epoch": 0.8416394335511983, "grad_norm": 35.723172948545596, "learning_rate": 5.958383492427441e-08, "logits/chosen": 14.992582321166992, "logits/rejected": 15.356679916381836, "logps/chosen": -4.714812278747559, "logps/rejected": -5.084663391113281, "loss": 3.7558, "rewards/accuracies": 1.0, "rewards/chosen": -47.14812469482422, "rewards/margins": 3.698512077331543, "rewards/rejected": -50.84663772583008, "step": 6181 }, { "epoch": 0.8417755991285403, "grad_norm": 44.081230639099935, "learning_rate": 5.9484030693538376e-08, "logits/chosen": 14.68124008178711, "logits/rejected": 14.728389739990234, "logps/chosen": -4.616413116455078, "logps/rejected": -4.772045612335205, "loss": 4.4181, "rewards/accuracies": 0.75, "rewards/chosen": -46.16413116455078, "rewards/margins": 1.5563268661499023, "rewards/rejected": -47.720458984375, "step": 6182 }, { "epoch": 0.8419117647058824, "grad_norm": 45.75271595825554, "learning_rate": 5.938430340520035e-08, "logits/chosen": 14.775362014770508, "logits/rejected": 15.424379348754883, "logps/chosen": -4.6983442306518555, "logps/rejected": -4.908130645751953, "loss": 3.7167, "rewards/accuracies": 0.75, "rewards/chosen": -46.98344421386719, "rewards/margins": 2.0978622436523438, "rewards/rejected": -49.08130645751953, "step": 6183 }, { "epoch": 0.8420479302832244, "grad_norm": 43.40122623612357, "learning_rate": 5.928465308179462e-08, "logits/chosen": 15.048408508300781, "logits/rejected": 15.010659217834473, "logps/chosen": -4.747158050537109, "logps/rejected": -4.760538101196289, "loss": 4.1932, "rewards/accuracies": 0.75, "rewards/chosen": -47.47157669067383, "rewards/margins": 0.1338052749633789, "rewards/rejected": -47.605384826660156, "step": 6184 }, { "epoch": 0.8421840958605664, "grad_norm": 36.26828639680993, "learning_rate": 5.9185079745837794e-08, "logits/chosen": 14.401033401489258, "logits/rejected": 14.194051742553711, "logps/chosen": -4.384739875793457, "logps/rejected": -4.460999965667725, "loss": 3.7218, "rewards/accuracies": 0.5, "rewards/chosen": -43.84739685058594, "rewards/margins": 0.7626047134399414, "rewards/rejected": -44.61000061035156, "step": 6185 }, { "epoch": 0.8423202614379085, "grad_norm": 38.856851060940805, "learning_rate": 5.908558341982943e-08, "logits/chosen": 13.613265037536621, "logits/rejected": 13.78314208984375, "logps/chosen": -4.351958751678467, "logps/rejected": -4.4600982666015625, "loss": 3.9681, "rewards/accuracies": 0.5, "rewards/chosen": -43.519588470458984, "rewards/margins": 1.0813941955566406, "rewards/rejected": -44.600982666015625, "step": 6186 }, { "epoch": 0.8424564270152506, "grad_norm": 39.67479028405105, "learning_rate": 5.898616412625159e-08, "logits/chosen": 14.35904312133789, "logits/rejected": 14.998046875, "logps/chosen": -4.629958152770996, "logps/rejected": -4.854342937469482, "loss": 4.2508, "rewards/accuracies": 0.75, "rewards/chosen": -46.299583435058594, "rewards/margins": 2.243844985961914, "rewards/rejected": -48.543426513671875, "step": 6187 }, { "epoch": 0.8425925925925926, "grad_norm": 41.857215779174936, "learning_rate": 5.888682188756875e-08, "logits/chosen": 14.409323692321777, "logits/rejected": 14.619958877563477, "logps/chosen": -4.550907135009766, "logps/rejected": -4.771119117736816, "loss": 3.1761, "rewards/accuracies": 1.0, "rewards/chosen": -45.509071350097656, "rewards/margins": 2.2021169662475586, "rewards/rejected": -47.71118927001953, "step": 6188 }, { "epoch": 0.8427287581699346, "grad_norm": 36.88368550431302, "learning_rate": 5.878755672622815e-08, "logits/chosen": 14.540023803710938, "logits/rejected": 14.376096725463867, "logps/chosen": -4.125643730163574, "logps/rejected": -4.629081726074219, "loss": 3.8575, "rewards/accuracies": 0.75, "rewards/chosen": -41.256439208984375, "rewards/margins": 5.034379005432129, "rewards/rejected": -46.29081726074219, "step": 6189 }, { "epoch": 0.8428649237472767, "grad_norm": 45.270094681547526, "learning_rate": 5.868836866465958e-08, "logits/chosen": 14.581822395324707, "logits/rejected": 14.083311080932617, "logps/chosen": -4.806578636169434, "logps/rejected": -4.835136413574219, "loss": 4.4415, "rewards/accuracies": 0.75, "rewards/chosen": -48.06578063964844, "rewards/margins": 0.2855844497680664, "rewards/rejected": -48.35136413574219, "step": 6190 }, { "epoch": 0.8430010893246187, "grad_norm": 40.77133856801317, "learning_rate": 5.858925772527556e-08, "logits/chosen": 14.752046585083008, "logits/rejected": 15.32476806640625, "logps/chosen": -4.664943695068359, "logps/rejected": -5.133642196655273, "loss": 4.0687, "rewards/accuracies": 0.75, "rewards/chosen": -46.649436950683594, "rewards/margins": 4.68698787689209, "rewards/rejected": -51.33642578125, "step": 6191 }, { "epoch": 0.8431372549019608, "grad_norm": 45.0890983866821, "learning_rate": 5.849022393047076e-08, "logits/chosen": 14.2959566116333, "logits/rejected": 14.58751106262207, "logps/chosen": -4.4059062004089355, "logps/rejected": -4.7457098960876465, "loss": 3.9047, "rewards/accuracies": 0.75, "rewards/chosen": -44.05906295776367, "rewards/margins": 3.398036003112793, "rewards/rejected": -47.45709991455078, "step": 6192 }, { "epoch": 0.8432734204793029, "grad_norm": 44.96109433870352, "learning_rate": 5.839126730262283e-08, "logits/chosen": 14.586196899414062, "logits/rejected": 15.213176727294922, "logps/chosen": -4.68829345703125, "logps/rejected": -4.945444107055664, "loss": 3.4408, "rewards/accuracies": 0.75, "rewards/chosen": -46.882930755615234, "rewards/margins": 2.571507453918457, "rewards/rejected": -49.45444107055664, "step": 6193 }, { "epoch": 0.8434095860566448, "grad_norm": 38.28804192503305, "learning_rate": 5.829238786409188e-08, "logits/chosen": 14.391853332519531, "logits/rejected": 13.855546951293945, "logps/chosen": -4.433539390563965, "logps/rejected": -4.547141075134277, "loss": 4.3467, "rewards/accuracies": 0.5, "rewards/chosen": -44.33539581298828, "rewards/margins": 1.1360149383544922, "rewards/rejected": -45.471412658691406, "step": 6194 }, { "epoch": 0.8435457516339869, "grad_norm": 34.87864939667412, "learning_rate": 5.819358563722043e-08, "logits/chosen": 14.84921646118164, "logits/rejected": 15.526409149169922, "logps/chosen": -4.624134063720703, "logps/rejected": -5.2170257568359375, "loss": 3.3493, "rewards/accuracies": 1.0, "rewards/chosen": -46.24134063720703, "rewards/margins": 5.928915023803711, "rewards/rejected": -52.170257568359375, "step": 6195 }, { "epoch": 0.843681917211329, "grad_norm": 35.95423911072242, "learning_rate": 5.809486064433367e-08, "logits/chosen": 14.555912017822266, "logits/rejected": 15.067626953125, "logps/chosen": -4.452256202697754, "logps/rejected": -4.7399492263793945, "loss": 3.6848, "rewards/accuracies": 0.75, "rewards/chosen": -44.52256774902344, "rewards/margins": 2.876927375793457, "rewards/rejected": -47.39949035644531, "step": 6196 }, { "epoch": 0.843818082788671, "grad_norm": 39.323662968194114, "learning_rate": 5.7996212907739375e-08, "logits/chosen": 14.308629989624023, "logits/rejected": 13.723299026489258, "logps/chosen": -4.719399452209473, "logps/rejected": -4.655572891235352, "loss": 4.1455, "rewards/accuracies": 0.5, "rewards/chosen": -47.193992614746094, "rewards/margins": -0.6382637023925781, "rewards/rejected": -46.55573272705078, "step": 6197 }, { "epoch": 0.8439542483660131, "grad_norm": 34.936955729852485, "learning_rate": 5.78976424497279e-08, "logits/chosen": 12.971638679504395, "logits/rejected": 14.607259750366211, "logps/chosen": -4.095831394195557, "logps/rejected": -4.531975269317627, "loss": 3.7182, "rewards/accuracies": 1.0, "rewards/chosen": -40.95831298828125, "rewards/margins": 4.361437797546387, "rewards/rejected": -45.31975173950195, "step": 6198 }, { "epoch": 0.8440904139433552, "grad_norm": 42.971407670889505, "learning_rate": 5.779914929257188e-08, "logits/chosen": 14.443582534790039, "logits/rejected": 15.355182647705078, "logps/chosen": -4.943906784057617, "logps/rejected": -5.2371063232421875, "loss": 4.1827, "rewards/accuracies": 1.0, "rewards/chosen": -49.439064025878906, "rewards/margins": 2.9319944381713867, "rewards/rejected": -52.37105941772461, "step": 6199 }, { "epoch": 0.8442265795206971, "grad_norm": 40.568086819157756, "learning_rate": 5.770073345852671e-08, "logits/chosen": 13.393024444580078, "logits/rejected": 14.496946334838867, "logps/chosen": -4.223993301391602, "logps/rejected": -4.625687599182129, "loss": 3.8967, "rewards/accuracies": 0.75, "rewards/chosen": -42.239933013916016, "rewards/margins": 4.016942024230957, "rewards/rejected": -46.256874084472656, "step": 6200 }, { "epoch": 0.8443627450980392, "grad_norm": 40.47260068469916, "learning_rate": 5.760239496983041e-08, "logits/chosen": 14.612361907958984, "logits/rejected": 15.308700561523438, "logps/chosen": -4.74636173248291, "logps/rejected": -4.970841884613037, "loss": 4.235, "rewards/accuracies": 0.75, "rewards/chosen": -47.463619232177734, "rewards/margins": 2.244800567626953, "rewards/rejected": -49.70841979980469, "step": 6201 }, { "epoch": 0.8444989106753813, "grad_norm": 44.3024697760081, "learning_rate": 5.750413384870314e-08, "logits/chosen": 14.757448196411133, "logits/rejected": 14.528070449829102, "logps/chosen": -4.524238586425781, "logps/rejected": -4.726508140563965, "loss": 3.4976, "rewards/accuracies": 0.5, "rewards/chosen": -45.24238204956055, "rewards/margins": 2.022695541381836, "rewards/rejected": -47.26507568359375, "step": 6202 }, { "epoch": 0.8446350762527233, "grad_norm": 40.01569035723053, "learning_rate": 5.7405950117347966e-08, "logits/chosen": 14.390100479125977, "logits/rejected": 14.04139518737793, "logps/chosen": -4.70400333404541, "logps/rejected": -4.826506614685059, "loss": 3.9007, "rewards/accuracies": 0.5, "rewards/chosen": -47.040035247802734, "rewards/margins": 1.2250299453735352, "rewards/rejected": -48.26506423950195, "step": 6203 }, { "epoch": 0.8447712418300654, "grad_norm": 37.740351658924794, "learning_rate": 5.7307843797950305e-08, "logits/chosen": 14.797893524169922, "logits/rejected": 14.6875, "logps/chosen": -4.625878810882568, "logps/rejected": -4.592161178588867, "loss": 3.7939, "rewards/accuracies": 0.75, "rewards/chosen": -46.258785247802734, "rewards/margins": -0.3371744155883789, "rewards/rejected": -45.92161178588867, "step": 6204 }, { "epoch": 0.8449074074074074, "grad_norm": 39.92106051897673, "learning_rate": 5.720981491267802e-08, "logits/chosen": 14.256815910339355, "logits/rejected": 14.524845123291016, "logps/chosen": -4.709014892578125, "logps/rejected": -4.869412422180176, "loss": 3.7517, "rewards/accuracies": 0.5, "rewards/chosen": -47.09014892578125, "rewards/margins": 1.6039743423461914, "rewards/rejected": -48.694122314453125, "step": 6205 }, { "epoch": 0.8450435729847494, "grad_norm": 37.988418644436, "learning_rate": 5.7111863483681576e-08, "logits/chosen": 14.59476375579834, "logits/rejected": 15.155033111572266, "logps/chosen": -4.4277849197387695, "logps/rejected": -4.831780433654785, "loss": 4.0431, "rewards/accuracies": 0.75, "rewards/chosen": -44.27784729003906, "rewards/margins": 4.0399580001831055, "rewards/rejected": -48.31780242919922, "step": 6206 }, { "epoch": 0.8451797385620915, "grad_norm": 39.850104412504145, "learning_rate": 5.701398953309397e-08, "logits/chosen": 14.495859146118164, "logits/rejected": 15.186941146850586, "logps/chosen": -4.530017852783203, "logps/rejected": -4.972137451171875, "loss": 3.9888, "rewards/accuracies": 1.0, "rewards/chosen": -45.3001823425293, "rewards/margins": 4.421192169189453, "rewards/rejected": -49.72137451171875, "step": 6207 }, { "epoch": 0.8453159041394336, "grad_norm": 40.61901587525214, "learning_rate": 5.691619308303055e-08, "logits/chosen": 14.394906997680664, "logits/rejected": 14.918575286865234, "logps/chosen": -4.528404235839844, "logps/rejected": -4.920700550079346, "loss": 4.4171, "rewards/accuracies": 1.0, "rewards/chosen": -45.28404235839844, "rewards/margins": 3.922959327697754, "rewards/rejected": -49.20700454711914, "step": 6208 }, { "epoch": 0.8454520697167756, "grad_norm": 38.14184070228544, "learning_rate": 5.6818474155589224e-08, "logits/chosen": 14.455503463745117, "logits/rejected": 14.535888671875, "logps/chosen": -4.441728591918945, "logps/rejected": -4.473299026489258, "loss": 3.9297, "rewards/accuracies": 0.75, "rewards/chosen": -44.41728591918945, "rewards/margins": 0.3157033920288086, "rewards/rejected": -44.73299026489258, "step": 6209 }, { "epoch": 0.8455882352941176, "grad_norm": 41.845780799202, "learning_rate": 5.672083277285051e-08, "logits/chosen": 13.755717277526855, "logits/rejected": 13.721050262451172, "logps/chosen": -4.446059703826904, "logps/rejected": -4.485538959503174, "loss": 4.1288, "rewards/accuracies": 0.25, "rewards/chosen": -44.460594177246094, "rewards/margins": 0.3947944641113281, "rewards/rejected": -44.85539245605469, "step": 6210 }, { "epoch": 0.8457244008714597, "grad_norm": 44.57081158411142, "learning_rate": 5.662326895687717e-08, "logits/chosen": 14.126913070678711, "logits/rejected": 14.539234161376953, "logps/chosen": -4.888748645782471, "logps/rejected": -4.8745927810668945, "loss": 3.6811, "rewards/accuracies": 0.75, "rewards/chosen": -48.88748550415039, "rewards/margins": -0.14155960083007812, "rewards/rejected": -48.74592590332031, "step": 6211 }, { "epoch": 0.8458605664488017, "grad_norm": 45.69234604351985, "learning_rate": 5.652578272971453e-08, "logits/chosen": 15.025135040283203, "logits/rejected": 15.716615676879883, "logps/chosen": -5.061832427978516, "logps/rejected": -5.269657135009766, "loss": 3.7696, "rewards/accuracies": 0.75, "rewards/chosen": -50.61832046508789, "rewards/margins": 2.0782461166381836, "rewards/rejected": -52.69656753540039, "step": 6212 }, { "epoch": 0.8459967320261438, "grad_norm": 37.88605887876548, "learning_rate": 5.642837411339059e-08, "logits/chosen": 14.309130668640137, "logits/rejected": 14.224588394165039, "logps/chosen": -4.407349586486816, "logps/rejected": -4.920955657958984, "loss": 3.8313, "rewards/accuracies": 1.0, "rewards/chosen": -44.07349395751953, "rewards/margins": 5.136064529418945, "rewards/rejected": -49.209556579589844, "step": 6213 }, { "epoch": 0.8461328976034859, "grad_norm": 37.77512643328672, "learning_rate": 5.633104312991541e-08, "logits/chosen": 14.44084358215332, "logits/rejected": 14.85331916809082, "logps/chosen": -4.7175421714782715, "logps/rejected": -4.99468469619751, "loss": 3.6377, "rewards/accuracies": 0.75, "rewards/chosen": -47.17542266845703, "rewards/margins": 2.7714242935180664, "rewards/rejected": -49.94684600830078, "step": 6214 }, { "epoch": 0.8462690631808278, "grad_norm": 43.26155044685386, "learning_rate": 5.623378980128186e-08, "logits/chosen": 15.708230972290039, "logits/rejected": 15.759811401367188, "logps/chosen": -4.854701995849609, "logps/rejected": -4.795938491821289, "loss": 3.0077, "rewards/accuracies": 0.25, "rewards/chosen": -48.547019958496094, "rewards/margins": -0.5876321792602539, "rewards/rejected": -47.959388732910156, "step": 6215 }, { "epoch": 0.8464052287581699, "grad_norm": 42.006749130792315, "learning_rate": 5.6136614149465155e-08, "logits/chosen": 14.360015869140625, "logits/rejected": 14.760613441467285, "logps/chosen": -4.32787561416626, "logps/rejected": -4.5609893798828125, "loss": 4.1014, "rewards/accuracies": 1.0, "rewards/chosen": -43.27875900268555, "rewards/margins": 2.331134796142578, "rewards/rejected": -45.609893798828125, "step": 6216 }, { "epoch": 0.846541394335512, "grad_norm": 38.427993541509, "learning_rate": 5.603951619642284e-08, "logits/chosen": 14.526899337768555, "logits/rejected": 14.692164421081543, "logps/chosen": -4.425310134887695, "logps/rejected": -4.779153347015381, "loss": 3.8427, "rewards/accuracies": 0.75, "rewards/chosen": -44.25310516357422, "rewards/margins": 3.53842830657959, "rewards/rejected": -47.791534423828125, "step": 6217 }, { "epoch": 0.846677559912854, "grad_norm": 43.95021137828215, "learning_rate": 5.594249596409501e-08, "logits/chosen": 14.367188453674316, "logits/rejected": 14.553285598754883, "logps/chosen": -4.578090667724609, "logps/rejected": -4.491054058074951, "loss": 3.8161, "rewards/accuracies": 0.75, "rewards/chosen": -45.780906677246094, "rewards/margins": -0.8703670501708984, "rewards/rejected": -44.91053771972656, "step": 6218 }, { "epoch": 0.8468137254901961, "grad_norm": 43.203500175279245, "learning_rate": 5.5845553474404316e-08, "logits/chosen": 14.421059608459473, "logits/rejected": 15.187675476074219, "logps/chosen": -4.809326648712158, "logps/rejected": -5.061852931976318, "loss": 3.8512, "rewards/accuracies": 0.75, "rewards/chosen": -48.093265533447266, "rewards/margins": 2.525265693664551, "rewards/rejected": -50.6185302734375, "step": 6219 }, { "epoch": 0.8469498910675382, "grad_norm": 40.979904067034504, "learning_rate": 5.574868874925553e-08, "logits/chosen": 14.306295394897461, "logits/rejected": 14.983027458190918, "logps/chosen": -4.85296630859375, "logps/rejected": -4.967456817626953, "loss": 3.967, "rewards/accuracies": 0.75, "rewards/chosen": -48.529659271240234, "rewards/margins": 1.1449098587036133, "rewards/rejected": -49.67456817626953, "step": 6220 }, { "epoch": 0.8470860566448801, "grad_norm": 39.56988461017972, "learning_rate": 5.565190181053618e-08, "logits/chosen": 13.800539016723633, "logits/rejected": 15.875099182128906, "logps/chosen": -4.37888765335083, "logps/rejected": -4.916264057159424, "loss": 3.6957, "rewards/accuracies": 1.0, "rewards/chosen": -43.78887176513672, "rewards/margins": 5.37376594543457, "rewards/rejected": -49.16264343261719, "step": 6221 }, { "epoch": 0.8472222222222222, "grad_norm": 39.12626232186616, "learning_rate": 5.555519268011606e-08, "logits/chosen": 14.280525207519531, "logits/rejected": 15.293754577636719, "logps/chosen": -4.522516250610352, "logps/rejected": -4.712632179260254, "loss": 3.9968, "rewards/accuracies": 0.75, "rewards/chosen": -45.22515869140625, "rewards/margins": 1.9011659622192383, "rewards/rejected": -47.12632751464844, "step": 6222 }, { "epoch": 0.8473583877995643, "grad_norm": 43.1065934161399, "learning_rate": 5.545856137984728e-08, "logits/chosen": 15.287805557250977, "logits/rejected": 14.860462188720703, "logps/chosen": -4.956392288208008, "logps/rejected": -4.992436408996582, "loss": 3.853, "rewards/accuracies": 0.5, "rewards/chosen": -49.56391906738281, "rewards/margins": 0.360443115234375, "rewards/rejected": -49.92436218261719, "step": 6223 }, { "epoch": 0.8474945533769063, "grad_norm": 39.940572905073005, "learning_rate": 5.53620079315646e-08, "logits/chosen": 14.851907730102539, "logits/rejected": 15.204615592956543, "logps/chosen": -4.674081325531006, "logps/rejected": -4.742523670196533, "loss": 4.135, "rewards/accuracies": 0.5, "rewards/chosen": -46.740814208984375, "rewards/margins": 0.6844196319580078, "rewards/rejected": -47.42523193359375, "step": 6224 }, { "epoch": 0.8476307189542484, "grad_norm": 47.507033932539976, "learning_rate": 5.526553235708511e-08, "logits/chosen": 14.96396255493164, "logits/rejected": 15.134278297424316, "logps/chosen": -4.926098823547363, "logps/rejected": -4.695321083068848, "loss": 3.9712, "rewards/accuracies": 0.25, "rewards/chosen": -49.260986328125, "rewards/margins": -2.3077754974365234, "rewards/rejected": -46.953208923339844, "step": 6225 }, { "epoch": 0.8477668845315904, "grad_norm": 39.23235396705647, "learning_rate": 5.5169134678208076e-08, "logits/chosen": 13.959724426269531, "logits/rejected": 14.340978622436523, "logps/chosen": -4.171786308288574, "logps/rejected": -4.637228965759277, "loss": 4.1246, "rewards/accuracies": 0.75, "rewards/chosen": -41.71786117553711, "rewards/margins": 4.654431343078613, "rewards/rejected": -46.372291564941406, "step": 6226 }, { "epoch": 0.8479030501089324, "grad_norm": 36.528298606872696, "learning_rate": 5.5072814916715496e-08, "logits/chosen": 14.680169105529785, "logits/rejected": 14.655248641967773, "logps/chosen": -4.651508331298828, "logps/rejected": -4.783268451690674, "loss": 3.4495, "rewards/accuracies": 0.5, "rewards/chosen": -46.515079498291016, "rewards/margins": 1.3176050186157227, "rewards/rejected": -47.83268356323242, "step": 6227 }, { "epoch": 0.8480392156862745, "grad_norm": 39.87826099791426, "learning_rate": 5.497657309437165e-08, "logits/chosen": 14.39995002746582, "logits/rejected": 14.271187782287598, "logps/chosen": -4.762090682983398, "logps/rejected": -4.807863235473633, "loss": 4.401, "rewards/accuracies": 0.5, "rewards/chosen": -47.620906829833984, "rewards/margins": 0.45772361755371094, "rewards/rejected": -48.07862854003906, "step": 6228 }, { "epoch": 0.8481753812636166, "grad_norm": 38.80683096977193, "learning_rate": 5.488040923292301e-08, "logits/chosen": 14.86029052734375, "logits/rejected": 15.14348316192627, "logps/chosen": -4.070168495178223, "logps/rejected": -4.8217878341674805, "loss": 3.8649, "rewards/accuracies": 0.75, "rewards/chosen": -40.701683044433594, "rewards/margins": 7.516191482543945, "rewards/rejected": -48.21787643432617, "step": 6229 }, { "epoch": 0.8483115468409586, "grad_norm": 45.042836496573415, "learning_rate": 5.4784323354098725e-08, "logits/chosen": 13.79139518737793, "logits/rejected": 14.450042724609375, "logps/chosen": -4.3407745361328125, "logps/rejected": -4.573916435241699, "loss": 4.5973, "rewards/accuracies": 0.75, "rewards/chosen": -43.40774154663086, "rewards/margins": 2.3314247131347656, "rewards/rejected": -45.739166259765625, "step": 6230 }, { "epoch": 0.8484477124183006, "grad_norm": 40.78341739792413, "learning_rate": 5.468831547961019e-08, "logits/chosen": 14.71342658996582, "logits/rejected": 14.598689079284668, "logps/chosen": -4.630984306335449, "logps/rejected": -4.598812103271484, "loss": 3.8675, "rewards/accuracies": 0.75, "rewards/chosen": -46.309844970703125, "rewards/margins": -0.32172107696533203, "rewards/rejected": -45.988121032714844, "step": 6231 }, { "epoch": 0.8485838779956427, "grad_norm": 37.70773883381697, "learning_rate": 5.459238563115112e-08, "logits/chosen": 13.982807159423828, "logits/rejected": 14.943536758422852, "logps/chosen": -4.25662899017334, "logps/rejected": -4.5168890953063965, "loss": 3.4802, "rewards/accuracies": 0.75, "rewards/chosen": -42.566287994384766, "rewards/margins": 2.602602958679199, "rewards/rejected": -45.168888092041016, "step": 6232 }, { "epoch": 0.8487200435729847, "grad_norm": 41.875008256647575, "learning_rate": 5.449653383039767e-08, "logits/chosen": 14.53259563446045, "logits/rejected": 14.858951568603516, "logps/chosen": -4.411779880523682, "logps/rejected": -4.665679931640625, "loss": 4.3073, "rewards/accuracies": 0.75, "rewards/chosen": -44.1177978515625, "rewards/margins": 2.5389976501464844, "rewards/rejected": -46.656795501708984, "step": 6233 }, { "epoch": 0.8488562091503268, "grad_norm": 39.00391779302383, "learning_rate": 5.4400760099008406e-08, "logits/chosen": 14.39236068725586, "logits/rejected": 14.893011093139648, "logps/chosen": -4.7840895652771, "logps/rejected": -4.82798957824707, "loss": 4.0245, "rewards/accuracies": 0.5, "rewards/chosen": -47.84089660644531, "rewards/margins": 0.43900203704833984, "rewards/rejected": -48.2798957824707, "step": 6234 }, { "epoch": 0.8489923747276689, "grad_norm": 38.24843047579483, "learning_rate": 5.4305064458624126e-08, "logits/chosen": 14.417978286743164, "logits/rejected": 14.219121932983398, "logps/chosen": -4.583337783813477, "logps/rejected": -4.687268257141113, "loss": 3.6909, "rewards/accuracies": 0.5, "rewards/chosen": -45.833377838134766, "rewards/margins": 1.0392999649047852, "rewards/rejected": -46.8726806640625, "step": 6235 }, { "epoch": 0.849128540305011, "grad_norm": 43.47481361199636, "learning_rate": 5.420944693086804e-08, "logits/chosen": 14.543950080871582, "logits/rejected": 14.483282089233398, "logps/chosen": -4.826862335205078, "logps/rejected": -4.656125068664551, "loss": 4.2791, "rewards/accuracies": 0.25, "rewards/chosen": -48.26862335205078, "rewards/margins": -1.7073736190795898, "rewards/rejected": -46.561248779296875, "step": 6236 }, { "epoch": 0.8492647058823529, "grad_norm": 41.32670969074121, "learning_rate": 5.411390753734584e-08, "logits/chosen": 14.910539627075195, "logits/rejected": 14.888425827026367, "logps/chosen": -4.3762078285217285, "logps/rejected": -4.559186935424805, "loss": 4.3013, "rewards/accuracies": 0.75, "rewards/chosen": -43.76207733154297, "rewards/margins": 1.8297920227050781, "rewards/rejected": -45.59186935424805, "step": 6237 }, { "epoch": 0.849400871459695, "grad_norm": 45.83256655165777, "learning_rate": 5.401844629964527e-08, "logits/chosen": 14.972201347351074, "logits/rejected": 15.435983657836914, "logps/chosen": -4.845681190490723, "logps/rejected": -4.989363670349121, "loss": 4.0114, "rewards/accuracies": 0.5, "rewards/chosen": -48.45681381225586, "rewards/margins": 1.4368200302124023, "rewards/rejected": -49.89363479614258, "step": 6238 }, { "epoch": 0.8495370370370371, "grad_norm": 37.82683986764322, "learning_rate": 5.3923063239336686e-08, "logits/chosen": 13.702573776245117, "logits/rejected": 14.642902374267578, "logps/chosen": -4.009775638580322, "logps/rejected": -4.648927688598633, "loss": 3.731, "rewards/accuracies": 1.0, "rewards/chosen": -40.097755432128906, "rewards/margins": 6.391521453857422, "rewards/rejected": -46.48927688598633, "step": 6239 }, { "epoch": 0.8496732026143791, "grad_norm": 45.436021656973544, "learning_rate": 5.382775837797271e-08, "logits/chosen": 14.178096771240234, "logits/rejected": 14.532770156860352, "logps/chosen": -4.294064521789551, "logps/rejected": -4.699276924133301, "loss": 4.4758, "rewards/accuracies": 0.75, "rewards/chosen": -42.94064712524414, "rewards/margins": 4.052119255065918, "rewards/rejected": -46.992767333984375, "step": 6240 }, { "epoch": 0.8498093681917211, "grad_norm": 37.82023218715517, "learning_rate": 5.373253173708816e-08, "logits/chosen": 14.371891021728516, "logits/rejected": 14.725678443908691, "logps/chosen": -4.710378170013428, "logps/rejected": -4.559988975524902, "loss": 3.8778, "rewards/accuracies": 0.5, "rewards/chosen": -47.103782653808594, "rewards/margins": -1.5038948059082031, "rewards/rejected": -45.59988784790039, "step": 6241 }, { "epoch": 0.8499455337690632, "grad_norm": 38.03263078431588, "learning_rate": 5.363738333820036e-08, "logits/chosen": 14.202341079711914, "logits/rejected": 14.569953918457031, "logps/chosen": -4.269172191619873, "logps/rejected": -4.599857330322266, "loss": 3.8166, "rewards/accuracies": 1.0, "rewards/chosen": -42.69171905517578, "rewards/margins": 3.306854248046875, "rewards/rejected": -45.998573303222656, "step": 6242 }, { "epoch": 0.8500816993464052, "grad_norm": 38.25687395048713, "learning_rate": 5.354231320280882e-08, "logits/chosen": 14.415428161621094, "logits/rejected": 14.467449188232422, "logps/chosen": -4.239962577819824, "logps/rejected": -4.755488395690918, "loss": 3.7195, "rewards/accuracies": 0.75, "rewards/chosen": -42.399627685546875, "rewards/margins": 5.155254364013672, "rewards/rejected": -47.55487823486328, "step": 6243 }, { "epoch": 0.8502178649237473, "grad_norm": 42.80440820540707, "learning_rate": 5.3447321352395605e-08, "logits/chosen": 14.757710456848145, "logits/rejected": 15.122314453125, "logps/chosen": -4.200394630432129, "logps/rejected": -4.755496025085449, "loss": 3.9128, "rewards/accuracies": 0.75, "rewards/chosen": -42.003944396972656, "rewards/margins": 5.551016807556152, "rewards/rejected": -47.554962158203125, "step": 6244 }, { "epoch": 0.8503540305010894, "grad_norm": 36.828334768343296, "learning_rate": 5.3352407808424604e-08, "logits/chosen": 14.94629955291748, "logits/rejected": 14.972536087036133, "logps/chosen": -4.563112258911133, "logps/rejected": -4.547910213470459, "loss": 4.0248, "rewards/accuracies": 0.75, "rewards/chosen": -45.63111877441406, "rewards/margins": -0.15201759338378906, "rewards/rejected": -45.479103088378906, "step": 6245 }, { "epoch": 0.8504901960784313, "grad_norm": 40.38355351429966, "learning_rate": 5.3257572592342537e-08, "logits/chosen": 14.074896812438965, "logits/rejected": 14.758683204650879, "logps/chosen": -4.462700843811035, "logps/rejected": -4.419112205505371, "loss": 4.1295, "rewards/accuracies": 0.25, "rewards/chosen": -44.62700653076172, "rewards/margins": -0.43589019775390625, "rewards/rejected": -44.19111633300781, "step": 6246 }, { "epoch": 0.8506263616557734, "grad_norm": 38.62782498604092, "learning_rate": 5.316281572557817e-08, "logits/chosen": 13.951683044433594, "logits/rejected": 14.76471996307373, "logps/chosen": -4.551470756530762, "logps/rejected": -4.666003704071045, "loss": 3.9315, "rewards/accuracies": 0.5, "rewards/chosen": -45.51470184326172, "rewards/margins": 1.1453323364257812, "rewards/rejected": -46.6600341796875, "step": 6247 }, { "epoch": 0.8507625272331155, "grad_norm": 39.83504526295554, "learning_rate": 5.306813722954255e-08, "logits/chosen": 14.939990997314453, "logits/rejected": 15.204742431640625, "logps/chosen": -4.748744964599609, "logps/rejected": -5.148334980010986, "loss": 3.6224, "rewards/accuracies": 1.0, "rewards/chosen": -47.487449645996094, "rewards/margins": 3.9959001541137695, "rewards/rejected": -51.48334884643555, "step": 6248 }, { "epoch": 0.8508986928104575, "grad_norm": 40.67299525470967, "learning_rate": 5.2973537125629064e-08, "logits/chosen": 14.633798599243164, "logits/rejected": 14.36746883392334, "logps/chosen": -4.516666412353516, "logps/rejected": -4.746030330657959, "loss": 3.9521, "rewards/accuracies": 0.75, "rewards/chosen": -45.166664123535156, "rewards/margins": 2.29364013671875, "rewards/rejected": -47.46030044555664, "step": 6249 }, { "epoch": 0.8510348583877996, "grad_norm": 36.91582929771106, "learning_rate": 5.287901543521349e-08, "logits/chosen": 14.974002838134766, "logits/rejected": 15.308134078979492, "logps/chosen": -4.576568126678467, "logps/rejected": -4.738439559936523, "loss": 3.2937, "rewards/accuracies": 0.5, "rewards/chosen": -45.76567840576172, "rewards/margins": 1.618718147277832, "rewards/rejected": -47.3843994140625, "step": 6250 }, { "epoch": 0.8511710239651417, "grad_norm": 37.648796548499966, "learning_rate": 5.278457217965364e-08, "logits/chosen": 14.065122604370117, "logits/rejected": 14.309077262878418, "logps/chosen": -4.168010711669922, "logps/rejected": -4.472809314727783, "loss": 3.8261, "rewards/accuracies": 0.75, "rewards/chosen": -41.68010711669922, "rewards/margins": 3.0479860305786133, "rewards/rejected": -44.728092193603516, "step": 6251 }, { "epoch": 0.8513071895424836, "grad_norm": 45.23366660375961, "learning_rate": 5.269020738028982e-08, "logits/chosen": 13.807381629943848, "logits/rejected": 14.820587158203125, "logps/chosen": -4.044520378112793, "logps/rejected": -4.581121444702148, "loss": 4.1857, "rewards/accuracies": 1.0, "rewards/chosen": -40.4452018737793, "rewards/margins": 5.36601448059082, "rewards/rejected": -45.811214447021484, "step": 6252 }, { "epoch": 0.8514433551198257, "grad_norm": 39.704547730644386, "learning_rate": 5.259592105844461e-08, "logits/chosen": 14.658027648925781, "logits/rejected": 14.7610445022583, "logps/chosen": -4.396629333496094, "logps/rejected": -4.625602722167969, "loss": 4.1843, "rewards/accuracies": 0.75, "rewards/chosen": -43.96629333496094, "rewards/margins": 2.289735794067383, "rewards/rejected": -46.25603103637695, "step": 6253 }, { "epoch": 0.8515795206971678, "grad_norm": 43.89090602892998, "learning_rate": 5.250171323542263e-08, "logits/chosen": 14.496736526489258, "logits/rejected": 14.620826721191406, "logps/chosen": -4.636334419250488, "logps/rejected": -4.779732704162598, "loss": 3.4319, "rewards/accuracies": 0.5, "rewards/chosen": -46.363346099853516, "rewards/margins": 1.4339828491210938, "rewards/rejected": -47.79732894897461, "step": 6254 }, { "epoch": 0.8517156862745098, "grad_norm": 41.41320361164033, "learning_rate": 5.240758393251097e-08, "logits/chosen": 14.910444259643555, "logits/rejected": 15.330184936523438, "logps/chosen": -4.833310604095459, "logps/rejected": -5.07781982421875, "loss": 4.2818, "rewards/accuracies": 1.0, "rewards/chosen": -48.333106994628906, "rewards/margins": 2.4450931549072266, "rewards/rejected": -50.7781982421875, "step": 6255 }, { "epoch": 0.8518518518518519, "grad_norm": 39.825754031931936, "learning_rate": 5.231353317097906e-08, "logits/chosen": 14.455728530883789, "logits/rejected": 13.921907424926758, "logps/chosen": -4.674918174743652, "logps/rejected": -4.555726051330566, "loss": 4.2371, "rewards/accuracies": 0.5, "rewards/chosen": -46.749183654785156, "rewards/margins": -1.1919260025024414, "rewards/rejected": -45.55725860595703, "step": 6256 }, { "epoch": 0.851988017429194, "grad_norm": 38.94034525026774, "learning_rate": 5.2219560972078223e-08, "logits/chosen": 14.46717643737793, "logits/rejected": 14.501235008239746, "logps/chosen": -4.533130168914795, "logps/rejected": -4.5654473304748535, "loss": 3.6489, "rewards/accuracies": 0.5, "rewards/chosen": -45.33130645751953, "rewards/margins": 0.32317256927490234, "rewards/rejected": -45.654476165771484, "step": 6257 }, { "epoch": 0.8521241830065359, "grad_norm": 34.79345116952912, "learning_rate": 5.21256673570424e-08, "logits/chosen": 14.690731048583984, "logits/rejected": 14.960939407348633, "logps/chosen": -4.689195156097412, "logps/rejected": -5.0147600173950195, "loss": 3.5716, "rewards/accuracies": 0.75, "rewards/chosen": -46.89194869995117, "rewards/margins": 3.255645751953125, "rewards/rejected": -50.14759826660156, "step": 6258 }, { "epoch": 0.852260348583878, "grad_norm": 40.61209898859293, "learning_rate": 5.2031852347087643e-08, "logits/chosen": 14.656595230102539, "logits/rejected": 14.527957916259766, "logps/chosen": -4.685667037963867, "logps/rejected": -4.672515869140625, "loss": 4.377, "rewards/accuracies": 0.75, "rewards/chosen": -46.856666564941406, "rewards/margins": -0.13151168823242188, "rewards/rejected": -46.72515869140625, "step": 6259 }, { "epoch": 0.8523965141612201, "grad_norm": 38.16198480336888, "learning_rate": 5.1938115963412156e-08, "logits/chosen": 13.689838409423828, "logits/rejected": 14.223959922790527, "logps/chosen": -4.251430988311768, "logps/rejected": -4.482953071594238, "loss": 3.6684, "rewards/accuracies": 0.75, "rewards/chosen": -42.514312744140625, "rewards/margins": 2.315220832824707, "rewards/rejected": -44.82952880859375, "step": 6260 }, { "epoch": 0.8525326797385621, "grad_norm": 40.49878009704965, "learning_rate": 5.184445822719641e-08, "logits/chosen": 14.584734916687012, "logits/rejected": 15.040432929992676, "logps/chosen": -5.172530174255371, "logps/rejected": -5.165698051452637, "loss": 4.251, "rewards/accuracies": 0.25, "rewards/chosen": -51.72529983520508, "rewards/margins": -0.06832027435302734, "rewards/rejected": -51.656978607177734, "step": 6261 }, { "epoch": 0.8526688453159041, "grad_norm": 41.209830557179465, "learning_rate": 5.1750879159603344e-08, "logits/chosen": 14.945521354675293, "logits/rejected": 15.006851196289062, "logps/chosen": -5.174594879150391, "logps/rejected": -5.026777267456055, "loss": 4.0987, "rewards/accuracies": 0.75, "rewards/chosen": -51.745948791503906, "rewards/margins": -1.4781761169433594, "rewards/rejected": -50.26776885986328, "step": 6262 }, { "epoch": 0.8528050108932462, "grad_norm": 36.08724110866231, "learning_rate": 5.165737878177769e-08, "logits/chosen": 14.904624938964844, "logits/rejected": 14.844938278198242, "logps/chosen": -4.889288902282715, "logps/rejected": -4.76701545715332, "loss": 4.0085, "rewards/accuracies": 0.5, "rewards/chosen": -48.89289093017578, "rewards/margins": -1.222731590270996, "rewards/rejected": -47.67015838623047, "step": 6263 }, { "epoch": 0.8529411764705882, "grad_norm": 39.416193536296575, "learning_rate": 5.1563957114846736e-08, "logits/chosen": 14.184259414672852, "logits/rejected": 14.804023742675781, "logps/chosen": -4.466268539428711, "logps/rejected": -4.599756717681885, "loss": 3.8721, "rewards/accuracies": 0.75, "rewards/chosen": -44.662681579589844, "rewards/margins": 1.3348875045776367, "rewards/rejected": -45.99756622314453, "step": 6264 }, { "epoch": 0.8530773420479303, "grad_norm": 37.13867648902403, "learning_rate": 5.147061417991994e-08, "logits/chosen": 14.794797897338867, "logits/rejected": 15.037956237792969, "logps/chosen": -4.852409839630127, "logps/rejected": -4.955851078033447, "loss": 3.8426, "rewards/accuracies": 0.5, "rewards/chosen": -48.52410125732422, "rewards/margins": 1.0344133377075195, "rewards/rejected": -49.558509826660156, "step": 6265 }, { "epoch": 0.8532135076252724, "grad_norm": 40.095417933202015, "learning_rate": 5.137734999808878e-08, "logits/chosen": 14.95899486541748, "logits/rejected": 15.05434799194336, "logps/chosen": -4.665475845336914, "logps/rejected": -4.33648681640625, "loss": 4.2926, "rewards/accuracies": 0.25, "rewards/chosen": -46.65475845336914, "rewards/margins": -3.289891242980957, "rewards/rejected": -43.3648681640625, "step": 6266 }, { "epoch": 0.8533496732026143, "grad_norm": 48.0746933035811, "learning_rate": 5.128416459042708e-08, "logits/chosen": 14.839713096618652, "logits/rejected": 15.274589538574219, "logps/chosen": -4.6399993896484375, "logps/rejected": -4.705672740936279, "loss": 4.4962, "rewards/accuracies": 0.5, "rewards/chosen": -46.399993896484375, "rewards/margins": 0.6567296981811523, "rewards/rejected": -47.056724548339844, "step": 6267 }, { "epoch": 0.8534858387799564, "grad_norm": 37.75660007702148, "learning_rate": 5.119105797799106e-08, "logits/chosen": 15.018550872802734, "logits/rejected": 14.275456428527832, "logps/chosen": -4.8961591720581055, "logps/rejected": -4.811429023742676, "loss": 3.6815, "rewards/accuracies": 0.5, "rewards/chosen": -48.96159362792969, "rewards/margins": -0.8473052978515625, "rewards/rejected": -48.114288330078125, "step": 6268 }, { "epoch": 0.8536220043572985, "grad_norm": 44.62329233035274, "learning_rate": 5.109803018181864e-08, "logits/chosen": 14.21280288696289, "logits/rejected": 14.22084903717041, "logps/chosen": -4.599079608917236, "logps/rejected": -4.622603416442871, "loss": 3.9144, "rewards/accuracies": 0.5, "rewards/chosen": -45.99079132080078, "rewards/margins": 0.23523902893066406, "rewards/rejected": -46.22603225708008, "step": 6269 }, { "epoch": 0.8537581699346405, "grad_norm": 40.78126145898024, "learning_rate": 5.100508122293039e-08, "logits/chosen": 14.831705093383789, "logits/rejected": 14.702640533447266, "logps/chosen": -4.390435218811035, "logps/rejected": -4.6198859214782715, "loss": 4.095, "rewards/accuracies": 0.5, "rewards/chosen": -43.90435028076172, "rewards/margins": 2.2945079803466797, "rewards/rejected": -46.19886016845703, "step": 6270 }, { "epoch": 0.8538943355119826, "grad_norm": 42.420764974441816, "learning_rate": 5.091221112232893e-08, "logits/chosen": 13.901590347290039, "logits/rejected": 13.945232391357422, "logps/chosen": -4.588263988494873, "logps/rejected": -4.611536026000977, "loss": 4.0896, "rewards/accuracies": 0.5, "rewards/chosen": -45.88264083862305, "rewards/margins": 0.23271942138671875, "rewards/rejected": -46.115360260009766, "step": 6271 }, { "epoch": 0.8540305010893247, "grad_norm": 41.47533289610718, "learning_rate": 5.081941990099885e-08, "logits/chosen": 14.137815475463867, "logits/rejected": 14.872845649719238, "logps/chosen": -4.273784637451172, "logps/rejected": -4.546964168548584, "loss": 4.0938, "rewards/accuracies": 0.75, "rewards/chosen": -42.73784255981445, "rewards/margins": 2.731797218322754, "rewards/rejected": -45.469642639160156, "step": 6272 }, { "epoch": 0.8541666666666666, "grad_norm": 39.59018270659481, "learning_rate": 5.07267075799072e-08, "logits/chosen": 14.044655799865723, "logits/rejected": 13.924846649169922, "logps/chosen": -4.646505355834961, "logps/rejected": -4.752919673919678, "loss": 3.9864, "rewards/accuracies": 0.5, "rewards/chosen": -46.46505355834961, "rewards/margins": 1.064143180847168, "rewards/rejected": -47.529197692871094, "step": 6273 }, { "epoch": 0.8543028322440087, "grad_norm": 39.27157624369311, "learning_rate": 5.063407418000323e-08, "logits/chosen": 13.471273422241211, "logits/rejected": 14.210247039794922, "logps/chosen": -4.494602203369141, "logps/rejected": -4.409513473510742, "loss": 4.1626, "rewards/accuracies": 0.5, "rewards/chosen": -44.946022033691406, "rewards/margins": -0.8508920669555664, "rewards/rejected": -44.095130920410156, "step": 6274 }, { "epoch": 0.8544389978213508, "grad_norm": 40.11945573658235, "learning_rate": 5.054151972221796e-08, "logits/chosen": 14.489456176757812, "logits/rejected": 14.15043830871582, "logps/chosen": -4.576015472412109, "logps/rejected": -4.772931098937988, "loss": 4.5395, "rewards/accuracies": 0.75, "rewards/chosen": -45.760154724121094, "rewards/margins": 1.9691534042358398, "rewards/rejected": -47.72930908203125, "step": 6275 }, { "epoch": 0.8545751633986928, "grad_norm": 44.42420450224698, "learning_rate": 5.0449044227464946e-08, "logits/chosen": 14.508440971374512, "logits/rejected": 14.26313591003418, "logps/chosen": -4.572574615478516, "logps/rejected": -4.609202861785889, "loss": 4.6339, "rewards/accuracies": 0.75, "rewards/chosen": -45.725746154785156, "rewards/margins": 0.36627769470214844, "rewards/rejected": -46.09202575683594, "step": 6276 }, { "epoch": 0.8547113289760349, "grad_norm": 37.11790677404647, "learning_rate": 5.035664771663994e-08, "logits/chosen": 13.301528930664062, "logits/rejected": 14.612176895141602, "logps/chosen": -4.063612937927246, "logps/rejected": -4.874410629272461, "loss": 3.5585, "rewards/accuracies": 1.0, "rewards/chosen": -40.636131286621094, "rewards/margins": 8.107973098754883, "rewards/rejected": -48.744102478027344, "step": 6277 }, { "epoch": 0.8548474945533769, "grad_norm": 36.615768146906525, "learning_rate": 5.0264330210620445e-08, "logits/chosen": 14.525392532348633, "logits/rejected": 14.834829330444336, "logps/chosen": -4.601931571960449, "logps/rejected": -4.859325408935547, "loss": 3.3096, "rewards/accuracies": 0.75, "rewards/chosen": -46.019317626953125, "rewards/margins": 2.573932647705078, "rewards/rejected": -48.59324645996094, "step": 6278 }, { "epoch": 0.8549836601307189, "grad_norm": 39.79979013845339, "learning_rate": 5.0172091730266464e-08, "logits/chosen": 14.560958862304688, "logits/rejected": 15.448087692260742, "logps/chosen": -4.810788154602051, "logps/rejected": -5.0091633796691895, "loss": 3.9627, "rewards/accuracies": 0.5, "rewards/chosen": -48.107879638671875, "rewards/margins": 1.9837532043457031, "rewards/rejected": -50.09163284301758, "step": 6279 }, { "epoch": 0.855119825708061, "grad_norm": 41.695983125828924, "learning_rate": 5.007993229642018e-08, "logits/chosen": 15.24697494506836, "logits/rejected": 15.064852714538574, "logps/chosen": -5.0692548751831055, "logps/rejected": -4.900490760803223, "loss": 3.7666, "rewards/accuracies": 0.5, "rewards/chosen": -50.69254684448242, "rewards/margins": -1.687643051147461, "rewards/rejected": -49.004905700683594, "step": 6280 }, { "epoch": 0.8552559912854031, "grad_norm": 46.02054650951853, "learning_rate": 4.99878519299056e-08, "logits/chosen": 14.64189338684082, "logits/rejected": 14.11971664428711, "logps/chosen": -4.573061943054199, "logps/rejected": -4.551560878753662, "loss": 4.5487, "rewards/accuracies": 0.25, "rewards/chosen": -45.730621337890625, "rewards/margins": -0.2150115966796875, "rewards/rejected": -45.51560974121094, "step": 6281 }, { "epoch": 0.8553921568627451, "grad_norm": 36.96638190021719, "learning_rate": 4.989585065152906e-08, "logits/chosen": 14.410411834716797, "logits/rejected": 14.839014053344727, "logps/chosen": -4.579722881317139, "logps/rejected": -4.962399959564209, "loss": 3.5535, "rewards/accuracies": 0.75, "rewards/chosen": -45.7972297668457, "rewards/margins": 3.826770782470703, "rewards/rejected": -49.624000549316406, "step": 6282 }, { "epoch": 0.8555283224400871, "grad_norm": 42.79794399805182, "learning_rate": 4.980392848207917e-08, "logits/chosen": 14.644388198852539, "logits/rejected": 15.036510467529297, "logps/chosen": -4.586919784545898, "logps/rejected": -4.866744041442871, "loss": 4.2799, "rewards/accuracies": 0.5, "rewards/chosen": -45.869197845458984, "rewards/margins": 2.7982378005981445, "rewards/rejected": -48.66743469238281, "step": 6283 }, { "epoch": 0.8556644880174292, "grad_norm": 39.08625273884677, "learning_rate": 4.97120854423263e-08, "logits/chosen": 14.430566787719727, "logits/rejected": 15.104718208312988, "logps/chosen": -4.644044876098633, "logps/rejected": -5.298306465148926, "loss": 3.2158, "rewards/accuracies": 1.0, "rewards/chosen": -46.440452575683594, "rewards/margins": 6.542610168457031, "rewards/rejected": -52.983062744140625, "step": 6284 }, { "epoch": 0.8558006535947712, "grad_norm": 41.34573152124854, "learning_rate": 4.962032155302323e-08, "logits/chosen": 14.67190170288086, "logits/rejected": 14.395873069763184, "logps/chosen": -4.637031555175781, "logps/rejected": -4.685266971588135, "loss": 4.3066, "rewards/accuracies": 0.75, "rewards/chosen": -46.37031555175781, "rewards/margins": 0.48235034942626953, "rewards/rejected": -46.85266876220703, "step": 6285 }, { "epoch": 0.8559368191721133, "grad_norm": 39.82232551470537, "learning_rate": 4.952863683490482e-08, "logits/chosen": 14.877391815185547, "logits/rejected": 14.044604301452637, "logps/chosen": -4.951569557189941, "logps/rejected": -4.565905570983887, "loss": 4.2248, "rewards/accuracies": 0.0, "rewards/chosen": -49.51569747924805, "rewards/margins": -3.8566436767578125, "rewards/rejected": -45.659053802490234, "step": 6286 }, { "epoch": 0.8560729847494554, "grad_norm": 38.34209812050936, "learning_rate": 4.9437031308687904e-08, "logits/chosen": 15.024900436401367, "logits/rejected": 15.492517471313477, "logps/chosen": -4.724591255187988, "logps/rejected": -4.884742736816406, "loss": 3.6151, "rewards/accuracies": 0.75, "rewards/chosen": -47.24591064453125, "rewards/margins": 1.6015186309814453, "rewards/rejected": -48.84742736816406, "step": 6287 }, { "epoch": 0.8562091503267973, "grad_norm": 40.785366902068446, "learning_rate": 4.9345504995071554e-08, "logits/chosen": 14.759358406066895, "logits/rejected": 14.882135391235352, "logps/chosen": -4.594047546386719, "logps/rejected": -4.778287887573242, "loss": 3.8745, "rewards/accuracies": 0.75, "rewards/chosen": -45.94047546386719, "rewards/margins": 1.842402458190918, "rewards/rejected": -47.782875061035156, "step": 6288 }, { "epoch": 0.8563453159041394, "grad_norm": 39.730666458900494, "learning_rate": 4.925405791473687e-08, "logits/chosen": 14.211091995239258, "logits/rejected": 14.360159873962402, "logps/chosen": -4.541609764099121, "logps/rejected": -4.6906867027282715, "loss": 4.1676, "rewards/accuracies": 1.0, "rewards/chosen": -45.416099548339844, "rewards/margins": 1.4907646179199219, "rewards/rejected": -46.906864166259766, "step": 6289 }, { "epoch": 0.8564814814814815, "grad_norm": 45.72498334098101, "learning_rate": 4.916269008834719e-08, "logits/chosen": 14.470311164855957, "logits/rejected": 14.769232749938965, "logps/chosen": -4.295186519622803, "logps/rejected": -4.375543594360352, "loss": 3.4272, "rewards/accuracies": 0.75, "rewards/chosen": -42.951866149902344, "rewards/margins": 0.8035726547241211, "rewards/rejected": -43.75543975830078, "step": 6290 }, { "epoch": 0.8566176470588235, "grad_norm": 38.15986381780877, "learning_rate": 4.907140153654765e-08, "logits/chosen": 13.695808410644531, "logits/rejected": 13.967390060424805, "logps/chosen": -4.269187927246094, "logps/rejected": -4.178229808807373, "loss": 3.7416, "rewards/accuracies": 0.5, "rewards/chosen": -42.69187927246094, "rewards/margins": -0.9095830917358398, "rewards/rejected": -41.78229904174805, "step": 6291 }, { "epoch": 0.8567538126361656, "grad_norm": 39.277175984462744, "learning_rate": 4.8980192279965705e-08, "logits/chosen": 14.34804630279541, "logits/rejected": 14.849483489990234, "logps/chosen": -4.666017532348633, "logps/rejected": -4.836252212524414, "loss": 3.9532, "rewards/accuracies": 0.75, "rewards/chosen": -46.66017150878906, "rewards/margins": 1.7023448944091797, "rewards/rejected": -48.362518310546875, "step": 6292 }, { "epoch": 0.8568899782135077, "grad_norm": 42.80823472395064, "learning_rate": 4.8889062339210995e-08, "logits/chosen": 14.965873718261719, "logits/rejected": 15.61726188659668, "logps/chosen": -4.807011604309082, "logps/rejected": -4.942631244659424, "loss": 4.1476, "rewards/accuracies": 0.75, "rewards/chosen": -48.07011413574219, "rewards/margins": 1.3561992645263672, "rewards/rejected": -49.42631530761719, "step": 6293 }, { "epoch": 0.8570261437908496, "grad_norm": 41.36812119767978, "learning_rate": 4.879801173487488e-08, "logits/chosen": 14.805535316467285, "logits/rejected": 15.961185455322266, "logps/chosen": -4.653742790222168, "logps/rejected": -4.83939266204834, "loss": 4.2671, "rewards/accuracies": 0.75, "rewards/chosen": -46.53742980957031, "rewards/margins": 1.8564987182617188, "rewards/rejected": -48.39392852783203, "step": 6294 }, { "epoch": 0.8571623093681917, "grad_norm": 43.14875866328341, "learning_rate": 4.870704048753107e-08, "logits/chosen": 15.162431716918945, "logits/rejected": 15.066777229309082, "logps/chosen": -5.006462574005127, "logps/rejected": -5.187044620513916, "loss": 3.7165, "rewards/accuracies": 0.5, "rewards/chosen": -50.06462478637695, "rewards/margins": 1.8058252334594727, "rewards/rejected": -51.87044906616211, "step": 6295 }, { "epoch": 0.8572984749455338, "grad_norm": 40.954369394520256, "learning_rate": 4.861614861773526e-08, "logits/chosen": 13.985841751098633, "logits/rejected": 14.812657356262207, "logps/chosen": -4.540799140930176, "logps/rejected": -4.923230171203613, "loss": 4.2168, "rewards/accuracies": 0.75, "rewards/chosen": -45.407989501953125, "rewards/margins": 3.824312210083008, "rewards/rejected": -49.2322998046875, "step": 6296 }, { "epoch": 0.8574346405228758, "grad_norm": 42.946688023245464, "learning_rate": 4.8525336146025344e-08, "logits/chosen": 14.930102348327637, "logits/rejected": 14.69635009765625, "logps/chosen": -4.833921432495117, "logps/rejected": -4.770147323608398, "loss": 3.5737, "rewards/accuracies": 0.5, "rewards/chosen": -48.33921432495117, "rewards/margins": -0.6377449035644531, "rewards/rejected": -47.70146942138672, "step": 6297 }, { "epoch": 0.8575708061002179, "grad_norm": 38.10756849362819, "learning_rate": 4.8434603092920936e-08, "logits/chosen": 14.256793975830078, "logits/rejected": 14.08391284942627, "logps/chosen": -4.529571533203125, "logps/rejected": -4.639126300811768, "loss": 3.702, "rewards/accuracies": 0.5, "rewards/chosen": -45.29571533203125, "rewards/margins": 1.0955514907836914, "rewards/rejected": -46.391265869140625, "step": 6298 }, { "epoch": 0.8577069716775599, "grad_norm": 39.68249526126104, "learning_rate": 4.834394947892404e-08, "logits/chosen": 14.427275657653809, "logits/rejected": 14.571354866027832, "logps/chosen": -4.744769096374512, "logps/rejected": -4.808130741119385, "loss": 4.12, "rewards/accuracies": 0.5, "rewards/chosen": -47.44768524169922, "rewards/margins": 0.6336183547973633, "rewards/rejected": -48.08130645751953, "step": 6299 }, { "epoch": 0.8578431372549019, "grad_norm": 41.1038391269269, "learning_rate": 4.8253375324518676e-08, "logits/chosen": 14.447763442993164, "logits/rejected": 15.869953155517578, "logps/chosen": -4.560564994812012, "logps/rejected": -5.087528228759766, "loss": 4.2428, "rewards/accuracies": 1.0, "rewards/chosen": -45.605648040771484, "rewards/margins": 5.269634246826172, "rewards/rejected": -50.875282287597656, "step": 6300 }, { "epoch": 0.857979302832244, "grad_norm": 40.75760034436127, "learning_rate": 4.8162880650170646e-08, "logits/chosen": 15.493326187133789, "logits/rejected": 15.639666557312012, "logps/chosen": -4.961874008178711, "logps/rejected": -5.045529365539551, "loss": 3.7324, "rewards/accuracies": 0.75, "rewards/chosen": -49.618743896484375, "rewards/margins": 0.8365488052368164, "rewards/rejected": -50.455291748046875, "step": 6301 }, { "epoch": 0.8581154684095861, "grad_norm": 41.546073678012256, "learning_rate": 4.807246547632804e-08, "logits/chosen": 13.720367431640625, "logits/rejected": 13.578420639038086, "logps/chosen": -4.307520389556885, "logps/rejected": -4.258754730224609, "loss": 3.6193, "rewards/accuracies": 0.5, "rewards/chosen": -43.07520294189453, "rewards/margins": -0.4876594543457031, "rewards/rejected": -42.587547302246094, "step": 6302 }, { "epoch": 0.858251633986928, "grad_norm": 37.348960546239255, "learning_rate": 4.7982129823420915e-08, "logits/chosen": 14.91280460357666, "logits/rejected": 15.057134628295898, "logps/chosen": -4.811737060546875, "logps/rejected": -5.053389549255371, "loss": 3.6341, "rewards/accuracies": 0.5, "rewards/chosen": -48.11737060546875, "rewards/margins": 2.4165220260620117, "rewards/rejected": -50.53389358520508, "step": 6303 }, { "epoch": 0.8583877995642701, "grad_norm": 40.43452037921048, "learning_rate": 4.789187371186143e-08, "logits/chosen": 13.308171272277832, "logits/rejected": 14.667356491088867, "logps/chosen": -4.157777309417725, "logps/rejected": -4.6551971435546875, "loss": 4.1478, "rewards/accuracies": 0.5, "rewards/chosen": -41.57777404785156, "rewards/margins": 4.9741973876953125, "rewards/rejected": -46.551971435546875, "step": 6304 }, { "epoch": 0.8585239651416122, "grad_norm": 39.986494730800636, "learning_rate": 4.780169716204358e-08, "logits/chosen": 15.07441520690918, "logits/rejected": 15.273904800415039, "logps/chosen": -4.754188537597656, "logps/rejected": -4.881324291229248, "loss": 3.5399, "rewards/accuracies": 0.75, "rewards/chosen": -47.54188537597656, "rewards/margins": 1.2713565826416016, "rewards/rejected": -48.81324005126953, "step": 6305 }, { "epoch": 0.8586601307189542, "grad_norm": 38.66939877569114, "learning_rate": 4.7711600194343526e-08, "logits/chosen": 15.013084411621094, "logits/rejected": 15.113861083984375, "logps/chosen": -4.962172508239746, "logps/rejected": -5.166897773742676, "loss": 3.9113, "rewards/accuracies": 0.5, "rewards/chosen": -49.62172317504883, "rewards/margins": 2.047255516052246, "rewards/rejected": -51.668975830078125, "step": 6306 }, { "epoch": 0.8587962962962963, "grad_norm": 41.05351881172554, "learning_rate": 4.762158282911959e-08, "logits/chosen": 13.365886688232422, "logits/rejected": 14.154801368713379, "logps/chosen": -4.1039299964904785, "logps/rejected": -4.40407657623291, "loss": 3.9181, "rewards/accuracies": 0.75, "rewards/chosen": -41.03929901123047, "rewards/margins": 3.00146484375, "rewards/rejected": -44.04076385498047, "step": 6307 }, { "epoch": 0.8589324618736384, "grad_norm": 40.36085650790186, "learning_rate": 4.753164508671168e-08, "logits/chosen": 15.649408340454102, "logits/rejected": 14.316495895385742, "logps/chosen": -4.818583965301514, "logps/rejected": -4.813755989074707, "loss": 4.0167, "rewards/accuracies": 0.5, "rewards/chosen": -48.18583679199219, "rewards/margins": -0.04828071594238281, "rewards/rejected": -48.13755798339844, "step": 6308 }, { "epoch": 0.8590686274509803, "grad_norm": 39.602435270438775, "learning_rate": 4.7441786987442125e-08, "logits/chosen": 13.940446853637695, "logits/rejected": 14.756650924682617, "logps/chosen": -4.305540084838867, "logps/rejected": -4.742859840393066, "loss": 3.6927, "rewards/accuracies": 1.0, "rewards/chosen": -43.05540084838867, "rewards/margins": 4.373201370239258, "rewards/rejected": -47.42860412597656, "step": 6309 }, { "epoch": 0.8592047930283224, "grad_norm": 41.002966033853376, "learning_rate": 4.735200855161512e-08, "logits/chosen": 14.15383529663086, "logits/rejected": 14.69749927520752, "logps/chosen": -4.81422758102417, "logps/rejected": -4.769549369812012, "loss": 3.9879, "rewards/accuracies": 0.5, "rewards/chosen": -48.142276763916016, "rewards/margins": -0.44678211212158203, "rewards/rejected": -47.69549560546875, "step": 6310 }, { "epoch": 0.8593409586056645, "grad_norm": 39.10014673863383, "learning_rate": 4.7262309799516754e-08, "logits/chosen": 14.935556411743164, "logits/rejected": 14.99087142944336, "logps/chosen": -4.871364116668701, "logps/rejected": -4.726001739501953, "loss": 4.2759, "rewards/accuracies": 0.5, "rewards/chosen": -48.71364212036133, "rewards/margins": -1.4536218643188477, "rewards/rejected": -47.2600212097168, "step": 6311 }, { "epoch": 0.8594771241830066, "grad_norm": 40.922851436680965, "learning_rate": 4.717269075141521e-08, "logits/chosen": 13.639924049377441, "logits/rejected": 14.704666137695312, "logps/chosen": -4.286572456359863, "logps/rejected": -4.674952030181885, "loss": 3.5535, "rewards/accuracies": 1.0, "rewards/chosen": -42.865726470947266, "rewards/margins": 3.883790969848633, "rewards/rejected": -46.74951934814453, "step": 6312 }, { "epoch": 0.8596132897603486, "grad_norm": 37.01681683613287, "learning_rate": 4.708315142756079e-08, "logits/chosen": 15.002361297607422, "logits/rejected": 14.774904251098633, "logps/chosen": -4.786962509155273, "logps/rejected": -4.84755277633667, "loss": 3.7274, "rewards/accuracies": 0.5, "rewards/chosen": -47.86962127685547, "rewards/margins": 0.6059064865112305, "rewards/rejected": -48.475528717041016, "step": 6313 }, { "epoch": 0.8597494553376906, "grad_norm": 38.90887564265552, "learning_rate": 4.699369184818547e-08, "logits/chosen": 14.701847076416016, "logits/rejected": 14.715373039245605, "logps/chosen": -4.627903938293457, "logps/rejected": -4.945742607116699, "loss": 4.0393, "rewards/accuracies": 0.75, "rewards/chosen": -46.27903747558594, "rewards/margins": 3.1783885955810547, "rewards/rejected": -49.457427978515625, "step": 6314 }, { "epoch": 0.8598856209150327, "grad_norm": 37.721089043289645, "learning_rate": 4.690431203350344e-08, "logits/chosen": 14.638528823852539, "logits/rejected": 14.890756607055664, "logps/chosen": -4.354433059692383, "logps/rejected": -4.792845726013184, "loss": 3.4275, "rewards/accuracies": 1.0, "rewards/chosen": -43.54433059692383, "rewards/margins": 4.384125709533691, "rewards/rejected": -47.92845153808594, "step": 6315 }, { "epoch": 0.8600217864923747, "grad_norm": 45.47095279791632, "learning_rate": 4.681501200371096e-08, "logits/chosen": 14.009502410888672, "logits/rejected": 14.791031837463379, "logps/chosen": -4.540534019470215, "logps/rejected": -4.862270355224609, "loss": 3.8424, "rewards/accuracies": 0.75, "rewards/chosen": -45.40534210205078, "rewards/margins": 3.2173614501953125, "rewards/rejected": -48.622703552246094, "step": 6316 }, { "epoch": 0.8601579520697168, "grad_norm": 39.547451532197684, "learning_rate": 4.6725791778985835e-08, "logits/chosen": 15.188364028930664, "logits/rejected": 14.885579109191895, "logps/chosen": -4.7107930183410645, "logps/rejected": -4.791717529296875, "loss": 4.2577, "rewards/accuracies": 0.25, "rewards/chosen": -47.10792541503906, "rewards/margins": 0.8092489242553711, "rewards/rejected": -47.91717529296875, "step": 6317 }, { "epoch": 0.8602941176470589, "grad_norm": 38.123101067679535, "learning_rate": 4.663665137948829e-08, "logits/chosen": 13.812347412109375, "logits/rejected": 15.100082397460938, "logps/chosen": -4.596907615661621, "logps/rejected": -5.017409324645996, "loss": 3.559, "rewards/accuracies": 0.75, "rewards/chosen": -45.96907424926758, "rewards/margins": 4.205016136169434, "rewards/rejected": -50.17409133911133, "step": 6318 }, { "epoch": 0.8604302832244008, "grad_norm": 43.40147641193728, "learning_rate": 4.654759082536035e-08, "logits/chosen": 14.60171890258789, "logits/rejected": 14.63087272644043, "logps/chosen": -4.891366004943848, "logps/rejected": -4.8296589851379395, "loss": 3.834, "rewards/accuracies": 0.5, "rewards/chosen": -48.913658142089844, "rewards/margins": -0.6170730590820312, "rewards/rejected": -48.29658508300781, "step": 6319 }, { "epoch": 0.8605664488017429, "grad_norm": 45.41494401343368, "learning_rate": 4.645861013672583e-08, "logits/chosen": 14.788078308105469, "logits/rejected": 14.355230331420898, "logps/chosen": -4.598001480102539, "logps/rejected": -4.36749267578125, "loss": 4.5016, "rewards/accuracies": 0.25, "rewards/chosen": -45.980018615722656, "rewards/margins": -2.30509090423584, "rewards/rejected": -43.6749267578125, "step": 6320 }, { "epoch": 0.860702614379085, "grad_norm": 38.90037855555846, "learning_rate": 4.636970933369082e-08, "logits/chosen": 14.837747573852539, "logits/rejected": 14.951184272766113, "logps/chosen": -5.193192481994629, "logps/rejected": -5.420886039733887, "loss": 3.7017, "rewards/accuracies": 0.75, "rewards/chosen": -51.93192672729492, "rewards/margins": 2.2769346237182617, "rewards/rejected": -54.208858489990234, "step": 6321 }, { "epoch": 0.860838779956427, "grad_norm": 39.22857936543595, "learning_rate": 4.6280888436343166e-08, "logits/chosen": 14.176897048950195, "logits/rejected": 14.641630172729492, "logps/chosen": -4.5578718185424805, "logps/rejected": -4.755561351776123, "loss": 4.2293, "rewards/accuracies": 0.75, "rewards/chosen": -45.57872009277344, "rewards/margins": 1.976893424987793, "rewards/rejected": -47.55561065673828, "step": 6322 }, { "epoch": 0.8609749455337691, "grad_norm": 41.82678830100765, "learning_rate": 4.619214746475255e-08, "logits/chosen": 14.762666702270508, "logits/rejected": 14.224882125854492, "logps/chosen": -4.63667106628418, "logps/rejected": -4.8924174308776855, "loss": 4.2605, "rewards/accuracies": 0.5, "rewards/chosen": -46.36671447753906, "rewards/margins": 2.5574636459350586, "rewards/rejected": -48.92417526245117, "step": 6323 }, { "epoch": 0.8611111111111112, "grad_norm": 37.50635858742857, "learning_rate": 4.610348643897084e-08, "logits/chosen": 14.35874080657959, "logits/rejected": 15.17151165008545, "logps/chosen": -4.6660051345825195, "logps/rejected": -4.99849796295166, "loss": 3.8189, "rewards/accuracies": 0.75, "rewards/chosen": -46.66005325317383, "rewards/margins": 3.3249263763427734, "rewards/rejected": -49.98497772216797, "step": 6324 }, { "epoch": 0.8612472766884531, "grad_norm": 46.037992169220125, "learning_rate": 4.6014905379031744e-08, "logits/chosen": 14.024520874023438, "logits/rejected": 14.551539421081543, "logps/chosen": -4.4644975662231445, "logps/rejected": -4.717874050140381, "loss": 4.2576, "rewards/accuracies": 0.75, "rewards/chosen": -44.64497375488281, "rewards/margins": 2.533763885498047, "rewards/rejected": -47.178741455078125, "step": 6325 }, { "epoch": 0.8613834422657952, "grad_norm": 49.56577904407304, "learning_rate": 4.592640430495081e-08, "logits/chosen": 14.615803718566895, "logits/rejected": 14.510477066040039, "logps/chosen": -4.599398612976074, "logps/rejected": -4.791410446166992, "loss": 4.0223, "rewards/accuracies": 0.75, "rewards/chosen": -45.993988037109375, "rewards/margins": 1.9201164245605469, "rewards/rejected": -47.91410446166992, "step": 6326 }, { "epoch": 0.8615196078431373, "grad_norm": 42.50851602511244, "learning_rate": 4.583798323672563e-08, "logits/chosen": 14.847146034240723, "logits/rejected": 14.328269004821777, "logps/chosen": -4.704829216003418, "logps/rejected": -4.731247425079346, "loss": 4.0954, "rewards/accuracies": 0.5, "rewards/chosen": -47.04828643798828, "rewards/margins": 0.26418399810791016, "rewards/rejected": -47.31247329711914, "step": 6327 }, { "epoch": 0.8616557734204793, "grad_norm": 39.814323899757255, "learning_rate": 4.574964219433575e-08, "logits/chosen": 14.797603607177734, "logits/rejected": 14.770275115966797, "logps/chosen": -4.733125686645508, "logps/rejected": -4.7246994972229, "loss": 4.0692, "rewards/accuracies": 0.5, "rewards/chosen": -47.33125686645508, "rewards/margins": -0.0842599868774414, "rewards/rejected": -47.24699401855469, "step": 6328 }, { "epoch": 0.8617919389978214, "grad_norm": 41.355188428302554, "learning_rate": 4.566138119774239e-08, "logits/chosen": 14.656563758850098, "logits/rejected": 14.697566986083984, "logps/chosen": -4.840449333190918, "logps/rejected": -4.703163146972656, "loss": 4.0742, "rewards/accuracies": 0.5, "rewards/chosen": -48.40449142456055, "rewards/margins": -1.3728599548339844, "rewards/rejected": -47.03163146972656, "step": 6329 }, { "epoch": 0.8619281045751634, "grad_norm": 38.05038200946937, "learning_rate": 4.5573200266888936e-08, "logits/chosen": 14.59046745300293, "logits/rejected": 15.17820930480957, "logps/chosen": -4.554408073425293, "logps/rejected": -4.91602087020874, "loss": 4.2656, "rewards/accuracies": 1.0, "rewards/chosen": -45.5440788269043, "rewards/margins": 3.6161298751831055, "rewards/rejected": -49.16020965576172, "step": 6330 }, { "epoch": 0.8620642701525054, "grad_norm": 39.22713395964272, "learning_rate": 4.548509942170065e-08, "logits/chosen": 14.822365760803223, "logits/rejected": 14.594985961914062, "logps/chosen": -4.903665542602539, "logps/rejected": -4.6173176765441895, "loss": 3.582, "rewards/accuracies": 0.25, "rewards/chosen": -49.03665542602539, "rewards/margins": -2.863480567932129, "rewards/rejected": -46.17317581176758, "step": 6331 }, { "epoch": 0.8622004357298475, "grad_norm": 38.27874734530141, "learning_rate": 4.5397078682084575e-08, "logits/chosen": 14.002306938171387, "logits/rejected": 14.196832656860352, "logps/chosen": -4.679148197174072, "logps/rejected": -4.61630916595459, "loss": 3.7394, "rewards/accuracies": 0.5, "rewards/chosen": -46.791481018066406, "rewards/margins": -0.6283893585205078, "rewards/rejected": -46.16309356689453, "step": 6332 }, { "epoch": 0.8623366013071896, "grad_norm": 48.32822179608456, "learning_rate": 4.530913806792971e-08, "logits/chosen": 14.838268280029297, "logits/rejected": 14.686996459960938, "logps/chosen": -4.645617485046387, "logps/rejected": -4.672821044921875, "loss": 4.5305, "rewards/accuracies": 0.75, "rewards/chosen": -46.45616912841797, "rewards/margins": 0.27203845977783203, "rewards/rejected": -46.72821044921875, "step": 6333 }, { "epoch": 0.8624727668845316, "grad_norm": 42.462574930158276, "learning_rate": 4.522127759910712e-08, "logits/chosen": 14.731767654418945, "logits/rejected": 14.760116577148438, "logps/chosen": -4.8789262771606445, "logps/rejected": -4.571924209594727, "loss": 4.2373, "rewards/accuracies": 0.0, "rewards/chosen": -48.78926086425781, "rewards/margins": -3.0700197219848633, "rewards/rejected": -45.71923828125, "step": 6334 }, { "epoch": 0.8626089324618736, "grad_norm": 44.02977276715155, "learning_rate": 4.513349729546938e-08, "logits/chosen": 14.76683521270752, "logits/rejected": 14.299362182617188, "logps/chosen": -4.7286057472229, "logps/rejected": -4.564095497131348, "loss": 4.3288, "rewards/accuracies": 0.0, "rewards/chosen": -47.28606033325195, "rewards/margins": -1.645106315612793, "rewards/rejected": -45.640953063964844, "step": 6335 }, { "epoch": 0.8627450980392157, "grad_norm": 39.30485801077127, "learning_rate": 4.5045797176851284e-08, "logits/chosen": 14.209582328796387, "logits/rejected": 15.059877395629883, "logps/chosen": -4.327055931091309, "logps/rejected": -4.621273040771484, "loss": 3.8582, "rewards/accuracies": 0.5, "rewards/chosen": -43.27056121826172, "rewards/margins": 2.942169189453125, "rewards/rejected": -46.21272659301758, "step": 6336 }, { "epoch": 0.8628812636165577, "grad_norm": 39.35559184156957, "learning_rate": 4.495817726306952e-08, "logits/chosen": 14.589550018310547, "logits/rejected": 15.061738967895508, "logps/chosen": -4.38742733001709, "logps/rejected": -4.778565406799316, "loss": 4.0119, "rewards/accuracies": 1.0, "rewards/chosen": -43.87427520751953, "rewards/margins": 3.911383628845215, "rewards/rejected": -47.78565979003906, "step": 6337 }, { "epoch": 0.8630174291938998, "grad_norm": 35.244753812357025, "learning_rate": 4.4870637573922286e-08, "logits/chosen": 14.099002838134766, "logits/rejected": 14.252903938293457, "logps/chosen": -4.294690132141113, "logps/rejected": -4.596505165100098, "loss": 3.6424, "rewards/accuracies": 0.5, "rewards/chosen": -42.9468994140625, "rewards/margins": 3.018153190612793, "rewards/rejected": -45.96505355834961, "step": 6338 }, { "epoch": 0.8631535947712419, "grad_norm": 38.56747248202085, "learning_rate": 4.4783178129190036e-08, "logits/chosen": 14.757229804992676, "logits/rejected": 14.822505950927734, "logps/chosen": -4.657669544219971, "logps/rejected": -4.744350433349609, "loss": 3.8588, "rewards/accuracies": 0.75, "rewards/chosen": -46.576698303222656, "rewards/margins": 0.8668107986450195, "rewards/rejected": -47.443504333496094, "step": 6339 }, { "epoch": 0.8632897603485838, "grad_norm": 35.52922759569156, "learning_rate": 4.4695798948635e-08, "logits/chosen": 13.63220500946045, "logits/rejected": 14.68411636352539, "logps/chosen": -4.229696273803711, "logps/rejected": -4.690229415893555, "loss": 3.6981, "rewards/accuracies": 1.0, "rewards/chosen": -42.296958923339844, "rewards/margins": 4.605337142944336, "rewards/rejected": -46.90229797363281, "step": 6340 }, { "epoch": 0.8634259259259259, "grad_norm": 38.93174424866533, "learning_rate": 4.460850005200107e-08, "logits/chosen": 13.951228141784668, "logits/rejected": 15.0750732421875, "logps/chosen": -4.55496883392334, "logps/rejected": -4.951628684997559, "loss": 4.1002, "rewards/accuracies": 0.75, "rewards/chosen": -45.549686431884766, "rewards/margins": 3.966602325439453, "rewards/rejected": -49.51628875732422, "step": 6341 }, { "epoch": 0.863562091503268, "grad_norm": 41.21403395187715, "learning_rate": 4.4521281459014307e-08, "logits/chosen": 14.225114822387695, "logits/rejected": 14.672502517700195, "logps/chosen": -4.669539451599121, "logps/rejected": -4.738886833190918, "loss": 4.4615, "rewards/accuracies": 0.5, "rewards/chosen": -46.69539260864258, "rewards/margins": 0.6934795379638672, "rewards/rejected": -47.38887023925781, "step": 6342 }, { "epoch": 0.86369825708061, "grad_norm": 40.23436022055652, "learning_rate": 4.443414318938248e-08, "logits/chosen": 14.089056015014648, "logits/rejected": 15.096016883850098, "logps/chosen": -4.541509628295898, "logps/rejected": -4.787684440612793, "loss": 4.0749, "rewards/accuracies": 1.0, "rewards/chosen": -45.41509246826172, "rewards/margins": 2.461750030517578, "rewards/rejected": -47.8768424987793, "step": 6343 }, { "epoch": 0.8638344226579521, "grad_norm": 36.671213656958535, "learning_rate": 4.4347085262795e-08, "logits/chosen": 14.616316795349121, "logits/rejected": 15.145242691040039, "logps/chosen": -4.7544426918029785, "logps/rejected": -4.772441864013672, "loss": 3.9913, "rewards/accuracies": 0.25, "rewards/chosen": -47.54442596435547, "rewards/margins": 0.17998981475830078, "rewards/rejected": -47.72441864013672, "step": 6344 }, { "epoch": 0.8639705882352942, "grad_norm": 41.408391396014025, "learning_rate": 4.4260107698923524e-08, "logits/chosen": 14.163421630859375, "logits/rejected": 14.995328903198242, "logps/chosen": -4.261241912841797, "logps/rejected": -4.377178192138672, "loss": 4.0518, "rewards/accuracies": 0.5, "rewards/chosen": -42.61241912841797, "rewards/margins": 1.159367561340332, "rewards/rejected": -43.771785736083984, "step": 6345 }, { "epoch": 0.8641067538126361, "grad_norm": 40.19934066762148, "learning_rate": 4.4173210517421334e-08, "logits/chosen": 14.358656883239746, "logits/rejected": 14.392945289611816, "logps/chosen": -4.4965996742248535, "logps/rejected": -4.664833068847656, "loss": 4.1306, "rewards/accuracies": 0.75, "rewards/chosen": -44.96599578857422, "rewards/margins": 1.682332992553711, "rewards/rejected": -46.64833068847656, "step": 6346 }, { "epoch": 0.8642429193899782, "grad_norm": 38.03970813372386, "learning_rate": 4.408639373792349e-08, "logits/chosen": 13.819921493530273, "logits/rejected": 14.528249740600586, "logps/chosen": -4.261305332183838, "logps/rejected": -4.522735595703125, "loss": 4.0323, "rewards/accuracies": 0.5, "rewards/chosen": -42.61305236816406, "rewards/margins": 2.6143007278442383, "rewards/rejected": -45.22735595703125, "step": 6347 }, { "epoch": 0.8643790849673203, "grad_norm": 40.32726274180482, "learning_rate": 4.3999657380046965e-08, "logits/chosen": 14.473121643066406, "logits/rejected": 14.067892074584961, "logps/chosen": -4.418939590454102, "logps/rejected": -4.760858058929443, "loss": 4.1239, "rewards/accuracies": 0.75, "rewards/chosen": -44.18939208984375, "rewards/margins": 3.419187545776367, "rewards/rejected": -47.60858154296875, "step": 6348 }, { "epoch": 0.8645152505446623, "grad_norm": 37.72364842971453, "learning_rate": 4.391300146339065e-08, "logits/chosen": 14.58991813659668, "logits/rejected": 14.994625091552734, "logps/chosen": -4.625495433807373, "logps/rejected": -4.5501017570495605, "loss": 3.81, "rewards/accuracies": 0.25, "rewards/chosen": -46.25495910644531, "rewards/margins": -0.753941535949707, "rewards/rejected": -45.501014709472656, "step": 6349 }, { "epoch": 0.8646514161220044, "grad_norm": 37.892243854297426, "learning_rate": 4.3826426007535035e-08, "logits/chosen": 13.943976402282715, "logits/rejected": 14.650776863098145, "logps/chosen": -4.17000150680542, "logps/rejected": -4.632457256317139, "loss": 3.7473, "rewards/accuracies": 0.75, "rewards/chosen": -41.700016021728516, "rewards/margins": 4.624558448791504, "rewards/rejected": -46.3245735168457, "step": 6350 }, { "epoch": 0.8647875816993464, "grad_norm": 39.63457459908611, "learning_rate": 4.373993103204259e-08, "logits/chosen": 14.48784065246582, "logits/rejected": 15.579439163208008, "logps/chosen": -4.666556358337402, "logps/rejected": -4.915494441986084, "loss": 4.1603, "rewards/accuracies": 0.75, "rewards/chosen": -46.66556167602539, "rewards/margins": 2.489381790161133, "rewards/rejected": -49.154945373535156, "step": 6351 }, { "epoch": 0.8649237472766884, "grad_norm": 43.5721559717003, "learning_rate": 4.3653516556457725e-08, "logits/chosen": 13.793170928955078, "logits/rejected": 14.874282836914062, "logps/chosen": -4.481222629547119, "logps/rejected": -4.975920677185059, "loss": 4.174, "rewards/accuracies": 1.0, "rewards/chosen": -44.812225341796875, "rewards/margins": 4.946977615356445, "rewards/rejected": -49.75920104980469, "step": 6352 }, { "epoch": 0.8650599128540305, "grad_norm": 36.34940605659338, "learning_rate": 4.356718260030629e-08, "logits/chosen": 14.257669448852539, "logits/rejected": 15.148859024047852, "logps/chosen": -4.391460418701172, "logps/rejected": -4.656277656555176, "loss": 3.8934, "rewards/accuracies": 0.75, "rewards/chosen": -43.91460418701172, "rewards/margins": 2.648172378540039, "rewards/rejected": -46.56277847290039, "step": 6353 }, { "epoch": 0.8651960784313726, "grad_norm": 40.76399509300516, "learning_rate": 4.348092918309625e-08, "logits/chosen": 15.355062484741211, "logits/rejected": 15.206619262695312, "logps/chosen": -4.8104352951049805, "logps/rejected": -4.935366153717041, "loss": 4.2092, "rewards/accuracies": 0.75, "rewards/chosen": -48.10435485839844, "rewards/margins": 1.2493095397949219, "rewards/rejected": -49.353660583496094, "step": 6354 }, { "epoch": 0.8653322440087146, "grad_norm": 39.98221930464129, "learning_rate": 4.339475632431737e-08, "logits/chosen": 14.532602310180664, "logits/rejected": 14.461953163146973, "logps/chosen": -4.776125907897949, "logps/rejected": -4.852254867553711, "loss": 4.0017, "rewards/accuracies": 0.5, "rewards/chosen": -47.761253356933594, "rewards/margins": 0.761296272277832, "rewards/rejected": -48.522552490234375, "step": 6355 }, { "epoch": 0.8654684095860566, "grad_norm": 39.32529582981507, "learning_rate": 4.330866404344093e-08, "logits/chosen": 13.961517333984375, "logits/rejected": 14.947359085083008, "logps/chosen": -4.419835090637207, "logps/rejected": -4.542269706726074, "loss": 3.2927, "rewards/accuracies": 0.5, "rewards/chosen": -44.19834899902344, "rewards/margins": 1.2243452072143555, "rewards/rejected": -45.42269515991211, "step": 6356 }, { "epoch": 0.8656045751633987, "grad_norm": 39.39787412217841, "learning_rate": 4.3222652359920265e-08, "logits/chosen": 13.625351905822754, "logits/rejected": 14.32509994506836, "logps/chosen": -4.268312931060791, "logps/rejected": -4.703261852264404, "loss": 3.2492, "rewards/accuracies": 0.75, "rewards/chosen": -42.68313217163086, "rewards/margins": 4.3494873046875, "rewards/rejected": -47.032615661621094, "step": 6357 }, { "epoch": 0.8657407407407407, "grad_norm": 40.83966211186751, "learning_rate": 4.313672129319057e-08, "logits/chosen": 13.809965133666992, "logits/rejected": 14.025969505310059, "logps/chosen": -4.10726261138916, "logps/rejected": -4.624524116516113, "loss": 3.9195, "rewards/accuracies": 0.75, "rewards/chosen": -41.072628021240234, "rewards/margins": 5.172616004943848, "rewards/rejected": -46.24524688720703, "step": 6358 }, { "epoch": 0.8658769063180828, "grad_norm": 40.03843078542121, "learning_rate": 4.30508708626685e-08, "logits/chosen": 13.740153312683105, "logits/rejected": 14.33854866027832, "logps/chosen": -4.235652446746826, "logps/rejected": -4.592522621154785, "loss": 3.857, "rewards/accuracies": 0.75, "rewards/chosen": -42.35652160644531, "rewards/margins": 3.5687026977539062, "rewards/rejected": -45.925228118896484, "step": 6359 }, { "epoch": 0.8660130718954249, "grad_norm": 39.33779849428493, "learning_rate": 4.2965101087752663e-08, "logits/chosen": 14.395133018493652, "logits/rejected": 15.489339828491211, "logps/chosen": -4.603323936462402, "logps/rejected": -4.879022121429443, "loss": 3.5784, "rewards/accuracies": 0.5, "rewards/chosen": -46.033241271972656, "rewards/margins": 2.7569828033447266, "rewards/rejected": -48.79022216796875, "step": 6360 }, { "epoch": 0.8661492374727668, "grad_norm": 35.3028570521528, "learning_rate": 4.287941198782365e-08, "logits/chosen": 13.7296724319458, "logits/rejected": 15.065929412841797, "logps/chosen": -4.370796203613281, "logps/rejected": -4.930834770202637, "loss": 3.3214, "rewards/accuracies": 1.0, "rewards/chosen": -43.70796203613281, "rewards/margins": 5.60038948059082, "rewards/rejected": -49.308349609375, "step": 6361 }, { "epoch": 0.8662854030501089, "grad_norm": 40.960588253103666, "learning_rate": 4.2793803582243406e-08, "logits/chosen": 14.629256248474121, "logits/rejected": 15.43675422668457, "logps/chosen": -4.5090789794921875, "logps/rejected": -4.717471599578857, "loss": 3.5953, "rewards/accuracies": 0.75, "rewards/chosen": -45.09078598022461, "rewards/margins": 2.083928108215332, "rewards/rejected": -47.174713134765625, "step": 6362 }, { "epoch": 0.866421568627451, "grad_norm": 39.20555898048241, "learning_rate": 4.27082758903559e-08, "logits/chosen": 13.781767845153809, "logits/rejected": 14.216806411743164, "logps/chosen": -4.6059064865112305, "logps/rejected": -4.658565521240234, "loss": 4.1122, "rewards/accuracies": 0.75, "rewards/chosen": -46.05906295776367, "rewards/margins": 0.5265951156616211, "rewards/rejected": -46.58565902709961, "step": 6363 }, { "epoch": 0.866557734204793, "grad_norm": 40.52838905684919, "learning_rate": 4.2622828931486985e-08, "logits/chosen": 14.692264556884766, "logits/rejected": 15.071676254272461, "logps/chosen": -4.938164234161377, "logps/rejected": -5.0645952224731445, "loss": 3.8728, "rewards/accuracies": 0.5, "rewards/chosen": -49.38164138793945, "rewards/margins": 1.2643051147460938, "rewards/rejected": -50.64594650268555, "step": 6364 }, { "epoch": 0.8666938997821351, "grad_norm": 41.919995567889885, "learning_rate": 4.2537462724943875e-08, "logits/chosen": 13.250463485717773, "logits/rejected": 15.199182510375977, "logps/chosen": -4.269472599029541, "logps/rejected": -4.825620651245117, "loss": 4.0353, "rewards/accuracies": 0.75, "rewards/chosen": -42.694725036621094, "rewards/margins": 5.561481475830078, "rewards/rejected": -48.25620651245117, "step": 6365 }, { "epoch": 0.8668300653594772, "grad_norm": 40.86775917807514, "learning_rate": 4.245217729001589e-08, "logits/chosen": 14.004805564880371, "logits/rejected": 14.647396087646484, "logps/chosen": -4.5875725746154785, "logps/rejected": -4.820026397705078, "loss": 4.5524, "rewards/accuracies": 0.5, "rewards/chosen": -45.87572479248047, "rewards/margins": 2.324535369873047, "rewards/rejected": -48.20026397705078, "step": 6366 }, { "epoch": 0.8669662309368191, "grad_norm": 40.11268952249572, "learning_rate": 4.236697264597402e-08, "logits/chosen": 13.941747665405273, "logits/rejected": 14.199206352233887, "logps/chosen": -4.6065754890441895, "logps/rejected": -4.973611354827881, "loss": 3.9395, "rewards/accuracies": 1.0, "rewards/chosen": -46.065757751464844, "rewards/margins": 3.6703577041625977, "rewards/rejected": -49.736114501953125, "step": 6367 }, { "epoch": 0.8671023965141612, "grad_norm": 41.588691805029676, "learning_rate": 4.228184881207087e-08, "logits/chosen": 14.075992584228516, "logits/rejected": 13.784355163574219, "logps/chosen": -4.803714752197266, "logps/rejected": -4.761025905609131, "loss": 3.8415, "rewards/accuracies": 0.5, "rewards/chosen": -48.037147521972656, "rewards/margins": -0.42689037322998047, "rewards/rejected": -47.610260009765625, "step": 6368 }, { "epoch": 0.8672385620915033, "grad_norm": 41.656078964985646, "learning_rate": 4.21968058075409e-08, "logits/chosen": 14.40916919708252, "logits/rejected": 14.03349494934082, "logps/chosen": -4.675520896911621, "logps/rejected": -4.571463584899902, "loss": 3.7426, "rewards/accuracies": 0.5, "rewards/chosen": -46.755210876464844, "rewards/margins": -1.0405731201171875, "rewards/rejected": -45.71463394165039, "step": 6369 }, { "epoch": 0.8673747276688453, "grad_norm": 50.55974735045593, "learning_rate": 4.211184365160032e-08, "logits/chosen": 14.017412185668945, "logits/rejected": 14.118459701538086, "logps/chosen": -4.362865447998047, "logps/rejected": -4.500075340270996, "loss": 4.5588, "rewards/accuracies": 0.5, "rewards/chosen": -43.62865447998047, "rewards/margins": 1.3720989227294922, "rewards/rejected": -45.00075149536133, "step": 6370 }, { "epoch": 0.8675108932461874, "grad_norm": 39.83167413213163, "learning_rate": 4.202696236344696e-08, "logits/chosen": 15.655902862548828, "logits/rejected": 15.510913848876953, "logps/chosen": -4.986924171447754, "logps/rejected": -4.896313667297363, "loss": 4.257, "rewards/accuracies": 0.5, "rewards/chosen": -49.86924743652344, "rewards/margins": -0.9061079025268555, "rewards/rejected": -48.963138580322266, "step": 6371 }, { "epoch": 0.8676470588235294, "grad_norm": 39.644338906929036, "learning_rate": 4.1942161962260505e-08, "logits/chosen": 13.983253479003906, "logits/rejected": 14.217206001281738, "logps/chosen": -4.584132194519043, "logps/rejected": -4.66500186920166, "loss": 3.8577, "rewards/accuracies": 0.75, "rewards/chosen": -45.8413200378418, "rewards/margins": 0.8086986541748047, "rewards/rejected": -46.650020599365234, "step": 6372 }, { "epoch": 0.8677832244008714, "grad_norm": 37.66816278111342, "learning_rate": 4.185744246720233e-08, "logits/chosen": 14.483989715576172, "logits/rejected": 14.36862850189209, "logps/chosen": -4.398013591766357, "logps/rejected": -4.573889255523682, "loss": 3.8085, "rewards/accuracies": 1.0, "rewards/chosen": -43.980140686035156, "rewards/margins": 1.7587556838989258, "rewards/rejected": -45.738895416259766, "step": 6373 }, { "epoch": 0.8679193899782135, "grad_norm": 40.90662245261698, "learning_rate": 4.17728038974154e-08, "logits/chosen": 15.063484191894531, "logits/rejected": 14.95763874053955, "logps/chosen": -4.488899230957031, "logps/rejected": -4.624993324279785, "loss": 4.0199, "rewards/accuracies": 0.75, "rewards/chosen": -44.88899612426758, "rewards/margins": 1.3609380722045898, "rewards/rejected": -46.24993133544922, "step": 6374 }, { "epoch": 0.8680555555555556, "grad_norm": 38.47993550292512, "learning_rate": 4.1688246272024586e-08, "logits/chosen": 13.411813735961914, "logits/rejected": 13.572715759277344, "logps/chosen": -4.1900739669799805, "logps/rejected": -4.457431793212891, "loss": 3.8131, "rewards/accuracies": 1.0, "rewards/chosen": -41.90074157714844, "rewards/margins": 2.6735763549804688, "rewards/rejected": -44.574317932128906, "step": 6375 }, { "epoch": 0.8681917211328976, "grad_norm": 39.977858386974304, "learning_rate": 4.160376961013643e-08, "logits/chosen": 14.770072937011719, "logits/rejected": 14.605964660644531, "logps/chosen": -4.562832355499268, "logps/rejected": -4.738656997680664, "loss": 3.8964, "rewards/accuracies": 0.75, "rewards/chosen": -45.628326416015625, "rewards/margins": 1.758244514465332, "rewards/rejected": -47.386566162109375, "step": 6376 }, { "epoch": 0.8683278867102396, "grad_norm": 40.116295065051375, "learning_rate": 4.1519373930838995e-08, "logits/chosen": 15.007818222045898, "logits/rejected": 14.719850540161133, "logps/chosen": -4.626842498779297, "logps/rejected": -4.819284439086914, "loss": 3.9227, "rewards/accuracies": 0.5, "rewards/chosen": -46.268428802490234, "rewards/margins": 1.9244155883789062, "rewards/rejected": -48.192840576171875, "step": 6377 }, { "epoch": 0.8684640522875817, "grad_norm": 45.19831329132586, "learning_rate": 4.143505925320223e-08, "logits/chosen": 14.359512329101562, "logits/rejected": 14.598987579345703, "logps/chosen": -4.426144599914551, "logps/rejected": -4.638174057006836, "loss": 3.7469, "rewards/accuracies": 0.75, "rewards/chosen": -44.261444091796875, "rewards/margins": 2.1202926635742188, "rewards/rejected": -46.381736755371094, "step": 6378 }, { "epoch": 0.8686002178649237, "grad_norm": 41.0482363774543, "learning_rate": 4.135082559627783e-08, "logits/chosen": 14.60567855834961, "logits/rejected": 14.774662017822266, "logps/chosen": -4.517286777496338, "logps/rejected": -4.792003631591797, "loss": 3.8397, "rewards/accuracies": 0.5, "rewards/chosen": -45.17287063598633, "rewards/margins": 2.747164726257324, "rewards/rejected": -47.92003631591797, "step": 6379 }, { "epoch": 0.8687363834422658, "grad_norm": 40.12051018617881, "learning_rate": 4.126667297909896e-08, "logits/chosen": 14.899578094482422, "logits/rejected": 14.916675567626953, "logps/chosen": -4.8450517654418945, "logps/rejected": -4.763472557067871, "loss": 4.0686, "rewards/accuracies": 0.5, "rewards/chosen": -48.45051956176758, "rewards/margins": -0.8157920837402344, "rewards/rejected": -47.634727478027344, "step": 6380 }, { "epoch": 0.8688725490196079, "grad_norm": 42.291171494751964, "learning_rate": 4.118260142068064e-08, "logits/chosen": 14.195939064025879, "logits/rejected": 14.667601585388184, "logps/chosen": -4.503249168395996, "logps/rejected": -4.782383918762207, "loss": 4.4721, "rewards/accuracies": 0.75, "rewards/chosen": -45.03248977661133, "rewards/margins": 2.7913522720336914, "rewards/rejected": -47.8238410949707, "step": 6381 }, { "epoch": 0.8690087145969498, "grad_norm": 40.49108608262367, "learning_rate": 4.1098610940019587e-08, "logits/chosen": 13.892478942871094, "logits/rejected": 14.780257225036621, "logps/chosen": -4.32732629776001, "logps/rejected": -4.5869550704956055, "loss": 4.0007, "rewards/accuracies": 0.75, "rewards/chosen": -43.27326202392578, "rewards/margins": 2.5962867736816406, "rewards/rejected": -45.86954879760742, "step": 6382 }, { "epoch": 0.8691448801742919, "grad_norm": 36.76457662699809, "learning_rate": 4.101470155609408e-08, "logits/chosen": 14.045089721679688, "logits/rejected": 14.782567024230957, "logps/chosen": -4.273975372314453, "logps/rejected": -4.837437629699707, "loss": 3.8044, "rewards/accuracies": 0.75, "rewards/chosen": -42.739749908447266, "rewards/margins": 5.634626388549805, "rewards/rejected": -48.3743782043457, "step": 6383 }, { "epoch": 0.869281045751634, "grad_norm": 40.26885794370948, "learning_rate": 4.093087328786411e-08, "logits/chosen": 14.846922874450684, "logits/rejected": 15.688678741455078, "logps/chosen": -4.821056842803955, "logps/rejected": -5.020571708679199, "loss": 4.0473, "rewards/accuracies": 0.5, "rewards/chosen": -48.210567474365234, "rewards/margins": 1.9951496124267578, "rewards/rejected": -50.205718994140625, "step": 6384 }, { "epoch": 0.869417211328976, "grad_norm": 38.60287481977131, "learning_rate": 4.08471261542715e-08, "logits/chosen": 13.85615348815918, "logits/rejected": 15.199310302734375, "logps/chosen": -4.388878345489502, "logps/rejected": -5.079535484313965, "loss": 3.5683, "rewards/accuracies": 0.75, "rewards/chosen": -43.88878631591797, "rewards/margins": 6.90656852722168, "rewards/rejected": -50.79535675048828, "step": 6385 }, { "epoch": 0.8695533769063181, "grad_norm": 41.00170679081023, "learning_rate": 4.076346017423948e-08, "logits/chosen": 14.371435165405273, "logits/rejected": 15.028678894042969, "logps/chosen": -4.552690505981445, "logps/rejected": -5.094244956970215, "loss": 3.8903, "rewards/accuracies": 0.75, "rewards/chosen": -45.52690124511719, "rewards/margins": 5.415545463562012, "rewards/rejected": -50.94244384765625, "step": 6386 }, { "epoch": 0.8696895424836601, "grad_norm": 37.572442252781414, "learning_rate": 4.06798753666731e-08, "logits/chosen": 14.163053512573242, "logits/rejected": 14.76620864868164, "logps/chosen": -4.381942272186279, "logps/rejected": -4.7461957931518555, "loss": 3.6178, "rewards/accuracies": 0.75, "rewards/chosen": -43.81942367553711, "rewards/margins": 3.6425352096557617, "rewards/rejected": -47.46195983886719, "step": 6387 }, { "epoch": 0.8698257080610022, "grad_norm": 41.0963004304646, "learning_rate": 4.0596371750459026e-08, "logits/chosen": 14.39043140411377, "logits/rejected": 15.087326049804688, "logps/chosen": -4.517897129058838, "logps/rejected": -5.1168670654296875, "loss": 3.6419, "rewards/accuracies": 0.75, "rewards/chosen": -45.17897033691406, "rewards/margins": 5.989697456359863, "rewards/rejected": -51.16866683959961, "step": 6388 }, { "epoch": 0.8699618736383442, "grad_norm": 39.5929388083872, "learning_rate": 4.051294934446572e-08, "logits/chosen": 14.065876007080078, "logits/rejected": 14.502132415771484, "logps/chosen": -4.542237758636475, "logps/rejected": -4.488065719604492, "loss": 3.8335, "rewards/accuracies": 0.5, "rewards/chosen": -45.42237854003906, "rewards/margins": -0.5417251586914062, "rewards/rejected": -44.880653381347656, "step": 6389 }, { "epoch": 0.8700980392156863, "grad_norm": 36.20121941952343, "learning_rate": 4.0429608167543e-08, "logits/chosen": 14.593790054321289, "logits/rejected": 14.76535415649414, "logps/chosen": -4.597760200500488, "logps/rejected": -4.613552093505859, "loss": 4.0714, "rewards/accuracies": 0.5, "rewards/chosen": -45.97760009765625, "rewards/margins": 0.15792083740234375, "rewards/rejected": -46.135520935058594, "step": 6390 }, { "epoch": 0.8702342047930284, "grad_norm": 39.785869027280604, "learning_rate": 4.03463482385225e-08, "logits/chosen": 14.135372161865234, "logits/rejected": 14.58403205871582, "logps/chosen": -4.355432033538818, "logps/rejected": -4.480962753295898, "loss": 4.3292, "rewards/accuracies": 0.75, "rewards/chosen": -43.5543212890625, "rewards/margins": 1.2553062438964844, "rewards/rejected": -44.80962371826172, "step": 6391 }, { "epoch": 0.8703703703703703, "grad_norm": 41.93405444073587, "learning_rate": 4.026316957621767e-08, "logits/chosen": 14.175260543823242, "logits/rejected": 14.161913871765137, "logps/chosen": -4.3014302253723145, "logps/rejected": -4.541727066040039, "loss": 4.1096, "rewards/accuracies": 1.0, "rewards/chosen": -43.014305114746094, "rewards/margins": 2.402968406677246, "rewards/rejected": -45.41727066040039, "step": 6392 }, { "epoch": 0.8705065359477124, "grad_norm": 40.05986768663255, "learning_rate": 4.0180072199423164e-08, "logits/chosen": 15.041767120361328, "logits/rejected": 14.871746063232422, "logps/chosen": -4.71727180480957, "logps/rejected": -4.630753040313721, "loss": 4.0562, "rewards/accuracies": 0.25, "rewards/chosen": -47.17272186279297, "rewards/margins": -0.8651905059814453, "rewards/rejected": -46.307533264160156, "step": 6393 }, { "epoch": 0.8706427015250545, "grad_norm": 36.33931767068077, "learning_rate": 4.0097056126915694e-08, "logits/chosen": 14.810453414916992, "logits/rejected": 14.602926254272461, "logps/chosen": -4.5890212059021, "logps/rejected": -4.9750285148620605, "loss": 3.7588, "rewards/accuracies": 0.75, "rewards/chosen": -45.89021301269531, "rewards/margins": 3.8600730895996094, "rewards/rejected": -49.75028610229492, "step": 6394 }, { "epoch": 0.8707788671023965, "grad_norm": 39.470043846431416, "learning_rate": 4.0014121377453325e-08, "logits/chosen": 15.147323608398438, "logits/rejected": 14.37887191772461, "logps/chosen": -4.786800861358643, "logps/rejected": -4.848301887512207, "loss": 3.887, "rewards/accuracies": 0.5, "rewards/chosen": -47.86800765991211, "rewards/margins": 0.6150112152099609, "rewards/rejected": -48.4830207824707, "step": 6395 }, { "epoch": 0.8709150326797386, "grad_norm": 38.71076983900426, "learning_rate": 3.993126796977604e-08, "logits/chosen": 14.294013977050781, "logits/rejected": 14.445629119873047, "logps/chosen": -4.406101226806641, "logps/rejected": -4.454351902008057, "loss": 4.0944, "rewards/accuracies": 0.5, "rewards/chosen": -44.061012268066406, "rewards/margins": 0.48250389099121094, "rewards/rejected": -44.54351806640625, "step": 6396 }, { "epoch": 0.8710511982570807, "grad_norm": 38.57701616343297, "learning_rate": 3.9848495922605e-08, "logits/chosen": 14.683003425598145, "logits/rejected": 14.54394245147705, "logps/chosen": -4.692392349243164, "logps/rejected": -4.595215797424316, "loss": 3.4015, "rewards/accuracies": 0.5, "rewards/chosen": -46.923919677734375, "rewards/margins": -0.9717636108398438, "rewards/rejected": -45.95215606689453, "step": 6397 }, { "epoch": 0.8711873638344226, "grad_norm": 40.729167873267656, "learning_rate": 3.976580525464337e-08, "logits/chosen": 14.457220077514648, "logits/rejected": 14.10552978515625, "logps/chosen": -4.6103668212890625, "logps/rejected": -4.197274684906006, "loss": 4.1586, "rewards/accuracies": 0.25, "rewards/chosen": -46.10366439819336, "rewards/margins": -4.130918502807617, "rewards/rejected": -41.972747802734375, "step": 6398 }, { "epoch": 0.8713235294117647, "grad_norm": 37.60550338156691, "learning_rate": 3.968319598457581e-08, "logits/chosen": 15.269248008728027, "logits/rejected": 14.337472915649414, "logps/chosen": -4.5450310707092285, "logps/rejected": -4.4206438064575195, "loss": 3.7801, "rewards/accuracies": 0.25, "rewards/chosen": -45.45030975341797, "rewards/margins": -1.2438735961914062, "rewards/rejected": -44.20643615722656, "step": 6399 }, { "epoch": 0.8714596949891068, "grad_norm": 39.54139770357285, "learning_rate": 3.960066813106851e-08, "logits/chosen": 15.321500778198242, "logits/rejected": 15.195356369018555, "logps/chosen": -4.652768135070801, "logps/rejected": -4.690662384033203, "loss": 3.9558, "rewards/accuracies": 0.5, "rewards/chosen": -46.52768325805664, "rewards/margins": 0.378936767578125, "rewards/rejected": -46.90662384033203, "step": 6400 }, { "epoch": 0.8715958605664488, "grad_norm": 39.47442611446504, "learning_rate": 3.951822171276928e-08, "logits/chosen": 14.892354011535645, "logits/rejected": 15.039535522460938, "logps/chosen": -4.617968559265137, "logps/rejected": -4.524057865142822, "loss": 4.2394, "rewards/accuracies": 0.5, "rewards/chosen": -46.1796875, "rewards/margins": -0.9391107559204102, "rewards/rejected": -45.240577697753906, "step": 6401 }, { "epoch": 0.8717320261437909, "grad_norm": 44.7638425061252, "learning_rate": 3.943585674830765e-08, "logits/chosen": 14.291083335876465, "logits/rejected": 14.139823913574219, "logps/chosen": -4.563572406768799, "logps/rejected": -4.446340560913086, "loss": 4.4208, "rewards/accuracies": 0.25, "rewards/chosen": -45.63572692871094, "rewards/margins": -1.172318458557129, "rewards/rejected": -44.46340560913086, "step": 6402 }, { "epoch": 0.871868191721133, "grad_norm": 40.47312355309214, "learning_rate": 3.9353573256294715e-08, "logits/chosen": 13.39900016784668, "logits/rejected": 13.747516632080078, "logps/chosen": -4.416171073913574, "logps/rejected": -4.508476257324219, "loss": 4.1591, "rewards/accuracies": 0.5, "rewards/chosen": -44.16170883178711, "rewards/margins": 0.9230518341064453, "rewards/rejected": -45.08475875854492, "step": 6403 }, { "epoch": 0.8720043572984749, "grad_norm": 37.90651713657366, "learning_rate": 3.9271371255322985e-08, "logits/chosen": 14.547497749328613, "logits/rejected": 14.217527389526367, "logps/chosen": -4.359409332275391, "logps/rejected": -4.397309303283691, "loss": 3.7738, "rewards/accuracies": 0.5, "rewards/chosen": -43.594093322753906, "rewards/margins": 0.3789968490600586, "rewards/rejected": -43.97309112548828, "step": 6404 }, { "epoch": 0.872140522875817, "grad_norm": 42.0696874376909, "learning_rate": 3.918925076396671e-08, "logits/chosen": 15.259635925292969, "logits/rejected": 14.874441146850586, "logps/chosen": -4.765989303588867, "logps/rejected": -4.636875629425049, "loss": 4.4131, "rewards/accuracies": 0.25, "rewards/chosen": -47.659889221191406, "rewards/margins": -1.2911338806152344, "rewards/rejected": -46.36875915527344, "step": 6405 }, { "epoch": 0.8722766884531591, "grad_norm": 40.50245085802029, "learning_rate": 3.9107211800781804e-08, "logits/chosen": 14.479875564575195, "logits/rejected": 15.155355453491211, "logps/chosen": -4.6992669105529785, "logps/rejected": -4.793121337890625, "loss": 3.84, "rewards/accuracies": 0.5, "rewards/chosen": -46.99266815185547, "rewards/margins": 0.938542366027832, "rewards/rejected": -47.93121337890625, "step": 6406 }, { "epoch": 0.8724128540305011, "grad_norm": 35.997447830651716, "learning_rate": 3.902525438430544e-08, "logits/chosen": 14.69379997253418, "logits/rejected": 14.923233032226562, "logps/chosen": -4.5476274490356445, "logps/rejected": -4.7722978591918945, "loss": 3.7861, "rewards/accuracies": 0.75, "rewards/chosen": -45.47627639770508, "rewards/margins": 2.2467031478881836, "rewards/rejected": -47.72298049926758, "step": 6407 }, { "epoch": 0.8725490196078431, "grad_norm": 37.15510751947002, "learning_rate": 3.894337853305676e-08, "logits/chosen": 14.210628509521484, "logits/rejected": 15.004976272583008, "logps/chosen": -4.362610340118408, "logps/rejected": -4.722551345825195, "loss": 3.8531, "rewards/accuracies": 0.75, "rewards/chosen": -43.626102447509766, "rewards/margins": 3.599410057067871, "rewards/rejected": -47.22550964355469, "step": 6408 }, { "epoch": 0.8726851851851852, "grad_norm": 38.071353829401346, "learning_rate": 3.8861584265536253e-08, "logits/chosen": 14.066654205322266, "logits/rejected": 14.404251098632812, "logps/chosen": -4.424383163452148, "logps/rejected": -4.3051862716674805, "loss": 3.9039, "rewards/accuracies": 0.25, "rewards/chosen": -44.24382781982422, "rewards/margins": -1.191965103149414, "rewards/rejected": -43.05186462402344, "step": 6409 }, { "epoch": 0.8728213507625272, "grad_norm": 41.77887480511208, "learning_rate": 3.877987160022593e-08, "logits/chosen": 13.71021842956543, "logits/rejected": 14.153553009033203, "logps/chosen": -4.436590194702148, "logps/rejected": -4.757972240447998, "loss": 3.5819, "rewards/accuracies": 0.75, "rewards/chosen": -44.365901947021484, "rewards/margins": 3.2138185501098633, "rewards/rejected": -47.5797233581543, "step": 6410 }, { "epoch": 0.8729575163398693, "grad_norm": 40.25346198433496, "learning_rate": 3.869824055558948e-08, "logits/chosen": 13.65452766418457, "logits/rejected": 14.006407737731934, "logps/chosen": -4.330312728881836, "logps/rejected": -4.41211462020874, "loss": 3.6728, "rewards/accuracies": 0.5, "rewards/chosen": -43.303131103515625, "rewards/margins": 0.8180170059204102, "rewards/rejected": -44.12114715576172, "step": 6411 }, { "epoch": 0.8730936819172114, "grad_norm": 40.92286910777891, "learning_rate": 3.861669115007222e-08, "logits/chosen": 14.084371566772461, "logits/rejected": 14.813268661499023, "logps/chosen": -4.632843017578125, "logps/rejected": -4.821859359741211, "loss": 3.6279, "rewards/accuracies": 0.5, "rewards/chosen": -46.32843017578125, "rewards/margins": 1.8901596069335938, "rewards/rejected": -48.218589782714844, "step": 6412 }, { "epoch": 0.8732298474945533, "grad_norm": 38.09413749413868, "learning_rate": 3.8535223402100757e-08, "logits/chosen": 15.415325164794922, "logits/rejected": 15.408245086669922, "logps/chosen": -4.9154181480407715, "logps/rejected": -4.874933242797852, "loss": 4.0588, "rewards/accuracies": 0.25, "rewards/chosen": -49.15418243408203, "rewards/margins": -0.40485191345214844, "rewards/rejected": -48.74932861328125, "step": 6413 }, { "epoch": 0.8733660130718954, "grad_norm": 41.98926075885202, "learning_rate": 3.8453837330083425e-08, "logits/chosen": 13.85378646850586, "logits/rejected": 14.288373947143555, "logps/chosen": -4.49324893951416, "logps/rejected": -4.421651840209961, "loss": 4.4284, "rewards/accuracies": 0.5, "rewards/chosen": -44.93248748779297, "rewards/margins": -0.715968132019043, "rewards/rejected": -44.216522216796875, "step": 6414 }, { "epoch": 0.8735021786492375, "grad_norm": 42.710754845167614, "learning_rate": 3.837253295241023e-08, "logits/chosen": 14.951618194580078, "logits/rejected": 15.527877807617188, "logps/chosen": -5.058448314666748, "logps/rejected": -5.129994869232178, "loss": 4.4408, "rewards/accuracies": 0.5, "rewards/chosen": -50.5844841003418, "rewards/margins": 0.7154645919799805, "rewards/rejected": -51.299949645996094, "step": 6415 }, { "epoch": 0.8736383442265795, "grad_norm": 37.00068242024689, "learning_rate": 3.829131028745234e-08, "logits/chosen": 14.101432800292969, "logits/rejected": 13.843595504760742, "logps/chosen": -4.375956058502197, "logps/rejected": -4.375450134277344, "loss": 3.7716, "rewards/accuracies": 0.5, "rewards/chosen": -43.759559631347656, "rewards/margins": -0.005061149597167969, "rewards/rejected": -43.75450134277344, "step": 6416 }, { "epoch": 0.8737745098039216, "grad_norm": 42.87656418795207, "learning_rate": 3.821016935356285e-08, "logits/chosen": 14.606614112854004, "logits/rejected": 15.327038764953613, "logps/chosen": -4.449498176574707, "logps/rejected": -5.132430076599121, "loss": 3.4441, "rewards/accuracies": 0.75, "rewards/chosen": -44.49497985839844, "rewards/margins": 6.829322814941406, "rewards/rejected": -51.324302673339844, "step": 6417 }, { "epoch": 0.8739106753812637, "grad_norm": 44.05248877050632, "learning_rate": 3.8129110169076206e-08, "logits/chosen": 13.968923568725586, "logits/rejected": 13.94259262084961, "logps/chosen": -4.363430976867676, "logps/rejected": -4.554393768310547, "loss": 3.9578, "rewards/accuracies": 0.75, "rewards/chosen": -43.634315490722656, "rewards/margins": 1.9096250534057617, "rewards/rejected": -45.54393768310547, "step": 6418 }, { "epoch": 0.8740468409586056, "grad_norm": 41.2403161080449, "learning_rate": 3.804813275230834e-08, "logits/chosen": 14.553624153137207, "logits/rejected": 15.049190521240234, "logps/chosen": -4.437106609344482, "logps/rejected": -4.864668846130371, "loss": 3.6795, "rewards/accuracies": 1.0, "rewards/chosen": -44.371063232421875, "rewards/margins": 4.275623321533203, "rewards/rejected": -48.64668655395508, "step": 6419 }, { "epoch": 0.8741830065359477, "grad_norm": 41.591873630877124, "learning_rate": 3.796723712155678e-08, "logits/chosen": 14.405549049377441, "logits/rejected": 14.205538749694824, "logps/chosen": -4.39401912689209, "logps/rejected": -4.726018905639648, "loss": 4.21, "rewards/accuracies": 0.75, "rewards/chosen": -43.94019317626953, "rewards/margins": 3.320000648498535, "rewards/rejected": -47.26019287109375, "step": 6420 }, { "epoch": 0.8743191721132898, "grad_norm": 36.9634849278797, "learning_rate": 3.78864232951007e-08, "logits/chosen": 13.664339065551758, "logits/rejected": 13.938796997070312, "logps/chosen": -4.075606822967529, "logps/rejected": -4.405932426452637, "loss": 4.0264, "rewards/accuracies": 1.0, "rewards/chosen": -40.756065368652344, "rewards/margins": 3.303255081176758, "rewards/rejected": -44.059322357177734, "step": 6421 }, { "epoch": 0.8744553376906318, "grad_norm": 41.69033830001113, "learning_rate": 3.7805691291200417e-08, "logits/chosen": 14.798686027526855, "logits/rejected": 14.556225776672363, "logps/chosen": -4.440011024475098, "logps/rejected": -4.813276290893555, "loss": 4.0794, "rewards/accuracies": 1.0, "rewards/chosen": -44.400115966796875, "rewards/margins": 3.7326459884643555, "rewards/rejected": -48.13276290893555, "step": 6422 }, { "epoch": 0.8745915032679739, "grad_norm": 42.87794112344415, "learning_rate": 3.7725041128098134e-08, "logits/chosen": 14.697446823120117, "logits/rejected": 14.646210670471191, "logps/chosen": -4.720425605773926, "logps/rejected": -4.408155918121338, "loss": 3.8026, "rewards/accuracies": 0.25, "rewards/chosen": -47.20425796508789, "rewards/margins": -3.1227006912231445, "rewards/rejected": -44.08155822753906, "step": 6423 }, { "epoch": 0.8747276688453159, "grad_norm": 39.46823180434057, "learning_rate": 3.764447282401746e-08, "logits/chosen": 14.83564567565918, "logits/rejected": 15.056041717529297, "logps/chosen": -4.311041355133057, "logps/rejected": -5.001204490661621, "loss": 3.443, "rewards/accuracies": 1.0, "rewards/chosen": -43.11041259765625, "rewards/margins": 6.901633262634277, "rewards/rejected": -50.012046813964844, "step": 6424 }, { "epoch": 0.8748638344226579, "grad_norm": 37.45472014724005, "learning_rate": 3.7563986397163386e-08, "logits/chosen": 14.52560043334961, "logits/rejected": 14.564215660095215, "logps/chosen": -4.519559860229492, "logps/rejected": -4.523240089416504, "loss": 3.5561, "rewards/accuracies": 0.5, "rewards/chosen": -45.19559860229492, "rewards/margins": 0.03680610656738281, "rewards/rejected": -45.23240661621094, "step": 6425 }, { "epoch": 0.875, "grad_norm": 43.22538626113385, "learning_rate": 3.7483581865722467e-08, "logits/chosen": 14.402946472167969, "logits/rejected": 14.600278854370117, "logps/chosen": -4.495946884155273, "logps/rejected": -4.798478603363037, "loss": 4.0405, "rewards/accuracies": 0.75, "rewards/chosen": -44.95947265625, "rewards/margins": 3.0253143310546875, "rewards/rejected": -47.98478698730469, "step": 6426 }, { "epoch": 0.8751361655773421, "grad_norm": 37.77422698743975, "learning_rate": 3.74032592478629e-08, "logits/chosen": 14.58679485321045, "logits/rejected": 14.02641487121582, "logps/chosen": -4.825684547424316, "logps/rejected": -4.4012250900268555, "loss": 3.5294, "rewards/accuracies": 0.0, "rewards/chosen": -48.2568473815918, "rewards/margins": -4.244595527648926, "rewards/rejected": -44.01224899291992, "step": 6427 }, { "epoch": 0.8752723311546841, "grad_norm": 53.465241511804955, "learning_rate": 3.73230185617341e-08, "logits/chosen": 13.880291938781738, "logits/rejected": 14.591053009033203, "logps/chosen": -4.549518585205078, "logps/rejected": -4.752524375915527, "loss": 3.9847, "rewards/accuracies": 0.75, "rewards/chosen": -45.495182037353516, "rewards/margins": 2.0300588607788086, "rewards/rejected": -47.52524185180664, "step": 6428 }, { "epoch": 0.8754084967320261, "grad_norm": 41.76378554934297, "learning_rate": 3.7242859825467174e-08, "logits/chosen": 13.977788925170898, "logits/rejected": 14.167438507080078, "logps/chosen": -4.213732719421387, "logps/rejected": -4.421597480773926, "loss": 4.6347, "rewards/accuracies": 0.75, "rewards/chosen": -42.1373291015625, "rewards/margins": 2.078643798828125, "rewards/rejected": -44.215972900390625, "step": 6429 }, { "epoch": 0.8755446623093682, "grad_norm": 41.23563225462508, "learning_rate": 3.7162783057174704e-08, "logits/chosen": 14.620783805847168, "logits/rejected": 15.363748550415039, "logps/chosen": -4.457222938537598, "logps/rejected": -4.931556701660156, "loss": 3.7613, "rewards/accuracies": 0.75, "rewards/chosen": -44.572227478027344, "rewards/margins": 4.743342399597168, "rewards/rejected": -49.31556701660156, "step": 6430 }, { "epoch": 0.8756808278867102, "grad_norm": 37.162057455876806, "learning_rate": 3.7082788274950574e-08, "logits/chosen": 13.922603607177734, "logits/rejected": 14.244707107543945, "logps/chosen": -4.629504203796387, "logps/rejected": -4.73807430267334, "loss": 3.6037, "rewards/accuracies": 0.5, "rewards/chosen": -46.295040130615234, "rewards/margins": 1.085702896118164, "rewards/rejected": -47.38074493408203, "step": 6431 }, { "epoch": 0.8758169934640523, "grad_norm": 48.91059652733074, "learning_rate": 3.7002875496870354e-08, "logits/chosen": 13.73674201965332, "logits/rejected": 13.891999244689941, "logps/chosen": -4.578237533569336, "logps/rejected": -4.358515739440918, "loss": 3.8626, "rewards/accuracies": 0.25, "rewards/chosen": -45.782371520996094, "rewards/margins": -2.197211265563965, "rewards/rejected": -43.58515930175781, "step": 6432 }, { "epoch": 0.8759531590413944, "grad_norm": 38.53862115259629, "learning_rate": 3.692304474099104e-08, "logits/chosen": 14.546770095825195, "logits/rejected": 14.971866607666016, "logps/chosen": -4.656914710998535, "logps/rejected": -4.758522033691406, "loss": 3.8949, "rewards/accuracies": 0.5, "rewards/chosen": -46.56915283203125, "rewards/margins": 1.016068458557129, "rewards/rejected": -47.58522033691406, "step": 6433 }, { "epoch": 0.8760893246187363, "grad_norm": 39.73587940809473, "learning_rate": 3.6843296025350945e-08, "logits/chosen": 14.494234085083008, "logits/rejected": 14.887067794799805, "logps/chosen": -4.6801910400390625, "logps/rejected": -4.983610153198242, "loss": 3.7879, "rewards/accuracies": 1.0, "rewards/chosen": -46.801910400390625, "rewards/margins": 3.0341882705688477, "rewards/rejected": -49.836097717285156, "step": 6434 }, { "epoch": 0.8762254901960784, "grad_norm": 39.78886737946357, "learning_rate": 3.6763629367969974e-08, "logits/chosen": 14.50880241394043, "logits/rejected": 14.516910552978516, "logps/chosen": -4.720488548278809, "logps/rejected": -4.835536003112793, "loss": 3.368, "rewards/accuracies": 0.5, "rewards/chosen": -47.204891204833984, "rewards/margins": 1.1504688262939453, "rewards/rejected": -48.3553581237793, "step": 6435 }, { "epoch": 0.8763616557734205, "grad_norm": 39.970334194045364, "learning_rate": 3.668404478684954e-08, "logits/chosen": 14.05533218383789, "logits/rejected": 14.019959449768066, "logps/chosen": -4.0322723388671875, "logps/rejected": -4.450976371765137, "loss": 3.8081, "rewards/accuracies": 0.75, "rewards/chosen": -40.322723388671875, "rewards/margins": 4.187044143676758, "rewards/rejected": -44.509765625, "step": 6436 }, { "epoch": 0.8764978213507625, "grad_norm": 42.40328034872259, "learning_rate": 3.660454229997234e-08, "logits/chosen": 13.979631423950195, "logits/rejected": 14.179576873779297, "logps/chosen": -4.444636344909668, "logps/rejected": -4.57441520690918, "loss": 4.0891, "rewards/accuracies": 0.75, "rewards/chosen": -44.44636535644531, "rewards/margins": 1.2977867126464844, "rewards/rejected": -45.74415588378906, "step": 6437 }, { "epoch": 0.8766339869281046, "grad_norm": 39.800280657622, "learning_rate": 3.65251219253027e-08, "logits/chosen": 14.914583206176758, "logits/rejected": 15.477628707885742, "logps/chosen": -4.8133721351623535, "logps/rejected": -4.909406661987305, "loss": 4.2819, "rewards/accuracies": 0.5, "rewards/chosen": -48.13372039794922, "rewards/margins": 0.9603433609008789, "rewards/rejected": -49.09406661987305, "step": 6438 }, { "epoch": 0.8767701525054467, "grad_norm": 39.53725639912955, "learning_rate": 3.644578368078628e-08, "logits/chosen": 14.067806243896484, "logits/rejected": 14.349769592285156, "logps/chosen": -4.5989885330200195, "logps/rejected": -4.732901573181152, "loss": 3.5776, "rewards/accuracies": 0.5, "rewards/chosen": -45.98988342285156, "rewards/margins": 1.3391304016113281, "rewards/rejected": -47.32901382446289, "step": 6439 }, { "epoch": 0.8769063180827886, "grad_norm": 40.060070416814426, "learning_rate": 3.636652758435019e-08, "logits/chosen": 14.064960479736328, "logits/rejected": 14.566059112548828, "logps/chosen": -4.17862606048584, "logps/rejected": -4.27132511138916, "loss": 4.2407, "rewards/accuracies": 0.75, "rewards/chosen": -41.78626251220703, "rewards/margins": 0.9269905090332031, "rewards/rejected": -42.713253021240234, "step": 6440 }, { "epoch": 0.8770424836601307, "grad_norm": 39.85249439194284, "learning_rate": 3.6287353653903006e-08, "logits/chosen": 14.194461822509766, "logits/rejected": 14.901826858520508, "logps/chosen": -4.312193393707275, "logps/rejected": -4.490956783294678, "loss": 4.0289, "rewards/accuracies": 0.75, "rewards/chosen": -43.1219367980957, "rewards/margins": 1.7876348495483398, "rewards/rejected": -44.90957260131836, "step": 6441 }, { "epoch": 0.8771786492374728, "grad_norm": 36.721617717985424, "learning_rate": 3.620826190733477e-08, "logits/chosen": 14.345317840576172, "logits/rejected": 14.828203201293945, "logps/chosen": -4.683570861816406, "logps/rejected": -4.872730255126953, "loss": 3.5213, "rewards/accuracies": 0.75, "rewards/chosen": -46.83570861816406, "rewards/margins": 1.8915939331054688, "rewards/rejected": -48.72730255126953, "step": 6442 }, { "epoch": 0.8773148148148148, "grad_norm": 39.798449728964805, "learning_rate": 3.612925236251687e-08, "logits/chosen": 14.590230941772461, "logits/rejected": 14.797420501708984, "logps/chosen": -4.2295427322387695, "logps/rejected": -4.582754135131836, "loss": 3.8283, "rewards/accuracies": 1.0, "rewards/chosen": -42.29542541503906, "rewards/margins": 3.5321168899536133, "rewards/rejected": -45.827545166015625, "step": 6443 }, { "epoch": 0.8774509803921569, "grad_norm": 40.83120364961105, "learning_rate": 3.605032503730214e-08, "logits/chosen": 14.44749641418457, "logits/rejected": 14.975740432739258, "logps/chosen": -4.630241870880127, "logps/rejected": -4.796483516693115, "loss": 3.878, "rewards/accuracies": 0.75, "rewards/chosen": -46.30242156982422, "rewards/margins": 1.6624116897583008, "rewards/rejected": -47.96483612060547, "step": 6444 }, { "epoch": 0.8775871459694989, "grad_norm": 38.722213145053985, "learning_rate": 3.597147994952503e-08, "logits/chosen": 14.034849166870117, "logits/rejected": 14.833902359008789, "logps/chosen": -4.374560832977295, "logps/rejected": -4.962850570678711, "loss": 4.1173, "rewards/accuracies": 1.0, "rewards/chosen": -43.74560546875, "rewards/margins": 5.88289737701416, "rewards/rejected": -49.62850570678711, "step": 6445 }, { "epoch": 0.8777233115468409, "grad_norm": 43.15198894144574, "learning_rate": 3.5892717117001013e-08, "logits/chosen": 15.187000274658203, "logits/rejected": 15.06383228302002, "logps/chosen": -4.822427272796631, "logps/rejected": -5.0331950187683105, "loss": 4.0673, "rewards/accuracies": 0.75, "rewards/chosen": -48.224273681640625, "rewards/margins": 2.1076745986938477, "rewards/rejected": -50.331947326660156, "step": 6446 }, { "epoch": 0.877859477124183, "grad_norm": 41.67888203165422, "learning_rate": 3.581403655752733e-08, "logits/chosen": 15.028715133666992, "logits/rejected": 14.905380249023438, "logps/chosen": -4.82575798034668, "logps/rejected": -4.890745639801025, "loss": 4.3206, "rewards/accuracies": 0.75, "rewards/chosen": -48.25757598876953, "rewards/margins": 0.6498794555664062, "rewards/rejected": -48.90745544433594, "step": 6447 }, { "epoch": 0.8779956427015251, "grad_norm": 40.04307944705918, "learning_rate": 3.57354382888825e-08, "logits/chosen": 14.532066345214844, "logits/rejected": 15.145087242126465, "logps/chosen": -4.635391712188721, "logps/rejected": -5.068246841430664, "loss": 4.1382, "rewards/accuracies": 0.75, "rewards/chosen": -46.35391616821289, "rewards/margins": 4.328556060791016, "rewards/rejected": -50.682472229003906, "step": 6448 }, { "epoch": 0.878131808278867, "grad_norm": 41.682175913447935, "learning_rate": 3.565692232882638e-08, "logits/chosen": 14.342623710632324, "logits/rejected": 14.531595230102539, "logps/chosen": -4.367714881896973, "logps/rejected": -4.813499450683594, "loss": 4.3514, "rewards/accuracies": 1.0, "rewards/chosen": -43.67715072631836, "rewards/margins": 4.4578399658203125, "rewards/rejected": -48.13499069213867, "step": 6449 }, { "epoch": 0.8782679738562091, "grad_norm": 42.496024727111845, "learning_rate": 3.557848869510036e-08, "logits/chosen": 14.127886772155762, "logits/rejected": 13.943355560302734, "logps/chosen": -4.338479995727539, "logps/rejected": -4.320595741271973, "loss": 3.9022, "rewards/accuracies": 0.5, "rewards/chosen": -43.384803771972656, "rewards/margins": -0.17884349822998047, "rewards/rejected": -43.205955505371094, "step": 6450 }, { "epoch": 0.8784041394335512, "grad_norm": 38.55484709449903, "learning_rate": 3.550013740542725e-08, "logits/chosen": 14.156562805175781, "logits/rejected": 14.953237533569336, "logps/chosen": -4.557342052459717, "logps/rejected": -4.961421966552734, "loss": 4.1511, "rewards/accuracies": 0.5, "rewards/chosen": -45.573421478271484, "rewards/margins": 4.040799140930176, "rewards/rejected": -49.61422348022461, "step": 6451 }, { "epoch": 0.8785403050108932, "grad_norm": 38.49578596676452, "learning_rate": 3.542186847751099e-08, "logits/chosen": 14.615710258483887, "logits/rejected": 15.266876220703125, "logps/chosen": -4.401449680328369, "logps/rejected": -4.81148624420166, "loss": 3.9552, "rewards/accuracies": 1.0, "rewards/chosen": -44.01449966430664, "rewards/margins": 4.1003618240356445, "rewards/rejected": -48.11486053466797, "step": 6452 }, { "epoch": 0.8786764705882353, "grad_norm": 45.77917172597979, "learning_rate": 3.534368192903714e-08, "logits/chosen": 14.747373580932617, "logits/rejected": 14.953824996948242, "logps/chosen": -4.7901611328125, "logps/rejected": -4.900787353515625, "loss": 4.4658, "rewards/accuracies": 0.5, "rewards/chosen": -47.901611328125, "rewards/margins": 1.1062612533569336, "rewards/rejected": -49.007869720458984, "step": 6453 }, { "epoch": 0.8788126361655774, "grad_norm": 38.72595865184456, "learning_rate": 3.526557777767278e-08, "logits/chosen": 14.436639785766602, "logits/rejected": 14.881178855895996, "logps/chosen": -4.661087512969971, "logps/rejected": -4.811749458312988, "loss": 3.5644, "rewards/accuracies": 0.5, "rewards/chosen": -46.61087417602539, "rewards/margins": 1.5066194534301758, "rewards/rejected": -48.11749267578125, "step": 6454 }, { "epoch": 0.8789488017429193, "grad_norm": 39.61019622169989, "learning_rate": 3.518755604106594e-08, "logits/chosen": 14.946962356567383, "logits/rejected": 15.287985801696777, "logps/chosen": -4.735480308532715, "logps/rejected": -4.775103569030762, "loss": 3.8143, "rewards/accuracies": 0.25, "rewards/chosen": -47.35480499267578, "rewards/margins": 0.39623355865478516, "rewards/rejected": -47.75103759765625, "step": 6455 }, { "epoch": 0.8790849673202614, "grad_norm": 41.20304745552521, "learning_rate": 3.510961673684636e-08, "logits/chosen": 14.066795349121094, "logits/rejected": 14.952457427978516, "logps/chosen": -4.500525951385498, "logps/rejected": -4.816723823547363, "loss": 4.1008, "rewards/accuracies": 0.5, "rewards/chosen": -45.0052604675293, "rewards/margins": 3.161980628967285, "rewards/rejected": -48.167240142822266, "step": 6456 }, { "epoch": 0.8792211328976035, "grad_norm": 41.844584415698144, "learning_rate": 3.503175988262521e-08, "logits/chosen": 13.986108779907227, "logits/rejected": 14.614659309387207, "logps/chosen": -4.321345329284668, "logps/rejected": -4.513345241546631, "loss": 4.29, "rewards/accuracies": 0.75, "rewards/chosen": -43.21344757080078, "rewards/margins": 1.920003890991211, "rewards/rejected": -45.133453369140625, "step": 6457 }, { "epoch": 0.8793572984749455, "grad_norm": 42.639880685192644, "learning_rate": 3.495398549599469e-08, "logits/chosen": 14.11113166809082, "logits/rejected": 15.11379623413086, "logps/chosen": -4.3033223152160645, "logps/rejected": -4.709806442260742, "loss": 3.3097, "rewards/accuracies": 0.75, "rewards/chosen": -43.033226013183594, "rewards/margins": 4.064835548400879, "rewards/rejected": -47.098060607910156, "step": 6458 }, { "epoch": 0.8794934640522876, "grad_norm": 40.61617206098586, "learning_rate": 3.487629359452859e-08, "logits/chosen": 14.85152816772461, "logits/rejected": 14.703044891357422, "logps/chosen": -4.765834808349609, "logps/rejected": -4.671970367431641, "loss": 4.2081, "rewards/accuracies": 0.5, "rewards/chosen": -47.658348083496094, "rewards/margins": -0.9386482238769531, "rewards/rejected": -46.71969985961914, "step": 6459 }, { "epoch": 0.8796296296296297, "grad_norm": 43.90939205314352, "learning_rate": 3.479868419578223e-08, "logits/chosen": 14.797377586364746, "logits/rejected": 14.516624450683594, "logps/chosen": -4.741280555725098, "logps/rejected": -4.624700546264648, "loss": 4.5394, "rewards/accuracies": 0.5, "rewards/chosen": -47.412803649902344, "rewards/margins": -1.1657962799072266, "rewards/rejected": -46.247005462646484, "step": 6460 }, { "epoch": 0.8797657952069716, "grad_norm": 40.982325456421705, "learning_rate": 3.472115731729186e-08, "logits/chosen": 15.135599136352539, "logits/rejected": 15.718717575073242, "logps/chosen": -4.821378231048584, "logps/rejected": -5.003107070922852, "loss": 3.6441, "rewards/accuracies": 0.75, "rewards/chosen": -48.21377944946289, "rewards/margins": 1.8172883987426758, "rewards/rejected": -50.03106689453125, "step": 6461 }, { "epoch": 0.8799019607843137, "grad_norm": 49.40187349353996, "learning_rate": 3.464371297657544e-08, "logits/chosen": 13.825761795043945, "logits/rejected": 14.012065887451172, "logps/chosen": -4.740679740905762, "logps/rejected": -4.678719520568848, "loss": 4.0955, "rewards/accuracies": 0.5, "rewards/chosen": -47.40679931640625, "rewards/margins": -0.6196060180664062, "rewards/rejected": -46.787193298339844, "step": 6462 }, { "epoch": 0.8800381263616558, "grad_norm": 39.648449275949694, "learning_rate": 3.4566351191132226e-08, "logits/chosen": 15.664426803588867, "logits/rejected": 15.132089614868164, "logps/chosen": -5.049696922302246, "logps/rejected": -4.8420209884643555, "loss": 3.7368, "rewards/accuracies": 0.25, "rewards/chosen": -50.496971130371094, "rewards/margins": -2.0767593383789062, "rewards/rejected": -48.42020797729492, "step": 6463 }, { "epoch": 0.8801742919389978, "grad_norm": 39.02396905623387, "learning_rate": 3.4489071978442577e-08, "logits/chosen": 15.04260540008545, "logits/rejected": 14.05266284942627, "logps/chosen": -4.861433506011963, "logps/rejected": -4.380093097686768, "loss": 3.9452, "rewards/accuracies": 0.0, "rewards/chosen": -48.61433410644531, "rewards/margins": -4.813405990600586, "rewards/rejected": -43.800933837890625, "step": 6464 }, { "epoch": 0.8803104575163399, "grad_norm": 41.5167297176962, "learning_rate": 3.4411875355968436e-08, "logits/chosen": 13.661964416503906, "logits/rejected": 14.153857231140137, "logps/chosen": -4.556730270385742, "logps/rejected": -4.5382208824157715, "loss": 3.8949, "rewards/accuracies": 0.25, "rewards/chosen": -45.56730651855469, "rewards/margins": -0.18509387969970703, "rewards/rejected": -45.38220977783203, "step": 6465 }, { "epoch": 0.8804466230936819, "grad_norm": 40.08070452972409, "learning_rate": 3.433476134115314e-08, "logits/chosen": 15.205245971679688, "logits/rejected": 15.38247299194336, "logps/chosen": -4.76780891418457, "logps/rejected": -5.023346900939941, "loss": 3.8461, "rewards/accuracies": 0.75, "rewards/chosen": -47.67809295654297, "rewards/margins": 2.5553789138793945, "rewards/rejected": -50.23347091674805, "step": 6466 }, { "epoch": 0.880582788671024, "grad_norm": 41.73931570074017, "learning_rate": 3.425772995142107e-08, "logits/chosen": 14.333309173583984, "logits/rejected": 14.590768814086914, "logps/chosen": -4.37169075012207, "logps/rejected": -4.462538242340088, "loss": 3.5952, "rewards/accuracies": 0.5, "rewards/chosen": -43.7169075012207, "rewards/margins": 0.9084749221801758, "rewards/rejected": -44.62538146972656, "step": 6467 }, { "epoch": 0.880718954248366, "grad_norm": 38.62818444195284, "learning_rate": 3.418078120417815e-08, "logits/chosen": 13.819652557373047, "logits/rejected": 14.379454612731934, "logps/chosen": -4.448179721832275, "logps/rejected": -4.481438159942627, "loss": 4.0622, "rewards/accuracies": 0.5, "rewards/chosen": -44.48179626464844, "rewards/margins": 0.3325834274291992, "rewards/rejected": -44.81438064575195, "step": 6468 }, { "epoch": 0.8808551198257081, "grad_norm": 37.19704350555269, "learning_rate": 3.41039151168117e-08, "logits/chosen": 13.48834228515625, "logits/rejected": 14.318721771240234, "logps/chosen": -4.348040580749512, "logps/rejected": -4.689782619476318, "loss": 3.9913, "rewards/accuracies": 0.75, "rewards/chosen": -43.480403900146484, "rewards/margins": 3.417422294616699, "rewards/rejected": -46.8978271484375, "step": 6469 }, { "epoch": 0.8809912854030502, "grad_norm": 39.201112368283106, "learning_rate": 3.402713170669007e-08, "logits/chosen": 13.673286437988281, "logits/rejected": 14.439432144165039, "logps/chosen": -4.555774688720703, "logps/rejected": -4.7411789894104, "loss": 3.8302, "rewards/accuracies": 0.75, "rewards/chosen": -45.55774688720703, "rewards/margins": 1.8540449142456055, "rewards/rejected": -47.41178894042969, "step": 6470 }, { "epoch": 0.8811274509803921, "grad_norm": 42.155194732993046, "learning_rate": 3.395043099116317e-08, "logits/chosen": 14.543783187866211, "logits/rejected": 14.458722114562988, "logps/chosen": -4.850053787231445, "logps/rejected": -5.048701286315918, "loss": 4.0962, "rewards/accuracies": 0.5, "rewards/chosen": -48.50054168701172, "rewards/margins": 1.9864740371704102, "rewards/rejected": -50.48701477050781, "step": 6471 }, { "epoch": 0.8812636165577342, "grad_norm": 38.945189588369495, "learning_rate": 3.387381298756229e-08, "logits/chosen": 14.800037384033203, "logits/rejected": 14.43634033203125, "logps/chosen": -4.61683464050293, "logps/rejected": -4.846805572509766, "loss": 4.3265, "rewards/accuracies": 0.75, "rewards/chosen": -46.16835021972656, "rewards/margins": 2.2997074127197266, "rewards/rejected": -48.468055725097656, "step": 6472 }, { "epoch": 0.8813997821350763, "grad_norm": 39.676289683047436, "learning_rate": 3.379727771319971e-08, "logits/chosen": 13.932962417602539, "logits/rejected": 14.234332084655762, "logps/chosen": -4.558501720428467, "logps/rejected": -4.531645774841309, "loss": 3.3804, "rewards/accuracies": 0.5, "rewards/chosen": -45.585018157958984, "rewards/margins": -0.26856136322021484, "rewards/rejected": -45.31645584106445, "step": 6473 }, { "epoch": 0.8815359477124183, "grad_norm": 40.32732377099096, "learning_rate": 3.372082518536934e-08, "logits/chosen": 15.268407821655273, "logits/rejected": 15.248322486877441, "logps/chosen": -4.603562831878662, "logps/rejected": -4.8523173332214355, "loss": 4.0159, "rewards/accuracies": 0.75, "rewards/chosen": -46.03562927246094, "rewards/margins": 2.4875450134277344, "rewards/rejected": -48.52317428588867, "step": 6474 }, { "epoch": 0.8816721132897604, "grad_norm": 43.04886759795251, "learning_rate": 3.364445542134624e-08, "logits/chosen": 14.18839168548584, "logits/rejected": 14.802669525146484, "logps/chosen": -4.844453811645508, "logps/rejected": -4.767755031585693, "loss": 4.0422, "rewards/accuracies": 0.25, "rewards/chosen": -48.444541931152344, "rewards/margins": -0.7669906616210938, "rewards/rejected": -47.67755126953125, "step": 6475 }, { "epoch": 0.8818082788671024, "grad_norm": 47.38511245499785, "learning_rate": 3.35681684383867e-08, "logits/chosen": 14.101490020751953, "logits/rejected": 15.271114349365234, "logps/chosen": -4.59080171585083, "logps/rejected": -4.788484573364258, "loss": 4.1942, "rewards/accuracies": 0.75, "rewards/chosen": -45.90801239013672, "rewards/margins": 1.9768295288085938, "rewards/rejected": -47.88484191894531, "step": 6476 }, { "epoch": 0.8819444444444444, "grad_norm": 39.33563366229272, "learning_rate": 3.349196425372844e-08, "logits/chosen": 14.62244987487793, "logits/rejected": 14.665203094482422, "logps/chosen": -4.406145095825195, "logps/rejected": -4.47603178024292, "loss": 4.0961, "rewards/accuracies": 0.5, "rewards/chosen": -44.06145095825195, "rewards/margins": 0.6988677978515625, "rewards/rejected": -44.76032257080078, "step": 6477 }, { "epoch": 0.8820806100217865, "grad_norm": 38.489626585386304, "learning_rate": 3.341584288459054e-08, "logits/chosen": 14.42796516418457, "logits/rejected": 15.121371269226074, "logps/chosen": -4.404318332672119, "logps/rejected": -4.725629806518555, "loss": 3.5271, "rewards/accuracies": 1.0, "rewards/chosen": -44.043182373046875, "rewards/margins": 3.213113784790039, "rewards/rejected": -47.25629806518555, "step": 6478 }, { "epoch": 0.8822167755991286, "grad_norm": 43.57951044291274, "learning_rate": 3.333980434817305e-08, "logits/chosen": 15.790075302124023, "logits/rejected": 15.692468643188477, "logps/chosen": -4.901934623718262, "logps/rejected": -5.079016208648682, "loss": 3.7928, "rewards/accuracies": 0.75, "rewards/chosen": -49.01934051513672, "rewards/margins": 1.7708206176757812, "rewards/rejected": -50.7901611328125, "step": 6479 }, { "epoch": 0.8823529411764706, "grad_norm": 39.55519929447748, "learning_rate": 3.326384866165765e-08, "logits/chosen": 14.891534805297852, "logits/rejected": 15.06617546081543, "logps/chosen": -4.6144514083862305, "logps/rejected": -5.088848114013672, "loss": 3.928, "rewards/accuracies": 0.75, "rewards/chosen": -46.14451599121094, "rewards/margins": 4.743963241577148, "rewards/rejected": -50.88848114013672, "step": 6480 }, { "epoch": 0.8824891067538126, "grad_norm": 41.994809027705, "learning_rate": 3.3187975842207163e-08, "logits/chosen": 14.70077133178711, "logits/rejected": 15.293102264404297, "logps/chosen": -4.421480655670166, "logps/rejected": -4.7776618003845215, "loss": 4.0933, "rewards/accuracies": 1.0, "rewards/chosen": -44.214805603027344, "rewards/margins": 3.5618114471435547, "rewards/rejected": -47.77661895751953, "step": 6481 }, { "epoch": 0.8826252723311547, "grad_norm": 38.47332356172082, "learning_rate": 3.3112185906965586e-08, "logits/chosen": 14.484356880187988, "logits/rejected": 14.796318054199219, "logps/chosen": -4.527605056762695, "logps/rejected": -4.6509785652160645, "loss": 3.695, "rewards/accuracies": 0.5, "rewards/chosen": -45.27605438232422, "rewards/margins": 1.2337350845336914, "rewards/rejected": -46.509788513183594, "step": 6482 }, { "epoch": 0.8827614379084967, "grad_norm": 40.597213364654294, "learning_rate": 3.303647887305834e-08, "logits/chosen": 14.86036491394043, "logits/rejected": 14.855960845947266, "logps/chosen": -4.738621711730957, "logps/rejected": -4.6878509521484375, "loss": 4.1506, "rewards/accuracies": 0.5, "rewards/chosen": -47.38621520996094, "rewards/margins": -0.5077085494995117, "rewards/rejected": -46.878509521484375, "step": 6483 }, { "epoch": 0.8828976034858388, "grad_norm": 84.77280739651992, "learning_rate": 3.296085475759205e-08, "logits/chosen": 14.257421493530273, "logits/rejected": 13.95866584777832, "logps/chosen": -4.484647750854492, "logps/rejected": -4.486104488372803, "loss": 4.4487, "rewards/accuracies": 0.75, "rewards/chosen": -44.84648132324219, "rewards/margins": 0.01456451416015625, "rewards/rejected": -44.861045837402344, "step": 6484 }, { "epoch": 0.8830337690631809, "grad_norm": 40.58947203686128, "learning_rate": 3.28853135776546e-08, "logits/chosen": 14.258101463317871, "logits/rejected": 14.523179054260254, "logps/chosen": -4.498550891876221, "logps/rejected": -4.613833427429199, "loss": 3.8307, "rewards/accuracies": 0.5, "rewards/chosen": -44.985504150390625, "rewards/margins": 1.152827262878418, "rewards/rejected": -46.13833236694336, "step": 6485 }, { "epoch": 0.8831699346405228, "grad_norm": 40.5092708038606, "learning_rate": 3.280985535031511e-08, "logits/chosen": 13.768486022949219, "logits/rejected": 13.883493423461914, "logps/chosen": -4.500424385070801, "logps/rejected": -4.83055305480957, "loss": 3.4526, "rewards/accuracies": 1.0, "rewards/chosen": -45.004249572753906, "rewards/margins": 3.301283836364746, "rewards/rejected": -48.3055305480957, "step": 6486 }, { "epoch": 0.8833061002178649, "grad_norm": 40.933824978560075, "learning_rate": 3.2734480092624053e-08, "logits/chosen": 14.103248596191406, "logits/rejected": 14.533075332641602, "logps/chosen": -4.473748207092285, "logps/rejected": -4.3725714683532715, "loss": 3.3981, "rewards/accuracies": 0.0, "rewards/chosen": -44.73748016357422, "rewards/margins": -1.011763572692871, "rewards/rejected": -43.72571563720703, "step": 6487 }, { "epoch": 0.883442265795207, "grad_norm": 38.35042821355382, "learning_rate": 3.2659187821613096e-08, "logits/chosen": 14.445199012756348, "logits/rejected": 14.843838691711426, "logps/chosen": -4.609823226928711, "logps/rejected": -4.7170820236206055, "loss": 3.9873, "rewards/accuracies": 0.25, "rewards/chosen": -46.098228454589844, "rewards/margins": 1.0725946426391602, "rewards/rejected": -47.17082214355469, "step": 6488 }, { "epoch": 0.883578431372549, "grad_norm": 39.29280698865142, "learning_rate": 3.258397855429509e-08, "logits/chosen": 14.183427810668945, "logits/rejected": 14.115612983703613, "logps/chosen": -4.739110469818115, "logps/rejected": -4.806196689605713, "loss": 3.8869, "rewards/accuracies": 0.5, "rewards/chosen": -47.3911018371582, "rewards/margins": 0.670863151550293, "rewards/rejected": -48.06196594238281, "step": 6489 }, { "epoch": 0.8837145969498911, "grad_norm": 38.85671261571352, "learning_rate": 3.25088523076642e-08, "logits/chosen": 14.73836898803711, "logits/rejected": 15.28462028503418, "logps/chosen": -5.023624420166016, "logps/rejected": -5.205217361450195, "loss": 3.9726, "rewards/accuracies": 0.75, "rewards/chosen": -50.23624038696289, "rewards/margins": 1.8159351348876953, "rewards/rejected": -52.05217742919922, "step": 6490 }, { "epoch": 0.8838507625272332, "grad_norm": 39.264347204974364, "learning_rate": 3.243380909869593e-08, "logits/chosen": 14.4051513671875, "logits/rejected": 15.189664840698242, "logps/chosen": -4.561454772949219, "logps/rejected": -5.059544086456299, "loss": 3.9269, "rewards/accuracies": 0.75, "rewards/chosen": -45.61454772949219, "rewards/margins": 4.980893135070801, "rewards/rejected": -50.59543991088867, "step": 6491 }, { "epoch": 0.8839869281045751, "grad_norm": 37.74596266658387, "learning_rate": 3.235884894434675e-08, "logits/chosen": 13.953543663024902, "logits/rejected": 13.859729766845703, "logps/chosen": -4.445187568664551, "logps/rejected": -4.53847599029541, "loss": 3.7765, "rewards/accuracies": 0.5, "rewards/chosen": -44.451873779296875, "rewards/margins": 0.9328842163085938, "rewards/rejected": -45.38475799560547, "step": 6492 }, { "epoch": 0.8841230936819172, "grad_norm": 46.109190758398256, "learning_rate": 3.2283971861554626e-08, "logits/chosen": 14.522802352905273, "logits/rejected": 14.891372680664062, "logps/chosen": -4.885660171508789, "logps/rejected": -5.061139106750488, "loss": 3.9412, "rewards/accuracies": 0.75, "rewards/chosen": -48.85660171508789, "rewards/margins": 1.7547931671142578, "rewards/rejected": -50.61139678955078, "step": 6493 }, { "epoch": 0.8842592592592593, "grad_norm": 40.77399154840054, "learning_rate": 3.22091778672386e-08, "logits/chosen": 15.411334037780762, "logits/rejected": 15.396018981933594, "logps/chosen": -4.551492214202881, "logps/rejected": -4.991722106933594, "loss": 3.7867, "rewards/accuracies": 1.0, "rewards/chosen": -45.514923095703125, "rewards/margins": 4.402299880981445, "rewards/rejected": -49.91722106933594, "step": 6494 }, { "epoch": 0.8843954248366013, "grad_norm": 38.57747988376743, "learning_rate": 3.213446697829911e-08, "logits/chosen": 13.916593551635742, "logits/rejected": 15.428321838378906, "logps/chosen": -4.508435249328613, "logps/rejected": -4.912290573120117, "loss": 3.8646, "rewards/accuracies": 1.0, "rewards/chosen": -45.084354400634766, "rewards/margins": 4.038553237915039, "rewards/rejected": -49.12290573120117, "step": 6495 }, { "epoch": 0.8845315904139434, "grad_norm": 37.23214431265603, "learning_rate": 3.2059839211617545e-08, "logits/chosen": 13.838316917419434, "logits/rejected": 14.32630729675293, "logps/chosen": -4.254088401794434, "logps/rejected": -4.621793270111084, "loss": 3.5394, "rewards/accuracies": 1.0, "rewards/chosen": -42.54088592529297, "rewards/margins": 3.677046775817871, "rewards/rejected": -46.217933654785156, "step": 6496 }, { "epoch": 0.8846677559912854, "grad_norm": 37.72110853555348, "learning_rate": 3.198529458405672e-08, "logits/chosen": 14.255071640014648, "logits/rejected": 13.81844711303711, "logps/chosen": -4.492557525634766, "logps/rejected": -4.542933464050293, "loss": 3.0718, "rewards/accuracies": 0.5, "rewards/chosen": -44.92557144165039, "rewards/margins": 0.5037631988525391, "rewards/rejected": -45.42933654785156, "step": 6497 }, { "epoch": 0.8848039215686274, "grad_norm": 38.45180337932437, "learning_rate": 3.191083311246072e-08, "logits/chosen": 14.926191329956055, "logits/rejected": 14.75230598449707, "logps/chosen": -4.74140739440918, "logps/rejected": -4.903442859649658, "loss": 3.5655, "rewards/accuracies": 0.5, "rewards/chosen": -47.41407012939453, "rewards/margins": 1.620356559753418, "rewards/rejected": -49.034427642822266, "step": 6498 }, { "epoch": 0.8849400871459695, "grad_norm": 41.0604630550154, "learning_rate": 3.1836454813654536e-08, "logits/chosen": 14.664083480834961, "logits/rejected": 14.388851165771484, "logps/chosen": -4.681318283081055, "logps/rejected": -4.673686981201172, "loss": 3.8736, "rewards/accuracies": 0.5, "rewards/chosen": -46.81318664550781, "rewards/margins": -0.07631492614746094, "rewards/rejected": -46.73686981201172, "step": 6499 }, { "epoch": 0.8850762527233116, "grad_norm": 39.10369442483878, "learning_rate": 3.176215970444467e-08, "logits/chosen": 13.589301109313965, "logits/rejected": 13.76563549041748, "logps/chosen": -4.277870178222656, "logps/rejected": -4.581272125244141, "loss": 3.7416, "rewards/accuracies": 0.75, "rewards/chosen": -42.77870178222656, "rewards/margins": 3.0340137481689453, "rewards/rejected": -45.812721252441406, "step": 6500 }, { "epoch": 0.8852124183006536, "grad_norm": 39.65595163246586, "learning_rate": 3.1687947801618765e-08, "logits/chosen": 14.204534530639648, "logits/rejected": 14.233745574951172, "logps/chosen": -4.454671859741211, "logps/rejected": -4.930067539215088, "loss": 4.0185, "rewards/accuracies": 0.75, "rewards/chosen": -44.54671859741211, "rewards/margins": 4.753960609436035, "rewards/rejected": -49.30067825317383, "step": 6501 }, { "epoch": 0.8853485838779956, "grad_norm": 44.12158499723476, "learning_rate": 3.1613819121945493e-08, "logits/chosen": 13.933298110961914, "logits/rejected": 14.533862113952637, "logps/chosen": -4.399033546447754, "logps/rejected": -4.813880443572998, "loss": 4.7196, "rewards/accuracies": 0.5, "rewards/chosen": -43.99033737182617, "rewards/margins": 4.148466110229492, "rewards/rejected": -48.1388053894043, "step": 6502 }, { "epoch": 0.8854847494553377, "grad_norm": 38.35279890681306, "learning_rate": 3.1539773682174885e-08, "logits/chosen": 14.506776809692383, "logits/rejected": 14.529740333557129, "logps/chosen": -4.587214469909668, "logps/rejected": -4.688831806182861, "loss": 4.1858, "rewards/accuracies": 0.75, "rewards/chosen": -45.87214660644531, "rewards/margins": 1.0161724090576172, "rewards/rejected": -46.88832092285156, "step": 6503 }, { "epoch": 0.8856209150326797, "grad_norm": 46.256890073056574, "learning_rate": 3.146581149903818e-08, "logits/chosen": 14.744789123535156, "logits/rejected": 14.553966522216797, "logps/chosen": -4.724396705627441, "logps/rejected": -4.464123725891113, "loss": 4.1929, "rewards/accuracies": 0.25, "rewards/chosen": -47.24396896362305, "rewards/margins": -2.602733612060547, "rewards/rejected": -44.6412353515625, "step": 6504 }, { "epoch": 0.8857570806100218, "grad_norm": 42.97400171333708, "learning_rate": 3.1391932589247727e-08, "logits/chosen": 15.219793319702148, "logits/rejected": 14.948490142822266, "logps/chosen": -4.914238929748535, "logps/rejected": -4.579657554626465, "loss": 4.1575, "rewards/accuracies": 0.25, "rewards/chosen": -49.14238739013672, "rewards/margins": -3.3458127975463867, "rewards/rejected": -45.79657745361328, "step": 6505 }, { "epoch": 0.8858932461873639, "grad_norm": 42.92956418158357, "learning_rate": 3.131813696949699e-08, "logits/chosen": 14.854589462280273, "logits/rejected": 15.04582691192627, "logps/chosen": -4.806427001953125, "logps/rejected": -4.789983749389648, "loss": 3.178, "rewards/accuracies": 0.5, "rewards/chosen": -48.06427001953125, "rewards/margins": -0.16443157196044922, "rewards/rejected": -47.89984130859375, "step": 6506 }, { "epoch": 0.8860294117647058, "grad_norm": 42.9384945661305, "learning_rate": 3.124442465646075e-08, "logits/chosen": 14.417570114135742, "logits/rejected": 15.190101623535156, "logps/chosen": -4.714385032653809, "logps/rejected": -5.023967742919922, "loss": 4.2849, "rewards/accuracies": 1.0, "rewards/chosen": -47.14384841918945, "rewards/margins": 3.0958261489868164, "rewards/rejected": -50.23967742919922, "step": 6507 }, { "epoch": 0.8861655773420479, "grad_norm": 43.21315719073564, "learning_rate": 3.1170795666795036e-08, "logits/chosen": 15.22298812866211, "logits/rejected": 16.0406494140625, "logps/chosen": -4.772747993469238, "logps/rejected": -5.398681163787842, "loss": 3.8566, "rewards/accuracies": 1.0, "rewards/chosen": -47.72747802734375, "rewards/margins": 6.259328842163086, "rewards/rejected": -53.98680877685547, "step": 6508 }, { "epoch": 0.88630174291939, "grad_norm": 38.53708457844315, "learning_rate": 3.10972500171367e-08, "logits/chosen": 13.947295188903809, "logits/rejected": 14.450223922729492, "logps/chosen": -4.604883670806885, "logps/rejected": -4.813775539398193, "loss": 3.6998, "rewards/accuracies": 0.75, "rewards/chosen": -46.04883575439453, "rewards/margins": 2.0889225006103516, "rewards/rejected": -48.13775634765625, "step": 6509 }, { "epoch": 0.886437908496732, "grad_norm": 63.83185578931224, "learning_rate": 3.10237877241041e-08, "logits/chosen": 14.956218719482422, "logits/rejected": 15.553567886352539, "logps/chosen": -4.653094291687012, "logps/rejected": -4.919430255889893, "loss": 3.985, "rewards/accuracies": 0.75, "rewards/chosen": -46.53094482421875, "rewards/margins": 2.6633529663085938, "rewards/rejected": -49.19430160522461, "step": 6510 }, { "epoch": 0.8865740740740741, "grad_norm": 37.45896377005266, "learning_rate": 3.0950408804296754e-08, "logits/chosen": 13.15710735321045, "logits/rejected": 14.612422943115234, "logps/chosen": -4.361906051635742, "logps/rejected": -4.841331958770752, "loss": 3.4914, "rewards/accuracies": 0.75, "rewards/chosen": -43.619056701660156, "rewards/margins": 4.7942609786987305, "rewards/rejected": -48.4133186340332, "step": 6511 }, { "epoch": 0.8867102396514162, "grad_norm": 41.71501959447256, "learning_rate": 3.0877113274295055e-08, "logits/chosen": 14.285404205322266, "logits/rejected": 14.671859741210938, "logps/chosen": -4.454624652862549, "logps/rejected": -4.860350608825684, "loss": 4.205, "rewards/accuracies": 1.0, "rewards/chosen": -44.54624557495117, "rewards/margins": 4.057260513305664, "rewards/rejected": -48.6035041809082, "step": 6512 }, { "epoch": 0.8868464052287581, "grad_norm": 42.0519880213345, "learning_rate": 3.0803901150660805e-08, "logits/chosen": 14.303911209106445, "logits/rejected": 13.88951301574707, "logps/chosen": -4.5944414138793945, "logps/rejected": -4.819624423980713, "loss": 4.1481, "rewards/accuracies": 0.75, "rewards/chosen": -45.94441604614258, "rewards/margins": 2.251828193664551, "rewards/rejected": -48.19624328613281, "step": 6513 }, { "epoch": 0.8869825708061002, "grad_norm": 38.6362647208514, "learning_rate": 3.073077244993696e-08, "logits/chosen": 14.26382827758789, "logits/rejected": 15.243694305419922, "logps/chosen": -4.6035566329956055, "logps/rejected": -4.863302707672119, "loss": 3.5102, "rewards/accuracies": 0.75, "rewards/chosen": -46.03556823730469, "rewards/margins": 2.5974550247192383, "rewards/rejected": -48.633026123046875, "step": 6514 }, { "epoch": 0.8871187363834423, "grad_norm": 41.4799855681419, "learning_rate": 3.065772718864745e-08, "logits/chosen": 13.89389705657959, "logits/rejected": 14.234964370727539, "logps/chosen": -4.376729965209961, "logps/rejected": -4.643310070037842, "loss": 3.6646, "rewards/accuracies": 0.75, "rewards/chosen": -43.76729965209961, "rewards/margins": 2.665802001953125, "rewards/rejected": -46.433101654052734, "step": 6515 }, { "epoch": 0.8872549019607843, "grad_norm": 41.166806449111085, "learning_rate": 3.058476538329748e-08, "logits/chosen": 14.532787322998047, "logits/rejected": 14.3997163772583, "logps/chosen": -4.714715957641602, "logps/rejected": -4.699108123779297, "loss": 3.7809, "rewards/accuracies": 0.5, "rewards/chosen": -47.14716339111328, "rewards/margins": -0.1560802459716797, "rewards/rejected": -46.99108123779297, "step": 6516 }, { "epoch": 0.8873910675381264, "grad_norm": 38.352261908469934, "learning_rate": 3.051188705037346e-08, "logits/chosen": 14.510626792907715, "logits/rejected": 15.455545425415039, "logps/chosen": -4.547002792358398, "logps/rejected": -5.147993087768555, "loss": 3.2882, "rewards/accuracies": 1.0, "rewards/chosen": -45.470027923583984, "rewards/margins": 6.009905815124512, "rewards/rejected": -51.47993469238281, "step": 6517 }, { "epoch": 0.8875272331154684, "grad_norm": 41.196344579629574, "learning_rate": 3.043909220634271e-08, "logits/chosen": 13.224620819091797, "logits/rejected": 14.102914810180664, "logps/chosen": -4.327890396118164, "logps/rejected": -4.52847146987915, "loss": 3.8954, "rewards/accuracies": 0.75, "rewards/chosen": -43.278900146484375, "rewards/margins": 2.0058135986328125, "rewards/rejected": -45.28471374511719, "step": 6518 }, { "epoch": 0.8876633986928104, "grad_norm": 39.56188364427536, "learning_rate": 3.036638086765388e-08, "logits/chosen": 15.098784446716309, "logits/rejected": 14.777717590332031, "logps/chosen": -4.398246765136719, "logps/rejected": -5.045330047607422, "loss": 3.8715, "rewards/accuracies": 1.0, "rewards/chosen": -43.98246765136719, "rewards/margins": 6.470832824707031, "rewards/rejected": -50.453304290771484, "step": 6519 }, { "epoch": 0.8877995642701525, "grad_norm": 41.98395350952181, "learning_rate": 3.029375305073678e-08, "logits/chosen": 14.81173038482666, "logits/rejected": 13.871893882751465, "logps/chosen": -4.437952995300293, "logps/rejected": -4.170068740844727, "loss": 4.075, "rewards/accuracies": 0.25, "rewards/chosen": -44.37953186035156, "rewards/margins": -2.6788415908813477, "rewards/rejected": -41.700687408447266, "step": 6520 }, { "epoch": 0.8879357298474946, "grad_norm": 41.062408606476154, "learning_rate": 3.022120877200218e-08, "logits/chosen": 14.800256729125977, "logits/rejected": 14.696557998657227, "logps/chosen": -4.761910438537598, "logps/rejected": -4.760038375854492, "loss": 3.571, "rewards/accuracies": 0.5, "rewards/chosen": -47.619102478027344, "rewards/margins": -0.018721580505371094, "rewards/rejected": -47.600379943847656, "step": 6521 }, { "epoch": 0.8880718954248366, "grad_norm": 40.1535795850193, "learning_rate": 3.014874804784204e-08, "logits/chosen": 14.231813430786133, "logits/rejected": 14.687196731567383, "logps/chosen": -4.48306131362915, "logps/rejected": -5.012423992156982, "loss": 4.1894, "rewards/accuracies": 1.0, "rewards/chosen": -44.83061599731445, "rewards/margins": 5.2936248779296875, "rewards/rejected": -50.12424087524414, "step": 6522 }, { "epoch": 0.8882080610021786, "grad_norm": 41.23729999195635, "learning_rate": 3.007637089462958e-08, "logits/chosen": 14.122503280639648, "logits/rejected": 14.66187858581543, "logps/chosen": -4.67372989654541, "logps/rejected": -4.795241832733154, "loss": 4.1222, "rewards/accuracies": 0.75, "rewards/chosen": -46.737300872802734, "rewards/margins": 1.2151174545288086, "rewards/rejected": -47.952415466308594, "step": 6523 }, { "epoch": 0.8883442265795207, "grad_norm": 38.06796090785564, "learning_rate": 3.000407732871886e-08, "logits/chosen": 14.677613258361816, "logits/rejected": 15.151172637939453, "logps/chosen": -4.703119277954102, "logps/rejected": -4.88994026184082, "loss": 3.9497, "rewards/accuracies": 0.5, "rewards/chosen": -47.031192779541016, "rewards/margins": 1.868210792541504, "rewards/rejected": -48.89940643310547, "step": 6524 }, { "epoch": 0.8884803921568627, "grad_norm": 41.198144395327546, "learning_rate": 2.9931867366445306e-08, "logits/chosen": 14.827590942382812, "logits/rejected": 14.314483642578125, "logps/chosen": -4.73652458190918, "logps/rejected": -4.500007152557373, "loss": 4.0589, "rewards/accuracies": 0.0, "rewards/chosen": -47.36524200439453, "rewards/margins": -2.365171432495117, "rewards/rejected": -45.00007247924805, "step": 6525 }, { "epoch": 0.8886165577342048, "grad_norm": 43.13160440288225, "learning_rate": 2.985974102412538e-08, "logits/chosen": 14.542314529418945, "logits/rejected": 14.714337348937988, "logps/chosen": -4.397445201873779, "logps/rejected": -4.784115791320801, "loss": 4.5291, "rewards/accuracies": 0.5, "rewards/chosen": -43.97445297241211, "rewards/margins": 3.866703987121582, "rewards/rejected": -47.841156005859375, "step": 6526 }, { "epoch": 0.8887527233115469, "grad_norm": 44.275953299208176, "learning_rate": 2.97876983180565e-08, "logits/chosen": 14.13455581665039, "logits/rejected": 15.732900619506836, "logps/chosen": -4.594271659851074, "logps/rejected": -5.084941864013672, "loss": 3.6656, "rewards/accuracies": 1.0, "rewards/chosen": -45.94271469116211, "rewards/margins": 4.906705856323242, "rewards/rejected": -50.84941864013672, "step": 6527 }, { "epoch": 0.8888888888888888, "grad_norm": 38.018946844763214, "learning_rate": 2.9715739264517447e-08, "logits/chosen": 14.40975570678711, "logits/rejected": 15.13525676727295, "logps/chosen": -4.513505935668945, "logps/rejected": -4.767147064208984, "loss": 3.6107, "rewards/accuracies": 0.75, "rewards/chosen": -45.13505935668945, "rewards/margins": 2.5364112854003906, "rewards/rejected": -47.671470642089844, "step": 6528 }, { "epoch": 0.8890250544662309, "grad_norm": 92.57386271518045, "learning_rate": 2.964386387976794e-08, "logits/chosen": 14.342592239379883, "logits/rejected": 14.544661521911621, "logps/chosen": -4.437013626098633, "logps/rejected": -4.684909343719482, "loss": 4.1286, "rewards/accuracies": 1.0, "rewards/chosen": -44.37013626098633, "rewards/margins": 2.4789581298828125, "rewards/rejected": -46.84909439086914, "step": 6529 }, { "epoch": 0.889161220043573, "grad_norm": 41.47915299904694, "learning_rate": 2.9572072180048713e-08, "logits/chosen": 14.284486770629883, "logits/rejected": 14.533041000366211, "logps/chosen": -4.547621726989746, "logps/rejected": -4.635652542114258, "loss": 3.8896, "rewards/accuracies": 0.5, "rewards/chosen": -45.476219177246094, "rewards/margins": 0.8803081512451172, "rewards/rejected": -46.35652542114258, "step": 6530 }, { "epoch": 0.889297385620915, "grad_norm": 40.01834752309073, "learning_rate": 2.950036418158177e-08, "logits/chosen": 14.110508918762207, "logits/rejected": 14.464035034179688, "logps/chosen": -4.296862602233887, "logps/rejected": -4.494927406311035, "loss": 4.0428, "rewards/accuracies": 0.75, "rewards/chosen": -42.968624114990234, "rewards/margins": 1.9806489944458008, "rewards/rejected": -44.94927215576172, "step": 6531 }, { "epoch": 0.8894335511982571, "grad_norm": 39.71516064043896, "learning_rate": 2.94287399005702e-08, "logits/chosen": 14.012845993041992, "logits/rejected": 14.956276893615723, "logps/chosen": -4.4286208152771, "logps/rejected": -5.250198841094971, "loss": 4.0328, "rewards/accuracies": 1.0, "rewards/chosen": -44.28620910644531, "rewards/margins": 8.21578311920166, "rewards/rejected": -52.501991271972656, "step": 6532 }, { "epoch": 0.8895697167755992, "grad_norm": 39.342584766789905, "learning_rate": 2.9357199353197936e-08, "logits/chosen": 14.82567024230957, "logits/rejected": 15.112269401550293, "logps/chosen": -4.761421203613281, "logps/rejected": -5.020576477050781, "loss": 3.8498, "rewards/accuracies": 1.0, "rewards/chosen": -47.61421203613281, "rewards/margins": 2.5915517807006836, "rewards/rejected": -50.20576477050781, "step": 6533 }, { "epoch": 0.8897058823529411, "grad_norm": 41.59098304344986, "learning_rate": 2.9285742555630233e-08, "logits/chosen": 14.990986824035645, "logits/rejected": 14.128732681274414, "logps/chosen": -4.806604862213135, "logps/rejected": -4.455408096313477, "loss": 3.9144, "rewards/accuracies": 0.25, "rewards/chosen": -48.06604766845703, "rewards/margins": -3.511964797973633, "rewards/rejected": -44.55408477783203, "step": 6534 }, { "epoch": 0.8898420479302832, "grad_norm": 42.3853069799051, "learning_rate": 2.921436952401346e-08, "logits/chosen": 14.918155670166016, "logits/rejected": 14.71446418762207, "logps/chosen": -4.994912624359131, "logps/rejected": -5.239040851593018, "loss": 4.2119, "rewards/accuracies": 1.0, "rewards/chosen": -49.949127197265625, "rewards/margins": 2.441281318664551, "rewards/rejected": -52.390411376953125, "step": 6535 }, { "epoch": 0.8899782135076253, "grad_norm": 42.794784337538815, "learning_rate": 2.9143080274474717e-08, "logits/chosen": 14.540855407714844, "logits/rejected": 14.151235580444336, "logps/chosen": -4.6006669998168945, "logps/rejected": -4.520681381225586, "loss": 3.8148, "rewards/accuracies": 0.5, "rewards/chosen": -46.00666809082031, "rewards/margins": -0.7998552322387695, "rewards/rejected": -45.20681381225586, "step": 6536 }, { "epoch": 0.8901143790849673, "grad_norm": 41.87471138406421, "learning_rate": 2.9071874823122587e-08, "logits/chosen": 13.716867446899414, "logits/rejected": 14.964395523071289, "logps/chosen": -4.486245155334473, "logps/rejected": -4.9456400871276855, "loss": 4.4183, "rewards/accuracies": 1.0, "rewards/chosen": -44.862457275390625, "rewards/margins": 4.593945503234863, "rewards/rejected": -49.456398010253906, "step": 6537 }, { "epoch": 0.8902505446623094, "grad_norm": 41.39242481502023, "learning_rate": 2.9000753186046466e-08, "logits/chosen": 14.308740615844727, "logits/rejected": 14.953252792358398, "logps/chosen": -4.645808219909668, "logps/rejected": -4.723169803619385, "loss": 4.218, "rewards/accuracies": 0.5, "rewards/chosen": -46.45808029174805, "rewards/margins": 0.7736177444458008, "rewards/rejected": -47.23169708251953, "step": 6538 }, { "epoch": 0.8903867102396514, "grad_norm": 40.21193829072285, "learning_rate": 2.8929715379316832e-08, "logits/chosen": 14.551156997680664, "logits/rejected": 15.687885284423828, "logps/chosen": -4.419943809509277, "logps/rejected": -4.835207939147949, "loss": 4.3648, "rewards/accuracies": 0.75, "rewards/chosen": -44.19943618774414, "rewards/margins": 4.152645111083984, "rewards/rejected": -48.352081298828125, "step": 6539 }, { "epoch": 0.8905228758169934, "grad_norm": 40.362234316203185, "learning_rate": 2.8858761418985334e-08, "logits/chosen": 15.481813430786133, "logits/rejected": 15.129400253295898, "logps/chosen": -4.870872497558594, "logps/rejected": -4.626046180725098, "loss": 4.3222, "rewards/accuracies": 0.25, "rewards/chosen": -48.70872497558594, "rewards/margins": -2.4482622146606445, "rewards/rejected": -46.260459899902344, "step": 6540 }, { "epoch": 0.8906590413943355, "grad_norm": 39.13423175944397, "learning_rate": 2.8787891321084612e-08, "logits/chosen": 14.846063613891602, "logits/rejected": 14.74040412902832, "logps/chosen": -4.693353652954102, "logps/rejected": -4.666848182678223, "loss": 4.1234, "rewards/accuracies": 0.25, "rewards/chosen": -46.93354034423828, "rewards/margins": -0.26505565643310547, "rewards/rejected": -46.668479919433594, "step": 6541 }, { "epoch": 0.8907952069716776, "grad_norm": 47.71609369316438, "learning_rate": 2.871710510162826e-08, "logits/chosen": 14.300975799560547, "logits/rejected": 14.741975784301758, "logps/chosen": -4.386206150054932, "logps/rejected": -4.8303070068359375, "loss": 4.0787, "rewards/accuracies": 1.0, "rewards/chosen": -43.862064361572266, "rewards/margins": 4.441004753112793, "rewards/rejected": -48.30306625366211, "step": 6542 }, { "epoch": 0.8909313725490197, "grad_norm": 39.97217031603346, "learning_rate": 2.8646402776611078e-08, "logits/chosen": 14.820169448852539, "logits/rejected": 14.864229202270508, "logps/chosen": -4.937773704528809, "logps/rejected": -4.975769996643066, "loss": 3.8484, "rewards/accuracies": 0.75, "rewards/chosen": -49.37773895263672, "rewards/margins": 0.3799610137939453, "rewards/rejected": -49.7577018737793, "step": 6543 }, { "epoch": 0.8910675381263616, "grad_norm": 39.151714831863956, "learning_rate": 2.857578436200887e-08, "logits/chosen": 14.917496681213379, "logits/rejected": 15.085123062133789, "logps/chosen": -4.727084159851074, "logps/rejected": -4.985587120056152, "loss": 4.3374, "rewards/accuracies": 0.75, "rewards/chosen": -47.270843505859375, "rewards/margins": 2.5850257873535156, "rewards/rejected": -49.855865478515625, "step": 6544 }, { "epoch": 0.8912037037037037, "grad_norm": 40.884141562039005, "learning_rate": 2.850524987377838e-08, "logits/chosen": 14.874187469482422, "logits/rejected": 15.514604568481445, "logps/chosen": -4.756472587585449, "logps/rejected": -5.197388648986816, "loss": 3.4952, "rewards/accuracies": 0.75, "rewards/chosen": -47.56472396850586, "rewards/margins": 4.409161567687988, "rewards/rejected": -51.97388458251953, "step": 6545 }, { "epoch": 0.8913398692810458, "grad_norm": 42.256550642071915, "learning_rate": 2.8434799327857438e-08, "logits/chosen": 14.200368881225586, "logits/rejected": 14.335332870483398, "logps/chosen": -4.5042595863342285, "logps/rejected": -4.712741851806641, "loss": 3.9004, "rewards/accuracies": 0.75, "rewards/chosen": -45.04259490966797, "rewards/margins": 2.0848217010498047, "rewards/rejected": -47.127418518066406, "step": 6546 }, { "epoch": 0.8914760348583878, "grad_norm": 42.747263072572736, "learning_rate": 2.836443274016509e-08, "logits/chosen": 14.013545989990234, "logits/rejected": 14.650619506835938, "logps/chosen": -4.6046881675720215, "logps/rejected": -5.024587154388428, "loss": 4.3817, "rewards/accuracies": 1.0, "rewards/chosen": -46.04688262939453, "rewards/margins": 4.1989850997924805, "rewards/rejected": -50.24586868286133, "step": 6547 }, { "epoch": 0.8916122004357299, "grad_norm": 43.026722487285134, "learning_rate": 2.8294150126601058e-08, "logits/chosen": 15.658334732055664, "logits/rejected": 14.856510162353516, "logps/chosen": -4.801113605499268, "logps/rejected": -4.799249172210693, "loss": 4.2511, "rewards/accuracies": 0.5, "rewards/chosen": -48.011138916015625, "rewards/margins": -0.018644332885742188, "rewards/rejected": -47.99249267578125, "step": 6548 }, { "epoch": 0.891748366013072, "grad_norm": 37.28752034018903, "learning_rate": 2.822395150304633e-08, "logits/chosen": 13.77725601196289, "logits/rejected": 14.050874710083008, "logps/chosen": -4.436852931976318, "logps/rejected": -4.874312877655029, "loss": 3.5758, "rewards/accuracies": 0.75, "rewards/chosen": -44.3685302734375, "rewards/margins": 4.374599456787109, "rewards/rejected": -48.743125915527344, "step": 6549 }, { "epoch": 0.8918845315904139, "grad_norm": 46.0302397726929, "learning_rate": 2.815383688536297e-08, "logits/chosen": 14.428596496582031, "logits/rejected": 14.749295234680176, "logps/chosen": -4.637877464294434, "logps/rejected": -5.02396297454834, "loss": 4.0032, "rewards/accuracies": 0.75, "rewards/chosen": -46.3787727355957, "rewards/margins": 3.8608551025390625, "rewards/rejected": -50.239627838134766, "step": 6550 }, { "epoch": 0.892020697167756, "grad_norm": 38.273833815056, "learning_rate": 2.808380628939382e-08, "logits/chosen": 14.761812210083008, "logits/rejected": 14.411946296691895, "logps/chosen": -4.580590724945068, "logps/rejected": -4.378120422363281, "loss": 4.397, "rewards/accuracies": 0.5, "rewards/chosen": -45.805908203125, "rewards/margins": -2.024704933166504, "rewards/rejected": -43.78120422363281, "step": 6551 }, { "epoch": 0.8921568627450981, "grad_norm": 40.80526476857286, "learning_rate": 2.801385973096293e-08, "logits/chosen": 14.193906784057617, "logits/rejected": 14.796161651611328, "logps/chosen": -4.576153755187988, "logps/rejected": -4.975676536560059, "loss": 3.5121, "rewards/accuracies": 0.75, "rewards/chosen": -45.76153564453125, "rewards/margins": 3.995227813720703, "rewards/rejected": -49.75676345825195, "step": 6552 }, { "epoch": 0.8922930283224401, "grad_norm": 41.13347472429537, "learning_rate": 2.794399722587535e-08, "logits/chosen": 14.691766738891602, "logits/rejected": 15.273365020751953, "logps/chosen": -4.629507541656494, "logps/rejected": -4.819700241088867, "loss": 4.0282, "rewards/accuracies": 0.5, "rewards/chosen": -46.295074462890625, "rewards/margins": 1.901926040649414, "rewards/rejected": -48.19700241088867, "step": 6553 }, { "epoch": 0.8924291938997821, "grad_norm": 42.83120463740622, "learning_rate": 2.787421878991698e-08, "logits/chosen": 14.361532211303711, "logits/rejected": 15.134136199951172, "logps/chosen": -4.74995756149292, "logps/rejected": -4.939859390258789, "loss": 4.7214, "rewards/accuracies": 0.75, "rewards/chosen": -47.49957275390625, "rewards/margins": 1.8990192413330078, "rewards/rejected": -49.39859390258789, "step": 6554 }, { "epoch": 0.8925653594771242, "grad_norm": 40.40032048946577, "learning_rate": 2.7804524438854947e-08, "logits/chosen": 13.977506637573242, "logits/rejected": 14.334724426269531, "logps/chosen": -4.529017448425293, "logps/rejected": -4.832776069641113, "loss": 3.7993, "rewards/accuracies": 1.0, "rewards/chosen": -45.29017639160156, "rewards/margins": 3.0375823974609375, "rewards/rejected": -48.3277587890625, "step": 6555 }, { "epoch": 0.8927015250544662, "grad_norm": 39.050514338399665, "learning_rate": 2.773491418843723e-08, "logits/chosen": 14.327739715576172, "logits/rejected": 14.776878356933594, "logps/chosen": -4.837926387786865, "logps/rejected": -4.862831115722656, "loss": 3.8955, "rewards/accuracies": 0.5, "rewards/chosen": -48.37926483154297, "rewards/margins": 0.24904537200927734, "rewards/rejected": -48.62831115722656, "step": 6556 }, { "epoch": 0.8928376906318083, "grad_norm": 40.703710121204345, "learning_rate": 2.7665388054392758e-08, "logits/chosen": 14.061655044555664, "logits/rejected": 14.490994453430176, "logps/chosen": -4.2730326652526855, "logps/rejected": -4.7039947509765625, "loss": 3.9307, "rewards/accuracies": 1.0, "rewards/chosen": -42.730323791503906, "rewards/margins": 4.3096208572387695, "rewards/rejected": -47.039947509765625, "step": 6557 }, { "epoch": 0.8929738562091504, "grad_norm": 40.57670030652575, "learning_rate": 2.7595946052431628e-08, "logits/chosen": 14.28436279296875, "logits/rejected": 14.583799362182617, "logps/chosen": -4.682580947875977, "logps/rejected": -4.702793598175049, "loss": 4.2023, "rewards/accuracies": 0.5, "rewards/chosen": -46.8258056640625, "rewards/margins": 0.20212841033935547, "rewards/rejected": -47.02793884277344, "step": 6558 }, { "epoch": 0.8931100217864923, "grad_norm": 41.010278774395275, "learning_rate": 2.752658819824485e-08, "logits/chosen": 14.120109558105469, "logits/rejected": 14.85757827758789, "logps/chosen": -4.700868606567383, "logps/rejected": -4.965498924255371, "loss": 4.0027, "rewards/accuracies": 0.75, "rewards/chosen": -47.00868225097656, "rewards/margins": 2.6463088989257812, "rewards/rejected": -49.654991149902344, "step": 6559 }, { "epoch": 0.8932461873638344, "grad_norm": 34.795428047412315, "learning_rate": 2.7457314507504326e-08, "logits/chosen": 15.616766929626465, "logits/rejected": 15.344226837158203, "logps/chosen": -4.997537136077881, "logps/rejected": -5.176491737365723, "loss": 3.5571, "rewards/accuracies": 0.75, "rewards/chosen": -49.97536849975586, "rewards/margins": 1.7895469665527344, "rewards/rejected": -51.764915466308594, "step": 6560 }, { "epoch": 0.8933823529411765, "grad_norm": 37.85830980479912, "learning_rate": 2.738812499586305e-08, "logits/chosen": 13.952360153198242, "logits/rejected": 15.057592391967773, "logps/chosen": -4.178728103637695, "logps/rejected": -4.842991352081299, "loss": 3.9217, "rewards/accuracies": 1.0, "rewards/chosen": -41.78727722167969, "rewards/margins": 6.642634391784668, "rewards/rejected": -48.42991638183594, "step": 6561 }, { "epoch": 0.8935185185185185, "grad_norm": 40.89838149419417, "learning_rate": 2.7319019678955046e-08, "logits/chosen": 15.252176284790039, "logits/rejected": 14.763099670410156, "logps/chosen": -4.698614120483398, "logps/rejected": -4.753860950469971, "loss": 3.9653, "rewards/accuracies": 0.5, "rewards/chosen": -46.986141204833984, "rewards/margins": 0.5524673461914062, "rewards/rejected": -47.53860855102539, "step": 6562 }, { "epoch": 0.8936546840958606, "grad_norm": 35.86681425476365, "learning_rate": 2.7249998572395073e-08, "logits/chosen": 14.840599060058594, "logits/rejected": 14.73019790649414, "logps/chosen": -5.173654556274414, "logps/rejected": -5.144780158996582, "loss": 4.0024, "rewards/accuracies": 0.5, "rewards/chosen": -51.73654556274414, "rewards/margins": -0.2887430191040039, "rewards/rejected": -51.44780349731445, "step": 6563 }, { "epoch": 0.8937908496732027, "grad_norm": 40.31536661720972, "learning_rate": 2.718106169177914e-08, "logits/chosen": 14.91802978515625, "logits/rejected": 14.215778350830078, "logps/chosen": -4.50380802154541, "logps/rejected": -4.611621856689453, "loss": 3.8785, "rewards/accuracies": 1.0, "rewards/chosen": -45.038082122802734, "rewards/margins": 1.0781402587890625, "rewards/rejected": -46.11621856689453, "step": 6564 }, { "epoch": 0.8939270152505446, "grad_norm": 40.12024007818816, "learning_rate": 2.711220905268412e-08, "logits/chosen": 14.281917572021484, "logits/rejected": 14.462955474853516, "logps/chosen": -4.78446102142334, "logps/rejected": -4.990388870239258, "loss": 3.7925, "rewards/accuracies": 0.75, "rewards/chosen": -47.84461212158203, "rewards/margins": 2.0592756271362305, "rewards/rejected": -49.90388870239258, "step": 6565 }, { "epoch": 0.8940631808278867, "grad_norm": 47.87583645863469, "learning_rate": 2.7043440670667705e-08, "logits/chosen": 15.692557334899902, "logits/rejected": 15.115021705627441, "logps/chosen": -4.833592891693115, "logps/rejected": -4.603033065795898, "loss": 3.9455, "rewards/accuracies": 0.5, "rewards/chosen": -48.33592987060547, "rewards/margins": -2.3055973052978516, "rewards/rejected": -46.03033447265625, "step": 6566 }, { "epoch": 0.8941993464052288, "grad_norm": 41.3562313321993, "learning_rate": 2.6974756561268754e-08, "logits/chosen": 13.82982063293457, "logits/rejected": 14.201261520385742, "logps/chosen": -4.294280052185059, "logps/rejected": -4.549516677856445, "loss": 3.8042, "rewards/accuracies": 0.75, "rewards/chosen": -42.94279861450195, "rewards/margins": 2.552365303039551, "rewards/rejected": -45.49516296386719, "step": 6567 }, { "epoch": 0.8943355119825708, "grad_norm": 41.15034963417751, "learning_rate": 2.6906156740007115e-08, "logits/chosen": 14.888601303100586, "logits/rejected": 14.722372055053711, "logps/chosen": -4.607912063598633, "logps/rejected": -4.8740692138671875, "loss": 3.6609, "rewards/accuracies": 1.0, "rewards/chosen": -46.079124450683594, "rewards/margins": 2.6615657806396484, "rewards/rejected": -48.740692138671875, "step": 6568 }, { "epoch": 0.8944716775599129, "grad_norm": 46.98749087690615, "learning_rate": 2.683764122238328e-08, "logits/chosen": 13.671453475952148, "logits/rejected": 14.451064109802246, "logps/chosen": -4.41574239730835, "logps/rejected": -4.713226318359375, "loss": 4.2532, "rewards/accuracies": 1.0, "rewards/chosen": -44.15742492675781, "rewards/margins": 2.9748382568359375, "rewards/rejected": -47.13226318359375, "step": 6569 }, { "epoch": 0.8946078431372549, "grad_norm": 43.76968071005333, "learning_rate": 2.676921002387904e-08, "logits/chosen": 13.848316192626953, "logits/rejected": 13.906158447265625, "logps/chosen": -4.027389049530029, "logps/rejected": -4.314399242401123, "loss": 3.9571, "rewards/accuracies": 0.75, "rewards/chosen": -40.273887634277344, "rewards/margins": 2.870100975036621, "rewards/rejected": -43.14398956298828, "step": 6570 }, { "epoch": 0.8947440087145969, "grad_norm": 41.12955778147922, "learning_rate": 2.670086315995701e-08, "logits/chosen": 14.215349197387695, "logits/rejected": 14.512024879455566, "logps/chosen": -4.25843620300293, "logps/rejected": -4.333893775939941, "loss": 3.9113, "rewards/accuracies": 0.5, "rewards/chosen": -42.58435821533203, "rewards/margins": 0.754582405090332, "rewards/rejected": -43.33893966674805, "step": 6571 }, { "epoch": 0.894880174291939, "grad_norm": 41.1905018004193, "learning_rate": 2.6632600646060566e-08, "logits/chosen": 15.038777351379395, "logits/rejected": 15.24653148651123, "logps/chosen": -4.805950164794922, "logps/rejected": -5.108544826507568, "loss": 4.3294, "rewards/accuracies": 0.75, "rewards/chosen": -48.05949783325195, "rewards/margins": 3.025949478149414, "rewards/rejected": -51.08544921875, "step": 6572 }, { "epoch": 0.8950163398692811, "grad_norm": 42.937787819616496, "learning_rate": 2.6564422497614348e-08, "logits/chosen": 13.95383071899414, "logits/rejected": 14.604515075683594, "logps/chosen": -4.636384010314941, "logps/rejected": -4.7582244873046875, "loss": 4.5164, "rewards/accuracies": 0.75, "rewards/chosen": -46.36383819580078, "rewards/margins": 1.2184009552001953, "rewards/rejected": -47.58224105834961, "step": 6573 }, { "epoch": 0.8951525054466231, "grad_norm": 39.81656704725779, "learning_rate": 2.6496328730023766e-08, "logits/chosen": 14.569705963134766, "logits/rejected": 14.735723495483398, "logps/chosen": -4.306543350219727, "logps/rejected": -4.354829788208008, "loss": 3.9423, "rewards/accuracies": 0.5, "rewards/chosen": -43.065433502197266, "rewards/margins": 0.4828615188598633, "rewards/rejected": -43.54829406738281, "step": 6574 }, { "epoch": 0.8952886710239651, "grad_norm": 39.905432123617864, "learning_rate": 2.6428319358675045e-08, "logits/chosen": 14.484014511108398, "logits/rejected": 14.469213485717773, "logps/chosen": -4.34181022644043, "logps/rejected": -4.530344486236572, "loss": 3.6778, "rewards/accuracies": 0.5, "rewards/chosen": -43.41809844970703, "rewards/margins": 1.8853445053100586, "rewards/rejected": -45.303443908691406, "step": 6575 }, { "epoch": 0.8954248366013072, "grad_norm": 42.86714649922698, "learning_rate": 2.6360394398935537e-08, "logits/chosen": 14.140459060668945, "logits/rejected": 14.915777206420898, "logps/chosen": -4.250396728515625, "logps/rejected": -4.929347991943359, "loss": 4.05, "rewards/accuracies": 1.0, "rewards/chosen": -42.503971099853516, "rewards/margins": 6.789508819580078, "rewards/rejected": -49.293479919433594, "step": 6576 }, { "epoch": 0.8955610021786492, "grad_norm": 39.77209584494284, "learning_rate": 2.62925538661535e-08, "logits/chosen": 14.490447998046875, "logits/rejected": 14.917156219482422, "logps/chosen": -4.880655288696289, "logps/rejected": -4.9836812019348145, "loss": 4.0993, "rewards/accuracies": 0.25, "rewards/chosen": -48.806556701660156, "rewards/margins": 1.0302562713623047, "rewards/rejected": -49.83681106567383, "step": 6577 }, { "epoch": 0.8956971677559913, "grad_norm": 38.339589198484816, "learning_rate": 2.6224797775657957e-08, "logits/chosen": 14.903091430664062, "logits/rejected": 14.853509902954102, "logps/chosen": -4.675122261047363, "logps/rejected": -4.747149467468262, "loss": 3.7524, "rewards/accuracies": 0.5, "rewards/chosen": -46.751220703125, "rewards/margins": 0.7202739715576172, "rewards/rejected": -47.471492767333984, "step": 6578 }, { "epoch": 0.8958333333333334, "grad_norm": 40.16076160698346, "learning_rate": 2.6157126142759023e-08, "logits/chosen": 14.831621170043945, "logits/rejected": 14.825502395629883, "logps/chosen": -4.899356842041016, "logps/rejected": -4.803455352783203, "loss": 3.5162, "rewards/accuracies": 0.25, "rewards/chosen": -48.993568420410156, "rewards/margins": -0.9590129852294922, "rewards/rejected": -48.03455352783203, "step": 6579 }, { "epoch": 0.8959694989106753, "grad_norm": 37.65926008139862, "learning_rate": 2.6089538982747748e-08, "logits/chosen": 14.413700103759766, "logits/rejected": 15.415189743041992, "logps/chosen": -4.956892013549805, "logps/rejected": -5.013285160064697, "loss": 3.9824, "rewards/accuracies": 0.5, "rewards/chosen": -49.56892013549805, "rewards/margins": 0.5639305114746094, "rewards/rejected": -50.132850646972656, "step": 6580 }, { "epoch": 0.8961056644880174, "grad_norm": 40.89945951739213, "learning_rate": 2.6022036310895834e-08, "logits/chosen": 14.64291000366211, "logits/rejected": 14.842201232910156, "logps/chosen": -4.792424201965332, "logps/rejected": -4.665085792541504, "loss": 3.9599, "rewards/accuracies": 0.25, "rewards/chosen": -47.92424774169922, "rewards/margins": -1.2733840942382812, "rewards/rejected": -46.65085983276367, "step": 6581 }, { "epoch": 0.8962418300653595, "grad_norm": 38.865558837094724, "learning_rate": 2.5954618142456142e-08, "logits/chosen": 14.812064170837402, "logits/rejected": 14.49820327758789, "logps/chosen": -4.598847389221191, "logps/rejected": -4.724178314208984, "loss": 3.6696, "rewards/accuracies": 0.5, "rewards/chosen": -45.98847198486328, "rewards/margins": 1.2533130645751953, "rewards/rejected": -47.241783142089844, "step": 6582 }, { "epoch": 0.8963779956427015, "grad_norm": 38.61242720720594, "learning_rate": 2.5887284492662397e-08, "logits/chosen": 14.599346160888672, "logits/rejected": 14.998235702514648, "logps/chosen": -4.619811058044434, "logps/rejected": -4.9517292976379395, "loss": 4.1911, "rewards/accuracies": 1.0, "rewards/chosen": -46.19811248779297, "rewards/margins": 3.319180488586426, "rewards/rejected": -49.51729202270508, "step": 6583 }, { "epoch": 0.8965141612200436, "grad_norm": 38.20864638876201, "learning_rate": 2.5820035376729143e-08, "logits/chosen": 14.465511322021484, "logits/rejected": 13.927375793457031, "logps/chosen": -4.404763221740723, "logps/rejected": -4.537230491638184, "loss": 3.9823, "rewards/accuracies": 0.5, "rewards/chosen": -44.04763412475586, "rewards/margins": 1.324671745300293, "rewards/rejected": -45.37230682373047, "step": 6584 }, { "epoch": 0.8966503267973857, "grad_norm": 37.56556010070771, "learning_rate": 2.575287080985191e-08, "logits/chosen": 14.504496574401855, "logits/rejected": 14.74588394165039, "logps/chosen": -4.722018241882324, "logps/rejected": -4.915040016174316, "loss": 3.8621, "rewards/accuracies": 0.75, "rewards/chosen": -47.220184326171875, "rewards/margins": 1.9302139282226562, "rewards/rejected": -49.15039825439453, "step": 6585 }, { "epoch": 0.8967864923747276, "grad_norm": 39.151433829001, "learning_rate": 2.5685790807207098e-08, "logits/chosen": 14.853487014770508, "logits/rejected": 14.579948425292969, "logps/chosen": -4.857764720916748, "logps/rejected": -4.73886775970459, "loss": 3.9595, "rewards/accuracies": 0.5, "rewards/chosen": -48.57765197753906, "rewards/margins": -1.1889724731445312, "rewards/rejected": -47.38867950439453, "step": 6586 }, { "epoch": 0.8969226579520697, "grad_norm": 39.50909434884919, "learning_rate": 2.5618795383951952e-08, "logits/chosen": 12.85440444946289, "logits/rejected": 13.539265632629395, "logps/chosen": -4.163694858551025, "logps/rejected": -4.633372783660889, "loss": 3.8243, "rewards/accuracies": 1.0, "rewards/chosen": -41.63694763183594, "rewards/margins": 4.696778297424316, "rewards/rejected": -46.33372497558594, "step": 6587 }, { "epoch": 0.8970588235294118, "grad_norm": 40.48291735618648, "learning_rate": 2.5551884555224633e-08, "logits/chosen": 14.377402305603027, "logits/rejected": 15.144027709960938, "logps/chosen": -4.555698871612549, "logps/rejected": -4.895991325378418, "loss": 4.3279, "rewards/accuracies": 0.75, "rewards/chosen": -45.55698776245117, "rewards/margins": 3.402923583984375, "rewards/rejected": -48.95991516113281, "step": 6588 }, { "epoch": 0.8971949891067538, "grad_norm": 38.28238910416021, "learning_rate": 2.5485058336144206e-08, "logits/chosen": 14.535296440124512, "logits/rejected": 14.476339340209961, "logps/chosen": -4.465503215789795, "logps/rejected": -4.638671875, "loss": 4.0028, "rewards/accuracies": 0.5, "rewards/chosen": -44.655029296875, "rewards/margins": 1.7316904067993164, "rewards/rejected": -46.38671875, "step": 6589 }, { "epoch": 0.8973311546840959, "grad_norm": 46.24540211656691, "learning_rate": 2.5418316741810674e-08, "logits/chosen": 14.610404968261719, "logits/rejected": 15.39065933227539, "logps/chosen": -4.535118579864502, "logps/rejected": -5.1183271408081055, "loss": 4.3929, "rewards/accuracies": 0.75, "rewards/chosen": -45.3511848449707, "rewards/margins": 5.832084655761719, "rewards/rejected": -51.18326950073242, "step": 6590 }, { "epoch": 0.8974673202614379, "grad_norm": 43.827604746603626, "learning_rate": 2.535165978730478e-08, "logits/chosen": 14.732389450073242, "logits/rejected": 14.911152839660645, "logps/chosen": -4.631199836730957, "logps/rejected": -4.626772880554199, "loss": 4.0229, "rewards/accuracies": 0.5, "rewards/chosen": -46.31199645996094, "rewards/margins": -0.044264793395996094, "rewards/rejected": -46.267730712890625, "step": 6591 }, { "epoch": 0.8976034858387799, "grad_norm": 45.95662437226892, "learning_rate": 2.5285087487688205e-08, "logits/chosen": 14.655227661132812, "logits/rejected": 14.850357055664062, "logps/chosen": -4.916316986083984, "logps/rejected": -5.041952610015869, "loss": 4.4351, "rewards/accuracies": 0.5, "rewards/chosen": -49.163169860839844, "rewards/margins": 1.256361961364746, "rewards/rejected": -50.41952896118164, "step": 6592 }, { "epoch": 0.897739651416122, "grad_norm": 38.33874641846983, "learning_rate": 2.5218599858003586e-08, "logits/chosen": 14.642705917358398, "logits/rejected": 13.633312225341797, "logps/chosen": -4.575044631958008, "logps/rejected": -4.581121444702148, "loss": 3.8766, "rewards/accuracies": 0.5, "rewards/chosen": -45.750450134277344, "rewards/margins": 0.06076526641845703, "rewards/rejected": -45.81121063232422, "step": 6593 }, { "epoch": 0.8978758169934641, "grad_norm": 38.083567482656605, "learning_rate": 2.515219691327428e-08, "logits/chosen": 13.65493392944336, "logits/rejected": 14.033233642578125, "logps/chosen": -4.494999885559082, "logps/rejected": -4.553727626800537, "loss": 4.2077, "rewards/accuracies": 0.5, "rewards/chosen": -44.94999694824219, "rewards/margins": 0.5872812271118164, "rewards/rejected": -45.53727722167969, "step": 6594 }, { "epoch": 0.898011982570806, "grad_norm": 39.044600902794016, "learning_rate": 2.5085878668504555e-08, "logits/chosen": 14.224191665649414, "logits/rejected": 14.72818374633789, "logps/chosen": -4.893850326538086, "logps/rejected": -4.6283416748046875, "loss": 4.0264, "rewards/accuracies": 0.25, "rewards/chosen": -48.93850326538086, "rewards/margins": -2.655088424682617, "rewards/rejected": -46.283416748046875, "step": 6595 }, { "epoch": 0.8981481481481481, "grad_norm": 44.49084194673739, "learning_rate": 2.501964513867967e-08, "logits/chosen": 13.595019340515137, "logits/rejected": 14.391960144042969, "logps/chosen": -4.1092424392700195, "logps/rejected": -4.41451358795166, "loss": 4.2837, "rewards/accuracies": 0.5, "rewards/chosen": -41.09242248535156, "rewards/margins": 3.0527143478393555, "rewards/rejected": -44.145137786865234, "step": 6596 }, { "epoch": 0.8982843137254902, "grad_norm": 38.5370393010718, "learning_rate": 2.49534963387656e-08, "logits/chosen": 14.643280029296875, "logits/rejected": 14.145914077758789, "logps/chosen": -4.87651252746582, "logps/rejected": -4.722027778625488, "loss": 3.9071, "rewards/accuracies": 0.5, "rewards/chosen": -48.76512908935547, "rewards/margins": -1.544846534729004, "rewards/rejected": -47.220279693603516, "step": 6597 }, { "epoch": 0.8984204793028322, "grad_norm": 42.39784682111191, "learning_rate": 2.4887432283709155e-08, "logits/chosen": 14.859199523925781, "logits/rejected": 15.011360168457031, "logps/chosen": -4.771321773529053, "logps/rejected": -5.27288818359375, "loss": 4.2907, "rewards/accuracies": 1.0, "rewards/chosen": -47.713218688964844, "rewards/margins": 5.01566219329834, "rewards/rejected": -52.7288818359375, "step": 6598 }, { "epoch": 0.8985566448801743, "grad_norm": 41.90240160058227, "learning_rate": 2.482145298843812e-08, "logits/chosen": 14.309776306152344, "logits/rejected": 14.620096206665039, "logps/chosen": -4.4253082275390625, "logps/rejected": -4.762847900390625, "loss": 4.1036, "rewards/accuracies": 0.75, "rewards/chosen": -44.253082275390625, "rewards/margins": 3.375398635864258, "rewards/rejected": -47.62847900390625, "step": 6599 }, { "epoch": 0.8986928104575164, "grad_norm": 38.00673966676318, "learning_rate": 2.475555846786106e-08, "logits/chosen": 14.632389068603516, "logits/rejected": 15.035263061523438, "logps/chosen": -4.723912239074707, "logps/rejected": -4.854530334472656, "loss": 3.797, "rewards/accuracies": 0.5, "rewards/chosen": -47.23912048339844, "rewards/margins": 1.3061790466308594, "rewards/rejected": -48.54530334472656, "step": 6600 }, { "epoch": 0.8988289760348583, "grad_norm": 36.45992904103951, "learning_rate": 2.468974873686731e-08, "logits/chosen": 14.27444839477539, "logits/rejected": 14.526531219482422, "logps/chosen": -4.297107219696045, "logps/rejected": -4.706941604614258, "loss": 3.4592, "rewards/accuracies": 0.75, "rewards/chosen": -42.9710693359375, "rewards/margins": 4.098346710205078, "rewards/rejected": -47.06941604614258, "step": 6601 }, { "epoch": 0.8989651416122004, "grad_norm": 40.068693156125576, "learning_rate": 2.4624023810327198e-08, "logits/chosen": 14.623026847839355, "logits/rejected": 13.89012336730957, "logps/chosen": -4.584765911102295, "logps/rejected": -4.465246200561523, "loss": 4.3388, "rewards/accuracies": 0.25, "rewards/chosen": -45.84765625, "rewards/margins": -1.195199966430664, "rewards/rejected": -44.65245819091797, "step": 6602 }, { "epoch": 0.8991013071895425, "grad_norm": 38.14723589497324, "learning_rate": 2.455838370309182e-08, "logits/chosen": 14.485140800476074, "logits/rejected": 13.987716674804688, "logps/chosen": -4.815611362457275, "logps/rejected": -4.6913347244262695, "loss": 3.9525, "rewards/accuracies": 0.25, "rewards/chosen": -48.1561164855957, "rewards/margins": -1.2427682876586914, "rewards/rejected": -46.91334533691406, "step": 6603 }, { "epoch": 0.8992374727668845, "grad_norm": 37.98329577639327, "learning_rate": 2.4492828429993094e-08, "logits/chosen": 14.257696151733398, "logits/rejected": 14.269205093383789, "logps/chosen": -4.800317764282227, "logps/rejected": -4.978524684906006, "loss": 3.8736, "rewards/accuracies": 0.75, "rewards/chosen": -48.00318145751953, "rewards/margins": 1.7820653915405273, "rewards/rejected": -49.785247802734375, "step": 6604 }, { "epoch": 0.8993736383442266, "grad_norm": 43.197936326656425, "learning_rate": 2.4427358005843703e-08, "logits/chosen": 14.841886520385742, "logits/rejected": 14.435258865356445, "logps/chosen": -4.52604866027832, "logps/rejected": -4.4653167724609375, "loss": 3.3364, "rewards/accuracies": 0.5, "rewards/chosen": -45.26049041748047, "rewards/margins": -0.6073217391967773, "rewards/rejected": -44.653167724609375, "step": 6605 }, { "epoch": 0.8995098039215687, "grad_norm": 38.817531816467564, "learning_rate": 2.4361972445437317e-08, "logits/chosen": 14.197367668151855, "logits/rejected": 14.312231063842773, "logps/chosen": -4.5571088790893555, "logps/rejected": -4.663816452026367, "loss": 3.5718, "rewards/accuracies": 0.75, "rewards/chosen": -45.57109069824219, "rewards/margins": 1.0670785903930664, "rewards/rejected": -46.63816833496094, "step": 6606 }, { "epoch": 0.8996459694989106, "grad_norm": 43.43342458263374, "learning_rate": 2.4296671763548348e-08, "logits/chosen": 13.081483840942383, "logits/rejected": 13.98216438293457, "logps/chosen": -4.263247966766357, "logps/rejected": -4.659224510192871, "loss": 4.2733, "rewards/accuracies": 1.0, "rewards/chosen": -42.632484436035156, "rewards/margins": 3.9597654342651367, "rewards/rejected": -46.592247009277344, "step": 6607 }, { "epoch": 0.8997821350762527, "grad_norm": 39.23477554374875, "learning_rate": 2.4231455974931924e-08, "logits/chosen": 14.461889266967773, "logits/rejected": 14.208752632141113, "logps/chosen": -4.513901710510254, "logps/rejected": -4.60907506942749, "loss": 3.6478, "rewards/accuracies": 0.5, "rewards/chosen": -45.139015197753906, "rewards/margins": 0.9517345428466797, "rewards/rejected": -46.09075164794922, "step": 6608 }, { "epoch": 0.8999183006535948, "grad_norm": 38.130359590988824, "learning_rate": 2.416632509432417e-08, "logits/chosen": 13.791816711425781, "logits/rejected": 14.461559295654297, "logps/chosen": -4.482941627502441, "logps/rejected": -4.5637993812561035, "loss": 3.5918, "rewards/accuracies": 0.5, "rewards/chosen": -44.82941818237305, "rewards/margins": 0.8085784912109375, "rewards/rejected": -45.63799285888672, "step": 6609 }, { "epoch": 0.9000544662309368, "grad_norm": 46.047730046997536, "learning_rate": 2.4101279136441978e-08, "logits/chosen": 13.937002182006836, "logits/rejected": 14.743363380432129, "logps/chosen": -4.390401840209961, "logps/rejected": -4.670472145080566, "loss": 3.8523, "rewards/accuracies": 0.75, "rewards/chosen": -43.904014587402344, "rewards/margins": 2.8007030487060547, "rewards/rejected": -46.70471954345703, "step": 6610 }, { "epoch": 0.9001906318082789, "grad_norm": 37.15144116156306, "learning_rate": 2.4036318115982924e-08, "logits/chosen": 13.711630821228027, "logits/rejected": 15.19766616821289, "logps/chosen": -4.2988104820251465, "logps/rejected": -4.753552436828613, "loss": 4.1301, "rewards/accuracies": 1.0, "rewards/chosen": -42.98810577392578, "rewards/margins": 4.547416687011719, "rewards/rejected": -47.5355224609375, "step": 6611 }, { "epoch": 0.9003267973856209, "grad_norm": 40.19341012841084, "learning_rate": 2.3971442047625535e-08, "logits/chosen": 15.477299690246582, "logits/rejected": 14.530801773071289, "logps/chosen": -5.092901229858398, "logps/rejected": -4.864063262939453, "loss": 3.8033, "rewards/accuracies": 0.5, "rewards/chosen": -50.929012298583984, "rewards/margins": -2.2883806228637695, "rewards/rejected": -48.64063262939453, "step": 6612 }, { "epoch": 0.9004629629629629, "grad_norm": 40.132466106252124, "learning_rate": 2.390665094602915e-08, "logits/chosen": 14.661222457885742, "logits/rejected": 14.80990219116211, "logps/chosen": -4.786656379699707, "logps/rejected": -4.609419822692871, "loss": 3.6178, "rewards/accuracies": 0.5, "rewards/chosen": -47.8665657043457, "rewards/margins": -1.772369384765625, "rewards/rejected": -46.09419631958008, "step": 6613 }, { "epoch": 0.900599128540305, "grad_norm": 37.641180325960235, "learning_rate": 2.384194482583375e-08, "logits/chosen": 14.529047012329102, "logits/rejected": 15.083124160766602, "logps/chosen": -4.676849365234375, "logps/rejected": -5.051650047302246, "loss": 4.1351, "rewards/accuracies": 1.0, "rewards/chosen": -46.76849365234375, "rewards/margins": 3.7480030059814453, "rewards/rejected": -50.51649856567383, "step": 6614 }, { "epoch": 0.9007352941176471, "grad_norm": 58.530166204240146, "learning_rate": 2.37773237016603e-08, "logits/chosen": 13.899169921875, "logits/rejected": 14.63033676147461, "logps/chosen": -4.446822166442871, "logps/rejected": -4.8299336433410645, "loss": 4.8077, "rewards/accuracies": 0.75, "rewards/chosen": -44.468223571777344, "rewards/margins": 3.8311119079589844, "rewards/rejected": -48.29933547973633, "step": 6615 }, { "epoch": 0.900871459694989, "grad_norm": 38.69313950432411, "learning_rate": 2.371278758811046e-08, "logits/chosen": 13.704615592956543, "logits/rejected": 14.0199613571167, "logps/chosen": -4.424989700317383, "logps/rejected": -4.515681266784668, "loss": 4.1125, "rewards/accuracies": 0.75, "rewards/chosen": -44.24989700317383, "rewards/margins": 0.906916618347168, "rewards/rejected": -45.15681457519531, "step": 6616 }, { "epoch": 0.9010076252723311, "grad_norm": 39.75840742649546, "learning_rate": 2.3648336499766653e-08, "logits/chosen": 14.11011028289795, "logits/rejected": 14.566298484802246, "logps/chosen": -4.3344621658325195, "logps/rejected": -4.514346122741699, "loss": 4.0925, "rewards/accuracies": 1.0, "rewards/chosen": -43.34462356567383, "rewards/margins": 1.7988395690917969, "rewards/rejected": -45.143463134765625, "step": 6617 }, { "epoch": 0.9011437908496732, "grad_norm": 39.35023555144168, "learning_rate": 2.358397045119216e-08, "logits/chosen": 14.123804092407227, "logits/rejected": 14.625386238098145, "logps/chosen": -4.3057708740234375, "logps/rejected": -4.671093940734863, "loss": 3.4582, "rewards/accuracies": 1.0, "rewards/chosen": -43.05771255493164, "rewards/margins": 3.653226852416992, "rewards/rejected": -46.7109375, "step": 6618 }, { "epoch": 0.9012799564270153, "grad_norm": 38.916626703026026, "learning_rate": 2.3519689456931124e-08, "logits/chosen": 15.1119966506958, "logits/rejected": 15.335296630859375, "logps/chosen": -4.6750688552856445, "logps/rejected": -4.685800552368164, "loss": 3.8193, "rewards/accuracies": 0.5, "rewards/chosen": -46.75069046020508, "rewards/margins": 0.1073160171508789, "rewards/rejected": -46.85800552368164, "step": 6619 }, { "epoch": 0.9014161220043573, "grad_norm": 38.16440211684613, "learning_rate": 2.3455493531508197e-08, "logits/chosen": 13.027193069458008, "logits/rejected": 14.193947792053223, "logps/chosen": -4.370904445648193, "logps/rejected": -4.671872138977051, "loss": 3.9204, "rewards/accuracies": 0.75, "rewards/chosen": -43.70904541015625, "rewards/margins": 3.0096750259399414, "rewards/rejected": -46.718719482421875, "step": 6620 }, { "epoch": 0.9015522875816994, "grad_norm": 34.99789040127783, "learning_rate": 2.3391382689429018e-08, "logits/chosen": 13.563544273376465, "logits/rejected": 14.863956451416016, "logps/chosen": -4.189476013183594, "logps/rejected": -4.884333610534668, "loss": 3.7283, "rewards/accuracies": 1.0, "rewards/chosen": -41.89476013183594, "rewards/margins": 6.948575973510742, "rewards/rejected": -48.84333801269531, "step": 6621 }, { "epoch": 0.9016884531590414, "grad_norm": 38.05332872519425, "learning_rate": 2.3327356945180086e-08, "logits/chosen": 14.529226303100586, "logits/rejected": 14.90028190612793, "logps/chosen": -4.655368804931641, "logps/rejected": -4.761726379394531, "loss": 4.0561, "rewards/accuracies": 0.5, "rewards/chosen": -46.553688049316406, "rewards/margins": 1.0635747909545898, "rewards/rejected": -47.61726379394531, "step": 6622 }, { "epoch": 0.9018246187363834, "grad_norm": 41.86937815665801, "learning_rate": 2.326341631322841e-08, "logits/chosen": 13.697507858276367, "logits/rejected": 14.153308868408203, "logps/chosen": -4.042704105377197, "logps/rejected": -4.598556041717529, "loss": 3.8121, "rewards/accuracies": 1.0, "rewards/chosen": -40.427040100097656, "rewards/margins": 5.558521270751953, "rewards/rejected": -45.98556137084961, "step": 6623 }, { "epoch": 0.9019607843137255, "grad_norm": 48.54304614770112, "learning_rate": 2.3199560808021946e-08, "logits/chosen": 14.647729873657227, "logits/rejected": 14.953508377075195, "logps/chosen": -4.9192914962768555, "logps/rejected": -5.063654899597168, "loss": 3.6431, "rewards/accuracies": 0.75, "rewards/chosen": -49.19291687011719, "rewards/margins": 1.4436349868774414, "rewards/rejected": -50.63655090332031, "step": 6624 }, { "epoch": 0.9020969498910676, "grad_norm": 39.44479908773473, "learning_rate": 2.313579044398941e-08, "logits/chosen": 14.463624954223633, "logits/rejected": 14.764301300048828, "logps/chosen": -4.620830535888672, "logps/rejected": -4.810619354248047, "loss": 3.8047, "rewards/accuracies": 0.75, "rewards/chosen": -46.20830535888672, "rewards/margins": 1.8978872299194336, "rewards/rejected": -48.10619354248047, "step": 6625 }, { "epoch": 0.9022331154684096, "grad_norm": 38.38546306272979, "learning_rate": 2.3072105235540173e-08, "logits/chosen": 14.356908798217773, "logits/rejected": 14.076133728027344, "logps/chosen": -4.588897228240967, "logps/rejected": -4.440986633300781, "loss": 4.1228, "rewards/accuracies": 0.25, "rewards/chosen": -45.88896942138672, "rewards/margins": -1.4791011810302734, "rewards/rejected": -44.40986633300781, "step": 6626 }, { "epoch": 0.9023692810457516, "grad_norm": 46.384053896795045, "learning_rate": 2.3008505197064497e-08, "logits/chosen": 14.77009391784668, "logits/rejected": 14.535196304321289, "logps/chosen": -4.733462810516357, "logps/rejected": -4.702143669128418, "loss": 4.4049, "rewards/accuracies": 0.5, "rewards/chosen": -47.33462905883789, "rewards/margins": -0.3131875991821289, "rewards/rejected": -47.02143859863281, "step": 6627 }, { "epoch": 0.9025054466230937, "grad_norm": 37.824783374648526, "learning_rate": 2.294499034293338e-08, "logits/chosen": 14.192983627319336, "logits/rejected": 14.679815292358398, "logps/chosen": -4.538548469543457, "logps/rejected": -4.772167205810547, "loss": 3.8688, "rewards/accuracies": 1.0, "rewards/chosen": -45.38547897338867, "rewards/margins": 2.3361892700195312, "rewards/rejected": -47.7216682434082, "step": 6628 }, { "epoch": 0.9026416122004357, "grad_norm": 37.48478826214329, "learning_rate": 2.288156068749836e-08, "logits/chosen": 14.763971328735352, "logits/rejected": 15.047365188598633, "logps/chosen": -4.774094581604004, "logps/rejected": -4.810900688171387, "loss": 4.0377, "rewards/accuracies": 0.25, "rewards/chosen": -47.74094009399414, "rewards/margins": 0.36806678771972656, "rewards/rejected": -48.1090087890625, "step": 6629 }, { "epoch": 0.9027777777777778, "grad_norm": 57.379210447599746, "learning_rate": 2.281821624509206e-08, "logits/chosen": 13.406854629516602, "logits/rejected": 13.892509460449219, "logps/chosen": -4.291377067565918, "logps/rejected": -4.444830417633057, "loss": 4.0258, "rewards/accuracies": 0.5, "rewards/chosen": -42.91377258300781, "rewards/margins": 1.534529685974121, "rewards/rejected": -44.44830322265625, "step": 6630 }, { "epoch": 0.9029139433551199, "grad_norm": 45.39027408392939, "learning_rate": 2.275495703002761e-08, "logits/chosen": 14.781567573547363, "logits/rejected": 14.040687561035156, "logps/chosen": -4.6306304931640625, "logps/rejected": -4.745522499084473, "loss": 3.7758, "rewards/accuracies": 0.75, "rewards/chosen": -46.30630111694336, "rewards/margins": 1.14892578125, "rewards/rejected": -47.45522689819336, "step": 6631 }, { "epoch": 0.9030501089324618, "grad_norm": 44.77055788310233, "learning_rate": 2.2691783056598913e-08, "logits/chosen": 14.181381225585938, "logits/rejected": 13.446868896484375, "logps/chosen": -4.671738147735596, "logps/rejected": -4.243743896484375, "loss": 4.0906, "rewards/accuracies": 0.0, "rewards/chosen": -46.717384338378906, "rewards/margins": -4.279947280883789, "rewards/rejected": -42.437435150146484, "step": 6632 }, { "epoch": 0.9031862745098039, "grad_norm": 42.92433640691424, "learning_rate": 2.2628694339080724e-08, "logits/chosen": 13.871393203735352, "logits/rejected": 14.479503631591797, "logps/chosen": -4.3780837059021, "logps/rejected": -4.517055511474609, "loss": 4.3465, "rewards/accuracies": 0.75, "rewards/chosen": -43.78083801269531, "rewards/margins": 1.3897171020507812, "rewards/rejected": -45.170555114746094, "step": 6633 }, { "epoch": 0.903322440087146, "grad_norm": 37.34228998316925, "learning_rate": 2.2565690891728482e-08, "logits/chosen": 13.745033264160156, "logits/rejected": 14.027322769165039, "logps/chosen": -4.297122478485107, "logps/rejected": -4.571649551391602, "loss": 3.635, "rewards/accuracies": 1.0, "rewards/chosen": -42.971221923828125, "rewards/margins": 2.745272636413574, "rewards/rejected": -45.71649932861328, "step": 6634 }, { "epoch": 0.903458605664488, "grad_norm": 40.90070682562208, "learning_rate": 2.250277272877823e-08, "logits/chosen": 14.266908645629883, "logits/rejected": 14.133350372314453, "logps/chosen": -4.340119361877441, "logps/rejected": -4.63205099105835, "loss": 4.3969, "rewards/accuracies": 0.75, "rewards/chosen": -43.40119171142578, "rewards/margins": 2.919318199157715, "rewards/rejected": -46.32051086425781, "step": 6635 }, { "epoch": 0.9035947712418301, "grad_norm": 46.67734419743404, "learning_rate": 2.24399398644469e-08, "logits/chosen": 15.18874740600586, "logits/rejected": 15.28404712677002, "logps/chosen": -4.558150291442871, "logps/rejected": -4.543025016784668, "loss": 4.232, "rewards/accuracies": 0.5, "rewards/chosen": -45.581504821777344, "rewards/margins": -0.15125751495361328, "rewards/rejected": -45.43025207519531, "step": 6636 }, { "epoch": 0.9037309368191722, "grad_norm": 68.71120227430276, "learning_rate": 2.2377192312932157e-08, "logits/chosen": 13.881745338439941, "logits/rejected": 14.054034233093262, "logps/chosen": -4.556504249572754, "logps/rejected": -4.615355968475342, "loss": 4.5332, "rewards/accuracies": 0.5, "rewards/chosen": -45.56504440307617, "rewards/margins": 0.5885162353515625, "rewards/rejected": -46.153560638427734, "step": 6637 }, { "epoch": 0.9038671023965141, "grad_norm": 39.38800291025631, "learning_rate": 2.2314530088412175e-08, "logits/chosen": 14.238834381103516, "logits/rejected": 14.436233520507812, "logps/chosen": -4.655962944030762, "logps/rejected": -4.597442626953125, "loss": 3.8765, "rewards/accuracies": 0.5, "rewards/chosen": -46.55963134765625, "rewards/margins": -0.5852031707763672, "rewards/rejected": -45.97442626953125, "step": 6638 }, { "epoch": 0.9040032679738562, "grad_norm": 38.5625797188029, "learning_rate": 2.225195320504616e-08, "logits/chosen": 13.017328262329102, "logits/rejected": 14.885181427001953, "logps/chosen": -4.183040618896484, "logps/rejected": -5.039066314697266, "loss": 3.9292, "rewards/accuracies": 1.0, "rewards/chosen": -41.830406188964844, "rewards/margins": 8.560256004333496, "rewards/rejected": -50.390663146972656, "step": 6639 }, { "epoch": 0.9041394335511983, "grad_norm": 41.07865706496696, "learning_rate": 2.218946167697382e-08, "logits/chosen": 14.092317581176758, "logits/rejected": 15.463831901550293, "logps/chosen": -4.424388885498047, "logps/rejected": -5.232882022857666, "loss": 4.0664, "rewards/accuracies": 1.0, "rewards/chosen": -44.24388885498047, "rewards/margins": 8.084930419921875, "rewards/rejected": -52.328819274902344, "step": 6640 }, { "epoch": 0.9042755991285403, "grad_norm": 44.18721078931139, "learning_rate": 2.212705551831564e-08, "logits/chosen": 14.627618789672852, "logits/rejected": 14.743572235107422, "logps/chosen": -4.418446063995361, "logps/rejected": -4.6451215744018555, "loss": 3.4949, "rewards/accuracies": 0.5, "rewards/chosen": -44.1844596862793, "rewards/margins": 2.266758918762207, "rewards/rejected": -46.45121765136719, "step": 6641 }, { "epoch": 0.9044117647058824, "grad_norm": 42.45932715499006, "learning_rate": 2.2064734743172742e-08, "logits/chosen": 14.991907119750977, "logits/rejected": 15.133252143859863, "logps/chosen": -4.860195636749268, "logps/rejected": -4.902884006500244, "loss": 3.65, "rewards/accuracies": 0.5, "rewards/chosen": -48.60195541381836, "rewards/margins": 0.42688560485839844, "rewards/rejected": -49.028839111328125, "step": 6642 }, { "epoch": 0.9045479302832244, "grad_norm": 41.684274274821114, "learning_rate": 2.2002499365627146e-08, "logits/chosen": 14.732406616210938, "logits/rejected": 14.889894485473633, "logps/chosen": -4.447436809539795, "logps/rejected": -5.024718284606934, "loss": 4.2373, "rewards/accuracies": 1.0, "rewards/chosen": -44.474365234375, "rewards/margins": 5.772817611694336, "rewards/rejected": -50.2471809387207, "step": 6643 }, { "epoch": 0.9046840958605664, "grad_norm": 36.43241857823037, "learning_rate": 2.1940349399741296e-08, "logits/chosen": 14.11459732055664, "logits/rejected": 14.534244537353516, "logps/chosen": -4.441567420959473, "logps/rejected": -4.700217247009277, "loss": 3.6103, "rewards/accuracies": 0.75, "rewards/chosen": -44.415672302246094, "rewards/margins": 2.586503028869629, "rewards/rejected": -47.002174377441406, "step": 6644 }, { "epoch": 0.9048202614379085, "grad_norm": 37.592970653270754, "learning_rate": 2.1878284859558583e-08, "logits/chosen": 14.186437606811523, "logits/rejected": 14.871918678283691, "logps/chosen": -4.692002773284912, "logps/rejected": -4.851078510284424, "loss": 3.6836, "rewards/accuracies": 0.75, "rewards/chosen": -46.92002868652344, "rewards/margins": 1.5907583236694336, "rewards/rejected": -48.51078796386719, "step": 6645 }, { "epoch": 0.9049564270152506, "grad_norm": 37.3533475878109, "learning_rate": 2.1816305759103072e-08, "logits/chosen": 14.716390609741211, "logits/rejected": 14.93163013458252, "logps/chosen": -4.5667195320129395, "logps/rejected": -4.838412284851074, "loss": 3.7057, "rewards/accuracies": 0.75, "rewards/chosen": -45.66719055175781, "rewards/margins": 2.7169275283813477, "rewards/rejected": -48.384117126464844, "step": 6646 }, { "epoch": 0.9050925925925926, "grad_norm": 41.68127154851244, "learning_rate": 2.1754412112379295e-08, "logits/chosen": 14.55760383605957, "logits/rejected": 14.81739330291748, "logps/chosen": -4.8060221672058105, "logps/rejected": -4.96355676651001, "loss": 4.1893, "rewards/accuracies": 0.5, "rewards/chosen": -48.06022262573242, "rewards/margins": 1.575347900390625, "rewards/rejected": -49.63557052612305, "step": 6647 }, { "epoch": 0.9052287581699346, "grad_norm": 37.642185456908024, "learning_rate": 2.1692603933372733e-08, "logits/chosen": 14.07756233215332, "logits/rejected": 14.458003997802734, "logps/chosen": -4.509278297424316, "logps/rejected": -4.838146686553955, "loss": 3.6909, "rewards/accuracies": 0.75, "rewards/chosen": -45.0927848815918, "rewards/margins": 3.2886829376220703, "rewards/rejected": -48.3814697265625, "step": 6648 }, { "epoch": 0.9053649237472767, "grad_norm": 38.267364490049246, "learning_rate": 2.163088123604946e-08, "logits/chosen": 14.194071769714355, "logits/rejected": 14.840591430664062, "logps/chosen": -4.448932647705078, "logps/rejected": -4.969870567321777, "loss": 3.546, "rewards/accuracies": 0.75, "rewards/chosen": -44.48932647705078, "rewards/margins": 5.209381103515625, "rewards/rejected": -49.698707580566406, "step": 6649 }, { "epoch": 0.9055010893246187, "grad_norm": 43.97034583394479, "learning_rate": 2.1569244034356184e-08, "logits/chosen": 15.057648658752441, "logits/rejected": 14.932363510131836, "logps/chosen": -4.705284118652344, "logps/rejected": -4.7312493324279785, "loss": 3.4182, "rewards/accuracies": 0.75, "rewards/chosen": -47.05284881591797, "rewards/margins": 0.25965023040771484, "rewards/rejected": -47.312496185302734, "step": 6650 }, { "epoch": 0.9056372549019608, "grad_norm": 43.262534875196174, "learning_rate": 2.150769234222034e-08, "logits/chosen": 13.517393112182617, "logits/rejected": 13.27151870727539, "logps/chosen": -4.431731224060059, "logps/rejected": -4.50083065032959, "loss": 3.9763, "rewards/accuracies": 0.5, "rewards/chosen": -44.31731414794922, "rewards/margins": 0.6909952163696289, "rewards/rejected": -45.00830841064453, "step": 6651 }, { "epoch": 0.9057734204793029, "grad_norm": 36.79639511691274, "learning_rate": 2.1446226173550097e-08, "logits/chosen": 14.43223762512207, "logits/rejected": 15.120199203491211, "logps/chosen": -4.621215343475342, "logps/rejected": -4.728836536407471, "loss": 3.9291, "rewards/accuracies": 0.5, "rewards/chosen": -46.21215057373047, "rewards/margins": 1.0762138366699219, "rewards/rejected": -47.288368225097656, "step": 6652 }, { "epoch": 0.9059095860566448, "grad_norm": 41.361272914782475, "learning_rate": 2.1384845542234166e-08, "logits/chosen": 13.649544715881348, "logits/rejected": 15.207844734191895, "logps/chosen": -4.383703708648682, "logps/rejected": -4.810991287231445, "loss": 3.4554, "rewards/accuracies": 1.0, "rewards/chosen": -43.8370361328125, "rewards/margins": 4.272875785827637, "rewards/rejected": -48.10990905761719, "step": 6653 }, { "epoch": 0.9060457516339869, "grad_norm": 41.42015675194983, "learning_rate": 2.132355046214207e-08, "logits/chosen": 13.984773635864258, "logits/rejected": 14.382329940795898, "logps/chosen": -4.517103672027588, "logps/rejected": -4.900843143463135, "loss": 3.8364, "rewards/accuracies": 1.0, "rewards/chosen": -45.17103576660156, "rewards/margins": 3.8373937606811523, "rewards/rejected": -49.00843048095703, "step": 6654 }, { "epoch": 0.906181917211329, "grad_norm": 37.23396669940587, "learning_rate": 2.1262340947123937e-08, "logits/chosen": 14.156228065490723, "logits/rejected": 14.386462211608887, "logps/chosen": -4.566632270812988, "logps/rejected": -4.601788520812988, "loss": 4.0399, "rewards/accuracies": 0.5, "rewards/chosen": -45.66632843017578, "rewards/margins": 0.3515586853027344, "rewards/rejected": -46.01788330078125, "step": 6655 }, { "epoch": 0.906318082788671, "grad_norm": 40.763753035331604, "learning_rate": 2.120121701101052e-08, "logits/chosen": 14.011035919189453, "logits/rejected": 14.548139572143555, "logps/chosen": -4.591405868530273, "logps/rejected": -4.961519241333008, "loss": 3.8517, "rewards/accuracies": 0.75, "rewards/chosen": -45.914058685302734, "rewards/margins": 3.7011356353759766, "rewards/rejected": -49.61519241333008, "step": 6656 }, { "epoch": 0.9064542483660131, "grad_norm": 37.3830874596933, "learning_rate": 2.1140178667613264e-08, "logits/chosen": 14.262838363647461, "logits/rejected": 14.477052688598633, "logps/chosen": -4.415815830230713, "logps/rejected": -4.6376190185546875, "loss": 4.1824, "rewards/accuracies": 0.75, "rewards/chosen": -44.15815734863281, "rewards/margins": 2.2180347442626953, "rewards/rejected": -46.376190185546875, "step": 6657 }, { "epoch": 0.9065904139433552, "grad_norm": 38.396349495926444, "learning_rate": 2.107922593072442e-08, "logits/chosen": 15.113945007324219, "logits/rejected": 13.760435104370117, "logps/chosen": -4.789968490600586, "logps/rejected": -4.647696495056152, "loss": 4.3477, "rewards/accuracies": 0.5, "rewards/chosen": -47.899681091308594, "rewards/margins": -1.4227190017700195, "rewards/rejected": -46.476966857910156, "step": 6658 }, { "epoch": 0.9067265795206971, "grad_norm": 40.31183914382717, "learning_rate": 2.1018358814116577e-08, "logits/chosen": 14.1962890625, "logits/rejected": 14.451309204101562, "logps/chosen": -4.151786804199219, "logps/rejected": -4.589145660400391, "loss": 3.5305, "rewards/accuracies": 0.75, "rewards/chosen": -41.51786422729492, "rewards/margins": 4.373595237731934, "rewards/rejected": -45.891456604003906, "step": 6659 }, { "epoch": 0.9068627450980392, "grad_norm": 39.984200233232514, "learning_rate": 2.095757733154331e-08, "logits/chosen": 14.585416793823242, "logits/rejected": 15.252324104309082, "logps/chosen": -4.887247562408447, "logps/rejected": -4.830795764923096, "loss": 3.419, "rewards/accuracies": 0.5, "rewards/chosen": -48.872474670410156, "rewards/margins": -0.5645198822021484, "rewards/rejected": -48.30795669555664, "step": 6660 }, { "epoch": 0.9069989106753813, "grad_norm": 37.154547051860995, "learning_rate": 2.089688149673865e-08, "logits/chosen": 14.781414031982422, "logits/rejected": 14.956722259521484, "logps/chosen": -4.736313343048096, "logps/rejected": -4.894802570343018, "loss": 3.9228, "rewards/accuracies": 0.5, "rewards/chosen": -47.36313247680664, "rewards/margins": 1.5848922729492188, "rewards/rejected": -48.94802474975586, "step": 6661 }, { "epoch": 0.9071350762527233, "grad_norm": 49.83453563073818, "learning_rate": 2.0836271323417276e-08, "logits/chosen": 14.377619743347168, "logits/rejected": 15.131280899047852, "logps/chosen": -4.741781711578369, "logps/rejected": -4.807672023773193, "loss": 4.0782, "rewards/accuracies": 0.5, "rewards/chosen": -47.417816162109375, "rewards/margins": 0.6589031219482422, "rewards/rejected": -48.07672119140625, "step": 6662 }, { "epoch": 0.9072712418300654, "grad_norm": 41.315473141983425, "learning_rate": 2.077574682527459e-08, "logits/chosen": 14.295820236206055, "logits/rejected": 14.834556579589844, "logps/chosen": -4.660384178161621, "logps/rejected": -5.096901893615723, "loss": 3.3249, "rewards/accuracies": 0.75, "rewards/chosen": -46.603843688964844, "rewards/margins": 4.365174293518066, "rewards/rejected": -50.969017028808594, "step": 6663 }, { "epoch": 0.9074074074074074, "grad_norm": 39.96801184457623, "learning_rate": 2.0715308015986664e-08, "logits/chosen": 14.093171119689941, "logits/rejected": 14.247398376464844, "logps/chosen": -4.452460289001465, "logps/rejected": -4.758256912231445, "loss": 3.9072, "rewards/accuracies": 0.75, "rewards/chosen": -44.52460479736328, "rewards/margins": 3.057967185974121, "rewards/rejected": -47.58256912231445, "step": 6664 }, { "epoch": 0.9075435729847494, "grad_norm": 43.97576858336336, "learning_rate": 2.065495490921001e-08, "logits/chosen": 14.484225273132324, "logits/rejected": 15.153966903686523, "logps/chosen": -4.539984703063965, "logps/rejected": -4.8373122215271, "loss": 4.3559, "rewards/accuracies": 0.5, "rewards/chosen": -45.399845123291016, "rewards/margins": 2.973276138305664, "rewards/rejected": -48.37312316894531, "step": 6665 }, { "epoch": 0.9076797385620915, "grad_norm": 39.422160224843196, "learning_rate": 2.059468751858202e-08, "logits/chosen": 15.012372970581055, "logits/rejected": 15.345521926879883, "logps/chosen": -4.76569128036499, "logps/rejected": -4.9389543533325195, "loss": 3.9379, "rewards/accuracies": 0.75, "rewards/chosen": -47.65691375732422, "rewards/margins": 1.7326288223266602, "rewards/rejected": -49.38954162597656, "step": 6666 }, { "epoch": 0.9078159041394336, "grad_norm": 38.1562037230859, "learning_rate": 2.0534505857720653e-08, "logits/chosen": 14.688116073608398, "logits/rejected": 15.602924346923828, "logps/chosen": -4.4442620277404785, "logps/rejected": -4.834550857543945, "loss": 3.6401, "rewards/accuracies": 0.75, "rewards/chosen": -44.44261932373047, "rewards/margins": 3.9028892517089844, "rewards/rejected": -48.34550857543945, "step": 6667 }, { "epoch": 0.9079520697167756, "grad_norm": 41.714813237894965, "learning_rate": 2.0474409940224313e-08, "logits/chosen": 14.242427825927734, "logits/rejected": 14.381162643432617, "logps/chosen": -4.694311141967773, "logps/rejected": -4.906648635864258, "loss": 4.3307, "rewards/accuracies": 0.75, "rewards/chosen": -46.943115234375, "rewards/margins": 2.12337589263916, "rewards/rejected": -49.066490173339844, "step": 6668 }, { "epoch": 0.9080882352941176, "grad_norm": 39.12312277589371, "learning_rate": 2.041439977967223e-08, "logits/chosen": 14.207738876342773, "logits/rejected": 14.878351211547852, "logps/chosen": -4.531444549560547, "logps/rejected": -4.891800880432129, "loss": 3.7219, "rewards/accuracies": 0.75, "rewards/chosen": -45.314449310302734, "rewards/margins": 3.603555679321289, "rewards/rejected": -48.918006896972656, "step": 6669 }, { "epoch": 0.9082244008714597, "grad_norm": 37.588486415351944, "learning_rate": 2.0354475389624224e-08, "logits/chosen": 14.83617115020752, "logits/rejected": 14.432539939880371, "logps/chosen": -4.702350616455078, "logps/rejected": -4.917006969451904, "loss": 3.8366, "rewards/accuracies": 0.5, "rewards/chosen": -47.023502349853516, "rewards/margins": 2.146566390991211, "rewards/rejected": -49.170066833496094, "step": 6670 }, { "epoch": 0.9083605664488017, "grad_norm": 40.517751160339, "learning_rate": 2.0294636783620667e-08, "logits/chosen": 14.391616821289062, "logits/rejected": 13.901153564453125, "logps/chosen": -4.576769828796387, "logps/rejected": -4.482532978057861, "loss": 4.3211, "rewards/accuracies": 0.5, "rewards/chosen": -45.76769256591797, "rewards/margins": -0.9423637390136719, "rewards/rejected": -44.8253288269043, "step": 6671 }, { "epoch": 0.9084967320261438, "grad_norm": 35.799690450876675, "learning_rate": 2.0234883975182605e-08, "logits/chosen": 14.775218963623047, "logits/rejected": 14.783720016479492, "logps/chosen": -4.666656017303467, "logps/rejected": -4.7618794441223145, "loss": 3.9803, "rewards/accuracies": 0.5, "rewards/chosen": -46.666561126708984, "rewards/margins": 0.9522342681884766, "rewards/rejected": -47.61879348754883, "step": 6672 }, { "epoch": 0.9086328976034859, "grad_norm": 41.759637442762056, "learning_rate": 2.017521697781177e-08, "logits/chosen": 14.204900741577148, "logits/rejected": 13.923337936401367, "logps/chosen": -4.912150859832764, "logps/rejected": -4.645108699798584, "loss": 4.4017, "rewards/accuracies": 0.25, "rewards/chosen": -49.12151336669922, "rewards/margins": -2.6704254150390625, "rewards/rejected": -46.451087951660156, "step": 6673 }, { "epoch": 0.9087690631808278, "grad_norm": 39.398956556391134, "learning_rate": 2.0115635804990228e-08, "logits/chosen": 14.771647453308105, "logits/rejected": 14.875704765319824, "logps/chosen": -4.426039695739746, "logps/rejected": -4.667804718017578, "loss": 4.1211, "rewards/accuracies": 0.75, "rewards/chosen": -44.260398864746094, "rewards/margins": 2.4176454544067383, "rewards/rejected": -46.67804718017578, "step": 6674 }, { "epoch": 0.9089052287581699, "grad_norm": 44.02129397510323, "learning_rate": 2.0056140470180937e-08, "logits/chosen": 14.601219177246094, "logits/rejected": 14.463225364685059, "logps/chosen": -4.762165546417236, "logps/rejected": -4.8690185546875, "loss": 4.1705, "rewards/accuracies": 0.5, "rewards/chosen": -47.62165832519531, "rewards/margins": 1.0685310363769531, "rewards/rejected": -48.690185546875, "step": 6675 }, { "epoch": 0.909041394335512, "grad_norm": 39.19279524447519, "learning_rate": 1.999673098682737e-08, "logits/chosen": 14.160489082336426, "logits/rejected": 14.97188949584961, "logps/chosen": -4.616064071655273, "logps/rejected": -4.893636703491211, "loss": 4.1016, "rewards/accuracies": 0.75, "rewards/chosen": -46.160640716552734, "rewards/margins": 2.775726318359375, "rewards/rejected": -48.93636703491211, "step": 6676 }, { "epoch": 0.909177559912854, "grad_norm": 33.336129421292185, "learning_rate": 1.9937407368353588e-08, "logits/chosen": 13.8707275390625, "logits/rejected": 14.536684036254883, "logps/chosen": -4.085079193115234, "logps/rejected": -4.565314292907715, "loss": 3.4622, "rewards/accuracies": 0.75, "rewards/chosen": -40.850791931152344, "rewards/margins": 4.8023481369018555, "rewards/rejected": -45.653141021728516, "step": 6677 }, { "epoch": 0.9093137254901961, "grad_norm": 40.97097453863987, "learning_rate": 1.98781696281642e-08, "logits/chosen": 14.699369430541992, "logits/rejected": 14.877538681030273, "logps/chosen": -4.396635055541992, "logps/rejected": -4.453059673309326, "loss": 4.2728, "rewards/accuracies": 0.75, "rewards/chosen": -43.966346740722656, "rewards/margins": 0.5642499923706055, "rewards/rejected": -44.53059768676758, "step": 6678 }, { "epoch": 0.9094498910675382, "grad_norm": 42.078911381188824, "learning_rate": 1.9819017779644544e-08, "logits/chosen": 13.775981903076172, "logits/rejected": 15.120414733886719, "logps/chosen": -4.09719181060791, "logps/rejected": -4.650661945343018, "loss": 4.1612, "rewards/accuracies": 1.0, "rewards/chosen": -40.971920013427734, "rewards/margins": 5.534700393676758, "rewards/rejected": -46.506622314453125, "step": 6679 }, { "epoch": 0.9095860566448801, "grad_norm": 36.765579391287275, "learning_rate": 1.9759951836160416e-08, "logits/chosen": 14.330493927001953, "logits/rejected": 15.134754180908203, "logps/chosen": -4.4500346183776855, "logps/rejected": -4.839227676391602, "loss": 3.773, "rewards/accuracies": 0.75, "rewards/chosen": -44.500343322753906, "rewards/margins": 3.891935348510742, "rewards/rejected": -48.39228057861328, "step": 6680 }, { "epoch": 0.9097222222222222, "grad_norm": 38.38878026309617, "learning_rate": 1.9700971811058253e-08, "logits/chosen": 13.911680221557617, "logits/rejected": 14.751995086669922, "logps/chosen": -4.247736930847168, "logps/rejected": -4.616316795349121, "loss": 3.6164, "rewards/accuracies": 0.75, "rewards/chosen": -42.47737121582031, "rewards/margins": 3.6857986450195312, "rewards/rejected": -46.163169860839844, "step": 6681 }, { "epoch": 0.9098583877995643, "grad_norm": 39.7883127834102, "learning_rate": 1.9642077717665128e-08, "logits/chosen": 14.528213500976562, "logits/rejected": 14.039298057556152, "logps/chosen": -4.938255310058594, "logps/rejected": -4.931159019470215, "loss": 4.5302, "rewards/accuracies": 0.5, "rewards/chosen": -49.38255310058594, "rewards/margins": -0.07096290588378906, "rewards/rejected": -49.31159210205078, "step": 6682 }, { "epoch": 0.9099945533769063, "grad_norm": 39.30940408347247, "learning_rate": 1.9583269569288575e-08, "logits/chosen": 13.867523193359375, "logits/rejected": 15.795760154724121, "logps/chosen": -4.22768497467041, "logps/rejected": -4.821250915527344, "loss": 4.085, "rewards/accuracies": 1.0, "rewards/chosen": -42.276851654052734, "rewards/margins": 5.9356536865234375, "rewards/rejected": -48.21250534057617, "step": 6683 }, { "epoch": 0.9101307189542484, "grad_norm": 46.66531121870764, "learning_rate": 1.9524547379216848e-08, "logits/chosen": 14.455528259277344, "logits/rejected": 13.939752578735352, "logps/chosen": -4.643751621246338, "logps/rejected": -4.429627418518066, "loss": 4.0137, "rewards/accuracies": 0.25, "rewards/chosen": -46.43751525878906, "rewards/margins": -2.1412429809570312, "rewards/rejected": -44.29627227783203, "step": 6684 }, { "epoch": 0.9102668845315904, "grad_norm": 45.05830833174043, "learning_rate": 1.9465911160718674e-08, "logits/chosen": 13.77796745300293, "logits/rejected": 14.56385326385498, "logps/chosen": -4.223598003387451, "logps/rejected": -4.615045547485352, "loss": 3.5597, "rewards/accuracies": 1.0, "rewards/chosen": -42.23598098754883, "rewards/margins": 3.9144744873046875, "rewards/rejected": -46.150455474853516, "step": 6685 }, { "epoch": 0.9104030501089324, "grad_norm": 36.576701600409386, "learning_rate": 1.9407360927043403e-08, "logits/chosen": 14.718990325927734, "logits/rejected": 14.224830627441406, "logps/chosen": -4.7580461502075195, "logps/rejected": -4.871520042419434, "loss": 3.6603, "rewards/accuracies": 0.5, "rewards/chosen": -47.58046340942383, "rewards/margins": 1.134735107421875, "rewards/rejected": -48.71519470214844, "step": 6686 }, { "epoch": 0.9105392156862745, "grad_norm": 38.22200397504469, "learning_rate": 1.93488966914209e-08, "logits/chosen": 14.50428581237793, "logits/rejected": 14.672194480895996, "logps/chosen": -4.639297008514404, "logps/rejected": -4.732892990112305, "loss": 3.4436, "rewards/accuracies": 0.75, "rewards/chosen": -46.39297103881836, "rewards/margins": 0.9359607696533203, "rewards/rejected": -47.32893371582031, "step": 6687 }, { "epoch": 0.9106753812636166, "grad_norm": 37.196254614815665, "learning_rate": 1.9290518467061712e-08, "logits/chosen": 15.305910110473633, "logits/rejected": 15.351409912109375, "logps/chosen": -5.041974067687988, "logps/rejected": -5.105476379394531, "loss": 3.8381, "rewards/accuracies": 0.5, "rewards/chosen": -50.419742584228516, "rewards/margins": 0.6350221633911133, "rewards/rejected": -51.05476379394531, "step": 6688 }, { "epoch": 0.9108115468409586, "grad_norm": 43.1698991565658, "learning_rate": 1.923222626715688e-08, "logits/chosen": 14.747334480285645, "logits/rejected": 14.751131057739258, "logps/chosen": -4.983667373657227, "logps/rejected": -4.751679420471191, "loss": 4.2451, "rewards/accuracies": 0.0, "rewards/chosen": -49.836669921875, "rewards/margins": -2.3198795318603516, "rewards/rejected": -47.51679229736328, "step": 6689 }, { "epoch": 0.9109477124183006, "grad_norm": 37.4013037747374, "learning_rate": 1.9174020104877965e-08, "logits/chosen": 14.970132827758789, "logits/rejected": 13.652444839477539, "logps/chosen": -4.958744049072266, "logps/rejected": -4.682132720947266, "loss": 4.0648, "rewards/accuracies": 0.0, "rewards/chosen": -49.58743667602539, "rewards/margins": -2.7661094665527344, "rewards/rejected": -46.821327209472656, "step": 6690 }, { "epoch": 0.9110838779956427, "grad_norm": 37.69248473411495, "learning_rate": 1.9115899993377104e-08, "logits/chosen": 13.848987579345703, "logits/rejected": 14.693204879760742, "logps/chosen": -4.798829555511475, "logps/rejected": -4.937373638153076, "loss": 3.8824, "rewards/accuracies": 0.75, "rewards/chosen": -47.98829650878906, "rewards/margins": 1.385441780090332, "rewards/rejected": -49.37373733520508, "step": 6691 }, { "epoch": 0.9112200435729847, "grad_norm": 39.937040626257726, "learning_rate": 1.9057865945787133e-08, "logits/chosen": 14.357950210571289, "logits/rejected": 14.943767547607422, "logps/chosen": -4.384428977966309, "logps/rejected": -4.6266679763793945, "loss": 4.0517, "rewards/accuracies": 1.0, "rewards/chosen": -43.84429168701172, "rewards/margins": 2.42238712310791, "rewards/rejected": -46.26667785644531, "step": 6692 }, { "epoch": 0.9113562091503268, "grad_norm": 37.500503721471404, "learning_rate": 1.89999179752212e-08, "logits/chosen": 15.132416725158691, "logits/rejected": 15.71971607208252, "logps/chosen": -4.885353088378906, "logps/rejected": -5.05595588684082, "loss": 4.1505, "rewards/accuracies": 0.75, "rewards/chosen": -48.85353088378906, "rewards/margins": 1.7060298919677734, "rewards/rejected": -50.55956268310547, "step": 6693 }, { "epoch": 0.9114923747276689, "grad_norm": 40.04239802356707, "learning_rate": 1.8942056094773196e-08, "logits/chosen": 14.907028198242188, "logits/rejected": 14.517351150512695, "logps/chosen": -4.576353073120117, "logps/rejected": -4.644454002380371, "loss": 4.1834, "rewards/accuracies": 0.5, "rewards/chosen": -45.763526916503906, "rewards/margins": 0.6810121536254883, "rewards/rejected": -46.444541931152344, "step": 6694 }, { "epoch": 0.911628540305011, "grad_norm": 39.32224512203184, "learning_rate": 1.8884280317517453e-08, "logits/chosen": 14.937986373901367, "logits/rejected": 15.341436386108398, "logps/chosen": -4.611069679260254, "logps/rejected": -5.017859935760498, "loss": 4.132, "rewards/accuracies": 0.5, "rewards/chosen": -46.11070251464844, "rewards/margins": 4.067900657653809, "rewards/rejected": -50.1786003112793, "step": 6695 }, { "epoch": 0.9117647058823529, "grad_norm": 76.30584702660228, "learning_rate": 1.8826590656508955e-08, "logits/chosen": 14.34791374206543, "logits/rejected": 14.576253890991211, "logps/chosen": -4.672440052032471, "logps/rejected": -4.802054405212402, "loss": 4.2517, "rewards/accuracies": 0.75, "rewards/chosen": -46.724403381347656, "rewards/margins": 1.296142578125, "rewards/rejected": -48.02054214477539, "step": 6696 }, { "epoch": 0.911900871459695, "grad_norm": 43.15600777523248, "learning_rate": 1.8768987124783054e-08, "logits/chosen": 14.59774112701416, "logits/rejected": 14.957012176513672, "logps/chosen": -4.571650505065918, "logps/rejected": -4.674845218658447, "loss": 3.4893, "rewards/accuracies": 0.5, "rewards/chosen": -45.71650695800781, "rewards/margins": 1.0319452285766602, "rewards/rejected": -46.748451232910156, "step": 6697 }, { "epoch": 0.9120370370370371, "grad_norm": 49.22968188814442, "learning_rate": 1.8711469735355824e-08, "logits/chosen": 15.299612045288086, "logits/rejected": 15.676983833312988, "logps/chosen": -4.944726943969727, "logps/rejected": -4.71635627746582, "loss": 4.3921, "rewards/accuracies": 0.25, "rewards/chosen": -49.44727325439453, "rewards/margins": -2.2837095260620117, "rewards/rejected": -47.16355895996094, "step": 6698 }, { "epoch": 0.9121732026143791, "grad_norm": 40.5892498419451, "learning_rate": 1.86540385012238e-08, "logits/chosen": 14.821139335632324, "logits/rejected": 15.092493057250977, "logps/chosen": -4.805511951446533, "logps/rejected": -5.0139594078063965, "loss": 4.3362, "rewards/accuracies": 0.75, "rewards/chosen": -48.055118560791016, "rewards/margins": 2.084475517272949, "rewards/rejected": -50.13959503173828, "step": 6699 }, { "epoch": 0.9123093681917211, "grad_norm": 44.25515326612357, "learning_rate": 1.8596693435363985e-08, "logits/chosen": 14.168462753295898, "logits/rejected": 13.781710624694824, "logps/chosen": -4.440225601196289, "logps/rejected": -4.513990879058838, "loss": 3.8635, "rewards/accuracies": 0.75, "rewards/chosen": -44.402259826660156, "rewards/margins": 0.7376508712768555, "rewards/rejected": -45.13990783691406, "step": 6700 }, { "epoch": 0.9124455337690632, "grad_norm": 34.760925989313975, "learning_rate": 1.8539434550733967e-08, "logits/chosen": 13.002549171447754, "logits/rejected": 14.43298053741455, "logps/chosen": -4.5478620529174805, "logps/rejected": -4.803436279296875, "loss": 3.6331, "rewards/accuracies": 0.5, "rewards/chosen": -45.47862243652344, "rewards/margins": 2.5557432174682617, "rewards/rejected": -48.03436279296875, "step": 6701 }, { "epoch": 0.9125816993464052, "grad_norm": 37.85370965179547, "learning_rate": 1.848226186027193e-08, "logits/chosen": 13.370316505432129, "logits/rejected": 14.223819732666016, "logps/chosen": -4.304421901702881, "logps/rejected": -4.753778457641602, "loss": 3.5925, "rewards/accuracies": 1.0, "rewards/chosen": -43.044219970703125, "rewards/margins": 4.493570327758789, "rewards/rejected": -47.53778839111328, "step": 6702 }, { "epoch": 0.9127178649237473, "grad_norm": 39.55239116363001, "learning_rate": 1.842517537689652e-08, "logits/chosen": 14.260662078857422, "logits/rejected": 14.88325023651123, "logps/chosen": -4.342813491821289, "logps/rejected": -4.529106140136719, "loss": 3.7599, "rewards/accuracies": 0.75, "rewards/chosen": -43.428131103515625, "rewards/margins": 1.862929344177246, "rewards/rejected": -45.29106140136719, "step": 6703 }, { "epoch": 0.9128540305010894, "grad_norm": 39.61954707318018, "learning_rate": 1.8368175113506834e-08, "logits/chosen": 14.563444137573242, "logits/rejected": 13.787195205688477, "logps/chosen": -4.573540687561035, "logps/rejected": -4.6905198097229, "loss": 3.8272, "rewards/accuracies": 0.75, "rewards/chosen": -45.73540496826172, "rewards/margins": 1.1697912216186523, "rewards/rejected": -46.90519714355469, "step": 6704 }, { "epoch": 0.9129901960784313, "grad_norm": 40.388548813803524, "learning_rate": 1.8311261082982532e-08, "logits/chosen": 14.157377243041992, "logits/rejected": 14.281720161437988, "logps/chosen": -4.415115833282471, "logps/rejected": -4.659545421600342, "loss": 3.6298, "rewards/accuracies": 0.5, "rewards/chosen": -44.151153564453125, "rewards/margins": 2.4442968368530273, "rewards/rejected": -46.59545135498047, "step": 6705 }, { "epoch": 0.9131263616557734, "grad_norm": 40.04331385424006, "learning_rate": 1.8254433298183948e-08, "logits/chosen": 14.084779739379883, "logits/rejected": 14.449684143066406, "logps/chosen": -4.425994873046875, "logps/rejected": -4.389808654785156, "loss": 3.6568, "rewards/accuracies": 0.5, "rewards/chosen": -44.25994873046875, "rewards/margins": -0.3618631362915039, "rewards/rejected": -43.89808654785156, "step": 6706 }, { "epoch": 0.9132625272331155, "grad_norm": 36.732859178637604, "learning_rate": 1.8197691771951652e-08, "logits/chosen": 13.61851692199707, "logits/rejected": 14.418556213378906, "logps/chosen": -4.57712459564209, "logps/rejected": -4.653953552246094, "loss": 3.7963, "rewards/accuracies": 0.5, "rewards/chosen": -45.77124786376953, "rewards/margins": 0.7682905197143555, "rewards/rejected": -46.53953552246094, "step": 6707 }, { "epoch": 0.9133986928104575, "grad_norm": 38.7149176237281, "learning_rate": 1.81410365171069e-08, "logits/chosen": 15.458446502685547, "logits/rejected": 15.326566696166992, "logps/chosen": -5.202343463897705, "logps/rejected": -5.324758529663086, "loss": 3.6918, "rewards/accuracies": 0.75, "rewards/chosen": -52.023433685302734, "rewards/margins": 1.2241506576538086, "rewards/rejected": -53.247581481933594, "step": 6708 }, { "epoch": 0.9135348583877996, "grad_norm": 39.94859984453748, "learning_rate": 1.8084467546451452e-08, "logits/chosen": 13.704292297363281, "logits/rejected": 14.48659896850586, "logps/chosen": -4.340338706970215, "logps/rejected": -4.7651166915893555, "loss": 4.2553, "rewards/accuracies": 0.75, "rewards/chosen": -43.40338134765625, "rewards/margins": 4.247781753540039, "rewards/rejected": -47.65116500854492, "step": 6709 }, { "epoch": 0.9136710239651417, "grad_norm": 38.58843276925828, "learning_rate": 1.8027984872767488e-08, "logits/chosen": 14.224178314208984, "logits/rejected": 14.802621841430664, "logps/chosen": -4.442365646362305, "logps/rejected": -4.779111385345459, "loss": 3.6447, "rewards/accuracies": 0.75, "rewards/chosen": -44.42365264892578, "rewards/margins": 3.367459297180176, "rewards/rejected": -47.791114807128906, "step": 6710 }, { "epoch": 0.9138071895424836, "grad_norm": 43.02765392041565, "learning_rate": 1.797158850881777e-08, "logits/chosen": 14.530865669250488, "logits/rejected": 14.847835540771484, "logps/chosen": -4.65513277053833, "logps/rejected": -4.7193498611450195, "loss": 3.7198, "rewards/accuracies": 0.5, "rewards/chosen": -46.551326751708984, "rewards/margins": 0.6421689987182617, "rewards/rejected": -47.19349670410156, "step": 6711 }, { "epoch": 0.9139433551198257, "grad_norm": 42.588994147772816, "learning_rate": 1.791527846734553e-08, "logits/chosen": 14.716514587402344, "logits/rejected": 15.448022842407227, "logps/chosen": -5.032919406890869, "logps/rejected": -5.245952606201172, "loss": 3.9919, "rewards/accuracies": 0.75, "rewards/chosen": -50.329193115234375, "rewards/margins": 2.1303319931030273, "rewards/rejected": -52.45952606201172, "step": 6712 }, { "epoch": 0.9140795206971678, "grad_norm": 39.55071832593034, "learning_rate": 1.785905476107441e-08, "logits/chosen": 14.378742218017578, "logits/rejected": 14.75570297241211, "logps/chosen": -4.6278605461120605, "logps/rejected": -4.875337600708008, "loss": 3.5836, "rewards/accuracies": 0.75, "rewards/chosen": -46.27860641479492, "rewards/margins": 2.47476863861084, "rewards/rejected": -48.75337219238281, "step": 6713 }, { "epoch": 0.9142156862745098, "grad_norm": 43.64556252456349, "learning_rate": 1.7802917402708696e-08, "logits/chosen": 13.997282028198242, "logits/rejected": 14.611754417419434, "logps/chosen": -4.349860191345215, "logps/rejected": -4.799871444702148, "loss": 3.7942, "rewards/accuracies": 1.0, "rewards/chosen": -43.498600006103516, "rewards/margins": 4.5001115798950195, "rewards/rejected": -47.99871063232422, "step": 6714 }, { "epoch": 0.9143518518518519, "grad_norm": 40.12284673090814, "learning_rate": 1.774686640493308e-08, "logits/chosen": 14.045637130737305, "logits/rejected": 14.48104190826416, "logps/chosen": -4.39753532409668, "logps/rejected": -4.98691463470459, "loss": 3.7795, "rewards/accuracies": 1.0, "rewards/chosen": -43.97534942626953, "rewards/margins": 5.893795013427734, "rewards/rejected": -49.86914825439453, "step": 6715 }, { "epoch": 0.914488017429194, "grad_norm": 55.180222900233936, "learning_rate": 1.7690901780412725e-08, "logits/chosen": 14.699203491210938, "logits/rejected": 15.017452239990234, "logps/chosen": -4.87393856048584, "logps/rejected": -5.1284098625183105, "loss": 3.9655, "rewards/accuracies": 0.75, "rewards/chosen": -48.7393798828125, "rewards/margins": 2.5447158813476562, "rewards/rejected": -51.284095764160156, "step": 6716 }, { "epoch": 0.9146241830065359, "grad_norm": 40.06502779722991, "learning_rate": 1.7635023541793292e-08, "logits/chosen": 14.987491607666016, "logits/rejected": 15.092472076416016, "logps/chosen": -4.81510591506958, "logps/rejected": -4.986644744873047, "loss": 4.2012, "rewards/accuracies": 0.75, "rewards/chosen": -48.15106201171875, "rewards/margins": 1.7153902053833008, "rewards/rejected": -49.86644744873047, "step": 6717 }, { "epoch": 0.914760348583878, "grad_norm": 41.53578154830664, "learning_rate": 1.7579231701701035e-08, "logits/chosen": 14.743254661560059, "logits/rejected": 15.479375839233398, "logps/chosen": -4.731123924255371, "logps/rejected": -4.998051166534424, "loss": 3.8725, "rewards/accuracies": 0.5, "rewards/chosen": -47.311241149902344, "rewards/margins": 2.669269561767578, "rewards/rejected": -49.98051452636719, "step": 6718 }, { "epoch": 0.9148965141612201, "grad_norm": 43.39946052605859, "learning_rate": 1.7523526272742405e-08, "logits/chosen": 13.913610458374023, "logits/rejected": 14.62270736694336, "logps/chosen": -4.441098213195801, "logps/rejected": -4.720676422119141, "loss": 4.19, "rewards/accuracies": 1.0, "rewards/chosen": -44.41098403930664, "rewards/margins": 2.795779228210449, "rewards/rejected": -47.206764221191406, "step": 6719 }, { "epoch": 0.9150326797385621, "grad_norm": 40.5385719835452, "learning_rate": 1.7467907267504623e-08, "logits/chosen": 13.455951690673828, "logits/rejected": 14.025978088378906, "logps/chosen": -4.303513526916504, "logps/rejected": -4.447080135345459, "loss": 4.0166, "rewards/accuracies": 0.75, "rewards/chosen": -43.035133361816406, "rewards/margins": 1.4356651306152344, "rewards/rejected": -44.470802307128906, "step": 6720 }, { "epoch": 0.9151688453159041, "grad_norm": 42.5007685246322, "learning_rate": 1.7412374698555275e-08, "logits/chosen": 14.246532440185547, "logits/rejected": 14.577642440795898, "logps/chosen": -4.708775043487549, "logps/rejected": -4.918924808502197, "loss": 4.3589, "rewards/accuracies": 0.75, "rewards/chosen": -47.08774948120117, "rewards/margins": 2.1014976501464844, "rewards/rejected": -49.189247131347656, "step": 6721 }, { "epoch": 0.9153050108932462, "grad_norm": 45.447414645632286, "learning_rate": 1.735692857844233e-08, "logits/chosen": 14.960556030273438, "logits/rejected": 14.78719711303711, "logps/chosen": -4.84144401550293, "logps/rejected": -4.663674831390381, "loss": 4.2392, "rewards/accuracies": 0.25, "rewards/chosen": -48.4144401550293, "rewards/margins": -1.7776908874511719, "rewards/rejected": -46.636749267578125, "step": 6722 }, { "epoch": 0.9154411764705882, "grad_norm": 38.60494061755957, "learning_rate": 1.73015689196943e-08, "logits/chosen": 13.661266326904297, "logits/rejected": 14.458475112915039, "logps/chosen": -4.407920837402344, "logps/rejected": -4.623102188110352, "loss": 3.811, "rewards/accuracies": 0.75, "rewards/chosen": -44.07920837402344, "rewards/margins": 2.1518115997314453, "rewards/rejected": -46.231021881103516, "step": 6723 }, { "epoch": 0.9155773420479303, "grad_norm": 38.63383756070336, "learning_rate": 1.724629573482028e-08, "logits/chosen": 14.843220710754395, "logits/rejected": 15.29036808013916, "logps/chosen": -4.5356245040893555, "logps/rejected": -5.0467939376831055, "loss": 3.8979, "rewards/accuracies": 0.75, "rewards/chosen": -45.35624694824219, "rewards/margins": 5.111697196960449, "rewards/rejected": -50.46794128417969, "step": 6724 }, { "epoch": 0.9157135076252724, "grad_norm": 37.54405548168016, "learning_rate": 1.7191109036309536e-08, "logits/chosen": 13.519155502319336, "logits/rejected": 13.633915901184082, "logps/chosen": -4.258537292480469, "logps/rejected": -4.5255961418151855, "loss": 3.8336, "rewards/accuracies": 0.75, "rewards/chosen": -42.58537673950195, "rewards/margins": 2.6705875396728516, "rewards/rejected": -45.25596237182617, "step": 6725 }, { "epoch": 0.9158496732026143, "grad_norm": 43.58376222520885, "learning_rate": 1.7136008836632042e-08, "logits/chosen": 14.262438774108887, "logits/rejected": 15.175220489501953, "logps/chosen": -4.480238914489746, "logps/rejected": -4.701069355010986, "loss": 3.9849, "rewards/accuracies": 1.0, "rewards/chosen": -44.802391052246094, "rewards/margins": 2.2083044052124023, "rewards/rejected": -47.01069259643555, "step": 6726 }, { "epoch": 0.9159858387799564, "grad_norm": 41.531974432170486, "learning_rate": 1.7080995148238152e-08, "logits/chosen": 14.454870223999023, "logits/rejected": 14.919777870178223, "logps/chosen": -4.4994282722473145, "logps/rejected": -4.688687324523926, "loss": 3.7318, "rewards/accuracies": 0.5, "rewards/chosen": -44.99428176879883, "rewards/margins": 1.8925933837890625, "rewards/rejected": -46.88687515258789, "step": 6727 }, { "epoch": 0.9161220043572985, "grad_norm": 43.785908706321216, "learning_rate": 1.7026067983558635e-08, "logits/chosen": 14.393524169921875, "logits/rejected": 15.047088623046875, "logps/chosen": -4.382222652435303, "logps/rejected": -4.857817649841309, "loss": 3.9969, "rewards/accuracies": 0.75, "rewards/chosen": -43.82222366333008, "rewards/margins": 4.755951881408691, "rewards/rejected": -48.57817840576172, "step": 6728 }, { "epoch": 0.9162581699346405, "grad_norm": 40.07880607755223, "learning_rate": 1.697122735500476e-08, "logits/chosen": 14.76706314086914, "logits/rejected": 15.210878372192383, "logps/chosen": -4.321791172027588, "logps/rejected": -4.678150177001953, "loss": 3.7112, "rewards/accuracies": 0.75, "rewards/chosen": -43.21791076660156, "rewards/margins": 3.563591957092285, "rewards/rejected": -46.78150177001953, "step": 6729 }, { "epoch": 0.9163943355119826, "grad_norm": 40.44622628906183, "learning_rate": 1.691647327496826e-08, "logits/chosen": 14.11400318145752, "logits/rejected": 14.843334197998047, "logps/chosen": -4.4645676612854, "logps/rejected": -4.512136459350586, "loss": 3.4258, "rewards/accuracies": 0.5, "rewards/chosen": -44.64567947387695, "rewards/margins": 0.47568321228027344, "rewards/rejected": -45.121360778808594, "step": 6730 }, { "epoch": 0.9165305010893247, "grad_norm": 43.10294964916541, "learning_rate": 1.686180575582119e-08, "logits/chosen": 14.5463228225708, "logits/rejected": 14.321369171142578, "logps/chosen": -4.50874662399292, "logps/rejected": -4.486295700073242, "loss": 4.3623, "rewards/accuracies": 0.25, "rewards/chosen": -45.08746337890625, "rewards/margins": -0.22451019287109375, "rewards/rejected": -44.862953186035156, "step": 6731 }, { "epoch": 0.9166666666666666, "grad_norm": 47.73923850944323, "learning_rate": 1.680722480991612e-08, "logits/chosen": 14.375829696655273, "logits/rejected": 14.860774993896484, "logps/chosen": -4.721210479736328, "logps/rejected": -4.892787933349609, "loss": 4.2816, "rewards/accuracies": 0.75, "rewards/chosen": -47.21210479736328, "rewards/margins": 1.7157745361328125, "rewards/rejected": -48.927879333496094, "step": 6732 }, { "epoch": 0.9168028322440087, "grad_norm": 39.37253384475166, "learning_rate": 1.675273044958616e-08, "logits/chosen": 14.516561508178711, "logits/rejected": 14.87299633026123, "logps/chosen": -4.616575241088867, "logps/rejected": -4.971405506134033, "loss": 4.3225, "rewards/accuracies": 0.75, "rewards/chosen": -46.165748596191406, "rewards/margins": 3.5483055114746094, "rewards/rejected": -49.71405029296875, "step": 6733 }, { "epoch": 0.9169389978213508, "grad_norm": 43.85750321017316, "learning_rate": 1.6698322687144707e-08, "logits/chosen": 14.886911392211914, "logits/rejected": 14.302226066589355, "logps/chosen": -4.418221950531006, "logps/rejected": -4.32298469543457, "loss": 4.4339, "rewards/accuracies": 0.25, "rewards/chosen": -44.182220458984375, "rewards/margins": -0.9523782730102539, "rewards/rejected": -43.22984313964844, "step": 6734 }, { "epoch": 0.9170751633986928, "grad_norm": 43.11477810424503, "learning_rate": 1.664400153488561e-08, "logits/chosen": 15.337060928344727, "logits/rejected": 15.43482494354248, "logps/chosen": -5.012417793273926, "logps/rejected": -5.229212284088135, "loss": 3.8934, "rewards/accuracies": 0.75, "rewards/chosen": -50.12417984008789, "rewards/margins": 2.167943000793457, "rewards/rejected": -52.29212188720703, "step": 6735 }, { "epoch": 0.9172113289760349, "grad_norm": 39.122314117648685, "learning_rate": 1.6589767005083276e-08, "logits/chosen": 13.988449096679688, "logits/rejected": 14.513387680053711, "logps/chosen": -4.078283786773682, "logps/rejected": -4.758655071258545, "loss": 3.5804, "rewards/accuracies": 1.0, "rewards/chosen": -40.782840728759766, "rewards/margins": 6.803709030151367, "rewards/rejected": -47.5865478515625, "step": 6736 }, { "epoch": 0.9173474945533769, "grad_norm": 39.27366162882018, "learning_rate": 1.6535619109992305e-08, "logits/chosen": 14.802946090698242, "logits/rejected": 14.126938819885254, "logps/chosen": -4.74223518371582, "logps/rejected": -4.666704177856445, "loss": 4.035, "rewards/accuracies": 0.5, "rewards/chosen": -47.42235565185547, "rewards/margins": -0.7553138732910156, "rewards/rejected": -46.66703796386719, "step": 6737 }, { "epoch": 0.9174836601307189, "grad_norm": 41.14684455929556, "learning_rate": 1.6481557861847973e-08, "logits/chosen": 14.501152992248535, "logits/rejected": 15.168144226074219, "logps/chosen": -4.6578216552734375, "logps/rejected": -4.986384391784668, "loss": 3.8035, "rewards/accuracies": 0.5, "rewards/chosen": -46.57821273803711, "rewards/margins": 3.285634994506836, "rewards/rejected": -49.86384582519531, "step": 6738 }, { "epoch": 0.917619825708061, "grad_norm": 40.09889254409741, "learning_rate": 1.642758327286593e-08, "logits/chosen": 13.564167976379395, "logits/rejected": 13.83112621307373, "logps/chosen": -4.445630073547363, "logps/rejected": -4.391572952270508, "loss": 3.5977, "rewards/accuracies": 0.5, "rewards/chosen": -44.456298828125, "rewards/margins": -0.5405664443969727, "rewards/rejected": -43.915733337402344, "step": 6739 }, { "epoch": 0.9177559912854031, "grad_norm": 37.77616763538305, "learning_rate": 1.637369535524198e-08, "logits/chosen": 14.402725219726562, "logits/rejected": 14.7608642578125, "logps/chosen": -4.552901268005371, "logps/rejected": -4.729959487915039, "loss": 3.4122, "rewards/accuracies": 0.75, "rewards/chosen": -45.529014587402344, "rewards/margins": 1.770578384399414, "rewards/rejected": -47.299591064453125, "step": 6740 }, { "epoch": 0.9178921568627451, "grad_norm": 40.670324054144494, "learning_rate": 1.631989412115269e-08, "logits/chosen": 13.668048858642578, "logits/rejected": 14.210226058959961, "logps/chosen": -4.330451965332031, "logps/rejected": -4.513368606567383, "loss": 4.0238, "rewards/accuracies": 0.75, "rewards/chosen": -43.30451965332031, "rewards/margins": 1.8291730880737305, "rewards/rejected": -45.133689880371094, "step": 6741 }, { "epoch": 0.9180283224400871, "grad_norm": 47.902675819556656, "learning_rate": 1.6266179582754868e-08, "logits/chosen": 13.453388214111328, "logits/rejected": 13.953615188598633, "logps/chosen": -4.291182518005371, "logps/rejected": -4.332525730133057, "loss": 4.3582, "rewards/accuracies": 0.25, "rewards/chosen": -42.91182327270508, "rewards/margins": 0.4134330749511719, "rewards/rejected": -43.32525634765625, "step": 6742 }, { "epoch": 0.9181644880174292, "grad_norm": 38.92309380046988, "learning_rate": 1.62125517521857e-08, "logits/chosen": 14.149930000305176, "logits/rejected": 14.552953720092773, "logps/chosen": -4.4058685302734375, "logps/rejected": -4.875234603881836, "loss": 3.3551, "rewards/accuracies": 1.0, "rewards/chosen": -44.05868148803711, "rewards/margins": 4.693665504455566, "rewards/rejected": -48.752349853515625, "step": 6743 }, { "epoch": 0.9183006535947712, "grad_norm": 40.66466053001712, "learning_rate": 1.615901064156291e-08, "logits/chosen": 13.582513809204102, "logits/rejected": 13.457571029663086, "logps/chosen": -4.0895843505859375, "logps/rejected": -4.16899299621582, "loss": 4.0749, "rewards/accuracies": 0.5, "rewards/chosen": -40.895843505859375, "rewards/margins": 0.7940902709960938, "rewards/rejected": -41.68993377685547, "step": 6744 }, { "epoch": 0.9184368191721133, "grad_norm": 37.82835016645198, "learning_rate": 1.6105556262984556e-08, "logits/chosen": 14.51552963256836, "logits/rejected": 14.9532470703125, "logps/chosen": -4.611753463745117, "logps/rejected": -5.109891891479492, "loss": 3.6896, "rewards/accuracies": 0.75, "rewards/chosen": -46.11753845214844, "rewards/margins": 4.981380462646484, "rewards/rejected": -51.09891891479492, "step": 6745 }, { "epoch": 0.9185729847494554, "grad_norm": 44.98019882477154, "learning_rate": 1.6052188628529017e-08, "logits/chosen": 13.993766784667969, "logits/rejected": 14.520622253417969, "logps/chosen": -4.584465026855469, "logps/rejected": -4.705376625061035, "loss": 4.6205, "rewards/accuracies": 0.5, "rewards/chosen": -45.84464645385742, "rewards/margins": 1.2091178894042969, "rewards/rejected": -47.05376434326172, "step": 6746 }, { "epoch": 0.9187091503267973, "grad_norm": 40.80774048481791, "learning_rate": 1.599890775025523e-08, "logits/chosen": 14.088493347167969, "logits/rejected": 14.424600601196289, "logps/chosen": -4.1953558921813965, "logps/rejected": -4.538059711456299, "loss": 3.7987, "rewards/accuracies": 1.0, "rewards/chosen": -41.95355987548828, "rewards/margins": 3.4270362854003906, "rewards/rejected": -45.38059616088867, "step": 6747 }, { "epoch": 0.9188453159041394, "grad_norm": 37.73457752748561, "learning_rate": 1.594571364020245e-08, "logits/chosen": 15.032058715820312, "logits/rejected": 15.244721412658691, "logps/chosen": -4.680965423583984, "logps/rejected": -4.970244407653809, "loss": 3.9992, "rewards/accuracies": 0.75, "rewards/chosen": -46.809654235839844, "rewards/margins": 2.892788887023926, "rewards/rejected": -49.70244598388672, "step": 6748 }, { "epoch": 0.9189814814814815, "grad_norm": 40.34038846357759, "learning_rate": 1.5892606310390266e-08, "logits/chosen": 14.416505813598633, "logits/rejected": 15.741951942443848, "logps/chosen": -4.782304763793945, "logps/rejected": -5.233119487762451, "loss": 4.2378, "rewards/accuracies": 1.0, "rewards/chosen": -47.82304382324219, "rewards/margins": 4.508150100708008, "rewards/rejected": -52.33119201660156, "step": 6749 }, { "epoch": 0.9191176470588235, "grad_norm": 40.395089106003134, "learning_rate": 1.583958577281872e-08, "logits/chosen": 14.829408645629883, "logits/rejected": 14.896684646606445, "logps/chosen": -4.512566566467285, "logps/rejected": -4.7052693367004395, "loss": 3.9118, "rewards/accuracies": 0.75, "rewards/chosen": -45.12566375732422, "rewards/margins": 1.9270267486572266, "rewards/rejected": -47.05268859863281, "step": 6750 }, { "epoch": 0.9192538126361656, "grad_norm": 49.58094143358589, "learning_rate": 1.5786652039468317e-08, "logits/chosen": 14.721893310546875, "logits/rejected": 14.517229080200195, "logps/chosen": -4.880977630615234, "logps/rejected": -4.860827922821045, "loss": 3.9799, "rewards/accuracies": 0.5, "rewards/chosen": -48.809776306152344, "rewards/margins": -0.20149803161621094, "rewards/rejected": -48.6082763671875, "step": 6751 }, { "epoch": 0.9193899782135077, "grad_norm": 40.74413043219786, "learning_rate": 1.5733805122299803e-08, "logits/chosen": 14.204569816589355, "logits/rejected": 14.594843864440918, "logps/chosen": -4.603950023651123, "logps/rejected": -4.763557434082031, "loss": 3.3337, "rewards/accuracies": 0.75, "rewards/chosen": -46.03949737548828, "rewards/margins": 1.5960712432861328, "rewards/rejected": -47.63557434082031, "step": 6752 }, { "epoch": 0.9195261437908496, "grad_norm": 44.6948209942328, "learning_rate": 1.5681045033254382e-08, "logits/chosen": 14.62969970703125, "logits/rejected": 14.771772384643555, "logps/chosen": -4.560256004333496, "logps/rejected": -4.552734375, "loss": 4.3497, "rewards/accuracies": 0.5, "rewards/chosen": -45.602561950683594, "rewards/margins": -0.07522106170654297, "rewards/rejected": -45.52734375, "step": 6753 }, { "epoch": 0.9196623093681917, "grad_norm": 40.7217873937924, "learning_rate": 1.5628371784253713e-08, "logits/chosen": 14.360027313232422, "logits/rejected": 14.282684326171875, "logps/chosen": -4.420858383178711, "logps/rejected": -4.6315484046936035, "loss": 3.6264, "rewards/accuracies": 0.5, "rewards/chosen": -44.20859146118164, "rewards/margins": 2.1068944931030273, "rewards/rejected": -46.31548309326172, "step": 6754 }, { "epoch": 0.9197984749455338, "grad_norm": 43.88800051157598, "learning_rate": 1.557578538719966e-08, "logits/chosen": 14.526519775390625, "logits/rejected": 15.037096977233887, "logps/chosen": -4.8557915687561035, "logps/rejected": -5.02329158782959, "loss": 3.9851, "rewards/accuracies": 0.75, "rewards/chosen": -48.557918548583984, "rewards/margins": 1.6749944686889648, "rewards/rejected": -50.23291015625, "step": 6755 }, { "epoch": 0.9199346405228758, "grad_norm": 42.20027442867652, "learning_rate": 1.5523285853974532e-08, "logits/chosen": 15.148445129394531, "logits/rejected": 15.100936889648438, "logps/chosen": -4.831739902496338, "logps/rejected": -4.837801933288574, "loss": 4.6495, "rewards/accuracies": 0.5, "rewards/chosen": -48.31739807128906, "rewards/margins": 0.06062030792236328, "rewards/rejected": -48.378021240234375, "step": 6756 }, { "epoch": 0.9200708061002179, "grad_norm": 36.405136502842936, "learning_rate": 1.5470873196441157e-08, "logits/chosen": 14.393030166625977, "logits/rejected": 15.020587921142578, "logps/chosen": -4.788494110107422, "logps/rejected": -5.112163543701172, "loss": 4.0714, "rewards/accuracies": 1.0, "rewards/chosen": -47.88494110107422, "rewards/margins": 3.2366943359375, "rewards/rejected": -51.12163543701172, "step": 6757 }, { "epoch": 0.9202069716775599, "grad_norm": 41.0393042576548, "learning_rate": 1.5418547426442465e-08, "logits/chosen": 14.23851203918457, "logits/rejected": 14.472007751464844, "logps/chosen": -4.685020446777344, "logps/rejected": -4.626200199127197, "loss": 4.0168, "rewards/accuracies": 0.5, "rewards/chosen": -46.85020446777344, "rewards/margins": -0.5882034301757812, "rewards/rejected": -46.262001037597656, "step": 6758 }, { "epoch": 0.9203431372549019, "grad_norm": 41.18927337586993, "learning_rate": 1.536630855580201e-08, "logits/chosen": 13.33159065246582, "logits/rejected": 14.613090515136719, "logps/chosen": -4.2752275466918945, "logps/rejected": -4.615300178527832, "loss": 4.5561, "rewards/accuracies": 0.75, "rewards/chosen": -42.75227355957031, "rewards/margins": 3.400729179382324, "rewards/rejected": -46.15299987792969, "step": 6759 }, { "epoch": 0.920479302832244, "grad_norm": 38.84300494268338, "learning_rate": 1.5314156596323557e-08, "logits/chosen": 14.449403762817383, "logits/rejected": 14.353422164916992, "logps/chosen": -4.5483808517456055, "logps/rejected": -4.695102691650391, "loss": 3.2635, "rewards/accuracies": 0.5, "rewards/chosen": -45.48381042480469, "rewards/margins": 1.4672203063964844, "rewards/rejected": -46.95103073120117, "step": 6760 }, { "epoch": 0.9206154684095861, "grad_norm": 39.42589414486604, "learning_rate": 1.5262091559791234e-08, "logits/chosen": 14.619693756103516, "logits/rejected": 15.299247741699219, "logps/chosen": -4.4525957107543945, "logps/rejected": -4.550215721130371, "loss": 3.8922, "rewards/accuracies": 0.75, "rewards/chosen": -44.52595520019531, "rewards/margins": 0.9761991500854492, "rewards/rejected": -45.50215148925781, "step": 6761 }, { "epoch": 0.920751633986928, "grad_norm": 41.493340994958366, "learning_rate": 1.5210113457969587e-08, "logits/chosen": 13.810285568237305, "logits/rejected": 14.805498123168945, "logps/chosen": -4.535473823547363, "logps/rejected": -4.693753242492676, "loss": 4.0155, "rewards/accuracies": 0.5, "rewards/chosen": -45.354732513427734, "rewards/margins": 1.5827960968017578, "rewards/rejected": -46.937530517578125, "step": 6762 }, { "epoch": 0.9208877995642701, "grad_norm": 40.418716351170076, "learning_rate": 1.5158222302603573e-08, "logits/chosen": 14.014710426330566, "logits/rejected": 15.235708236694336, "logps/chosen": -4.486559867858887, "logps/rejected": -4.981696128845215, "loss": 3.4743, "rewards/accuracies": 1.0, "rewards/chosen": -44.8656005859375, "rewards/margins": 4.951362609863281, "rewards/rejected": -49.81696319580078, "step": 6763 }, { "epoch": 0.9210239651416122, "grad_norm": 44.15135096216269, "learning_rate": 1.5106418105418307e-08, "logits/chosen": 14.60236644744873, "logits/rejected": 14.072538375854492, "logps/chosen": -4.872836112976074, "logps/rejected": -4.6976494789123535, "loss": 4.1536, "rewards/accuracies": 0.5, "rewards/chosen": -48.72835922241211, "rewards/margins": -1.7518653869628906, "rewards/rejected": -46.97649383544922, "step": 6764 }, { "epoch": 0.9211601307189542, "grad_norm": 38.0354306011167, "learning_rate": 1.5054700878119442e-08, "logits/chosen": 14.031573295593262, "logits/rejected": 14.270959854125977, "logps/chosen": -4.658343315124512, "logps/rejected": -4.864550590515137, "loss": 3.5614, "rewards/accuracies": 0.75, "rewards/chosen": -46.583431243896484, "rewards/margins": 2.06207275390625, "rewards/rejected": -48.645503997802734, "step": 6765 }, { "epoch": 0.9212962962962963, "grad_norm": 42.915658905705556, "learning_rate": 1.5003070632392924e-08, "logits/chosen": 14.753662109375, "logits/rejected": 14.856517791748047, "logps/chosen": -4.720126152038574, "logps/rejected": -4.877915382385254, "loss": 3.8469, "rewards/accuracies": 0.75, "rewards/chosen": -47.20125961303711, "rewards/margins": 1.5778942108154297, "rewards/rejected": -48.779151916503906, "step": 6766 }, { "epoch": 0.9214324618736384, "grad_norm": 40.012296818163236, "learning_rate": 1.4951527379904973e-08, "logits/chosen": 13.97922134399414, "logits/rejected": 14.259452819824219, "logps/chosen": -4.5346503257751465, "logps/rejected": -4.684640407562256, "loss": 3.9007, "rewards/accuracies": 0.5, "rewards/chosen": -45.34650421142578, "rewards/margins": 1.4999017715454102, "rewards/rejected": -46.846405029296875, "step": 6767 }, { "epoch": 0.9215686274509803, "grad_norm": 41.703665321125136, "learning_rate": 1.4900071132302272e-08, "logits/chosen": 13.543663024902344, "logits/rejected": 14.287481307983398, "logps/chosen": -4.093902587890625, "logps/rejected": -4.45493221282959, "loss": 3.4214, "rewards/accuracies": 0.75, "rewards/chosen": -40.93902587890625, "rewards/margins": 3.610292434692383, "rewards/rejected": -44.54931640625, "step": 6768 }, { "epoch": 0.9217047930283224, "grad_norm": 38.787922591933516, "learning_rate": 1.4848701901211835e-08, "logits/chosen": 14.215391159057617, "logits/rejected": 14.988428115844727, "logps/chosen": -4.2796173095703125, "logps/rejected": -4.5991129875183105, "loss": 4.1684, "rewards/accuracies": 0.5, "rewards/chosen": -42.796173095703125, "rewards/margins": 3.1949539184570312, "rewards/rejected": -45.99113082885742, "step": 6769 }, { "epoch": 0.9218409586056645, "grad_norm": 38.24444655824122, "learning_rate": 1.479741969824082e-08, "logits/chosen": 14.856103897094727, "logits/rejected": 14.395885467529297, "logps/chosen": -4.471006393432617, "logps/rejected": -4.490234375, "loss": 4.0272, "rewards/accuracies": 0.5, "rewards/chosen": -44.710060119628906, "rewards/margins": 0.19228363037109375, "rewards/rejected": -44.90234375, "step": 6770 }, { "epoch": 0.9219771241830066, "grad_norm": 35.82934840650674, "learning_rate": 1.4746224534976936e-08, "logits/chosen": 14.280436515808105, "logits/rejected": 14.691963195800781, "logps/chosen": -4.568642616271973, "logps/rejected": -4.612913131713867, "loss": 3.9297, "rewards/accuracies": 0.5, "rewards/chosen": -45.686424255371094, "rewards/margins": 0.44271087646484375, "rewards/rejected": -46.12913131713867, "step": 6771 }, { "epoch": 0.9221132897603486, "grad_norm": 38.74398576797103, "learning_rate": 1.4695116422988219e-08, "logits/chosen": 13.735259056091309, "logits/rejected": 14.322728157043457, "logps/chosen": -4.496291160583496, "logps/rejected": -4.759818077087402, "loss": 3.5549, "rewards/accuracies": 0.75, "rewards/chosen": -44.962913513183594, "rewards/margins": 2.6352691650390625, "rewards/rejected": -47.598182678222656, "step": 6772 }, { "epoch": 0.9222494553376906, "grad_norm": 44.24371168285451, "learning_rate": 1.4644095373822851e-08, "logits/chosen": 14.130765914916992, "logits/rejected": 14.300922393798828, "logps/chosen": -4.229194164276123, "logps/rejected": -4.838769435882568, "loss": 4.4012, "rewards/accuracies": 1.0, "rewards/chosen": -42.29193878173828, "rewards/margins": 6.0957536697387695, "rewards/rejected": -48.3876953125, "step": 6773 }, { "epoch": 0.9223856209150327, "grad_norm": 41.391016414418, "learning_rate": 1.4593161399009523e-08, "logits/chosen": 14.257116317749023, "logits/rejected": 14.442955017089844, "logps/chosen": -4.41900634765625, "logps/rejected": -4.644778251647949, "loss": 3.9465, "rewards/accuracies": 0.25, "rewards/chosen": -44.1900634765625, "rewards/margins": 2.257716178894043, "rewards/rejected": -46.447776794433594, "step": 6774 }, { "epoch": 0.9225217864923747, "grad_norm": 43.0638282799226, "learning_rate": 1.4542314510057207e-08, "logits/chosen": 14.155397415161133, "logits/rejected": 14.480408668518066, "logps/chosen": -4.452725410461426, "logps/rejected": -4.513070583343506, "loss": 4.1226, "rewards/accuracies": 0.5, "rewards/chosen": -44.527252197265625, "rewards/margins": 0.6034526824951172, "rewards/rejected": -45.130706787109375, "step": 6775 }, { "epoch": 0.9226579520697168, "grad_norm": 40.208537891737926, "learning_rate": 1.4491554718455157e-08, "logits/chosen": 14.687593460083008, "logits/rejected": 15.454869270324707, "logps/chosen": -4.751197814941406, "logps/rejected": -5.215648651123047, "loss": 3.3113, "rewards/accuracies": 0.75, "rewards/chosen": -47.51197814941406, "rewards/margins": 4.644513130187988, "rewards/rejected": -52.15648651123047, "step": 6776 }, { "epoch": 0.9227941176470589, "grad_norm": 42.64106117243476, "learning_rate": 1.4440882035672907e-08, "logits/chosen": 14.139383316040039, "logits/rejected": 15.01756477355957, "logps/chosen": -4.354538917541504, "logps/rejected": -5.038313865661621, "loss": 4.4994, "rewards/accuracies": 1.0, "rewards/chosen": -43.54539108276367, "rewards/margins": 6.8377485275268555, "rewards/rejected": -50.383140563964844, "step": 6777 }, { "epoch": 0.9229302832244008, "grad_norm": 43.014913053193084, "learning_rate": 1.43902964731605e-08, "logits/chosen": 13.359580993652344, "logits/rejected": 13.690167427062988, "logps/chosen": -4.166553497314453, "logps/rejected": -4.368912696838379, "loss": 3.205, "rewards/accuracies": 1.0, "rewards/chosen": -41.66553497314453, "rewards/margins": 2.023591995239258, "rewards/rejected": -43.689125061035156, "step": 6778 }, { "epoch": 0.9230664488017429, "grad_norm": 38.52025296122173, "learning_rate": 1.4339798042348039e-08, "logits/chosen": 14.619073867797852, "logits/rejected": 14.7689847946167, "logps/chosen": -4.661347389221191, "logps/rejected": -4.72377872467041, "loss": 3.5045, "rewards/accuracies": 0.5, "rewards/chosen": -46.61347579956055, "rewards/margins": 0.6243162155151367, "rewards/rejected": -47.23779296875, "step": 6779 }, { "epoch": 0.923202614379085, "grad_norm": 41.52372549390945, "learning_rate": 1.4289386754646126e-08, "logits/chosen": 14.174428939819336, "logits/rejected": 14.597246170043945, "logps/chosen": -4.397462844848633, "logps/rejected": -4.688570022583008, "loss": 3.6232, "rewards/accuracies": 0.5, "rewards/chosen": -43.974632263183594, "rewards/margins": 2.911069869995117, "rewards/rejected": -46.88570022583008, "step": 6780 }, { "epoch": 0.923338779956427, "grad_norm": 41.55740496805595, "learning_rate": 1.4239062621445608e-08, "logits/chosen": 14.657246589660645, "logits/rejected": 14.568588256835938, "logps/chosen": -4.537900924682617, "logps/rejected": -4.637540340423584, "loss": 3.741, "rewards/accuracies": 0.5, "rewards/chosen": -45.37900924682617, "rewards/margins": 0.9963951110839844, "rewards/rejected": -46.375404357910156, "step": 6781 }, { "epoch": 0.9234749455337691, "grad_norm": 39.6889557351489, "learning_rate": 1.418882565411761e-08, "logits/chosen": 14.04426383972168, "logits/rejected": 15.239315032958984, "logps/chosen": -4.459297180175781, "logps/rejected": -4.905660152435303, "loss": 3.9676, "rewards/accuracies": 0.75, "rewards/chosen": -44.59297561645508, "rewards/margins": 4.463623046875, "rewards/rejected": -49.05659866333008, "step": 6782 }, { "epoch": 0.9236111111111112, "grad_norm": 39.38126663967733, "learning_rate": 1.4138675864013583e-08, "logits/chosen": 14.920934677124023, "logits/rejected": 15.610395431518555, "logps/chosen": -4.787561416625977, "logps/rejected": -5.194040775299072, "loss": 4.0195, "rewards/accuracies": 0.75, "rewards/chosen": -47.87561798095703, "rewards/margins": 4.064789772033691, "rewards/rejected": -51.940406799316406, "step": 6783 }, { "epoch": 0.9237472766884531, "grad_norm": 37.49454102639847, "learning_rate": 1.4088613262465355e-08, "logits/chosen": 14.802899360656738, "logits/rejected": 15.25367259979248, "logps/chosen": -4.6435956954956055, "logps/rejected": -5.044439315795898, "loss": 3.4972, "rewards/accuracies": 0.75, "rewards/chosen": -46.43595504760742, "rewards/margins": 4.008440971374512, "rewards/rejected": -50.44439697265625, "step": 6784 }, { "epoch": 0.9238834422657952, "grad_norm": 36.217766449034876, "learning_rate": 1.4038637860784897e-08, "logits/chosen": 13.527284622192383, "logits/rejected": 14.367330551147461, "logps/chosen": -4.34325647354126, "logps/rejected": -4.477871417999268, "loss": 3.9149, "rewards/accuracies": 0.75, "rewards/chosen": -43.43256378173828, "rewards/margins": 1.346151351928711, "rewards/rejected": -44.778717041015625, "step": 6785 }, { "epoch": 0.9240196078431373, "grad_norm": 38.00882245676396, "learning_rate": 1.3988749670264554e-08, "logits/chosen": 13.884804725646973, "logits/rejected": 14.08978271484375, "logps/chosen": -4.362133026123047, "logps/rejected": -4.509699821472168, "loss": 4.0685, "rewards/accuracies": 0.75, "rewards/chosen": -43.62133026123047, "rewards/margins": 1.475667953491211, "rewards/rejected": -45.09700012207031, "step": 6786 }, { "epoch": 0.9241557734204793, "grad_norm": 39.72752406222456, "learning_rate": 1.393894870217709e-08, "logits/chosen": 14.436391830444336, "logits/rejected": 14.95541000366211, "logps/chosen": -4.5142059326171875, "logps/rejected": -4.912740707397461, "loss": 3.5386, "rewards/accuracies": 0.75, "rewards/chosen": -45.142059326171875, "rewards/margins": 3.985348701477051, "rewards/rejected": -49.127410888671875, "step": 6787 }, { "epoch": 0.9242919389978214, "grad_norm": 38.28045786932095, "learning_rate": 1.3889234967775409e-08, "logits/chosen": 14.193163871765137, "logits/rejected": 14.386473655700684, "logps/chosen": -4.71146297454834, "logps/rejected": -5.015596866607666, "loss": 3.7564, "rewards/accuracies": 0.5, "rewards/chosen": -47.114627838134766, "rewards/margins": 3.0413379669189453, "rewards/rejected": -50.155967712402344, "step": 6788 }, { "epoch": 0.9244281045751634, "grad_norm": 39.72182538951139, "learning_rate": 1.3839608478292664e-08, "logits/chosen": 14.287015914916992, "logits/rejected": 14.388810157775879, "logps/chosen": -4.5903167724609375, "logps/rejected": -4.527106285095215, "loss": 3.6394, "rewards/accuracies": 0.5, "rewards/chosen": -45.903167724609375, "rewards/margins": -0.6321086883544922, "rewards/rejected": -45.271060943603516, "step": 6789 }, { "epoch": 0.9245642701525054, "grad_norm": 41.56015620882312, "learning_rate": 1.3790069244942415e-08, "logits/chosen": 13.982223510742188, "logits/rejected": 14.47907829284668, "logps/chosen": -4.538240909576416, "logps/rejected": -4.6396613121032715, "loss": 3.7046, "rewards/accuracies": 0.5, "rewards/chosen": -45.382408142089844, "rewards/margins": 1.0142021179199219, "rewards/rejected": -46.39661407470703, "step": 6790 }, { "epoch": 0.9247004357298475, "grad_norm": 42.30460151893147, "learning_rate": 1.3740617278918509e-08, "logits/chosen": 14.475563049316406, "logits/rejected": 15.488883972167969, "logps/chosen": -4.533759593963623, "logps/rejected": -4.9028096199035645, "loss": 4.0387, "rewards/accuracies": 1.0, "rewards/chosen": -45.33759689331055, "rewards/margins": 3.6904964447021484, "rewards/rejected": -49.02809143066406, "step": 6791 }, { "epoch": 0.9248366013071896, "grad_norm": 39.75609074304024, "learning_rate": 1.3691252591394897e-08, "logits/chosen": 14.595362663269043, "logits/rejected": 15.568953514099121, "logps/chosen": -4.59514045715332, "logps/rejected": -4.67826509475708, "loss": 4.2794, "rewards/accuracies": 0.5, "rewards/chosen": -45.95140075683594, "rewards/margins": 0.8312540054321289, "rewards/rejected": -46.78265380859375, "step": 6792 }, { "epoch": 0.9249727668845316, "grad_norm": 36.279423739557565, "learning_rate": 1.3641975193526079e-08, "logits/chosen": 14.898665428161621, "logits/rejected": 15.146387100219727, "logps/chosen": -4.535780906677246, "logps/rejected": -4.843574523925781, "loss": 4.0094, "rewards/accuracies": 0.75, "rewards/chosen": -45.357810974121094, "rewards/margins": 3.077932357788086, "rewards/rejected": -48.43574523925781, "step": 6793 }, { "epoch": 0.9251089324618736, "grad_norm": 42.39738824330767, "learning_rate": 1.3592785096446613e-08, "logits/chosen": 13.668695449829102, "logits/rejected": 15.238506317138672, "logps/chosen": -4.589140892028809, "logps/rejected": -4.705063819885254, "loss": 3.5178, "rewards/accuracies": 0.5, "rewards/chosen": -45.89140701293945, "rewards/margins": 1.1592321395874023, "rewards/rejected": -47.05064010620117, "step": 6794 }, { "epoch": 0.9252450980392157, "grad_norm": 62.55882349149826, "learning_rate": 1.3543682311271476e-08, "logits/chosen": 13.781676292419434, "logits/rejected": 14.440010070800781, "logps/chosen": -4.1111650466918945, "logps/rejected": -4.708245277404785, "loss": 4.0956, "rewards/accuracies": 1.0, "rewards/chosen": -41.11164855957031, "rewards/margins": 5.970800399780273, "rewards/rejected": -47.08245086669922, "step": 6795 }, { "epoch": 0.9253812636165577, "grad_norm": 36.950928759906176, "learning_rate": 1.3494666849095748e-08, "logits/chosen": 13.782032012939453, "logits/rejected": 13.281686782836914, "logps/chosen": -4.1746015548706055, "logps/rejected": -4.142761707305908, "loss": 3.803, "rewards/accuracies": 0.25, "rewards/chosen": -41.74601745605469, "rewards/margins": -0.31839942932128906, "rewards/rejected": -41.42761993408203, "step": 6796 }, { "epoch": 0.9255174291938998, "grad_norm": 53.56863132549745, "learning_rate": 1.3445738720994925e-08, "logits/chosen": 14.550243377685547, "logits/rejected": 15.221717834472656, "logps/chosen": -4.746731758117676, "logps/rejected": -4.90366268157959, "loss": 3.8361, "rewards/accuracies": 0.5, "rewards/chosen": -47.46731948852539, "rewards/margins": 1.5693111419677734, "rewards/rejected": -49.03662872314453, "step": 6797 }, { "epoch": 0.9256535947712419, "grad_norm": 44.0248563794313, "learning_rate": 1.3396897938024788e-08, "logits/chosen": 14.162368774414062, "logits/rejected": 14.649189949035645, "logps/chosen": -4.667319297790527, "logps/rejected": -4.926031112670898, "loss": 3.5598, "rewards/accuracies": 0.5, "rewards/chosen": -46.673194885253906, "rewards/margins": 2.587116241455078, "rewards/rejected": -49.260311126708984, "step": 6798 }, { "epoch": 0.9257897603485838, "grad_norm": 39.958887937537746, "learning_rate": 1.3348144511221216e-08, "logits/chosen": 15.157949447631836, "logits/rejected": 14.346200942993164, "logps/chosen": -5.1004862785339355, "logps/rejected": -4.873051643371582, "loss": 3.6566, "rewards/accuracies": 0.25, "rewards/chosen": -51.004859924316406, "rewards/margins": -2.2743425369262695, "rewards/rejected": -48.73051834106445, "step": 6799 }, { "epoch": 0.9259259259259259, "grad_norm": 41.10035108312747, "learning_rate": 1.3299478451600465e-08, "logits/chosen": 15.43431282043457, "logits/rejected": 15.27849006652832, "logps/chosen": -4.880476951599121, "logps/rejected": -5.17603063583374, "loss": 3.3598, "rewards/accuracies": 0.75, "rewards/chosen": -48.80476760864258, "rewards/margins": 2.9555368423461914, "rewards/rejected": -51.76030731201172, "step": 6800 }, { "epoch": 0.926062091503268, "grad_norm": 47.502964327331746, "learning_rate": 1.3250899770159074e-08, "logits/chosen": 14.165899276733398, "logits/rejected": 14.74341869354248, "logps/chosen": -4.390871047973633, "logps/rejected": -4.686620712280273, "loss": 4.4804, "rewards/accuracies": 0.75, "rewards/chosen": -43.908714294433594, "rewards/margins": 2.9574947357177734, "rewards/rejected": -46.866207122802734, "step": 6801 }, { "epoch": 0.92619825708061, "grad_norm": 44.95393430627806, "learning_rate": 1.3202408477873816e-08, "logits/chosen": 14.946892738342285, "logits/rejected": 15.221948623657227, "logps/chosen": -4.481070518493652, "logps/rejected": -4.4543375968933105, "loss": 3.8227, "rewards/accuracies": 0.5, "rewards/chosen": -44.810707092285156, "rewards/margins": -0.2673311233520508, "rewards/rejected": -44.54337692260742, "step": 6802 }, { "epoch": 0.9263344226579521, "grad_norm": 39.69510843884468, "learning_rate": 1.3154004585701662e-08, "logits/chosen": 14.022682189941406, "logits/rejected": 14.978841781616211, "logps/chosen": -4.4859538078308105, "logps/rejected": -5.039568901062012, "loss": 3.7112, "rewards/accuracies": 1.0, "rewards/chosen": -44.859535217285156, "rewards/margins": 5.536149024963379, "rewards/rejected": -50.39568328857422, "step": 6803 }, { "epoch": 0.9264705882352942, "grad_norm": 44.89808874715223, "learning_rate": 1.3105688104579814e-08, "logits/chosen": 14.007524490356445, "logits/rejected": 14.249137878417969, "logps/chosen": -4.797148704528809, "logps/rejected": -4.9081830978393555, "loss": 4.5835, "rewards/accuracies": 0.75, "rewards/chosen": -47.97148895263672, "rewards/margins": 1.1103410720825195, "rewards/rejected": -49.08182907104492, "step": 6804 }, { "epoch": 0.9266067538126361, "grad_norm": 45.13029904699652, "learning_rate": 1.30574590454259e-08, "logits/chosen": 13.87925910949707, "logits/rejected": 15.34712028503418, "logps/chosen": -4.371181964874268, "logps/rejected": -4.938070297241211, "loss": 4.2706, "rewards/accuracies": 1.0, "rewards/chosen": -43.711822509765625, "rewards/margins": 5.668881416320801, "rewards/rejected": -49.380699157714844, "step": 6805 }, { "epoch": 0.9267429193899782, "grad_norm": 43.45751546314249, "learning_rate": 1.30093174191376e-08, "logits/chosen": 14.14355182647705, "logits/rejected": 14.823725700378418, "logps/chosen": -4.569605827331543, "logps/rejected": -4.641613960266113, "loss": 4.1783, "rewards/accuracies": 0.5, "rewards/chosen": -45.69606018066406, "rewards/margins": 0.7200832366943359, "rewards/rejected": -46.416141510009766, "step": 6806 }, { "epoch": 0.9268790849673203, "grad_norm": 38.61309166214179, "learning_rate": 1.296126323659288e-08, "logits/chosen": 15.36563491821289, "logits/rejected": 15.04548454284668, "logps/chosen": -4.907108783721924, "logps/rejected": -5.135040283203125, "loss": 3.8655, "rewards/accuracies": 0.5, "rewards/chosen": -49.07109069824219, "rewards/margins": 2.2793140411376953, "rewards/rejected": -51.35040283203125, "step": 6807 }, { "epoch": 0.9270152505446623, "grad_norm": 39.161195248389085, "learning_rate": 1.2913296508650117e-08, "logits/chosen": 15.018783569335938, "logits/rejected": 14.480110168457031, "logps/chosen": -4.3754730224609375, "logps/rejected": -4.556687831878662, "loss": 4.1099, "rewards/accuracies": 0.75, "rewards/chosen": -43.75473403930664, "rewards/margins": 1.8121442794799805, "rewards/rejected": -45.56687927246094, "step": 6808 }, { "epoch": 0.9271514161220044, "grad_norm": 38.79577184099557, "learning_rate": 1.2865417246147626e-08, "logits/chosen": 14.36564826965332, "logits/rejected": 15.216064453125, "logps/chosen": -4.383671760559082, "logps/rejected": -4.834414482116699, "loss": 3.4288, "rewards/accuracies": 1.0, "rewards/chosen": -43.83672332763672, "rewards/margins": 4.50742244720459, "rewards/rejected": -48.34414291381836, "step": 6809 }, { "epoch": 0.9272875816993464, "grad_norm": 40.23844416114156, "learning_rate": 1.2817625459904214e-08, "logits/chosen": 13.845020294189453, "logits/rejected": 13.350824356079102, "logps/chosen": -4.379499912261963, "logps/rejected": -4.2337212562561035, "loss": 3.9391, "rewards/accuracies": 0.5, "rewards/chosen": -43.79499816894531, "rewards/margins": -1.4577875137329102, "rewards/rejected": -42.33721160888672, "step": 6810 }, { "epoch": 0.9274237472766884, "grad_norm": 40.80675572893171, "learning_rate": 1.2769921160718845e-08, "logits/chosen": 14.94998836517334, "logits/rejected": 15.416877746582031, "logps/chosen": -4.730112075805664, "logps/rejected": -4.801022052764893, "loss": 3.8243, "rewards/accuracies": 0.5, "rewards/chosen": -47.301124572753906, "rewards/margins": 0.7090969085693359, "rewards/rejected": -48.01021957397461, "step": 6811 }, { "epoch": 0.9275599128540305, "grad_norm": 48.87975320292585, "learning_rate": 1.2722304359370628e-08, "logits/chosen": 14.261263847351074, "logits/rejected": 13.987836837768555, "logps/chosen": -4.702539920806885, "logps/rejected": -4.52263879776001, "loss": 3.4219, "rewards/accuracies": 0.5, "rewards/chosen": -47.02539825439453, "rewards/margins": -1.7990131378173828, "rewards/rejected": -45.22638702392578, "step": 6812 }, { "epoch": 0.9276960784313726, "grad_norm": 45.010016279729555, "learning_rate": 1.2674775066619003e-08, "logits/chosen": 14.953679084777832, "logits/rejected": 14.122819900512695, "logps/chosen": -4.677059173583984, "logps/rejected": -4.801542282104492, "loss": 3.3607, "rewards/accuracies": 0.5, "rewards/chosen": -46.770591735839844, "rewards/margins": 1.2448348999023438, "rewards/rejected": -48.01542663574219, "step": 6813 }, { "epoch": 0.9278322440087146, "grad_norm": 42.04399846533379, "learning_rate": 1.2627333293203646e-08, "logits/chosen": 14.205488204956055, "logits/rejected": 14.547310829162598, "logps/chosen": -4.502155303955078, "logps/rejected": -4.692977428436279, "loss": 3.7529, "rewards/accuracies": 0.75, "rewards/chosen": -45.02155303955078, "rewards/margins": 1.9082231521606445, "rewards/rejected": -46.92977523803711, "step": 6814 }, { "epoch": 0.9279684095860566, "grad_norm": 42.87490044942527, "learning_rate": 1.2579979049844336e-08, "logits/chosen": 14.692741394042969, "logits/rejected": 14.41361141204834, "logps/chosen": -4.734740257263184, "logps/rejected": -4.833100318908691, "loss": 3.6064, "rewards/accuracies": 0.75, "rewards/chosen": -47.34740447998047, "rewards/margins": 0.9836006164550781, "rewards/rejected": -48.33100128173828, "step": 6815 }, { "epoch": 0.9281045751633987, "grad_norm": 40.06299642849908, "learning_rate": 1.2532712347241226e-08, "logits/chosen": 14.565803527832031, "logits/rejected": 14.676458358764648, "logps/chosen": -4.334803581237793, "logps/rejected": -4.866718292236328, "loss": 4.2155, "rewards/accuracies": 0.75, "rewards/chosen": -43.34803771972656, "rewards/margins": 5.319147109985352, "rewards/rejected": -48.66718292236328, "step": 6816 }, { "epoch": 0.9282407407407407, "grad_norm": 36.458291900881385, "learning_rate": 1.2485533196074661e-08, "logits/chosen": 14.425840377807617, "logits/rejected": 15.171649932861328, "logps/chosen": -4.69339656829834, "logps/rejected": -4.955080032348633, "loss": 3.6313, "rewards/accuracies": 0.75, "rewards/chosen": -46.93396759033203, "rewards/margins": 2.616830825805664, "rewards/rejected": -49.55080032348633, "step": 6817 }, { "epoch": 0.9283769063180828, "grad_norm": 35.65017128575371, "learning_rate": 1.2438441607005046e-08, "logits/chosen": 14.41445541381836, "logits/rejected": 14.839303970336914, "logps/chosen": -4.4054999351501465, "logps/rejected": -4.4717888832092285, "loss": 3.6186, "rewards/accuracies": 0.75, "rewards/chosen": -44.05500030517578, "rewards/margins": 0.6628856658935547, "rewards/rejected": -44.71788787841797, "step": 6818 }, { "epoch": 0.9285130718954249, "grad_norm": 40.081739119075316, "learning_rate": 1.2391437590673116e-08, "logits/chosen": 14.21588134765625, "logits/rejected": 14.78796672821045, "logps/chosen": -4.326898097991943, "logps/rejected": -4.562078952789307, "loss": 3.7584, "rewards/accuracies": 0.75, "rewards/chosen": -43.26898193359375, "rewards/margins": 2.351808547973633, "rewards/rejected": -45.62078857421875, "step": 6819 }, { "epoch": 0.9286492374727668, "grad_norm": 41.97175833065531, "learning_rate": 1.2344521157699972e-08, "logits/chosen": 14.726675033569336, "logits/rejected": 14.912668228149414, "logps/chosen": -4.524930000305176, "logps/rejected": -4.677855491638184, "loss": 4.1255, "rewards/accuracies": 0.5, "rewards/chosen": -45.249298095703125, "rewards/margins": 1.5292558670043945, "rewards/rejected": -46.77855682373047, "step": 6820 }, { "epoch": 0.9287854030501089, "grad_norm": 40.457075927634435, "learning_rate": 1.2297692318686604e-08, "logits/chosen": 14.555328369140625, "logits/rejected": 15.618692398071289, "logps/chosen": -4.5323486328125, "logps/rejected": -4.991239547729492, "loss": 3.6162, "rewards/accuracies": 1.0, "rewards/chosen": -45.323486328125, "rewards/margins": 4.588907241821289, "rewards/rejected": -49.91239547729492, "step": 6821 }, { "epoch": 0.928921568627451, "grad_norm": 41.573646581920606, "learning_rate": 1.2250951084214412e-08, "logits/chosen": 13.737460136413574, "logits/rejected": 14.702133178710938, "logps/chosen": -4.559569358825684, "logps/rejected": -4.785401821136475, "loss": 3.8975, "rewards/accuracies": 0.75, "rewards/chosen": -45.59569549560547, "rewards/margins": 2.2583255767822266, "rewards/rejected": -47.85401916503906, "step": 6822 }, { "epoch": 0.929057734204793, "grad_norm": 43.892600857341435, "learning_rate": 1.220429746484508e-08, "logits/chosen": 13.716935157775879, "logits/rejected": 14.186607360839844, "logps/chosen": -4.110381126403809, "logps/rejected": -4.578065872192383, "loss": 3.8907, "rewards/accuracies": 1.0, "rewards/chosen": -41.10380935668945, "rewards/margins": 4.676851272583008, "rewards/rejected": -45.78065872192383, "step": 6823 }, { "epoch": 0.9291938997821351, "grad_norm": 42.64097055834925, "learning_rate": 1.2157731471120181e-08, "logits/chosen": 14.365606307983398, "logits/rejected": 14.91728401184082, "logps/chosen": -4.9231414794921875, "logps/rejected": -5.122694969177246, "loss": 4.4036, "rewards/accuracies": 0.75, "rewards/chosen": -49.231414794921875, "rewards/margins": 1.9955310821533203, "rewards/rejected": -51.22694396972656, "step": 6824 }, { "epoch": 0.9293300653594772, "grad_norm": 41.36030071548112, "learning_rate": 1.2111253113561826e-08, "logits/chosen": 14.83449935913086, "logits/rejected": 14.242382049560547, "logps/chosen": -4.664826393127441, "logps/rejected": -4.562865734100342, "loss": 4.2336, "rewards/accuracies": 0.5, "rewards/chosen": -46.64826583862305, "rewards/margins": -1.019606590270996, "rewards/rejected": -45.62865447998047, "step": 6825 }, { "epoch": 0.9294662309368191, "grad_norm": 37.45741475535354, "learning_rate": 1.2064862402672194e-08, "logits/chosen": 14.554238319396973, "logits/rejected": 14.713998794555664, "logps/chosen": -4.584011077880859, "logps/rejected": -4.800250053405762, "loss": 3.4924, "rewards/accuracies": 0.75, "rewards/chosen": -45.840110778808594, "rewards/margins": 2.162385940551758, "rewards/rejected": -48.00250244140625, "step": 6826 }, { "epoch": 0.9296023965141612, "grad_norm": 39.94586860813386, "learning_rate": 1.2018559348933565e-08, "logits/chosen": 14.075782775878906, "logits/rejected": 14.812715530395508, "logps/chosen": -4.540431022644043, "logps/rejected": -4.782310962677002, "loss": 3.5351, "rewards/accuracies": 0.75, "rewards/chosen": -45.40430450439453, "rewards/margins": 2.4188032150268555, "rewards/rejected": -47.8231086730957, "step": 6827 }, { "epoch": 0.9297385620915033, "grad_norm": 38.10070523378203, "learning_rate": 1.197234396280855e-08, "logits/chosen": 14.080676078796387, "logits/rejected": 13.567907333374023, "logps/chosen": -4.485445022583008, "logps/rejected": -4.228311538696289, "loss": 3.6669, "rewards/accuracies": 0.25, "rewards/chosen": -44.85444641113281, "rewards/margins": -2.571333885192871, "rewards/rejected": -42.28311538696289, "step": 6828 }, { "epoch": 0.9298747276688453, "grad_norm": 41.18926533240061, "learning_rate": 1.1926216254739908e-08, "logits/chosen": 14.510300636291504, "logits/rejected": 14.759552001953125, "logps/chosen": -4.89776611328125, "logps/rejected": -4.909085750579834, "loss": 3.895, "rewards/accuracies": 0.5, "rewards/chosen": -48.977664947509766, "rewards/margins": 0.11319255828857422, "rewards/rejected": -49.090858459472656, "step": 6829 }, { "epoch": 0.9300108932461874, "grad_norm": 42.400996010588486, "learning_rate": 1.1880176235150542e-08, "logits/chosen": 14.397014617919922, "logits/rejected": 14.00263786315918, "logps/chosen": -4.549559116363525, "logps/rejected": -4.590410232543945, "loss": 4.0601, "rewards/accuracies": 0.5, "rewards/chosen": -45.4955940246582, "rewards/margins": 0.4085092544555664, "rewards/rejected": -45.90409851074219, "step": 6830 }, { "epoch": 0.9301470588235294, "grad_norm": 40.979328642330515, "learning_rate": 1.1834223914443553e-08, "logits/chosen": 13.974349975585938, "logits/rejected": 14.098871231079102, "logps/chosen": -4.278643608093262, "logps/rejected": -4.538090705871582, "loss": 3.5911, "rewards/accuracies": 1.0, "rewards/chosen": -42.78643798828125, "rewards/margins": 2.5944719314575195, "rewards/rejected": -45.38091278076172, "step": 6831 }, { "epoch": 0.9302832244008714, "grad_norm": 41.13712815543657, "learning_rate": 1.1788359303002326e-08, "logits/chosen": 14.783952713012695, "logits/rejected": 14.26051139831543, "logps/chosen": -4.441555976867676, "logps/rejected": -4.502623081207275, "loss": 3.9743, "rewards/accuracies": 0.5, "rewards/chosen": -44.415557861328125, "rewards/margins": 0.6106719970703125, "rewards/rejected": -45.02622985839844, "step": 6832 }, { "epoch": 0.9304193899782135, "grad_norm": 39.06143336238178, "learning_rate": 1.1742582411190305e-08, "logits/chosen": 14.26192569732666, "logits/rejected": 13.999765396118164, "logps/chosen": -4.18541955947876, "logps/rejected": -4.43023157119751, "loss": 3.6408, "rewards/accuracies": 0.75, "rewards/chosen": -41.85419464111328, "rewards/margins": 2.448122024536133, "rewards/rejected": -44.30231475830078, "step": 6833 }, { "epoch": 0.9305555555555556, "grad_norm": 38.73824725664045, "learning_rate": 1.1696893249351125e-08, "logits/chosen": 15.244970321655273, "logits/rejected": 15.327077865600586, "logps/chosen": -5.001689910888672, "logps/rejected": -4.910971164703369, "loss": 3.5926, "rewards/accuracies": 0.25, "rewards/chosen": -50.01689910888672, "rewards/margins": -0.9071884155273438, "rewards/rejected": -49.109710693359375, "step": 6834 }, { "epoch": 0.9306917211328976, "grad_norm": 44.41020799229731, "learning_rate": 1.1651291827808662e-08, "logits/chosen": 14.330766677856445, "logits/rejected": 14.091971397399902, "logps/chosen": -4.294726371765137, "logps/rejected": -4.37161922454834, "loss": 4.1346, "rewards/accuracies": 0.75, "rewards/chosen": -42.947265625, "rewards/margins": 0.7689275741577148, "rewards/rejected": -43.71619415283203, "step": 6835 }, { "epoch": 0.9308278867102396, "grad_norm": 41.84912172275183, "learning_rate": 1.1605778156866942e-08, "logits/chosen": 15.700007438659668, "logits/rejected": 15.233607292175293, "logps/chosen": -4.615893840789795, "logps/rejected": -5.018581390380859, "loss": 3.8774, "rewards/accuracies": 1.0, "rewards/chosen": -46.158939361572266, "rewards/margins": 4.0268754959106445, "rewards/rejected": -50.185813903808594, "step": 6836 }, { "epoch": 0.9309640522875817, "grad_norm": 39.256988293086984, "learning_rate": 1.1560352246810135e-08, "logits/chosen": 14.469047546386719, "logits/rejected": 14.988479614257812, "logps/chosen": -4.432897090911865, "logps/rejected": -4.482184410095215, "loss": 4.1711, "rewards/accuracies": 0.5, "rewards/chosen": -44.32897186279297, "rewards/margins": 0.4928712844848633, "rewards/rejected": -44.821842193603516, "step": 6837 }, { "epoch": 0.9311002178649237, "grad_norm": 43.29811092922785, "learning_rate": 1.1515014107902654e-08, "logits/chosen": 14.65271282196045, "logits/rejected": 14.963760375976562, "logps/chosen": -4.87800931930542, "logps/rejected": -4.886363983154297, "loss": 4.2091, "rewards/accuracies": 0.5, "rewards/chosen": -48.78009033203125, "rewards/margins": 0.08354377746582031, "rewards/rejected": -48.8636360168457, "step": 6838 }, { "epoch": 0.9312363834422658, "grad_norm": 36.553127737645355, "learning_rate": 1.1469763750388973e-08, "logits/chosen": 14.628763198852539, "logits/rejected": 15.130796432495117, "logps/chosen": -4.500397682189941, "logps/rejected": -4.984757900238037, "loss": 3.784, "rewards/accuracies": 1.0, "rewards/chosen": -45.00397491455078, "rewards/margins": 4.843601226806641, "rewards/rejected": -49.84757995605469, "step": 6839 }, { "epoch": 0.9313725490196079, "grad_norm": 37.44582580525542, "learning_rate": 1.1424601184493753e-08, "logits/chosen": 15.534065246582031, "logits/rejected": 15.369207382202148, "logps/chosen": -5.047945499420166, "logps/rejected": -4.8789472579956055, "loss": 3.7501, "rewards/accuracies": 0.0, "rewards/chosen": -50.479454040527344, "rewards/margins": -1.6899805068969727, "rewards/rejected": -48.78947448730469, "step": 6840 }, { "epoch": 0.9315087145969498, "grad_norm": 39.16814427409169, "learning_rate": 1.1379526420421947e-08, "logits/chosen": 13.748501777648926, "logits/rejected": 14.054055213928223, "logps/chosen": -4.627902030944824, "logps/rejected": -4.592700958251953, "loss": 4.2209, "rewards/accuracies": 0.5, "rewards/chosen": -46.279022216796875, "rewards/margins": -0.35200977325439453, "rewards/rejected": -45.9270133972168, "step": 6841 }, { "epoch": 0.9316448801742919, "grad_norm": 42.56664148742633, "learning_rate": 1.1334539468358473e-08, "logits/chosen": 14.608949661254883, "logits/rejected": 14.821258544921875, "logps/chosen": -4.5617218017578125, "logps/rejected": -4.478631973266602, "loss": 4.3146, "rewards/accuracies": 0.25, "rewards/chosen": -45.617218017578125, "rewards/margins": -0.8308925628662109, "rewards/rejected": -44.78632354736328, "step": 6842 }, { "epoch": 0.931781045751634, "grad_norm": 41.523174459088935, "learning_rate": 1.128964033846853e-08, "logits/chosen": 14.072593688964844, "logits/rejected": 15.077777862548828, "logps/chosen": -4.483545303344727, "logps/rejected": -4.79606819152832, "loss": 3.8555, "rewards/accuracies": 1.0, "rewards/chosen": -44.83545684814453, "rewards/margins": 3.125229835510254, "rewards/rejected": -47.96068572998047, "step": 6843 }, { "epoch": 0.931917211328976, "grad_norm": 44.129639722291394, "learning_rate": 1.1244829040897564e-08, "logits/chosen": 14.634329795837402, "logits/rejected": 14.433828353881836, "logps/chosen": -4.773890972137451, "logps/rejected": -4.788982391357422, "loss": 4.2856, "rewards/accuracies": 0.5, "rewards/chosen": -47.73890686035156, "rewards/margins": 0.15091609954833984, "rewards/rejected": -47.88982391357422, "step": 6844 }, { "epoch": 0.9320533769063181, "grad_norm": 40.85087443661294, "learning_rate": 1.1200105585770847e-08, "logits/chosen": 15.09307861328125, "logits/rejected": 15.009323120117188, "logps/chosen": -4.64769983291626, "logps/rejected": -4.835145473480225, "loss": 4.2182, "rewards/accuracies": 0.75, "rewards/chosen": -46.47699737548828, "rewards/margins": 1.8744564056396484, "rewards/rejected": -48.35145568847656, "step": 6845 }, { "epoch": 0.9321895424836601, "grad_norm": 42.01019705144879, "learning_rate": 1.1155469983194165e-08, "logits/chosen": 14.733348846435547, "logits/rejected": 15.299627304077148, "logps/chosen": -4.322334289550781, "logps/rejected": -4.977728366851807, "loss": 4.1611, "rewards/accuracies": 1.0, "rewards/chosen": -43.22333908081055, "rewards/margins": 6.553940773010254, "rewards/rejected": -49.77728271484375, "step": 6846 }, { "epoch": 0.9323257080610022, "grad_norm": 40.92972057838761, "learning_rate": 1.1110922243253318e-08, "logits/chosen": 13.88548755645752, "logits/rejected": 14.62586784362793, "logps/chosen": -4.2348432540893555, "logps/rejected": -4.602748870849609, "loss": 4.4273, "rewards/accuracies": 1.0, "rewards/chosen": -42.348426818847656, "rewards/margins": 3.6790590286254883, "rewards/rejected": -46.027488708496094, "step": 6847 }, { "epoch": 0.9324618736383442, "grad_norm": 39.56566183855741, "learning_rate": 1.1066462376014118e-08, "logits/chosen": 14.558486938476562, "logits/rejected": 15.356146812438965, "logps/chosen": -4.627528190612793, "logps/rejected": -4.970620155334473, "loss": 3.9024, "rewards/accuracies": 1.0, "rewards/chosen": -46.27528381347656, "rewards/margins": 3.430924415588379, "rewards/rejected": -49.706207275390625, "step": 6848 }, { "epoch": 0.9325980392156863, "grad_norm": 40.53586028797964, "learning_rate": 1.1022090391522709e-08, "logits/chosen": 14.52226734161377, "logits/rejected": 14.142602920532227, "logps/chosen": -4.749634265899658, "logps/rejected": -4.853720664978027, "loss": 4.112, "rewards/accuracies": 0.75, "rewards/chosen": -47.496341705322266, "rewards/margins": 1.0408601760864258, "rewards/rejected": -48.53720474243164, "step": 6849 }, { "epoch": 0.9327342047930284, "grad_norm": 38.828003164877025, "learning_rate": 1.0977806299805292e-08, "logits/chosen": 14.839635848999023, "logits/rejected": 15.320211410522461, "logps/chosen": -4.534858226776123, "logps/rejected": -4.90861701965332, "loss": 3.839, "rewards/accuracies": 1.0, "rewards/chosen": -45.34857940673828, "rewards/margins": 3.7375850677490234, "rewards/rejected": -49.08616638183594, "step": 6850 }, { "epoch": 0.9328703703703703, "grad_norm": 43.74687207518943, "learning_rate": 1.0933610110868263e-08, "logits/chosen": 14.527170181274414, "logits/rejected": 15.682208061218262, "logps/chosen": -4.516470909118652, "logps/rejected": -4.822061538696289, "loss": 4.305, "rewards/accuracies": 0.75, "rewards/chosen": -45.16470718383789, "rewards/margins": 3.0559072494506836, "rewards/rejected": -48.22061538696289, "step": 6851 }, { "epoch": 0.9330065359477124, "grad_norm": 41.21276365900151, "learning_rate": 1.0889501834698033e-08, "logits/chosen": 14.835836410522461, "logits/rejected": 15.126538276672363, "logps/chosen": -4.827478408813477, "logps/rejected": -4.9181365966796875, "loss": 3.6906, "rewards/accuracies": 0.5, "rewards/chosen": -48.274784088134766, "rewards/margins": 0.9065828323364258, "rewards/rejected": -49.181365966796875, "step": 6852 }, { "epoch": 0.9331427015250545, "grad_norm": 40.21806461668645, "learning_rate": 1.0845481481261343e-08, "logits/chosen": 14.5745210647583, "logits/rejected": 14.43498420715332, "logps/chosen": -4.598900318145752, "logps/rejected": -4.542566299438477, "loss": 3.8048, "rewards/accuracies": 0.5, "rewards/chosen": -45.98899841308594, "rewards/margins": -0.5633363723754883, "rewards/rejected": -45.42566680908203, "step": 6853 }, { "epoch": 0.9332788671023965, "grad_norm": 39.492703221600635, "learning_rate": 1.0801549060504855e-08, "logits/chosen": 13.517570495605469, "logits/rejected": 14.888895988464355, "logps/chosen": -4.428736686706543, "logps/rejected": -4.649085998535156, "loss": 3.9898, "rewards/accuracies": 0.75, "rewards/chosen": -44.28736877441406, "rewards/margins": 2.2034921646118164, "rewards/rejected": -46.49085998535156, "step": 6854 }, { "epoch": 0.9334150326797386, "grad_norm": 40.75917455937545, "learning_rate": 1.075770458235552e-08, "logits/chosen": 15.163509368896484, "logits/rejected": 15.090042114257812, "logps/chosen": -4.745706081390381, "logps/rejected": -4.78858757019043, "loss": 3.776, "rewards/accuracies": 0.25, "rewards/chosen": -47.457061767578125, "rewards/margins": 0.42881011962890625, "rewards/rejected": -47.88587188720703, "step": 6855 }, { "epoch": 0.9335511982570807, "grad_norm": 41.7201755276684, "learning_rate": 1.071394805672039e-08, "logits/chosen": 13.982696533203125, "logits/rejected": 14.632956504821777, "logps/chosen": -4.497529983520508, "logps/rejected": -4.656728744506836, "loss": 4.2207, "rewards/accuracies": 0.5, "rewards/chosen": -44.975303649902344, "rewards/margins": 1.5919904708862305, "rewards/rejected": -46.567291259765625, "step": 6856 }, { "epoch": 0.9336873638344226, "grad_norm": 40.7563558965362, "learning_rate": 1.0670279493486489e-08, "logits/chosen": 14.843568801879883, "logits/rejected": 14.896537780761719, "logps/chosen": -4.812166213989258, "logps/rejected": -4.703956604003906, "loss": 3.4942, "rewards/accuracies": 0.25, "rewards/chosen": -48.12165832519531, "rewards/margins": -1.0820951461791992, "rewards/rejected": -47.03956604003906, "step": 6857 }, { "epoch": 0.9338235294117647, "grad_norm": 52.32982750836146, "learning_rate": 1.062669890252117e-08, "logits/chosen": 14.45628833770752, "logits/rejected": 14.322050094604492, "logps/chosen": -4.938828468322754, "logps/rejected": -4.72407341003418, "loss": 4.1424, "rewards/accuracies": 0.0, "rewards/chosen": -49.388282775878906, "rewards/margins": -2.1475486755371094, "rewards/rejected": -47.2407341003418, "step": 6858 }, { "epoch": 0.9339596949891068, "grad_norm": 41.457570502834905, "learning_rate": 1.0583206293671887e-08, "logits/chosen": 14.202119827270508, "logits/rejected": 14.68612003326416, "logps/chosen": -4.432084083557129, "logps/rejected": -4.5424485206604, "loss": 3.9613, "rewards/accuracies": 0.75, "rewards/chosen": -44.32084274291992, "rewards/margins": 1.1036434173583984, "rewards/rejected": -45.42448425292969, "step": 6859 }, { "epoch": 0.9340958605664488, "grad_norm": 38.57756564802796, "learning_rate": 1.0539801676766068e-08, "logits/chosen": 14.267328262329102, "logits/rejected": 14.737190246582031, "logps/chosen": -4.618790626525879, "logps/rejected": -4.649038791656494, "loss": 3.9276, "rewards/accuracies": 0.25, "rewards/chosen": -46.18791198730469, "rewards/margins": 0.3024787902832031, "rewards/rejected": -46.490386962890625, "step": 6860 }, { "epoch": 0.9342320261437909, "grad_norm": 41.09436854828169, "learning_rate": 1.0496485061611338e-08, "logits/chosen": 13.825956344604492, "logits/rejected": 14.420763969421387, "logps/chosen": -4.529779434204102, "logps/rejected": -4.661599159240723, "loss": 3.9451, "rewards/accuracies": 0.5, "rewards/chosen": -45.297794342041016, "rewards/margins": 1.318202018737793, "rewards/rejected": -46.615997314453125, "step": 6861 }, { "epoch": 0.934368191721133, "grad_norm": 39.4937547060977, "learning_rate": 1.0453256457995552e-08, "logits/chosen": 14.94363021850586, "logits/rejected": 14.992246627807617, "logps/chosen": -4.5556721687316895, "logps/rejected": -4.70506477355957, "loss": 3.6938, "rewards/accuracies": 0.75, "rewards/chosen": -45.556724548339844, "rewards/margins": 1.4939231872558594, "rewards/rejected": -47.05064392089844, "step": 6862 }, { "epoch": 0.9345043572984749, "grad_norm": 43.89550002515184, "learning_rate": 1.041011587568641e-08, "logits/chosen": 14.245702743530273, "logits/rejected": 14.539121627807617, "logps/chosen": -4.865324974060059, "logps/rejected": -4.849418640136719, "loss": 3.3592, "rewards/accuracies": 0.5, "rewards/chosen": -48.65324783325195, "rewards/margins": -0.15906238555908203, "rewards/rejected": -48.49418640136719, "step": 6863 }, { "epoch": 0.934640522875817, "grad_norm": 41.460755022979384, "learning_rate": 1.0367063324432023e-08, "logits/chosen": 14.601072311401367, "logits/rejected": 14.997739791870117, "logps/chosen": -4.887643337249756, "logps/rejected": -4.90895414352417, "loss": 4.0183, "rewards/accuracies": 0.5, "rewards/chosen": -48.876434326171875, "rewards/margins": 0.21310710906982422, "rewards/rejected": -49.089542388916016, "step": 6864 }, { "epoch": 0.9347766884531591, "grad_norm": 40.87121657011609, "learning_rate": 1.0324098813960435e-08, "logits/chosen": 14.073471069335938, "logits/rejected": 14.702106475830078, "logps/chosen": -4.455944538116455, "logps/rejected": -4.865817546844482, "loss": 3.9905, "rewards/accuracies": 0.75, "rewards/chosen": -44.5594482421875, "rewards/margins": 4.098729133605957, "rewards/rejected": -48.65817642211914, "step": 6865 }, { "epoch": 0.9349128540305011, "grad_norm": 39.31917196809496, "learning_rate": 1.0281222353979746e-08, "logits/chosen": 14.890850067138672, "logits/rejected": 15.12907600402832, "logps/chosen": -4.930551052093506, "logps/rejected": -4.8759870529174805, "loss": 3.6933, "rewards/accuracies": 0.5, "rewards/chosen": -49.305511474609375, "rewards/margins": -0.5456418991088867, "rewards/rejected": -48.759864807128906, "step": 6866 }, { "epoch": 0.9350490196078431, "grad_norm": 42.14154760460367, "learning_rate": 1.0238433954178338e-08, "logits/chosen": 13.60917854309082, "logits/rejected": 14.327705383300781, "logps/chosen": -4.232654571533203, "logps/rejected": -4.5109968185424805, "loss": 4.195, "rewards/accuracies": 0.75, "rewards/chosen": -42.326541900634766, "rewards/margins": 2.7834272384643555, "rewards/rejected": -45.10997009277344, "step": 6867 }, { "epoch": 0.9351851851851852, "grad_norm": 44.24100313730725, "learning_rate": 1.0195733624224611e-08, "logits/chosen": 14.716154098510742, "logits/rejected": 14.749054908752441, "logps/chosen": -4.675641059875488, "logps/rejected": -4.713588714599609, "loss": 3.99, "rewards/accuracies": 0.75, "rewards/chosen": -46.756412506103516, "rewards/margins": 0.37947559356689453, "rewards/rejected": -47.135887145996094, "step": 6868 }, { "epoch": 0.9353213507625272, "grad_norm": 42.31877511547064, "learning_rate": 1.0153121373766982e-08, "logits/chosen": 14.819331169128418, "logits/rejected": 14.957779884338379, "logps/chosen": -4.577436447143555, "logps/rejected": -4.745970249176025, "loss": 3.319, "rewards/accuracies": 0.5, "rewards/chosen": -45.77436065673828, "rewards/margins": 1.685342788696289, "rewards/rejected": -47.45970153808594, "step": 6869 }, { "epoch": 0.9354575163398693, "grad_norm": 42.029157110208864, "learning_rate": 1.0110597212434102e-08, "logits/chosen": 14.86233139038086, "logits/rejected": 14.909497261047363, "logps/chosen": -4.748946189880371, "logps/rejected": -4.882096290588379, "loss": 3.4418, "rewards/accuracies": 0.5, "rewards/chosen": -47.489463806152344, "rewards/margins": 1.3314990997314453, "rewards/rejected": -48.82096481323242, "step": 6870 }, { "epoch": 0.9355936819172114, "grad_norm": 43.33205232734833, "learning_rate": 1.0068161149834687e-08, "logits/chosen": 14.247238159179688, "logits/rejected": 14.467379570007324, "logps/chosen": -4.571260452270508, "logps/rejected": -4.827784538269043, "loss": 4.2528, "rewards/accuracies": 0.5, "rewards/chosen": -45.71260452270508, "rewards/margins": 2.565239906311035, "rewards/rejected": -48.2778434753418, "step": 6871 }, { "epoch": 0.9357298474945533, "grad_norm": 39.42692744372, "learning_rate": 1.002581319555742e-08, "logits/chosen": 14.904626846313477, "logits/rejected": 15.902175903320312, "logps/chosen": -4.7719926834106445, "logps/rejected": -5.223586082458496, "loss": 3.5688, "rewards/accuracies": 1.0, "rewards/chosen": -47.71992874145508, "rewards/margins": 4.515931129455566, "rewards/rejected": -52.23585891723633, "step": 6872 }, { "epoch": 0.9358660130718954, "grad_norm": 39.76992334626895, "learning_rate": 9.983553359171225e-09, "logits/chosen": 14.738008499145508, "logits/rejected": 15.29068660736084, "logps/chosen": -4.436900615692139, "logps/rejected": -4.948342800140381, "loss": 3.926, "rewards/accuracies": 1.0, "rewards/chosen": -44.36900329589844, "rewards/margins": 5.114419937133789, "rewards/rejected": -49.48342514038086, "step": 6873 }, { "epoch": 0.9360021786492375, "grad_norm": 39.52070287449965, "learning_rate": 9.941381650225089e-09, "logits/chosen": 13.859207153320312, "logits/rejected": 15.268813133239746, "logps/chosen": -4.796553611755371, "logps/rejected": -4.915415287017822, "loss": 4.1455, "rewards/accuracies": 0.5, "rewards/chosen": -47.96553421020508, "rewards/margins": 1.1886186599731445, "rewards/rejected": -49.154151916503906, "step": 6874 }, { "epoch": 0.9361383442265795, "grad_norm": 43.18985787421851, "learning_rate": 9.899298078247965e-09, "logits/chosen": 14.678940773010254, "logits/rejected": 14.73327922821045, "logps/chosen": -4.632227897644043, "logps/rejected": -4.5985798835754395, "loss": 4.2158, "rewards/accuracies": 0.5, "rewards/chosen": -46.32227325439453, "rewards/margins": -0.3364753723144531, "rewards/rejected": -45.985801696777344, "step": 6875 }, { "epoch": 0.9362745098039216, "grad_norm": 41.173566128894045, "learning_rate": 9.857302652749088e-09, "logits/chosen": 14.539851188659668, "logits/rejected": 15.635347366333008, "logps/chosen": -4.487613677978516, "logps/rejected": -4.889524459838867, "loss": 4.0316, "rewards/accuracies": 0.75, "rewards/chosen": -44.87614059448242, "rewards/margins": 4.019106864929199, "rewards/rejected": -48.89524841308594, "step": 6876 }, { "epoch": 0.9364106753812637, "grad_norm": 39.90506856278606, "learning_rate": 9.815395383217628e-09, "logits/chosen": 14.149438858032227, "logits/rejected": 14.797306060791016, "logps/chosen": -4.582365036010742, "logps/rejected": -4.822357654571533, "loss": 3.7856, "rewards/accuracies": 1.0, "rewards/chosen": -45.823646545410156, "rewards/margins": 2.399930953979492, "rewards/rejected": -48.22357940673828, "step": 6877 }, { "epoch": 0.9365468409586056, "grad_norm": 39.11841692489991, "learning_rate": 9.773576279122852e-09, "logits/chosen": 15.195962905883789, "logits/rejected": 15.212480545043945, "logps/chosen": -4.965100288391113, "logps/rejected": -4.74984073638916, "loss": 4.0783, "rewards/accuracies": 0.5, "rewards/chosen": -49.651004791259766, "rewards/margins": -2.1525964736938477, "rewards/rejected": -47.49840545654297, "step": 6878 }, { "epoch": 0.9366830065359477, "grad_norm": 37.6508107119134, "learning_rate": 9.73184534991418e-09, "logits/chosen": 14.915523529052734, "logits/rejected": 14.947142601013184, "logps/chosen": -4.826926231384277, "logps/rejected": -4.805975914001465, "loss": 3.6498, "rewards/accuracies": 0.5, "rewards/chosen": -48.269264221191406, "rewards/margins": -0.20950603485107422, "rewards/rejected": -48.05975341796875, "step": 6879 }, { "epoch": 0.9368191721132898, "grad_norm": 38.898550823220106, "learning_rate": 9.690202605021092e-09, "logits/chosen": 13.979569435119629, "logits/rejected": 14.16922378540039, "logps/chosen": -4.390753269195557, "logps/rejected": -4.661311149597168, "loss": 3.6019, "rewards/accuracies": 1.0, "rewards/chosen": -43.907535552978516, "rewards/margins": 2.7055788040161133, "rewards/rejected": -46.61311340332031, "step": 6880 }, { "epoch": 0.9369553376906318, "grad_norm": 43.657004659676836, "learning_rate": 9.648648053852993e-09, "logits/chosen": 14.50826358795166, "logits/rejected": 15.199434280395508, "logps/chosen": -4.706062316894531, "logps/rejected": -4.965090751647949, "loss": 4.4732, "rewards/accuracies": 0.75, "rewards/chosen": -47.06062316894531, "rewards/margins": 2.590287208557129, "rewards/rejected": -49.650909423828125, "step": 6881 }, { "epoch": 0.9370915032679739, "grad_norm": 38.51980775157875, "learning_rate": 9.607181705799527e-09, "logits/chosen": 14.682638168334961, "logits/rejected": 15.332616806030273, "logps/chosen": -4.772642135620117, "logps/rejected": -4.755458831787109, "loss": 3.8586, "rewards/accuracies": 0.5, "rewards/chosen": -47.72642135620117, "rewards/margins": -0.17183780670166016, "rewards/rejected": -47.55458450317383, "step": 6882 }, { "epoch": 0.9372276688453159, "grad_norm": 37.063456524861024, "learning_rate": 9.565803570230446e-09, "logits/chosen": 14.30976676940918, "logits/rejected": 14.555639266967773, "logps/chosen": -4.377120018005371, "logps/rejected": -4.906467437744141, "loss": 3.537, "rewards/accuracies": 1.0, "rewards/chosen": -43.77119445800781, "rewards/margins": 5.293478965759277, "rewards/rejected": -49.064674377441406, "step": 6883 }, { "epoch": 0.9373638344226579, "grad_norm": 39.88101525630742, "learning_rate": 9.524513656495337e-09, "logits/chosen": 15.007865905761719, "logits/rejected": 14.934394836425781, "logps/chosen": -4.6327080726623535, "logps/rejected": -4.835899829864502, "loss": 3.9246, "rewards/accuracies": 0.75, "rewards/chosen": -46.32707977294922, "rewards/margins": 2.0319175720214844, "rewards/rejected": -48.3589973449707, "step": 6884 }, { "epoch": 0.9375, "grad_norm": 42.05612622029778, "learning_rate": 9.483311973924114e-09, "logits/chosen": 14.324881553649902, "logits/rejected": 14.839325904846191, "logps/chosen": -4.2361626625061035, "logps/rejected": -4.514591217041016, "loss": 4.1067, "rewards/accuracies": 1.0, "rewards/chosen": -42.36162567138672, "rewards/margins": 2.7842845916748047, "rewards/rejected": -45.145912170410156, "step": 6885 }, { "epoch": 0.9376361655773421, "grad_norm": 40.00961947227102, "learning_rate": 9.442198531826573e-09, "logits/chosen": 15.255123138427734, "logits/rejected": 13.890420913696289, "logps/chosen": -4.828721046447754, "logps/rejected": -4.477724552154541, "loss": 4.2041, "rewards/accuracies": 0.25, "rewards/chosen": -48.28721237182617, "rewards/margins": -3.5099668502807617, "rewards/rejected": -44.777244567871094, "step": 6886 }, { "epoch": 0.9377723311546841, "grad_norm": 39.53767963203046, "learning_rate": 9.401173339492708e-09, "logits/chosen": 14.826770782470703, "logits/rejected": 14.15667724609375, "logps/chosen": -4.530658721923828, "logps/rejected": -4.515419960021973, "loss": 4.2289, "rewards/accuracies": 0.25, "rewards/chosen": -45.30658721923828, "rewards/margins": -0.15238475799560547, "rewards/rejected": -45.15420150756836, "step": 6887 }, { "epoch": 0.9379084967320261, "grad_norm": 54.357996084271356, "learning_rate": 9.36023640619239e-09, "logits/chosen": 14.397339820861816, "logits/rejected": 14.83056926727295, "logps/chosen": -4.340290069580078, "logps/rejected": -4.872260570526123, "loss": 3.4944, "rewards/accuracies": 1.0, "rewards/chosen": -43.40290069580078, "rewards/margins": 5.319705009460449, "rewards/rejected": -48.72260284423828, "step": 6888 }, { "epoch": 0.9380446623093682, "grad_norm": 39.57236681041982, "learning_rate": 9.319387741175688e-09, "logits/chosen": 14.459342956542969, "logits/rejected": 14.795378684997559, "logps/chosen": -4.547328948974609, "logps/rejected": -4.929496765136719, "loss": 3.5187, "rewards/accuracies": 0.75, "rewards/chosen": -45.473289489746094, "rewards/margins": 3.8216753005981445, "rewards/rejected": -49.29496765136719, "step": 6889 }, { "epoch": 0.9381808278867102, "grad_norm": 38.89806986109652, "learning_rate": 9.278627353672819e-09, "logits/chosen": 14.874979019165039, "logits/rejected": 14.65833854675293, "logps/chosen": -4.826542854309082, "logps/rejected": -4.766773223876953, "loss": 3.8701, "rewards/accuracies": 0.25, "rewards/chosen": -48.26543045043945, "rewards/margins": -0.5977001190185547, "rewards/rejected": -47.66773223876953, "step": 6890 }, { "epoch": 0.9383169934640523, "grad_norm": 39.66488289243978, "learning_rate": 9.237955252893792e-09, "logits/chosen": 14.63463020324707, "logits/rejected": 14.765775680541992, "logps/chosen": -4.568607807159424, "logps/rejected": -4.963500022888184, "loss": 3.5996, "rewards/accuracies": 0.75, "rewards/chosen": -45.68608093261719, "rewards/margins": 3.9489212036132812, "rewards/rejected": -49.63500213623047, "step": 6891 }, { "epoch": 0.9384531590413944, "grad_norm": 45.25933339199785, "learning_rate": 9.197371448028812e-09, "logits/chosen": 14.656968116760254, "logits/rejected": 14.70706844329834, "logps/chosen": -4.593576431274414, "logps/rejected": -4.605708122253418, "loss": 4.2166, "rewards/accuracies": 0.5, "rewards/chosen": -45.935760498046875, "rewards/margins": 0.12132072448730469, "rewards/rejected": -46.05708312988281, "step": 6892 }, { "epoch": 0.9385893246187363, "grad_norm": 38.78393784201497, "learning_rate": 9.156875948248188e-09, "logits/chosen": 14.699603080749512, "logits/rejected": 15.410831451416016, "logps/chosen": -4.447998523712158, "logps/rejected": -4.996033668518066, "loss": 4.3322, "rewards/accuracies": 1.0, "rewards/chosen": -44.47998046875, "rewards/margins": 5.480353355407715, "rewards/rejected": -49.96033477783203, "step": 6893 }, { "epoch": 0.9387254901960784, "grad_norm": 44.30811146553271, "learning_rate": 9.1164687627022e-09, "logits/chosen": 13.75379753112793, "logits/rejected": 14.198243141174316, "logps/chosen": -4.693772315979004, "logps/rejected": -4.774267196655273, "loss": 4.4574, "rewards/accuracies": 0.5, "rewards/chosen": -46.937721252441406, "rewards/margins": 0.8049497604370117, "rewards/rejected": -47.74266815185547, "step": 6894 }, { "epoch": 0.9388616557734205, "grad_norm": 44.56932999133255, "learning_rate": 9.076149900521191e-09, "logits/chosen": 14.77519416809082, "logits/rejected": 15.098512649536133, "logps/chosen": -4.727291107177734, "logps/rejected": -4.8657121658325195, "loss": 4.4411, "rewards/accuracies": 0.5, "rewards/chosen": -47.27290725708008, "rewards/margins": 1.3842144012451172, "rewards/rejected": -48.65711975097656, "step": 6895 }, { "epoch": 0.9389978213507625, "grad_norm": 38.96768410170039, "learning_rate": 9.03591937081547e-09, "logits/chosen": 14.33939266204834, "logits/rejected": 14.601495742797852, "logps/chosen": -4.759091377258301, "logps/rejected": -4.858890056610107, "loss": 3.9379, "rewards/accuracies": 0.5, "rewards/chosen": -47.59091567993164, "rewards/margins": 0.9979848861694336, "rewards/rejected": -48.58890151977539, "step": 6896 }, { "epoch": 0.9391339869281046, "grad_norm": 38.597559791509546, "learning_rate": 8.995777182675546e-09, "logits/chosen": 13.890559196472168, "logits/rejected": 14.48796272277832, "logps/chosen": -4.641929626464844, "logps/rejected": -4.625478744506836, "loss": 4.0105, "rewards/accuracies": 0.5, "rewards/chosen": -46.41929626464844, "rewards/margins": -0.16451168060302734, "rewards/rejected": -46.254783630371094, "step": 6897 }, { "epoch": 0.9392701525054467, "grad_norm": 41.30917940960229, "learning_rate": 8.955723345171806e-09, "logits/chosen": 15.449750900268555, "logits/rejected": 15.10751724243164, "logps/chosen": -4.8381028175354, "logps/rejected": -4.806998252868652, "loss": 4.2948, "rewards/accuracies": 0.5, "rewards/chosen": -48.38102722167969, "rewards/margins": -0.3110494613647461, "rewards/rejected": -48.069976806640625, "step": 6898 }, { "epoch": 0.9394063180827886, "grad_norm": 41.93046755916974, "learning_rate": 8.91575786735479e-09, "logits/chosen": 15.2882661819458, "logits/rejected": 15.361985206604004, "logps/chosen": -4.519807815551758, "logps/rejected": -4.6652021408081055, "loss": 3.8306, "rewards/accuracies": 1.0, "rewards/chosen": -45.198081970214844, "rewards/margins": 1.4539413452148438, "rewards/rejected": -46.65202331542969, "step": 6899 }, { "epoch": 0.9395424836601307, "grad_norm": 35.44034810878578, "learning_rate": 8.87588075825505e-09, "logits/chosen": 14.650322914123535, "logits/rejected": 14.890657424926758, "logps/chosen": -4.374067306518555, "logps/rejected": -4.43040657043457, "loss": 3.3486, "rewards/accuracies": 0.5, "rewards/chosen": -43.74067687988281, "rewards/margins": 0.5633916854858398, "rewards/rejected": -44.3040657043457, "step": 6900 }, { "epoch": 0.9396786492374728, "grad_norm": 39.89739680540414, "learning_rate": 8.836092026883114e-09, "logits/chosen": 14.148574829101562, "logits/rejected": 14.655221939086914, "logps/chosen": -4.467277526855469, "logps/rejected": -4.381965160369873, "loss": 4.1508, "rewards/accuracies": 0.25, "rewards/chosen": -44.67277526855469, "rewards/margins": -0.8531284332275391, "rewards/rejected": -43.81964874267578, "step": 6901 }, { "epoch": 0.9398148148148148, "grad_norm": 39.05740225228569, "learning_rate": 8.796391682229565e-09, "logits/chosen": 14.89448070526123, "logits/rejected": 14.808318138122559, "logps/chosen": -4.720273971557617, "logps/rejected": -4.827170372009277, "loss": 3.7937, "rewards/accuracies": 0.5, "rewards/chosen": -47.202735900878906, "rewards/margins": 1.0689640045166016, "rewards/rejected": -48.27170181274414, "step": 6902 }, { "epoch": 0.9399509803921569, "grad_norm": 43.06043551521991, "learning_rate": 8.756779733265007e-09, "logits/chosen": 15.120622634887695, "logits/rejected": 14.720327377319336, "logps/chosen": -4.752023696899414, "logps/rejected": -4.849357604980469, "loss": 3.9204, "rewards/accuracies": 0.75, "rewards/chosen": -47.52023696899414, "rewards/margins": 0.9733333587646484, "rewards/rejected": -48.493568420410156, "step": 6903 }, { "epoch": 0.9400871459694989, "grad_norm": 40.17280991954435, "learning_rate": 8.717256188940147e-09, "logits/chosen": 14.712750434875488, "logits/rejected": 14.335649490356445, "logps/chosen": -4.557977676391602, "logps/rejected": -4.632574558258057, "loss": 3.591, "rewards/accuracies": 0.75, "rewards/chosen": -45.579776763916016, "rewards/margins": 0.7459688186645508, "rewards/rejected": -46.32574462890625, "step": 6904 }, { "epoch": 0.9402233115468409, "grad_norm": 37.96884045172625, "learning_rate": 8.677821058185619e-09, "logits/chosen": 14.70233154296875, "logits/rejected": 15.362321853637695, "logps/chosen": -4.799190998077393, "logps/rejected": -5.036003112792969, "loss": 3.482, "rewards/accuracies": 0.75, "rewards/chosen": -47.99190902709961, "rewards/margins": 2.3681230545043945, "rewards/rejected": -50.36003112792969, "step": 6905 }, { "epoch": 0.940359477124183, "grad_norm": 38.91359152945212, "learning_rate": 8.638474349912118e-09, "logits/chosen": 13.526691436767578, "logits/rejected": 14.820650100708008, "logps/chosen": -4.563789367675781, "logps/rejected": -4.983051300048828, "loss": 3.7771, "rewards/accuracies": 0.75, "rewards/chosen": -45.63789749145508, "rewards/margins": 4.192618370056152, "rewards/rejected": -49.83051300048828, "step": 6906 }, { "epoch": 0.9404956427015251, "grad_norm": 37.86797265714078, "learning_rate": 8.59921607301044e-09, "logits/chosen": 14.387469291687012, "logits/rejected": 13.759744644165039, "logps/chosen": -4.599419593811035, "logps/rejected": -4.435003280639648, "loss": 4.0964, "rewards/accuracies": 0.5, "rewards/chosen": -45.99419403076172, "rewards/margins": -1.6441593170166016, "rewards/rejected": -44.35003662109375, "step": 6907 }, { "epoch": 0.940631808278867, "grad_norm": 41.67839547553079, "learning_rate": 8.560046236351137e-09, "logits/chosen": 14.398164749145508, "logits/rejected": 14.771703720092773, "logps/chosen": -4.663217544555664, "logps/rejected": -4.636135101318359, "loss": 4.3209, "rewards/accuracies": 0.25, "rewards/chosen": -46.632171630859375, "rewards/margins": -0.27082252502441406, "rewards/rejected": -46.361351013183594, "step": 6908 }, { "epoch": 0.9407679738562091, "grad_norm": 43.61883298180592, "learning_rate": 8.520964848785084e-09, "logits/chosen": 14.865646362304688, "logits/rejected": 14.658811569213867, "logps/chosen": -4.762360572814941, "logps/rejected": -4.665645122528076, "loss": 3.8798, "rewards/accuracies": 0.5, "rewards/chosen": -47.62360382080078, "rewards/margins": -0.9671516418457031, "rewards/rejected": -46.656455993652344, "step": 6909 }, { "epoch": 0.9409041394335512, "grad_norm": 40.70838690964942, "learning_rate": 8.481971919143082e-09, "logits/chosen": 14.689809799194336, "logits/rejected": 15.20716667175293, "logps/chosen": -4.789660453796387, "logps/rejected": -4.995833396911621, "loss": 3.8898, "rewards/accuracies": 0.75, "rewards/chosen": -47.8966064453125, "rewards/margins": 2.0617246627807617, "rewards/rejected": -49.95832824707031, "step": 6910 }, { "epoch": 0.9410403050108932, "grad_norm": 40.784769431528844, "learning_rate": 8.44306745623582e-09, "logits/chosen": 13.556873321533203, "logits/rejected": 14.20589828491211, "logps/chosen": -4.500094413757324, "logps/rejected": -4.557616710662842, "loss": 3.9109, "rewards/accuracies": 0.5, "rewards/chosen": -45.000946044921875, "rewards/margins": 0.575221061706543, "rewards/rejected": -45.576168060302734, "step": 6911 }, { "epoch": 0.9411764705882353, "grad_norm": 45.45794249159479, "learning_rate": 8.404251468854085e-09, "logits/chosen": 14.705347061157227, "logits/rejected": 14.866937637329102, "logps/chosen": -4.647356986999512, "logps/rejected": -4.903313636779785, "loss": 4.3613, "rewards/accuracies": 0.75, "rewards/chosen": -46.47357177734375, "rewards/margins": 2.559565544128418, "rewards/rejected": -49.03313446044922, "step": 6912 }, { "epoch": 0.9413126361655774, "grad_norm": 40.58463019715252, "learning_rate": 8.365523965768728e-09, "logits/chosen": 14.410346984863281, "logits/rejected": 15.390170097351074, "logps/chosen": -4.811335563659668, "logps/rejected": -4.959157943725586, "loss": 3.7114, "rewards/accuracies": 0.75, "rewards/chosen": -48.11335754394531, "rewards/margins": 1.4782218933105469, "rewards/rejected": -49.59157943725586, "step": 6913 }, { "epoch": 0.9414488017429193, "grad_norm": 42.5815781052585, "learning_rate": 8.326884955730484e-09, "logits/chosen": 14.459924697875977, "logits/rejected": 14.981949806213379, "logps/chosen": -4.677626609802246, "logps/rejected": -4.643613815307617, "loss": 3.5379, "rewards/accuracies": 0.5, "rewards/chosen": -46.77626419067383, "rewards/margins": -0.34012794494628906, "rewards/rejected": -46.436134338378906, "step": 6914 }, { "epoch": 0.9415849673202614, "grad_norm": 36.904132533363644, "learning_rate": 8.288334447470147e-09, "logits/chosen": 14.00714111328125, "logits/rejected": 15.093069076538086, "logps/chosen": -4.422607898712158, "logps/rejected": -4.84562349319458, "loss": 3.8586, "rewards/accuracies": 0.75, "rewards/chosen": -44.22608184814453, "rewards/margins": 4.230157852172852, "rewards/rejected": -48.45623779296875, "step": 6915 }, { "epoch": 0.9417211328976035, "grad_norm": 42.264506498994365, "learning_rate": 8.249872449698659e-09, "logits/chosen": 13.923849105834961, "logits/rejected": 13.809802055358887, "logps/chosen": -4.40172004699707, "logps/rejected": -4.711606025695801, "loss": 4.0165, "rewards/accuracies": 0.75, "rewards/chosen": -44.0172004699707, "rewards/margins": 3.098860740661621, "rewards/rejected": -47.11606216430664, "step": 6916 }, { "epoch": 0.9418572984749455, "grad_norm": 53.65542181117593, "learning_rate": 8.211498971106667e-09, "logits/chosen": 14.322294235229492, "logits/rejected": 14.931089401245117, "logps/chosen": -4.742377281188965, "logps/rejected": -4.912883281707764, "loss": 3.9214, "rewards/accuracies": 0.75, "rewards/chosen": -47.423770904541016, "rewards/margins": 1.7050628662109375, "rewards/rejected": -49.12883377075195, "step": 6917 }, { "epoch": 0.9419934640522876, "grad_norm": 37.775159062524345, "learning_rate": 8.17321402036506e-09, "logits/chosen": 14.743130683898926, "logits/rejected": 15.207781791687012, "logps/chosen": -4.604076385498047, "logps/rejected": -5.1226701736450195, "loss": 3.4919, "rewards/accuracies": 0.75, "rewards/chosen": -46.04076385498047, "rewards/margins": 5.185937881469727, "rewards/rejected": -51.22669982910156, "step": 6918 }, { "epoch": 0.9421296296296297, "grad_norm": 40.708360153336045, "learning_rate": 8.135017606124606e-09, "logits/chosen": 14.765143394470215, "logits/rejected": 14.519890785217285, "logps/chosen": -4.766974925994873, "logps/rejected": -4.575962543487549, "loss": 3.5878, "rewards/accuracies": 0.5, "rewards/chosen": -47.66975021362305, "rewards/margins": -1.910120964050293, "rewards/rejected": -45.75962829589844, "step": 6919 }, { "epoch": 0.9422657952069716, "grad_norm": 38.58749530012595, "learning_rate": 8.096909737016133e-09, "logits/chosen": 14.546841621398926, "logits/rejected": 14.956276893615723, "logps/chosen": -4.6284332275390625, "logps/rejected": -4.75670051574707, "loss": 3.9779, "rewards/accuracies": 0.5, "rewards/chosen": -46.284332275390625, "rewards/margins": 1.282668113708496, "rewards/rejected": -47.56700134277344, "step": 6920 }, { "epoch": 0.9424019607843137, "grad_norm": 39.84052154309118, "learning_rate": 8.058890421650355e-09, "logits/chosen": 15.257102966308594, "logits/rejected": 15.644966125488281, "logps/chosen": -5.117158889770508, "logps/rejected": -5.280501365661621, "loss": 4.3141, "rewards/accuracies": 0.5, "rewards/chosen": -51.171592712402344, "rewards/margins": 1.6334257125854492, "rewards/rejected": -52.805015563964844, "step": 6921 }, { "epoch": 0.9425381263616558, "grad_norm": 42.230781698820074, "learning_rate": 8.020959668618177e-09, "logits/chosen": 13.918365478515625, "logits/rejected": 14.578519821166992, "logps/chosen": -4.675524711608887, "logps/rejected": -4.7821879386901855, "loss": 3.5821, "rewards/accuracies": 0.75, "rewards/chosen": -46.755245208740234, "rewards/margins": 1.0666351318359375, "rewards/rejected": -47.82188034057617, "step": 6922 }, { "epoch": 0.9426742919389978, "grad_norm": 39.9011346395048, "learning_rate": 7.983117486490253e-09, "logits/chosen": 14.047148704528809, "logits/rejected": 14.241689682006836, "logps/chosen": -4.8063859939575195, "logps/rejected": -4.9870147705078125, "loss": 3.6974, "rewards/accuracies": 0.75, "rewards/chosen": -48.06385803222656, "rewards/margins": 1.8062934875488281, "rewards/rejected": -49.87015151977539, "step": 6923 }, { "epoch": 0.9428104575163399, "grad_norm": 40.50845919182845, "learning_rate": 7.94536388381739e-09, "logits/chosen": 14.502426147460938, "logits/rejected": 14.40955638885498, "logps/chosen": -4.640548229217529, "logps/rejected": -4.626596450805664, "loss": 4.3963, "rewards/accuracies": 0.75, "rewards/chosen": -46.40548324584961, "rewards/margins": -0.13951969146728516, "rewards/rejected": -46.26596450805664, "step": 6924 }, { "epoch": 0.9429466230936819, "grad_norm": 40.684115126532745, "learning_rate": 7.90769886913032e-09, "logits/chosen": 14.540708541870117, "logits/rejected": 14.824211120605469, "logps/chosen": -4.708953380584717, "logps/rejected": -4.817347526550293, "loss": 4.2203, "rewards/accuracies": 0.5, "rewards/chosen": -47.089534759521484, "rewards/margins": 1.0839424133300781, "rewards/rejected": -48.17347717285156, "step": 6925 }, { "epoch": 0.943082788671024, "grad_norm": 43.795081718756954, "learning_rate": 7.870122450939742e-09, "logits/chosen": 14.935336112976074, "logits/rejected": 14.082862854003906, "logps/chosen": -4.6342854499816895, "logps/rejected": -4.489043235778809, "loss": 3.8991, "rewards/accuracies": 0.5, "rewards/chosen": -46.342857360839844, "rewards/margins": -1.4524240493774414, "rewards/rejected": -44.89043426513672, "step": 6926 }, { "epoch": 0.943218954248366, "grad_norm": 44.69233535434805, "learning_rate": 7.832634637736379e-09, "logits/chosen": 14.844978332519531, "logits/rejected": 15.244403839111328, "logps/chosen": -4.620631217956543, "logps/rejected": -4.916162014007568, "loss": 4.1933, "rewards/accuracies": 1.0, "rewards/chosen": -46.20630645751953, "rewards/margins": 2.9553117752075195, "rewards/rejected": -49.16162109375, "step": 6927 }, { "epoch": 0.9433551198257081, "grad_norm": 39.93050081553332, "learning_rate": 7.795235437990922e-09, "logits/chosen": 14.386098861694336, "logits/rejected": 14.13528060913086, "logps/chosen": -4.654851913452148, "logps/rejected": -4.582581520080566, "loss": 4.1653, "rewards/accuracies": 0.5, "rewards/chosen": -46.54852294921875, "rewards/margins": -0.7227058410644531, "rewards/rejected": -45.82582092285156, "step": 6928 }, { "epoch": 0.9434912854030502, "grad_norm": 40.58937128401054, "learning_rate": 7.757924860153985e-09, "logits/chosen": 14.43159294128418, "logits/rejected": 14.428677558898926, "logps/chosen": -4.332043647766113, "logps/rejected": -4.537796974182129, "loss": 3.6172, "rewards/accuracies": 0.5, "rewards/chosen": -43.3204345703125, "rewards/margins": 2.0575342178344727, "rewards/rejected": -45.377967834472656, "step": 6929 }, { "epoch": 0.9436274509803921, "grad_norm": 37.39007223182239, "learning_rate": 7.720702912656252e-09, "logits/chosen": 14.402427673339844, "logits/rejected": 14.613900184631348, "logps/chosen": -4.818774223327637, "logps/rejected": -4.8448686599731445, "loss": 3.6219, "rewards/accuracies": 0.5, "rewards/chosen": -48.187744140625, "rewards/margins": 0.2609415054321289, "rewards/rejected": -48.44868469238281, "step": 6930 }, { "epoch": 0.9437636165577342, "grad_norm": 40.96520252639394, "learning_rate": 7.683569603908324e-09, "logits/chosen": 13.824430465698242, "logits/rejected": 13.758073806762695, "logps/chosen": -4.371068000793457, "logps/rejected": -4.478595733642578, "loss": 4.1189, "rewards/accuracies": 0.75, "rewards/chosen": -43.71067810058594, "rewards/margins": 1.0752811431884766, "rewards/rejected": -44.78596115112305, "step": 6931 }, { "epoch": 0.9438997821350763, "grad_norm": 46.5516414925665, "learning_rate": 7.646524942300736e-09, "logits/chosen": 14.663139343261719, "logits/rejected": 14.044795036315918, "logps/chosen": -4.460912227630615, "logps/rejected": -4.722001075744629, "loss": 4.7428, "rewards/accuracies": 0.5, "rewards/chosen": -44.60912322998047, "rewards/margins": 2.6108875274658203, "rewards/rejected": -47.220008850097656, "step": 6932 }, { "epoch": 0.9440359477124183, "grad_norm": 36.895114089317964, "learning_rate": 7.60956893620408e-09, "logits/chosen": 14.495187759399414, "logits/rejected": 15.505158424377441, "logps/chosen": -4.437026023864746, "logps/rejected": -5.177210807800293, "loss": 3.745, "rewards/accuracies": 1.0, "rewards/chosen": -44.370262145996094, "rewards/margins": 7.401843070983887, "rewards/rejected": -51.77210998535156, "step": 6933 }, { "epoch": 0.9441721132897604, "grad_norm": 41.34216613021236, "learning_rate": 7.572701593968877e-09, "logits/chosen": 14.39726448059082, "logits/rejected": 15.803485870361328, "logps/chosen": -4.4929704666137695, "logps/rejected": -4.9328155517578125, "loss": 3.7056, "rewards/accuracies": 1.0, "rewards/chosen": -44.92970275878906, "rewards/margins": 4.398453712463379, "rewards/rejected": -49.328155517578125, "step": 6934 }, { "epoch": 0.9443082788671024, "grad_norm": 39.06082353787393, "learning_rate": 7.53592292392553e-09, "logits/chosen": 14.890640258789062, "logits/rejected": 14.472469329833984, "logps/chosen": -4.61618185043335, "logps/rejected": -4.607041835784912, "loss": 4.1395, "rewards/accuracies": 0.75, "rewards/chosen": -46.16181945800781, "rewards/margins": -0.09140491485595703, "rewards/rejected": -46.07041549682617, "step": 6935 }, { "epoch": 0.9444444444444444, "grad_norm": 37.156380491958174, "learning_rate": 7.499232934384548e-09, "logits/chosen": 14.65505599975586, "logits/rejected": 15.068513870239258, "logps/chosen": -4.577737808227539, "logps/rejected": -5.075055122375488, "loss": 3.5086, "rewards/accuracies": 1.0, "rewards/chosen": -45.77737808227539, "rewards/margins": 4.973174095153809, "rewards/rejected": -50.75054931640625, "step": 6936 }, { "epoch": 0.9445806100217865, "grad_norm": 37.54800533250483, "learning_rate": 7.462631633636407e-09, "logits/chosen": 14.134088516235352, "logits/rejected": 14.542803764343262, "logps/chosen": -4.789592266082764, "logps/rejected": -4.761536598205566, "loss": 3.8013, "rewards/accuracies": 0.5, "rewards/chosen": -47.89592361450195, "rewards/margins": -0.28055763244628906, "rewards/rejected": -47.61536407470703, "step": 6937 }, { "epoch": 0.9447167755991286, "grad_norm": 40.62082834710147, "learning_rate": 7.426119029951294e-09, "logits/chosen": 14.21839427947998, "logits/rejected": 13.960363388061523, "logps/chosen": -4.405997276306152, "logps/rejected": -4.504805564880371, "loss": 4.0075, "rewards/accuracies": 0.75, "rewards/chosen": -44.059974670410156, "rewards/margins": 0.9880819320678711, "rewards/rejected": -45.048057556152344, "step": 6938 }, { "epoch": 0.9448529411764706, "grad_norm": 40.8685007534905, "learning_rate": 7.389695131579676e-09, "logits/chosen": 14.342657089233398, "logits/rejected": 15.716903686523438, "logps/chosen": -4.5369038581848145, "logps/rejected": -4.9596452713012695, "loss": 4.2, "rewards/accuracies": 1.0, "rewards/chosen": -45.36903762817383, "rewards/margins": 4.227418899536133, "rewards/rejected": -49.59645462036133, "step": 6939 }, { "epoch": 0.9449891067538126, "grad_norm": 45.341358988474305, "learning_rate": 7.3533599467518134e-09, "logits/chosen": 14.135231018066406, "logits/rejected": 13.924578666687012, "logps/chosen": -4.481014251708984, "logps/rejected": -4.642436981201172, "loss": 3.3179, "rewards/accuracies": 0.75, "rewards/chosen": -44.810142517089844, "rewards/margins": 1.6142292022705078, "rewards/rejected": -46.42436981201172, "step": 6940 }, { "epoch": 0.9451252723311547, "grad_norm": 47.60453715057827, "learning_rate": 7.317113483677894e-09, "logits/chosen": 14.441288948059082, "logits/rejected": 14.28369426727295, "logps/chosen": -4.6545820236206055, "logps/rejected": -4.680920600891113, "loss": 4.4516, "rewards/accuracies": 0.75, "rewards/chosen": -46.54582214355469, "rewards/margins": 0.2633848190307617, "rewards/rejected": -46.8092041015625, "step": 6941 }, { "epoch": 0.9452614379084967, "grad_norm": 42.17629758618445, "learning_rate": 7.280955750548124e-09, "logits/chosen": 14.304210662841797, "logits/rejected": 15.588254928588867, "logps/chosen": -4.533168315887451, "logps/rejected": -4.871072769165039, "loss": 3.5551, "rewards/accuracies": 0.75, "rewards/chosen": -45.33168411254883, "rewards/margins": 3.379047393798828, "rewards/rejected": -48.71072769165039, "step": 6942 }, { "epoch": 0.9453976034858388, "grad_norm": 38.38952591276401, "learning_rate": 7.24488675553272e-09, "logits/chosen": 14.040246963500977, "logits/rejected": 15.413284301757812, "logps/chosen": -4.704684257507324, "logps/rejected": -4.759145736694336, "loss": 3.7392, "rewards/accuracies": 0.5, "rewards/chosen": -47.046844482421875, "rewards/margins": 0.5446157455444336, "rewards/rejected": -47.591461181640625, "step": 6943 }, { "epoch": 0.9455337690631809, "grad_norm": 38.75504203523704, "learning_rate": 7.208906506781609e-09, "logits/chosen": 14.435882568359375, "logits/rejected": 14.72526741027832, "logps/chosen": -4.533967018127441, "logps/rejected": -4.820422172546387, "loss": 4.1308, "rewards/accuracies": 0.75, "rewards/chosen": -45.33966827392578, "rewards/margins": 2.8645524978637695, "rewards/rejected": -48.2042236328125, "step": 6944 }, { "epoch": 0.9456699346405228, "grad_norm": 47.25867291217441, "learning_rate": 7.173015012424955e-09, "logits/chosen": 15.254264831542969, "logits/rejected": 15.343893051147461, "logps/chosen": -4.783430099487305, "logps/rejected": -4.784517288208008, "loss": 4.4092, "rewards/accuracies": 0.5, "rewards/chosen": -47.83430480957031, "rewards/margins": 0.010863304138183594, "rewards/rejected": -47.84516906738281, "step": 6945 }, { "epoch": 0.9458061002178649, "grad_norm": 42.94808026160014, "learning_rate": 7.137212280572713e-09, "logits/chosen": 14.043670654296875, "logits/rejected": 14.817092895507812, "logps/chosen": -4.616097450256348, "logps/rejected": -4.993957042694092, "loss": 3.8145, "rewards/accuracies": 0.75, "rewards/chosen": -46.160972595214844, "rewards/margins": 3.778597831726074, "rewards/rejected": -49.939571380615234, "step": 6946 }, { "epoch": 0.945942265795207, "grad_norm": 40.82560020641428, "learning_rate": 7.101498319314769e-09, "logits/chosen": 14.566259384155273, "logits/rejected": 14.574076652526855, "logps/chosen": -4.493260383605957, "logps/rejected": -4.831212043762207, "loss": 3.5515, "rewards/accuracies": 0.75, "rewards/chosen": -44.93260192871094, "rewards/margins": 3.3795127868652344, "rewards/rejected": -48.31211471557617, "step": 6947 }, { "epoch": 0.946078431372549, "grad_norm": 39.72201226251723, "learning_rate": 7.0658731367210234e-09, "logits/chosen": 15.050223350524902, "logits/rejected": 14.435979843139648, "logps/chosen": -4.5757551193237305, "logps/rejected": -4.690686225891113, "loss": 4.1991, "rewards/accuracies": 0.5, "rewards/chosen": -45.75755310058594, "rewards/margins": 1.1493091583251953, "rewards/rejected": -46.9068603515625, "step": 6948 }, { "epoch": 0.9462145969498911, "grad_norm": 48.61443884344735, "learning_rate": 7.030336740841303e-09, "logits/chosen": 13.979192733764648, "logits/rejected": 15.227087020874023, "logps/chosen": -4.599586009979248, "logps/rejected": -4.860482692718506, "loss": 3.4389, "rewards/accuracies": 0.5, "rewards/chosen": -45.9958610534668, "rewards/margins": 2.608968734741211, "rewards/rejected": -48.604827880859375, "step": 6949 }, { "epoch": 0.9463507625272332, "grad_norm": 39.37429639583408, "learning_rate": 6.994889139705273e-09, "logits/chosen": 13.619879722595215, "logits/rejected": 14.412906646728516, "logps/chosen": -4.2726922035217285, "logps/rejected": -4.714582443237305, "loss": 3.632, "rewards/accuracies": 0.75, "rewards/chosen": -42.72692108154297, "rewards/margins": 4.418903350830078, "rewards/rejected": -47.14582824707031, "step": 6950 }, { "epoch": 0.9464869281045751, "grad_norm": 39.487154198505884, "learning_rate": 6.959530341322661e-09, "logits/chosen": 14.274092674255371, "logits/rejected": 14.755056381225586, "logps/chosen": -4.24017333984375, "logps/rejected": -4.81760311126709, "loss": 3.732, "rewards/accuracies": 1.0, "rewards/chosen": -42.4017333984375, "rewards/margins": 5.7743024826049805, "rewards/rejected": -48.17603302001953, "step": 6951 }, { "epoch": 0.9466230936819172, "grad_norm": 40.91423880409895, "learning_rate": 6.924260353683075e-09, "logits/chosen": 14.180479049682617, "logits/rejected": 14.814062118530273, "logps/chosen": -4.355475425720215, "logps/rejected": -4.798863887786865, "loss": 4.5034, "rewards/accuracies": 1.0, "rewards/chosen": -43.55475997924805, "rewards/margins": 4.433877944946289, "rewards/rejected": -47.98863983154297, "step": 6952 }, { "epoch": 0.9467592592592593, "grad_norm": 41.01607477283725, "learning_rate": 6.889079184756052e-09, "logits/chosen": 14.533069610595703, "logits/rejected": 13.825885772705078, "logps/chosen": -4.558408737182617, "logps/rejected": -4.527519702911377, "loss": 4.1493, "rewards/accuracies": 0.5, "rewards/chosen": -45.584083557128906, "rewards/margins": -0.30888938903808594, "rewards/rejected": -45.27519607543945, "step": 6953 }, { "epoch": 0.9468954248366013, "grad_norm": 38.06197710192643, "learning_rate": 6.8539868424911e-09, "logits/chosen": 13.492755889892578, "logits/rejected": 15.27834415435791, "logps/chosen": -4.067425727844238, "logps/rejected": -4.562470436096191, "loss": 3.7092, "rewards/accuracies": 0.75, "rewards/chosen": -40.67425537109375, "rewards/margins": 4.950444221496582, "rewards/rejected": -45.62470245361328, "step": 6954 }, { "epoch": 0.9470315904139434, "grad_norm": 41.83014940604524, "learning_rate": 6.818983334817607e-09, "logits/chosen": 13.6204833984375, "logits/rejected": 13.727699279785156, "logps/chosen": -4.359375476837158, "logps/rejected": -4.499605655670166, "loss": 3.2723, "rewards/accuracies": 0.75, "rewards/chosen": -43.59375762939453, "rewards/margins": 1.4022979736328125, "rewards/rejected": -44.996055603027344, "step": 6955 }, { "epoch": 0.9471677559912854, "grad_norm": 39.49511313173944, "learning_rate": 6.784068669644849e-09, "logits/chosen": 13.973175048828125, "logits/rejected": 14.074592590332031, "logps/chosen": -4.587799072265625, "logps/rejected": -4.699361324310303, "loss": 4.0215, "rewards/accuracies": 0.75, "rewards/chosen": -45.877986907958984, "rewards/margins": 1.115621566772461, "rewards/rejected": -46.99361038208008, "step": 6956 }, { "epoch": 0.9473039215686274, "grad_norm": 41.455506048179586, "learning_rate": 6.749242854862158e-09, "logits/chosen": 14.428457260131836, "logits/rejected": 15.402420043945312, "logps/chosen": -4.754877090454102, "logps/rejected": -5.030269145965576, "loss": 3.884, "rewards/accuracies": 0.75, "rewards/chosen": -47.548770904541016, "rewards/margins": 2.7539186477661133, "rewards/rejected": -50.30269241333008, "step": 6957 }, { "epoch": 0.9474400871459695, "grad_norm": 38.055162720774554, "learning_rate": 6.714505898338707e-09, "logits/chosen": 14.320283889770508, "logits/rejected": 14.391521453857422, "logps/chosen": -4.320317268371582, "logps/rejected": -4.205317974090576, "loss": 3.9921, "rewards/accuracies": 0.25, "rewards/chosen": -43.20317077636719, "rewards/margins": -1.1499919891357422, "rewards/rejected": -42.05318069458008, "step": 6958 }, { "epoch": 0.9475762527233116, "grad_norm": 37.1667794965273, "learning_rate": 6.67985780792355e-09, "logits/chosen": 14.806516647338867, "logits/rejected": 14.184979438781738, "logps/chosen": -4.664074897766113, "logps/rejected": -4.635479927062988, "loss": 3.3181, "rewards/accuracies": 0.5, "rewards/chosen": -46.64075469970703, "rewards/margins": -0.28595638275146484, "rewards/rejected": -46.35479736328125, "step": 6959 }, { "epoch": 0.9477124183006536, "grad_norm": 37.320023828427146, "learning_rate": 6.6452985914457135e-09, "logits/chosen": 14.656598091125488, "logits/rejected": 14.290404319763184, "logps/chosen": -4.905762672424316, "logps/rejected": -4.764690399169922, "loss": 4.1977, "rewards/accuracies": 0.5, "rewards/chosen": -49.05762481689453, "rewards/margins": -1.4107236862182617, "rewards/rejected": -47.64690399169922, "step": 6960 }, { "epoch": 0.9478485838779956, "grad_norm": 40.349334353596916, "learning_rate": 6.61082825671424e-09, "logits/chosen": 15.161800384521484, "logits/rejected": 14.714794158935547, "logps/chosen": -4.965652942657471, "logps/rejected": -4.725221157073975, "loss": 4.3806, "rewards/accuracies": 0.0, "rewards/chosen": -49.65652847290039, "rewards/margins": -2.4043197631835938, "rewards/rejected": -47.25221252441406, "step": 6961 }, { "epoch": 0.9479847494553377, "grad_norm": 42.773607446332534, "learning_rate": 6.576446811517833e-09, "logits/chosen": 14.782690048217773, "logits/rejected": 14.965293884277344, "logps/chosen": -4.949070453643799, "logps/rejected": -4.96699333190918, "loss": 4.2642, "rewards/accuracies": 0.5, "rewards/chosen": -49.49070358276367, "rewards/margins": 0.17922687530517578, "rewards/rejected": -49.66992950439453, "step": 6962 }, { "epoch": 0.9481209150326797, "grad_norm": 39.30279980367745, "learning_rate": 6.542154263625388e-09, "logits/chosen": 14.62213134765625, "logits/rejected": 14.881622314453125, "logps/chosen": -4.755795478820801, "logps/rejected": -4.7465386390686035, "loss": 3.7898, "rewards/accuracies": 0.5, "rewards/chosen": -47.55795669555664, "rewards/margins": -0.09257030487060547, "rewards/rejected": -47.46538543701172, "step": 6963 }, { "epoch": 0.9482570806100218, "grad_norm": 42.3968069513557, "learning_rate": 6.507950620785552e-09, "logits/chosen": 14.21585464477539, "logits/rejected": 14.99069595336914, "logps/chosen": -4.4826340675354, "logps/rejected": -4.4393510818481445, "loss": 4.113, "rewards/accuracies": 0.25, "rewards/chosen": -44.82634353637695, "rewards/margins": -0.4328317642211914, "rewards/rejected": -44.39350891113281, "step": 6964 }, { "epoch": 0.9483932461873639, "grad_norm": 39.356428933840625, "learning_rate": 6.473835890726853e-09, "logits/chosen": 14.445474624633789, "logits/rejected": 14.89023494720459, "logps/chosen": -4.628215789794922, "logps/rejected": -4.572786331176758, "loss": 3.7933, "rewards/accuracies": 0.5, "rewards/chosen": -46.28215789794922, "rewards/margins": -0.5542926788330078, "rewards/rejected": -45.72786331176758, "step": 6965 }, { "epoch": 0.9485294117647058, "grad_norm": 41.66030732695347, "learning_rate": 6.439810081157882e-09, "logits/chosen": 15.310930252075195, "logits/rejected": 15.103974342346191, "logps/chosen": -4.835031986236572, "logps/rejected": -5.084517002105713, "loss": 3.3964, "rewards/accuracies": 0.25, "rewards/chosen": -48.350318908691406, "rewards/margins": 2.494852066040039, "rewards/rejected": -50.84517288208008, "step": 6966 }, { "epoch": 0.9486655773420479, "grad_norm": 39.33765358961406, "learning_rate": 6.405873199767065e-09, "logits/chosen": 15.1058988571167, "logits/rejected": 15.200790405273438, "logps/chosen": -4.629334926605225, "logps/rejected": -4.759381294250488, "loss": 3.9603, "rewards/accuracies": 0.5, "rewards/chosen": -46.29335021972656, "rewards/margins": 1.300461769104004, "rewards/rejected": -47.59381103515625, "step": 6967 }, { "epoch": 0.94880174291939, "grad_norm": 114.7776228140305, "learning_rate": 6.3720252542226235e-09, "logits/chosen": 14.796356201171875, "logits/rejected": 15.08449935913086, "logps/chosen": -4.724361419677734, "logps/rejected": -4.910086631774902, "loss": 4.3249, "rewards/accuracies": 0.5, "rewards/chosen": -47.243614196777344, "rewards/margins": 1.8572492599487305, "rewards/rejected": -49.10086441040039, "step": 6968 }, { "epoch": 0.948937908496732, "grad_norm": 40.77554044459881, "learning_rate": 6.338266252172841e-09, "logits/chosen": 14.933581352233887, "logits/rejected": 15.465599060058594, "logps/chosen": -4.832008361816406, "logps/rejected": -4.690426826477051, "loss": 4.0611, "rewards/accuracies": 0.5, "rewards/chosen": -48.3200798034668, "rewards/margins": -1.4158153533935547, "rewards/rejected": -46.904266357421875, "step": 6969 }, { "epoch": 0.9490740740740741, "grad_norm": 41.04003979444776, "learning_rate": 6.304596201245926e-09, "logits/chosen": 14.32411003112793, "logits/rejected": 14.691167831420898, "logps/chosen": -4.699557304382324, "logps/rejected": -4.616560935974121, "loss": 3.5824, "rewards/accuracies": 0.5, "rewards/chosen": -46.995574951171875, "rewards/margins": -0.8299636840820312, "rewards/rejected": -46.165611267089844, "step": 6970 }, { "epoch": 0.9492102396514162, "grad_norm": 39.55062846813179, "learning_rate": 6.271015109049704e-09, "logits/chosen": 14.48359489440918, "logits/rejected": 13.956182479858398, "logps/chosen": -4.619436264038086, "logps/rejected": -4.4319915771484375, "loss": 4.399, "rewards/accuracies": 0.25, "rewards/chosen": -46.194358825683594, "rewards/margins": -1.8744468688964844, "rewards/rejected": -44.319915771484375, "step": 6971 }, { "epoch": 0.9493464052287581, "grad_norm": 40.79098094027648, "learning_rate": 6.237522983172283e-09, "logits/chosen": 13.972463607788086, "logits/rejected": 15.214662551879883, "logps/chosen": -4.332825183868408, "logps/rejected": -4.684988021850586, "loss": 3.4521, "rewards/accuracies": 0.75, "rewards/chosen": -43.328250885009766, "rewards/margins": 3.5216283798217773, "rewards/rejected": -46.84988021850586, "step": 6972 }, { "epoch": 0.9494825708061002, "grad_norm": 41.45669489948733, "learning_rate": 6.204119831181432e-09, "logits/chosen": 14.16844654083252, "logits/rejected": 15.001518249511719, "logps/chosen": -4.395936965942383, "logps/rejected": -4.859579086303711, "loss": 3.8731, "rewards/accuracies": 0.75, "rewards/chosen": -43.95936584472656, "rewards/margins": 4.6364240646362305, "rewards/rejected": -48.59579086303711, "step": 6973 }, { "epoch": 0.9496187363834423, "grad_norm": 42.90358202119108, "learning_rate": 6.1708056606248e-09, "logits/chosen": 15.06429386138916, "logits/rejected": 15.351175308227539, "logps/chosen": -4.656231880187988, "logps/rejected": -4.5190534591674805, "loss": 3.8539, "rewards/accuracies": 0.5, "rewards/chosen": -46.56232452392578, "rewards/margins": -1.371786117553711, "rewards/rejected": -45.19053649902344, "step": 6974 }, { "epoch": 0.9497549019607843, "grad_norm": 41.54265814432412, "learning_rate": 6.137580479030058e-09, "logits/chosen": 13.830204010009766, "logits/rejected": 13.914033889770508, "logps/chosen": -4.610662460327148, "logps/rejected": -4.588081359863281, "loss": 4.2164, "rewards/accuracies": 0.5, "rewards/chosen": -46.106624603271484, "rewards/margins": -0.2258129119873047, "rewards/rejected": -45.88081359863281, "step": 6975 }, { "epoch": 0.9498910675381264, "grad_norm": 41.338097184725214, "learning_rate": 6.104444293904753e-09, "logits/chosen": 14.247068405151367, "logits/rejected": 15.199302673339844, "logps/chosen": -4.8002214431762695, "logps/rejected": -5.290060043334961, "loss": 3.9362, "rewards/accuracies": 1.0, "rewards/chosen": -48.00221252441406, "rewards/margins": 4.898388862609863, "rewards/rejected": -52.90060043334961, "step": 6976 }, { "epoch": 0.9500272331154684, "grad_norm": 40.50445185228423, "learning_rate": 6.071397112736187e-09, "logits/chosen": 14.410661697387695, "logits/rejected": 14.49437427520752, "logps/chosen": -4.616929531097412, "logps/rejected": -4.749493598937988, "loss": 3.9982, "rewards/accuracies": 0.5, "rewards/chosen": -46.16929626464844, "rewards/margins": 1.3256425857543945, "rewards/rejected": -47.49494171142578, "step": 6977 }, { "epoch": 0.9501633986928104, "grad_norm": 42.58458492303347, "learning_rate": 6.038438942991719e-09, "logits/chosen": 14.384659767150879, "logits/rejected": 15.520013809204102, "logps/chosen": -4.671627998352051, "logps/rejected": -4.879166603088379, "loss": 4.3454, "rewards/accuracies": 0.75, "rewards/chosen": -46.716278076171875, "rewards/margins": 2.0753917694091797, "rewards/rejected": -48.79166793823242, "step": 6978 }, { "epoch": 0.9502995642701525, "grad_norm": 38.196412022274515, "learning_rate": 6.005569792118459e-09, "logits/chosen": 15.150991439819336, "logits/rejected": 14.970928192138672, "logps/chosen": -4.205298900604248, "logps/rejected": -4.144295692443848, "loss": 3.7254, "rewards/accuracies": 0.25, "rewards/chosen": -42.05299377441406, "rewards/margins": -0.6100339889526367, "rewards/rejected": -41.44295883178711, "step": 6979 }, { "epoch": 0.9504357298474946, "grad_norm": 38.38130082766438, "learning_rate": 5.972789667543532e-09, "logits/chosen": 14.96230697631836, "logits/rejected": 15.126863479614258, "logps/chosen": -4.443577766418457, "logps/rejected": -4.697322368621826, "loss": 3.856, "rewards/accuracies": 0.5, "rewards/chosen": -44.43578338623047, "rewards/margins": 2.537442207336426, "rewards/rejected": -46.97322082519531, "step": 6980 }, { "epoch": 0.9505718954248366, "grad_norm": 43.803146650902775, "learning_rate": 5.940098576673813e-09, "logits/chosen": 15.00241756439209, "logits/rejected": 14.923534393310547, "logps/chosen": -4.606821060180664, "logps/rejected": -4.807290554046631, "loss": 3.7632, "rewards/accuracies": 0.5, "rewards/chosen": -46.068206787109375, "rewards/margins": 2.004696846008301, "rewards/rejected": -48.072906494140625, "step": 6981 }, { "epoch": 0.9507080610021786, "grad_norm": 38.97129108132798, "learning_rate": 5.90749652689615e-09, "logits/chosen": 14.94330883026123, "logits/rejected": 15.083907127380371, "logps/chosen": -4.702566146850586, "logps/rejected": -4.78769588470459, "loss": 3.8495, "rewards/accuracies": 0.5, "rewards/chosen": -47.02566146850586, "rewards/margins": 0.8513002395629883, "rewards/rejected": -47.87696075439453, "step": 6982 }, { "epoch": 0.9508442265795207, "grad_norm": 38.699123641547686, "learning_rate": 5.874983525577315e-09, "logits/chosen": 13.753753662109375, "logits/rejected": 14.785408020019531, "logps/chosen": -4.517684459686279, "logps/rejected": -4.980532169342041, "loss": 3.5922, "rewards/accuracies": 1.0, "rewards/chosen": -45.17684555053711, "rewards/margins": 4.628475189208984, "rewards/rejected": -49.805320739746094, "step": 6983 }, { "epoch": 0.9509803921568627, "grad_norm": 39.749542861961835, "learning_rate": 5.842559580063744e-09, "logits/chosen": 15.11677360534668, "logits/rejected": 14.422679901123047, "logps/chosen": -4.701061725616455, "logps/rejected": -4.537603378295898, "loss": 3.4165, "rewards/accuracies": 0.25, "rewards/chosen": -47.0106201171875, "rewards/margins": -1.634587287902832, "rewards/rejected": -45.37602996826172, "step": 6984 }, { "epoch": 0.9511165577342048, "grad_norm": 41.274115209450265, "learning_rate": 5.810224697681976e-09, "logits/chosen": 14.670340538024902, "logits/rejected": 14.091806411743164, "logps/chosen": -4.826621055603027, "logps/rejected": -4.5348405838012695, "loss": 3.8613, "rewards/accuracies": 0.25, "rewards/chosen": -48.266212463378906, "rewards/margins": -2.9178037643432617, "rewards/rejected": -45.34840393066406, "step": 6985 }, { "epoch": 0.9512527233115469, "grad_norm": 40.39181410145695, "learning_rate": 5.777978885738432e-09, "logits/chosen": 14.866765022277832, "logits/rejected": 15.447219848632812, "logps/chosen": -4.699483871459961, "logps/rejected": -5.345129013061523, "loss": 3.9239, "rewards/accuracies": 0.75, "rewards/chosen": -46.994834899902344, "rewards/margins": 6.456454277038574, "rewards/rejected": -53.451290130615234, "step": 6986 }, { "epoch": 0.9513888888888888, "grad_norm": 42.15616883450297, "learning_rate": 5.745822151519153e-09, "logits/chosen": 13.848555564880371, "logits/rejected": 15.553028106689453, "logps/chosen": -4.419795036315918, "logps/rejected": -4.958348274230957, "loss": 3.9728, "rewards/accuracies": 0.75, "rewards/chosen": -44.19795227050781, "rewards/margins": 5.385534286499023, "rewards/rejected": -49.5834846496582, "step": 6987 }, { "epoch": 0.9515250544662309, "grad_norm": 40.10747823479988, "learning_rate": 5.71375450229028e-09, "logits/chosen": 14.253594398498535, "logits/rejected": 14.995607376098633, "logps/chosen": -4.9736738204956055, "logps/rejected": -5.127289772033691, "loss": 3.9145, "rewards/accuracies": 0.75, "rewards/chosen": -49.73673629760742, "rewards/margins": 1.536158561706543, "rewards/rejected": -51.27289581298828, "step": 6988 }, { "epoch": 0.951661220043573, "grad_norm": 46.70987868911285, "learning_rate": 5.6817759452978394e-09, "logits/chosen": 14.9063138961792, "logits/rejected": 13.377825736999512, "logps/chosen": -4.494808197021484, "logps/rejected": -4.337233543395996, "loss": 4.5159, "rewards/accuracies": 0.25, "rewards/chosen": -44.948081970214844, "rewards/margins": -1.5757484436035156, "rewards/rejected": -43.37233352661133, "step": 6989 }, { "epoch": 0.951797385620915, "grad_norm": 38.86274941868131, "learning_rate": 5.649886487767563e-09, "logits/chosen": 14.30221176147461, "logits/rejected": 14.851198196411133, "logps/chosen": -4.855650424957275, "logps/rejected": -5.118870258331299, "loss": 3.9096, "rewards/accuracies": 1.0, "rewards/chosen": -48.55650329589844, "rewards/margins": 2.632199287414551, "rewards/rejected": -51.18870544433594, "step": 6990 }, { "epoch": 0.9519335511982571, "grad_norm": 35.226045939777194, "learning_rate": 5.618086136905154e-09, "logits/chosen": 13.385282516479492, "logits/rejected": 15.059356689453125, "logps/chosen": -4.536096572875977, "logps/rejected": -5.017175197601318, "loss": 3.6849, "rewards/accuracies": 1.0, "rewards/chosen": -45.3609619140625, "rewards/margins": 4.810789108276367, "rewards/rejected": -50.1717529296875, "step": 6991 }, { "epoch": 0.9520697167755992, "grad_norm": 43.53806774963723, "learning_rate": 5.586374899896195e-09, "logits/chosen": 14.320905685424805, "logits/rejected": 14.425682067871094, "logps/chosen": -4.37640905380249, "logps/rejected": -4.560941696166992, "loss": 4.4133, "rewards/accuracies": 0.75, "rewards/chosen": -43.76409149169922, "rewards/margins": 1.8453264236450195, "rewards/rejected": -45.60941696166992, "step": 6992 }, { "epoch": 0.9522058823529411, "grad_norm": 42.758010500481085, "learning_rate": 5.554752783906114e-09, "logits/chosen": 14.321447372436523, "logits/rejected": 14.081243515014648, "logps/chosen": -5.022600173950195, "logps/rejected": -4.941246509552002, "loss": 3.9063, "rewards/accuracies": 0.25, "rewards/chosen": -50.22600555419922, "rewards/margins": -0.8135404586791992, "rewards/rejected": -49.4124641418457, "step": 6993 }, { "epoch": 0.9523420479302832, "grad_norm": 38.972949861313325, "learning_rate": 5.523219796080081e-09, "logits/chosen": 13.486303329467773, "logits/rejected": 14.825115203857422, "logps/chosen": -4.399753570556641, "logps/rejected": -4.720682144165039, "loss": 3.8829, "rewards/accuracies": 0.75, "rewards/chosen": -43.997535705566406, "rewards/margins": 3.209282875061035, "rewards/rejected": -47.206817626953125, "step": 6994 }, { "epoch": 0.9524782135076253, "grad_norm": 41.50615256773239, "learning_rate": 5.491775943543375e-09, "logits/chosen": 13.00597858428955, "logits/rejected": 14.617019653320312, "logps/chosen": -4.357303619384766, "logps/rejected": -4.635395050048828, "loss": 4.0813, "rewards/accuracies": 0.75, "rewards/chosen": -43.573036193847656, "rewards/margins": 2.780914306640625, "rewards/rejected": -46.35395431518555, "step": 6995 }, { "epoch": 0.9526143790849673, "grad_norm": 39.88400232022276, "learning_rate": 5.460421233400936e-09, "logits/chosen": 14.540641784667969, "logits/rejected": 14.583101272583008, "logps/chosen": -4.519233703613281, "logps/rejected": -4.654693603515625, "loss": 3.8927, "rewards/accuracies": 0.5, "rewards/chosen": -45.19233703613281, "rewards/margins": 1.354604721069336, "rewards/rejected": -46.546939849853516, "step": 6996 }, { "epoch": 0.9527505446623094, "grad_norm": 41.19915663466931, "learning_rate": 5.429155672737584e-09, "logits/chosen": 14.621637344360352, "logits/rejected": 15.072973251342773, "logps/chosen": -4.557406425476074, "logps/rejected": -4.5590362548828125, "loss": 4.1146, "rewards/accuracies": 0.75, "rewards/chosen": -45.574066162109375, "rewards/margins": 0.01629352569580078, "rewards/rejected": -45.59035873413086, "step": 6997 }, { "epoch": 0.9528867102396514, "grad_norm": 41.553700892288, "learning_rate": 5.397979268618069e-09, "logits/chosen": 13.925911903381348, "logits/rejected": 14.076868057250977, "logps/chosen": -4.285484313964844, "logps/rejected": -4.5278849601745605, "loss": 3.971, "rewards/accuracies": 1.0, "rewards/chosen": -42.85484313964844, "rewards/margins": 2.4240102767944336, "rewards/rejected": -45.27885055541992, "step": 6998 }, { "epoch": 0.9530228758169934, "grad_norm": 37.58828828081402, "learning_rate": 5.366892028086933e-09, "logits/chosen": 13.776557922363281, "logits/rejected": 14.528009414672852, "logps/chosen": -4.311418056488037, "logps/rejected": -4.648680210113525, "loss": 3.9062, "rewards/accuracies": 0.75, "rewards/chosen": -43.11418151855469, "rewards/margins": 3.372617721557617, "rewards/rejected": -46.48680114746094, "step": 6999 }, { "epoch": 0.9531590413943355, "grad_norm": 39.17643568858108, "learning_rate": 5.335893958168647e-09, "logits/chosen": 14.097101211547852, "logits/rejected": 15.705452919006348, "logps/chosen": -4.59108304977417, "logps/rejected": -4.8288655281066895, "loss": 4.115, "rewards/accuracies": 0.5, "rewards/chosen": -45.91082763671875, "rewards/margins": 2.3778276443481445, "rewards/rejected": -48.288658142089844, "step": 7000 }, { "epoch": 0.9532952069716776, "grad_norm": 39.87090206414779, "learning_rate": 5.304985065867429e-09, "logits/chosen": 14.351726531982422, "logits/rejected": 14.450958251953125, "logps/chosen": -5.148684501647949, "logps/rejected": -5.158295154571533, "loss": 3.8023, "rewards/accuracies": 0.5, "rewards/chosen": -51.486839294433594, "rewards/margins": 0.09610939025878906, "rewards/rejected": -51.58295440673828, "step": 7001 }, { "epoch": 0.9534313725490197, "grad_norm": 40.41396387542753, "learning_rate": 5.274165358167426e-09, "logits/chosen": 14.833662033081055, "logits/rejected": 14.688238143920898, "logps/chosen": -4.645627975463867, "logps/rejected": -4.628594398498535, "loss": 4.3239, "rewards/accuracies": 0.5, "rewards/chosen": -46.45628356933594, "rewards/margins": -0.1703357696533203, "rewards/rejected": -46.285945892333984, "step": 7002 }, { "epoch": 0.9535675381263616, "grad_norm": 41.33516066074658, "learning_rate": 5.2434348420326235e-09, "logits/chosen": 13.908647537231445, "logits/rejected": 14.290565490722656, "logps/chosen": -4.488908290863037, "logps/rejected": -4.6857194900512695, "loss": 3.9044, "rewards/accuracies": 0.75, "rewards/chosen": -44.88908386230469, "rewards/margins": 1.9681081771850586, "rewards/rejected": -46.85719299316406, "step": 7003 }, { "epoch": 0.9537037037037037, "grad_norm": 41.70543615487641, "learning_rate": 5.212793524406755e-09, "logits/chosen": 14.848690032958984, "logits/rejected": 14.956998825073242, "logps/chosen": -5.009612083435059, "logps/rejected": -4.766623497009277, "loss": 4.2407, "rewards/accuracies": 0.25, "rewards/chosen": -50.09612274169922, "rewards/margins": -2.429882049560547, "rewards/rejected": -47.66624069213867, "step": 7004 }, { "epoch": 0.9538398692810458, "grad_norm": 42.671642094887616, "learning_rate": 5.182241412213573e-09, "logits/chosen": 14.186906814575195, "logits/rejected": 14.004749298095703, "logps/chosen": -4.483489990234375, "logps/rejected": -4.3346405029296875, "loss": 4.2072, "rewards/accuracies": 0.5, "rewards/chosen": -44.834896087646484, "rewards/margins": -1.488490104675293, "rewards/rejected": -43.346405029296875, "step": 7005 }, { "epoch": 0.9539760348583878, "grad_norm": 40.14934330712475, "learning_rate": 5.151778512356531e-09, "logits/chosen": 14.674162864685059, "logits/rejected": 14.589107513427734, "logps/chosen": -4.890546798706055, "logps/rejected": -4.894114017486572, "loss": 3.6626, "rewards/accuracies": 0.25, "rewards/chosen": -48.90546417236328, "rewards/margins": 0.035676002502441406, "rewards/rejected": -48.941139221191406, "step": 7006 }, { "epoch": 0.9541122004357299, "grad_norm": 43.381210432632564, "learning_rate": 5.1214048317190115e-09, "logits/chosen": 14.880182266235352, "logits/rejected": 14.862592697143555, "logps/chosen": -4.950556755065918, "logps/rejected": -5.000620365142822, "loss": 4.2647, "rewards/accuracies": 0.75, "rewards/chosen": -49.50556945800781, "rewards/margins": 0.5006370544433594, "rewards/rejected": -50.006202697753906, "step": 7007 }, { "epoch": 0.954248366013072, "grad_norm": 43.895102027356835, "learning_rate": 5.0911203771641045e-09, "logits/chosen": 14.178789138793945, "logits/rejected": 14.392619132995605, "logps/chosen": -4.655361652374268, "logps/rejected": -4.676178455352783, "loss": 3.4177, "rewards/accuracies": 0.5, "rewards/chosen": -46.553619384765625, "rewards/margins": 0.20817089080810547, "rewards/rejected": -46.76178741455078, "step": 7008 }, { "epoch": 0.9543845315904139, "grad_norm": 37.4285956719146, "learning_rate": 5.0609251555349566e-09, "logits/chosen": 15.284245491027832, "logits/rejected": 15.574049949645996, "logps/chosen": -4.902977466583252, "logps/rejected": -5.218541145324707, "loss": 3.7437, "rewards/accuracies": 1.0, "rewards/chosen": -49.02977752685547, "rewards/margins": 3.155632972717285, "rewards/rejected": -52.18540954589844, "step": 7009 }, { "epoch": 0.954520697167756, "grad_norm": 40.76375368162527, "learning_rate": 5.030819173654333e-09, "logits/chosen": 14.476784706115723, "logits/rejected": 15.132364273071289, "logps/chosen": -4.533343315124512, "logps/rejected": -4.908575057983398, "loss": 4.3996, "rewards/accuracies": 0.5, "rewards/chosen": -45.33342742919922, "rewards/margins": 3.752321243286133, "rewards/rejected": -49.08575439453125, "step": 7010 }, { "epoch": 0.9546568627450981, "grad_norm": 40.24199974483904, "learning_rate": 5.000802438324969e-09, "logits/chosen": 14.786249160766602, "logits/rejected": 14.243288040161133, "logps/chosen": -4.794635772705078, "logps/rejected": -4.820162296295166, "loss": 3.8014, "rewards/accuracies": 0.25, "rewards/chosen": -47.94635772705078, "rewards/margins": 0.2552671432495117, "rewards/rejected": -48.201622009277344, "step": 7011 }, { "epoch": 0.9547930283224401, "grad_norm": 38.29787144942348, "learning_rate": 4.970874956329396e-09, "logits/chosen": 13.7445707321167, "logits/rejected": 13.866933822631836, "logps/chosen": -4.21444845199585, "logps/rejected": -4.681936264038086, "loss": 4.1834, "rewards/accuracies": 1.0, "rewards/chosen": -42.14448547363281, "rewards/margins": 4.67487907409668, "rewards/rejected": -46.81936264038086, "step": 7012 }, { "epoch": 0.9549291938997821, "grad_norm": 38.0917089377554, "learning_rate": 4.941036734430026e-09, "logits/chosen": 13.824827194213867, "logits/rejected": 14.48539924621582, "logps/chosen": -4.381443023681641, "logps/rejected": -4.715538024902344, "loss": 3.9601, "rewards/accuracies": 1.0, "rewards/chosen": -43.81443405151367, "rewards/margins": 3.340945243835449, "rewards/rejected": -47.15538024902344, "step": 7013 }, { "epoch": 0.9550653594771242, "grad_norm": 39.34053287091028, "learning_rate": 4.9112877793689335e-09, "logits/chosen": 14.844993591308594, "logits/rejected": 14.77430534362793, "logps/chosen": -4.130035877227783, "logps/rejected": -4.635982036590576, "loss": 3.7699, "rewards/accuracies": 1.0, "rewards/chosen": -41.30036163330078, "rewards/margins": 5.059460639953613, "rewards/rejected": -46.35982131958008, "step": 7014 }, { "epoch": 0.9552015250544662, "grad_norm": 41.125788087339885, "learning_rate": 4.881628097868207e-09, "logits/chosen": 14.96231460571289, "logits/rejected": 14.703889846801758, "logps/chosen": -4.753214359283447, "logps/rejected": -4.760523319244385, "loss": 4.1912, "rewards/accuracies": 0.5, "rewards/chosen": -47.532142639160156, "rewards/margins": 0.07309150695800781, "rewards/rejected": -47.60523223876953, "step": 7015 }, { "epoch": 0.9553376906318083, "grad_norm": 40.91949106087633, "learning_rate": 4.85205769662973e-09, "logits/chosen": 14.415285110473633, "logits/rejected": 13.875951766967773, "logps/chosen": -4.658352375030518, "logps/rejected": -4.6921257972717285, "loss": 3.9058, "rewards/accuracies": 0.25, "rewards/chosen": -46.583526611328125, "rewards/margins": 0.3377342224121094, "rewards/rejected": -46.92125701904297, "step": 7016 }, { "epoch": 0.9554738562091504, "grad_norm": 38.7090316823393, "learning_rate": 4.822576582335092e-09, "logits/chosen": 14.401528358459473, "logits/rejected": 15.009117126464844, "logps/chosen": -4.837705135345459, "logps/rejected": -5.061465263366699, "loss": 4.1012, "rewards/accuracies": 0.75, "rewards/chosen": -48.377052307128906, "rewards/margins": 2.237603187561035, "rewards/rejected": -50.614654541015625, "step": 7017 }, { "epoch": 0.9556100217864923, "grad_norm": 38.45802165715454, "learning_rate": 4.793184761645852e-09, "logits/chosen": 14.141905784606934, "logits/rejected": 14.588571548461914, "logps/chosen": -4.561117649078369, "logps/rejected": -4.8684773445129395, "loss": 3.601, "rewards/accuracies": 0.75, "rewards/chosen": -45.611175537109375, "rewards/margins": 3.0735979080200195, "rewards/rejected": -48.684776306152344, "step": 7018 }, { "epoch": 0.9557461873638344, "grad_norm": 42.050646267540664, "learning_rate": 4.763882241203365e-09, "logits/chosen": 14.806598663330078, "logits/rejected": 14.960201263427734, "logps/chosen": -5.062134265899658, "logps/rejected": -4.993791103363037, "loss": 3.6192, "rewards/accuracies": 0.5, "rewards/chosen": -50.621341705322266, "rewards/margins": -0.6834287643432617, "rewards/rejected": -49.93791198730469, "step": 7019 }, { "epoch": 0.9558823529411765, "grad_norm": 40.42221413439386, "learning_rate": 4.7346690276286905e-09, "logits/chosen": 14.614829063415527, "logits/rejected": 15.317000389099121, "logps/chosen": -4.625405788421631, "logps/rejected": -4.866567611694336, "loss": 4.1218, "rewards/accuracies": 0.75, "rewards/chosen": -46.254058837890625, "rewards/margins": 2.411618232727051, "rewards/rejected": -48.66567611694336, "step": 7020 }, { "epoch": 0.9560185185185185, "grad_norm": 39.15502345506796, "learning_rate": 4.705545127522903e-09, "logits/chosen": 14.98819351196289, "logits/rejected": 15.089320182800293, "logps/chosen": -4.546985626220703, "logps/rejected": -4.797011375427246, "loss": 3.7754, "rewards/accuracies": 0.75, "rewards/chosen": -45.46985626220703, "rewards/margins": 2.5002574920654297, "rewards/rejected": -47.97011184692383, "step": 7021 }, { "epoch": 0.9561546840958606, "grad_norm": 40.26944789677095, "learning_rate": 4.676510547466695e-09, "logits/chosen": 14.671411514282227, "logits/rejected": 15.262746810913086, "logps/chosen": -4.911282062530518, "logps/rejected": -4.997372150421143, "loss": 3.7954, "rewards/accuracies": 0.25, "rewards/chosen": -49.112823486328125, "rewards/margins": 0.86090087890625, "rewards/rejected": -49.973724365234375, "step": 7022 }, { "epoch": 0.9562908496732027, "grad_norm": 41.00658339820309, "learning_rate": 4.6475652940207275e-09, "logits/chosen": 14.328184127807617, "logits/rejected": 14.709579467773438, "logps/chosen": -4.591658592224121, "logps/rejected": -5.074642181396484, "loss": 3.9283, "rewards/accuracies": 0.75, "rewards/chosen": -45.91658401489258, "rewards/margins": 4.829839706420898, "rewards/rejected": -50.74642562866211, "step": 7023 }, { "epoch": 0.9564270152505446, "grad_norm": 42.90820208946307, "learning_rate": 4.618709373725371e-09, "logits/chosen": 13.559494972229004, "logits/rejected": 14.077381134033203, "logps/chosen": -4.575595855712891, "logps/rejected": -4.561445713043213, "loss": 4.2949, "rewards/accuracies": 0.25, "rewards/chosen": -45.755958557128906, "rewards/margins": -0.14150619506835938, "rewards/rejected": -45.61445617675781, "step": 7024 }, { "epoch": 0.9565631808278867, "grad_norm": 35.71846359922732, "learning_rate": 4.589942793100921e-09, "logits/chosen": 14.816619873046875, "logits/rejected": 13.971323013305664, "logps/chosen": -4.740578651428223, "logps/rejected": -4.6799726486206055, "loss": 3.4651, "rewards/accuracies": 0.5, "rewards/chosen": -47.405792236328125, "rewards/margins": -0.6060676574707031, "rewards/rejected": -46.799720764160156, "step": 7025 }, { "epoch": 0.9566993464052288, "grad_norm": 40.93392176695496, "learning_rate": 4.561265558647376e-09, "logits/chosen": 14.352359771728516, "logits/rejected": 14.989513397216797, "logps/chosen": -4.840331077575684, "logps/rejected": -4.994019508361816, "loss": 4.0737, "rewards/accuracies": 0.75, "rewards/chosen": -48.4033088684082, "rewards/margins": 1.5368871688842773, "rewards/rejected": -49.9401969909668, "step": 7026 }, { "epoch": 0.9568355119825708, "grad_norm": 41.41741903879197, "learning_rate": 4.5326776768445766e-09, "logits/chosen": 14.742095947265625, "logits/rejected": 14.988550186157227, "logps/chosen": -4.460971355438232, "logps/rejected": -4.632378578186035, "loss": 4.0201, "rewards/accuracies": 0.5, "rewards/chosen": -44.60971450805664, "rewards/margins": 1.7140684127807617, "rewards/rejected": -46.32378387451172, "step": 7027 }, { "epoch": 0.9569716775599129, "grad_norm": 41.19242600760124, "learning_rate": 4.504179154152243e-09, "logits/chosen": 15.046594619750977, "logits/rejected": 14.615849494934082, "logps/chosen": -4.772922992706299, "logps/rejected": -4.619785308837891, "loss": 4.3747, "rewards/accuracies": 0.25, "rewards/chosen": -47.72922897338867, "rewards/margins": -1.5313758850097656, "rewards/rejected": -46.197853088378906, "step": 7028 }, { "epoch": 0.9571078431372549, "grad_norm": 44.73376262512313, "learning_rate": 4.475769997009848e-09, "logits/chosen": 15.302845001220703, "logits/rejected": 14.807893753051758, "logps/chosen": -4.748691558837891, "logps/rejected": -4.646885395050049, "loss": 3.7709, "rewards/accuracies": 0.5, "rewards/chosen": -47.48691177368164, "rewards/margins": -1.0180587768554688, "rewards/rejected": -46.46885299682617, "step": 7029 }, { "epoch": 0.9572440087145969, "grad_norm": 39.804811076680956, "learning_rate": 4.447450211836612e-09, "logits/chosen": 14.597155570983887, "logits/rejected": 14.012823104858398, "logps/chosen": -4.876562595367432, "logps/rejected": -4.904824733734131, "loss": 4.1352, "rewards/accuracies": 0.5, "rewards/chosen": -48.765628814697266, "rewards/margins": 0.2826194763183594, "rewards/rejected": -49.048248291015625, "step": 7030 }, { "epoch": 0.957380174291939, "grad_norm": 42.74944384397219, "learning_rate": 4.419219805031727e-09, "logits/chosen": 14.284894943237305, "logits/rejected": 14.309670448303223, "logps/chosen": -4.733154296875, "logps/rejected": -4.854448318481445, "loss": 4.0484, "rewards/accuracies": 0.25, "rewards/chosen": -47.33154296875, "rewards/margins": 1.2129392623901367, "rewards/rejected": -48.54448318481445, "step": 7031 }, { "epoch": 0.9575163398692811, "grad_norm": 37.16079683655764, "learning_rate": 4.3910787829740006e-09, "logits/chosen": 14.629907608032227, "logits/rejected": 14.163741111755371, "logps/chosen": -4.637502193450928, "logps/rejected": -4.915584564208984, "loss": 3.7975, "rewards/accuracies": 0.5, "rewards/chosen": -46.37501907348633, "rewards/margins": 2.7808265686035156, "rewards/rejected": -49.155845642089844, "step": 7032 }, { "epoch": 0.9576525054466231, "grad_norm": 38.39494009593774, "learning_rate": 4.36302715202217e-09, "logits/chosen": 14.648920059204102, "logits/rejected": 14.238555908203125, "logps/chosen": -4.487100601196289, "logps/rejected": -4.640517234802246, "loss": 3.7645, "rewards/accuracies": 0.5, "rewards/chosen": -44.871009826660156, "rewards/margins": 1.5341691970825195, "rewards/rejected": -46.405174255371094, "step": 7033 }, { "epoch": 0.9577886710239651, "grad_norm": 37.33612044770045, "learning_rate": 4.3350649185147196e-09, "logits/chosen": 13.632423400878906, "logits/rejected": 14.859746932983398, "logps/chosen": -4.215184211730957, "logps/rejected": -4.679158687591553, "loss": 3.3832, "rewards/accuracies": 1.0, "rewards/chosen": -42.15184020996094, "rewards/margins": 4.639748573303223, "rewards/rejected": -46.791587829589844, "step": 7034 }, { "epoch": 0.9579248366013072, "grad_norm": 42.68669367793745, "learning_rate": 4.307192088769973e-09, "logits/chosen": 14.732065200805664, "logits/rejected": 14.91379165649414, "logps/chosen": -4.712080001831055, "logps/rejected": -4.879834175109863, "loss": 3.4317, "rewards/accuracies": 0.75, "rewards/chosen": -47.12079620361328, "rewards/margins": 1.677546501159668, "rewards/rejected": -48.798343658447266, "step": 7035 }, { "epoch": 0.9580610021786492, "grad_norm": 41.74663271219429, "learning_rate": 4.2794086690859595e-09, "logits/chosen": 14.018997192382812, "logits/rejected": 14.96043586730957, "logps/chosen": -4.534564018249512, "logps/rejected": -5.013264179229736, "loss": 3.4726, "rewards/accuracies": 1.0, "rewards/chosen": -45.34564208984375, "rewards/margins": 4.786996841430664, "rewards/rejected": -50.13263702392578, "step": 7036 }, { "epoch": 0.9581971677559913, "grad_norm": 40.24249011228047, "learning_rate": 4.25171466574068e-09, "logits/chosen": 14.114824295043945, "logits/rejected": 14.613359451293945, "logps/chosen": -4.7700910568237305, "logps/rejected": -5.032415866851807, "loss": 4.1724, "rewards/accuracies": 0.5, "rewards/chosen": -47.700904846191406, "rewards/margins": 2.6232471466064453, "rewards/rejected": -50.324153900146484, "step": 7037 }, { "epoch": 0.9583333333333334, "grad_norm": 39.65320411807361, "learning_rate": 4.224110084991705e-09, "logits/chosen": 14.18734073638916, "logits/rejected": 13.84119987487793, "logps/chosen": -4.745879650115967, "logps/rejected": -4.734287261962891, "loss": 3.6437, "rewards/accuracies": 0.5, "rewards/chosen": -47.45879364013672, "rewards/margins": -0.1159219741821289, "rewards/rejected": -47.342872619628906, "step": 7038 }, { "epoch": 0.9584694989106753, "grad_norm": 37.685731371634645, "learning_rate": 4.1965949330765805e-09, "logits/chosen": 14.376228332519531, "logits/rejected": 14.86707878112793, "logps/chosen": -4.957677364349365, "logps/rejected": -4.921161651611328, "loss": 3.4125, "rewards/accuracies": 0.25, "rewards/chosen": -49.57677459716797, "rewards/margins": -0.3651552200317383, "rewards/rejected": -49.21161651611328, "step": 7039 }, { "epoch": 0.9586056644880174, "grad_norm": 39.366908695903255, "learning_rate": 4.169169216212598e-09, "logits/chosen": 14.049312591552734, "logits/rejected": 14.537785530090332, "logps/chosen": -4.455395221710205, "logps/rejected": -4.651092529296875, "loss": 4.2504, "rewards/accuracies": 0.75, "rewards/chosen": -44.553951263427734, "rewards/margins": 1.9569692611694336, "rewards/rejected": -46.510921478271484, "step": 7040 }, { "epoch": 0.9587418300653595, "grad_norm": 40.64602924186137, "learning_rate": 4.141832940596757e-09, "logits/chosen": 14.647745132446289, "logits/rejected": 14.584199905395508, "logps/chosen": -4.704082489013672, "logps/rejected": -4.994548797607422, "loss": 3.983, "rewards/accuracies": 0.75, "rewards/chosen": -47.04082489013672, "rewards/margins": 2.9046621322631836, "rewards/rejected": -49.94548797607422, "step": 7041 }, { "epoch": 0.9588779956427015, "grad_norm": 42.232117410146515, "learning_rate": 4.114586112405982e-09, "logits/chosen": 14.490713119506836, "logits/rejected": 14.865974426269531, "logps/chosen": -4.657441139221191, "logps/rejected": -5.004116058349609, "loss": 4.375, "rewards/accuracies": 0.75, "rewards/chosen": -46.57440948486328, "rewards/margins": 3.4667463302612305, "rewards/rejected": -50.041160583496094, "step": 7042 }, { "epoch": 0.9590141612200436, "grad_norm": 40.234416698571636, "learning_rate": 4.08742873779695e-09, "logits/chosen": 14.48361587524414, "logits/rejected": 14.036502838134766, "logps/chosen": -4.743844032287598, "logps/rejected": -4.661059379577637, "loss": 3.8308, "rewards/accuracies": 0.25, "rewards/chosen": -47.43844223022461, "rewards/margins": -0.8278493881225586, "rewards/rejected": -46.610591888427734, "step": 7043 }, { "epoch": 0.9591503267973857, "grad_norm": 40.189063168479024, "learning_rate": 4.06036082290595e-09, "logits/chosen": 14.696159362792969, "logits/rejected": 14.847160339355469, "logps/chosen": -4.86192512512207, "logps/rejected": -4.796876907348633, "loss": 4.0415, "rewards/accuracies": 0.5, "rewards/chosen": -48.6192512512207, "rewards/margins": -0.6504793167114258, "rewards/rejected": -47.968772888183594, "step": 7044 }, { "epoch": 0.9592864923747276, "grad_norm": 38.957365666778315, "learning_rate": 4.033382373849337e-09, "logits/chosen": 13.994377136230469, "logits/rejected": 15.391517639160156, "logps/chosen": -4.593860149383545, "logps/rejected": -4.9487385749816895, "loss": 3.244, "rewards/accuracies": 0.75, "rewards/chosen": -45.9385986328125, "rewards/margins": 3.5487852096557617, "rewards/rejected": -49.487388610839844, "step": 7045 }, { "epoch": 0.9594226579520697, "grad_norm": 43.891063375041895, "learning_rate": 4.0064933967230766e-09, "logits/chosen": 13.485774993896484, "logits/rejected": 14.347137451171875, "logps/chosen": -4.073921203613281, "logps/rejected": -4.602784156799316, "loss": 3.883, "rewards/accuracies": 1.0, "rewards/chosen": -40.73921203613281, "rewards/margins": 5.288629531860352, "rewards/rejected": -46.02784729003906, "step": 7046 }, { "epoch": 0.9595588235294118, "grad_norm": 41.549301482085866, "learning_rate": 3.979693897602976e-09, "logits/chosen": 14.707815170288086, "logits/rejected": 15.026105880737305, "logps/chosen": -4.741298675537109, "logps/rejected": -4.913121700286865, "loss": 3.8895, "rewards/accuracies": 0.5, "rewards/chosen": -47.412986755371094, "rewards/margins": 1.7182302474975586, "rewards/rejected": -49.13121795654297, "step": 7047 }, { "epoch": 0.9596949891067538, "grad_norm": 37.919831056208494, "learning_rate": 3.952983882544503e-09, "logits/chosen": 14.057472229003906, "logits/rejected": 14.736885070800781, "logps/chosen": -4.273599147796631, "logps/rejected": -4.822695255279541, "loss": 3.772, "rewards/accuracies": 1.0, "rewards/chosen": -42.735992431640625, "rewards/margins": 5.490962982177734, "rewards/rejected": -48.226951599121094, "step": 7048 }, { "epoch": 0.9598311546840959, "grad_norm": 50.177765867743815, "learning_rate": 3.92636335758314e-09, "logits/chosen": 13.717718124389648, "logits/rejected": 14.6273775100708, "logps/chosen": -4.249604225158691, "logps/rejected": -4.611209869384766, "loss": 3.665, "rewards/accuracies": 0.75, "rewards/chosen": -42.49604034423828, "rewards/margins": 3.6160573959350586, "rewards/rejected": -46.112098693847656, "step": 7049 }, { "epoch": 0.9599673202614379, "grad_norm": 41.77471949998786, "learning_rate": 3.899832328733943e-09, "logits/chosen": 14.517280578613281, "logits/rejected": 14.449535369873047, "logps/chosen": -4.93391227722168, "logps/rejected": -4.8760986328125, "loss": 3.7423, "rewards/accuracies": 0.5, "rewards/chosen": -49.3391227722168, "rewards/margins": -0.5781354904174805, "rewards/rejected": -48.760990142822266, "step": 7050 }, { "epoch": 0.9601034858387799, "grad_norm": 39.55801190702682, "learning_rate": 3.873390801991805e-09, "logits/chosen": 13.774656295776367, "logits/rejected": 14.511516571044922, "logps/chosen": -4.141382217407227, "logps/rejected": -4.632431983947754, "loss": 3.6199, "rewards/accuracies": 1.0, "rewards/chosen": -41.413822174072266, "rewards/margins": 4.910496711730957, "rewards/rejected": -46.324317932128906, "step": 7051 }, { "epoch": 0.960239651416122, "grad_norm": 35.57550577005852, "learning_rate": 3.8470387833314574e-09, "logits/chosen": 13.758261680603027, "logits/rejected": 14.016584396362305, "logps/chosen": -4.027861595153809, "logps/rejected": -4.62504768371582, "loss": 3.9172, "rewards/accuracies": 1.0, "rewards/chosen": -40.27861404418945, "rewards/margins": 5.971865653991699, "rewards/rejected": -46.25048065185547, "step": 7052 }, { "epoch": 0.9603758169934641, "grad_norm": 40.22353002315888, "learning_rate": 3.820776278707294e-09, "logits/chosen": 14.051261901855469, "logits/rejected": 14.403793334960938, "logps/chosen": -4.240579605102539, "logps/rejected": -4.459552764892578, "loss": 4.1882, "rewards/accuracies": 0.5, "rewards/chosen": -42.405792236328125, "rewards/margins": 2.1897363662719727, "rewards/rejected": -44.59552764892578, "step": 7053 }, { "epoch": 0.960511982570806, "grad_norm": 40.938382607419356, "learning_rate": 3.794603294053633e-09, "logits/chosen": 14.354595184326172, "logits/rejected": 14.389059066772461, "logps/chosen": -4.571020126342773, "logps/rejected": -4.683408737182617, "loss": 4.2364, "rewards/accuracies": 0.5, "rewards/chosen": -45.710205078125, "rewards/margins": 1.1238813400268555, "rewards/rejected": -46.834083557128906, "step": 7054 }, { "epoch": 0.9606481481481481, "grad_norm": 38.79517166607877, "learning_rate": 3.768519835284412e-09, "logits/chosen": 15.456225395202637, "logits/rejected": 15.867897033691406, "logps/chosen": -4.666091442108154, "logps/rejected": -5.480698585510254, "loss": 3.764, "rewards/accuracies": 1.0, "rewards/chosen": -46.660911560058594, "rewards/margins": 8.146074295043945, "rewards/rejected": -54.80698776245117, "step": 7055 }, { "epoch": 0.9607843137254902, "grad_norm": 39.67812783573395, "learning_rate": 3.742525908293403e-09, "logits/chosen": 14.269903182983398, "logits/rejected": 15.302400588989258, "logps/chosen": -4.4862565994262695, "logps/rejected": -5.240904808044434, "loss": 3.6947, "rewards/accuracies": 1.0, "rewards/chosen": -44.86256408691406, "rewards/margins": 7.54648494720459, "rewards/rejected": -52.40904998779297, "step": 7056 }, { "epoch": 0.9609204793028322, "grad_norm": 44.473175779144704, "learning_rate": 3.7166215189541328e-09, "logits/chosen": 14.36410140991211, "logits/rejected": 14.659128189086914, "logps/chosen": -4.691257476806641, "logps/rejected": -4.672649383544922, "loss": 3.4296, "rewards/accuracies": 0.5, "rewards/chosen": -46.91257858276367, "rewards/margins": -0.18608474731445312, "rewards/rejected": -46.72649383544922, "step": 7057 }, { "epoch": 0.9610566448801743, "grad_norm": 42.146107285842966, "learning_rate": 3.690806673120006e-09, "logits/chosen": 14.52623176574707, "logits/rejected": 14.535223007202148, "logps/chosen": -4.49434757232666, "logps/rejected": -4.613465309143066, "loss": 4.021, "rewards/accuracies": 0.5, "rewards/chosen": -44.94347381591797, "rewards/margins": 1.1911811828613281, "rewards/rejected": -46.1346549987793, "step": 7058 }, { "epoch": 0.9611928104575164, "grad_norm": 42.83698162348109, "learning_rate": 3.6650813766239573e-09, "logits/chosen": 14.085101127624512, "logits/rejected": 13.949865341186523, "logps/chosen": -4.297475814819336, "logps/rejected": -4.4478559494018555, "loss": 3.855, "rewards/accuracies": 0.75, "rewards/chosen": -42.974761962890625, "rewards/margins": 1.5037994384765625, "rewards/rejected": -44.47856140136719, "step": 7059 }, { "epoch": 0.9613289760348583, "grad_norm": 40.72677393583307, "learning_rate": 3.6394456352789815e-09, "logits/chosen": 14.151598930358887, "logits/rejected": 15.144012451171875, "logps/chosen": -4.543048858642578, "logps/rejected": -4.910614013671875, "loss": 3.7728, "rewards/accuracies": 0.75, "rewards/chosen": -45.43048858642578, "rewards/margins": 3.675654411315918, "rewards/rejected": -49.106143951416016, "step": 7060 }, { "epoch": 0.9614651416122004, "grad_norm": 37.090794168925655, "learning_rate": 3.6138994548776003e-09, "logits/chosen": 14.810517311096191, "logits/rejected": 14.837089538574219, "logps/chosen": -4.507694721221924, "logps/rejected": -4.877435684204102, "loss": 3.7964, "rewards/accuracies": 0.75, "rewards/chosen": -45.07695007324219, "rewards/margins": 3.6974096298217773, "rewards/rejected": -48.774356842041016, "step": 7061 }, { "epoch": 0.9616013071895425, "grad_norm": 45.176857738249595, "learning_rate": 3.588442841192174e-09, "logits/chosen": 14.384366035461426, "logits/rejected": 14.427361488342285, "logps/chosen": -4.413649559020996, "logps/rejected": -4.672752857208252, "loss": 3.5202, "rewards/accuracies": 0.75, "rewards/chosen": -44.13649368286133, "rewards/margins": 2.5910348892211914, "rewards/rejected": -46.7275276184082, "step": 7062 }, { "epoch": 0.9617374727668845, "grad_norm": 40.359815373481446, "learning_rate": 3.5630757999748574e-09, "logits/chosen": 13.992366790771484, "logits/rejected": 13.497723579406738, "logps/chosen": -4.519432067871094, "logps/rejected": -4.601056098937988, "loss": 4.2626, "rewards/accuracies": 0.75, "rewards/chosen": -45.19432067871094, "rewards/margins": 0.8162374496459961, "rewards/rejected": -46.01055908203125, "step": 7063 }, { "epoch": 0.9618736383442266, "grad_norm": 39.661428985878466, "learning_rate": 3.5377983369575536e-09, "logits/chosen": 14.834615707397461, "logits/rejected": 15.777219772338867, "logps/chosen": -4.813264846801758, "logps/rejected": -5.053819179534912, "loss": 4.1044, "rewards/accuracies": 0.75, "rewards/chosen": -48.13264465332031, "rewards/margins": 2.4055471420288086, "rewards/rejected": -50.53819274902344, "step": 7064 }, { "epoch": 0.9620098039215687, "grad_norm": 40.06830461498541, "learning_rate": 3.5126104578519165e-09, "logits/chosen": 14.788606643676758, "logits/rejected": 13.999845504760742, "logps/chosen": -4.612316131591797, "logps/rejected": -4.311226844787598, "loss": 4.3381, "rewards/accuracies": 0.25, "rewards/chosen": -46.12316131591797, "rewards/margins": -3.0108890533447266, "rewards/rejected": -43.112274169921875, "step": 7065 }, { "epoch": 0.9621459694989106, "grad_norm": 43.6149804015799, "learning_rate": 3.487512168349305e-09, "logits/chosen": 14.589742660522461, "logits/rejected": 14.607783317565918, "logps/chosen": -4.8396148681640625, "logps/rejected": -4.594042778015137, "loss": 4.1086, "rewards/accuracies": 0.5, "rewards/chosen": -48.396148681640625, "rewards/margins": -2.455716133117676, "rewards/rejected": -45.9404296875, "step": 7066 }, { "epoch": 0.9622821350762527, "grad_norm": 39.921997713578655, "learning_rate": 3.4625034741210034e-09, "logits/chosen": 13.811594009399414, "logits/rejected": 14.582194328308105, "logps/chosen": -4.4901652336120605, "logps/rejected": -4.786173343658447, "loss": 3.6199, "rewards/accuracies": 1.0, "rewards/chosen": -44.90165328979492, "rewards/margins": 2.9600791931152344, "rewards/rejected": -47.861732482910156, "step": 7067 }, { "epoch": 0.9624183006535948, "grad_norm": 36.304507944159404, "learning_rate": 3.437584380817782e-09, "logits/chosen": 14.861213684082031, "logits/rejected": 15.475271224975586, "logps/chosen": -4.45070743560791, "logps/rejected": -4.791491985321045, "loss": 3.5474, "rewards/accuracies": 0.75, "rewards/chosen": -44.50707244873047, "rewards/margins": 3.4078474044799805, "rewards/rejected": -47.914920806884766, "step": 7068 }, { "epoch": 0.9625544662309368, "grad_norm": 39.93118483461618, "learning_rate": 3.412754894070424e-09, "logits/chosen": 14.639785766601562, "logits/rejected": 14.660120964050293, "logps/chosen": -4.611581802368164, "logps/rejected": -4.477898120880127, "loss": 4.0115, "rewards/accuracies": 0.25, "rewards/chosen": -46.115814208984375, "rewards/margins": -1.3368349075317383, "rewards/rejected": -44.77898025512695, "step": 7069 }, { "epoch": 0.9626906318082789, "grad_norm": 44.68798380821348, "learning_rate": 3.3880150194892877e-09, "logits/chosen": 13.816949844360352, "logits/rejected": 14.037002563476562, "logps/chosen": -4.570904731750488, "logps/rejected": -4.751583576202393, "loss": 4.5256, "rewards/accuracies": 0.5, "rewards/chosen": -45.70904541015625, "rewards/margins": 1.806793212890625, "rewards/rejected": -47.515838623046875, "step": 7070 }, { "epoch": 0.9628267973856209, "grad_norm": 40.005697327831214, "learning_rate": 3.363364762664611e-09, "logits/chosen": 14.457828521728516, "logits/rejected": 15.728584289550781, "logps/chosen": -4.691264629364014, "logps/rejected": -5.352257251739502, "loss": 3.4147, "rewards/accuracies": 1.0, "rewards/chosen": -46.91265106201172, "rewards/margins": 6.609926223754883, "rewards/rejected": -53.52257537841797, "step": 7071 }, { "epoch": 0.9629629629629629, "grad_norm": 39.64804475719241, "learning_rate": 3.338804129166295e-09, "logits/chosen": 14.93734359741211, "logits/rejected": 14.777875900268555, "logps/chosen": -4.669565200805664, "logps/rejected": -4.514032363891602, "loss": 3.7734, "rewards/accuracies": 0.5, "rewards/chosen": -46.69565200805664, "rewards/margins": -1.5553321838378906, "rewards/rejected": -45.14031982421875, "step": 7072 }, { "epoch": 0.963099128540305, "grad_norm": 40.20545651330915, "learning_rate": 3.314333124544033e-09, "logits/chosen": 14.020560264587402, "logits/rejected": 14.418479919433594, "logps/chosen": -4.431333541870117, "logps/rejected": -4.5100531578063965, "loss": 4.4467, "rewards/accuracies": 0.75, "rewards/chosen": -44.313331604003906, "rewards/margins": 0.7871980667114258, "rewards/rejected": -45.10053253173828, "step": 7073 }, { "epoch": 0.9632352941176471, "grad_norm": 40.03751299418246, "learning_rate": 3.289951754327225e-09, "logits/chosen": 14.541499137878418, "logits/rejected": 15.324586868286133, "logps/chosen": -4.516319751739502, "logps/rejected": -4.676990032196045, "loss": 4.2185, "rewards/accuracies": 1.0, "rewards/chosen": -45.1631965637207, "rewards/margins": 1.6067018508911133, "rewards/rejected": -46.769901275634766, "step": 7074 }, { "epoch": 0.963371459694989, "grad_norm": 40.0715350401495, "learning_rate": 3.265660024025063e-09, "logits/chosen": 14.296198844909668, "logits/rejected": 14.878108978271484, "logps/chosen": -4.569100379943848, "logps/rejected": -4.641207695007324, "loss": 3.7798, "rewards/accuracies": 0.75, "rewards/chosen": -45.69100570678711, "rewards/margins": 0.7210750579833984, "rewards/rejected": -46.41208267211914, "step": 7075 }, { "epoch": 0.9635076252723311, "grad_norm": 41.93681638808718, "learning_rate": 3.2414579391264464e-09, "logits/chosen": 13.707018852233887, "logits/rejected": 14.74234390258789, "logps/chosen": -4.318779945373535, "logps/rejected": -4.837105751037598, "loss": 4.2307, "rewards/accuracies": 1.0, "rewards/chosen": -43.18779754638672, "rewards/margins": 5.183260917663574, "rewards/rejected": -48.371055603027344, "step": 7076 }, { "epoch": 0.9636437908496732, "grad_norm": 37.69894397787685, "learning_rate": 3.2173455051000665e-09, "logits/chosen": 14.820084571838379, "logits/rejected": 14.759872436523438, "logps/chosen": -4.735481262207031, "logps/rejected": -4.4408769607543945, "loss": 3.7971, "rewards/accuracies": 0.25, "rewards/chosen": -47.35481643676758, "rewards/margins": -2.946046829223633, "rewards/rejected": -44.40877151489258, "step": 7077 }, { "epoch": 0.9637799564270153, "grad_norm": 41.741060091524886, "learning_rate": 3.1933227273942763e-09, "logits/chosen": 14.145637512207031, "logits/rejected": 14.704258918762207, "logps/chosen": -4.491244316101074, "logps/rejected": -4.58928108215332, "loss": 3.8447, "rewards/accuracies": 1.0, "rewards/chosen": -44.91244125366211, "rewards/margins": 0.9803657531738281, "rewards/rejected": -45.89280700683594, "step": 7078 }, { "epoch": 0.9639161220043573, "grad_norm": 43.82295768254937, "learning_rate": 3.1693896114372677e-09, "logits/chosen": 13.750524520874023, "logits/rejected": 14.581949234008789, "logps/chosen": -4.377962112426758, "logps/rejected": -4.710566997528076, "loss": 3.8531, "rewards/accuracies": 0.5, "rewards/chosen": -43.77961730957031, "rewards/margins": 3.3260536193847656, "rewards/rejected": -47.105674743652344, "step": 7079 }, { "epoch": 0.9640522875816994, "grad_norm": 42.499375808608185, "learning_rate": 3.145546162636936e-09, "logits/chosen": 14.269725799560547, "logits/rejected": 14.53935718536377, "logps/chosen": -4.63529634475708, "logps/rejected": -4.697786331176758, "loss": 4.2746, "rewards/accuracies": 0.5, "rewards/chosen": -46.352962493896484, "rewards/margins": 0.6248989105224609, "rewards/rejected": -46.97785949707031, "step": 7080 }, { "epoch": 0.9641884531590414, "grad_norm": 42.09407149414657, "learning_rate": 3.1217923863808395e-09, "logits/chosen": 14.683492660522461, "logits/rejected": 15.148159980773926, "logps/chosen": -4.649874687194824, "logps/rejected": -4.980627536773682, "loss": 4.0204, "rewards/accuracies": 1.0, "rewards/chosen": -46.498748779296875, "rewards/margins": 3.307527542114258, "rewards/rejected": -49.8062744140625, "step": 7081 }, { "epoch": 0.9643246187363834, "grad_norm": 36.877936925173614, "learning_rate": 3.0981282880364167e-09, "logits/chosen": 13.87332534790039, "logits/rejected": 14.381084442138672, "logps/chosen": -4.192017078399658, "logps/rejected": -4.611020565032959, "loss": 4.0309, "rewards/accuracies": 1.0, "rewards/chosen": -41.92017364501953, "rewards/margins": 4.190034866333008, "rewards/rejected": -46.110206604003906, "step": 7082 }, { "epoch": 0.9644607843137255, "grad_norm": 38.05952363412219, "learning_rate": 3.074553872950725e-09, "logits/chosen": 15.164342880249023, "logits/rejected": 15.122979164123535, "logps/chosen": -5.067268371582031, "logps/rejected": -5.402165412902832, "loss": 3.6852, "rewards/accuracies": 1.0, "rewards/chosen": -50.67268753051758, "rewards/margins": 3.3489675521850586, "rewards/rejected": -54.02165222167969, "step": 7083 }, { "epoch": 0.9645969498910676, "grad_norm": 38.14882435248676, "learning_rate": 3.051069146450569e-09, "logits/chosen": 14.126025199890137, "logits/rejected": 14.631877899169922, "logps/chosen": -4.850951194763184, "logps/rejected": -4.938002586364746, "loss": 3.8984, "rewards/accuracies": 0.5, "rewards/chosen": -48.50951385498047, "rewards/margins": 0.8705120086669922, "rewards/rejected": -49.380027770996094, "step": 7084 }, { "epoch": 0.9647331154684096, "grad_norm": 40.1246097687896, "learning_rate": 3.027674113842593e-09, "logits/chosen": 14.434840202331543, "logits/rejected": 14.756160736083984, "logps/chosen": -4.45025634765625, "logps/rejected": -4.870367527008057, "loss": 3.6942, "rewards/accuracies": 1.0, "rewards/chosen": -44.5025634765625, "rewards/margins": 4.201107025146484, "rewards/rejected": -48.70367431640625, "step": 7085 }, { "epoch": 0.9648692810457516, "grad_norm": 38.04871026363336, "learning_rate": 3.004368780413058e-09, "logits/chosen": 14.366917610168457, "logits/rejected": 15.16165542602539, "logps/chosen": -4.580654144287109, "logps/rejected": -5.00154972076416, "loss": 3.6942, "rewards/accuracies": 0.75, "rewards/chosen": -45.806541442871094, "rewards/margins": 4.208952903747559, "rewards/rejected": -50.01549530029297, "step": 7086 }, { "epoch": 0.9650054466230937, "grad_norm": 40.966040288715995, "learning_rate": 2.981153151427973e-09, "logits/chosen": 14.021088600158691, "logits/rejected": 13.8348388671875, "logps/chosen": -4.456433296203613, "logps/rejected": -4.420598030090332, "loss": 3.8036, "rewards/accuracies": 0.25, "rewards/chosen": -44.5643310546875, "rewards/margins": -0.3583507537841797, "rewards/rejected": -44.20597839355469, "step": 7087 }, { "epoch": 0.9651416122004357, "grad_norm": 41.37951410067213, "learning_rate": 2.9580272321331423e-09, "logits/chosen": 14.876035690307617, "logits/rejected": 14.39660358428955, "logps/chosen": -5.012880802154541, "logps/rejected": -4.931624412536621, "loss": 3.9547, "rewards/accuracies": 0.5, "rewards/chosen": -50.128807067871094, "rewards/margins": -0.8125667572021484, "rewards/rejected": -49.31624221801758, "step": 7088 }, { "epoch": 0.9652777777777778, "grad_norm": 38.9899430438368, "learning_rate": 2.9349910277540304e-09, "logits/chosen": 14.333271026611328, "logits/rejected": 14.743885040283203, "logps/chosen": -4.441214561462402, "logps/rejected": -4.511737823486328, "loss": 3.5327, "rewards/accuracies": 0.5, "rewards/chosen": -44.41214370727539, "rewards/margins": 0.705235481262207, "rewards/rejected": -45.11737823486328, "step": 7089 }, { "epoch": 0.9654139433551199, "grad_norm": 42.249777195126214, "learning_rate": 2.9120445434958507e-09, "logits/chosen": 14.366244316101074, "logits/rejected": 14.867504119873047, "logps/chosen": -4.567327976226807, "logps/rejected": -4.858969688415527, "loss": 4.0287, "rewards/accuracies": 0.5, "rewards/chosen": -45.67327880859375, "rewards/margins": 2.9164161682128906, "rewards/rejected": -48.589698791503906, "step": 7090 }, { "epoch": 0.9655501089324618, "grad_norm": 39.976620493621105, "learning_rate": 2.8891877845436118e-09, "logits/chosen": 13.684560775756836, "logits/rejected": 14.036020278930664, "logps/chosen": -4.30527400970459, "logps/rejected": -4.5848388671875, "loss": 3.923, "rewards/accuracies": 0.75, "rewards/chosen": -43.05274200439453, "rewards/margins": 2.795651435852051, "rewards/rejected": -45.848392486572266, "step": 7091 }, { "epoch": 0.9656862745098039, "grad_norm": 36.77848529853867, "learning_rate": 2.866420756061938e-09, "logits/chosen": 14.097814559936523, "logits/rejected": 15.087301254272461, "logps/chosen": -4.480469703674316, "logps/rejected": -4.843416213989258, "loss": 3.6731, "rewards/accuracies": 1.0, "rewards/chosen": -44.80469512939453, "rewards/margins": 3.6294679641723633, "rewards/rejected": -48.43416213989258, "step": 7092 }, { "epoch": 0.965822440087146, "grad_norm": 39.19054779760959, "learning_rate": 2.8437434631952027e-09, "logits/chosen": 14.432297706604004, "logits/rejected": 14.855489730834961, "logps/chosen": -4.655653953552246, "logps/rejected": -4.863851547241211, "loss": 3.5093, "rewards/accuracies": 0.75, "rewards/chosen": -46.556541442871094, "rewards/margins": 2.0819740295410156, "rewards/rejected": -48.638511657714844, "step": 7093 }, { "epoch": 0.965958605664488, "grad_norm": 41.672778156433665, "learning_rate": 2.821155911067574e-09, "logits/chosen": 13.373456001281738, "logits/rejected": 15.736555099487305, "logps/chosen": -4.255465507507324, "logps/rejected": -5.187613010406494, "loss": 3.6585, "rewards/accuracies": 1.0, "rewards/chosen": -42.554656982421875, "rewards/margins": 9.321474075317383, "rewards/rejected": -51.876129150390625, "step": 7094 }, { "epoch": 0.9660947712418301, "grad_norm": 42.74614077697422, "learning_rate": 2.7986581047828805e-09, "logits/chosen": 13.932117462158203, "logits/rejected": 13.934799194335938, "logps/chosen": -4.277427673339844, "logps/rejected": -4.452114105224609, "loss": 4.2634, "rewards/accuracies": 0.5, "rewards/chosen": -42.77427673339844, "rewards/margins": 1.7468605041503906, "rewards/rejected": -44.521141052246094, "step": 7095 }, { "epoch": 0.9662309368191722, "grad_norm": 36.955688034484304, "learning_rate": 2.7762500494247e-09, "logits/chosen": 13.940530776977539, "logits/rejected": 14.532905578613281, "logps/chosen": -4.393857002258301, "logps/rejected": -4.887537002563477, "loss": 3.6404, "rewards/accuracies": 0.75, "rewards/chosen": -43.938568115234375, "rewards/margins": 4.936800956726074, "rewards/rejected": -48.8753662109375, "step": 7096 }, { "epoch": 0.9663671023965141, "grad_norm": 42.4731154982413, "learning_rate": 2.753931750056271e-09, "logits/chosen": 14.671092987060547, "logits/rejected": 15.338861465454102, "logps/chosen": -4.935314178466797, "logps/rejected": -5.07540225982666, "loss": 3.9778, "rewards/accuracies": 0.5, "rewards/chosen": -49.35314178466797, "rewards/margins": 1.4008769989013672, "rewards/rejected": -50.75402069091797, "step": 7097 }, { "epoch": 0.9665032679738562, "grad_norm": 40.79767949906788, "learning_rate": 2.7317032117206705e-09, "logits/chosen": 14.627256393432617, "logits/rejected": 14.9744291305542, "logps/chosen": -4.762292861938477, "logps/rejected": -5.033435821533203, "loss": 3.7226, "rewards/accuracies": 0.75, "rewards/chosen": -47.62293243408203, "rewards/margins": 2.7114267349243164, "rewards/rejected": -50.33435821533203, "step": 7098 }, { "epoch": 0.9666394335511983, "grad_norm": 37.687199917142934, "learning_rate": 2.7095644394405925e-09, "logits/chosen": 13.255828857421875, "logits/rejected": 14.25437068939209, "logps/chosen": -4.365100860595703, "logps/rejected": -4.861478805541992, "loss": 3.5519, "rewards/accuracies": 1.0, "rewards/chosen": -43.65100860595703, "rewards/margins": 4.963783264160156, "rewards/rejected": -48.61479187011719, "step": 7099 }, { "epoch": 0.9667755991285403, "grad_norm": 52.508674743563915, "learning_rate": 2.687515438218435e-09, "logits/chosen": 14.374427795410156, "logits/rejected": 15.16098690032959, "logps/chosen": -4.510486602783203, "logps/rejected": -4.750787734985352, "loss": 3.7033, "rewards/accuracies": 0.75, "rewards/chosen": -45.10486602783203, "rewards/margins": 2.4030141830444336, "rewards/rejected": -47.50788116455078, "step": 7100 }, { "epoch": 0.9669117647058824, "grad_norm": 41.895947271338414, "learning_rate": 2.6655562130363463e-09, "logits/chosen": 14.17490005493164, "logits/rejected": 14.301156997680664, "logps/chosen": -4.615248680114746, "logps/rejected": -4.482049942016602, "loss": 4.113, "rewards/accuracies": 0.25, "rewards/chosen": -46.15248489379883, "rewards/margins": -1.3319873809814453, "rewards/rejected": -44.82049560546875, "step": 7101 }, { "epoch": 0.9670479302832244, "grad_norm": 41.388284899331765, "learning_rate": 2.6436867688563127e-09, "logits/chosen": 13.888713836669922, "logits/rejected": 14.053945541381836, "logps/chosen": -4.807778835296631, "logps/rejected": -4.8642578125, "loss": 4.6046, "rewards/accuracies": 0.75, "rewards/chosen": -48.07778549194336, "rewards/margins": 0.564788818359375, "rewards/rejected": -48.642574310302734, "step": 7102 }, { "epoch": 0.9671840958605664, "grad_norm": 40.039134811016005, "learning_rate": 2.6219071106197587e-09, "logits/chosen": 13.969476699829102, "logits/rejected": 14.650026321411133, "logps/chosen": -4.638684272766113, "logps/rejected": -4.678380012512207, "loss": 4.0319, "rewards/accuracies": 0.5, "rewards/chosen": -46.3868408203125, "rewards/margins": 0.3969612121582031, "rewards/rejected": -46.78380584716797, "step": 7103 }, { "epoch": 0.9673202614379085, "grad_norm": 41.83129540778656, "learning_rate": 2.600217243248082e-09, "logits/chosen": 13.397884368896484, "logits/rejected": 13.66269302368164, "logps/chosen": -4.44477653503418, "logps/rejected": -4.447572708129883, "loss": 4.1369, "rewards/accuracies": 0.5, "rewards/chosen": -44.4477653503418, "rewards/margins": 0.027959823608398438, "rewards/rejected": -44.47572326660156, "step": 7104 }, { "epoch": 0.9674564270152506, "grad_norm": 40.31380818164233, "learning_rate": 2.5786171716422943e-09, "logits/chosen": 14.65341854095459, "logits/rejected": 15.36949348449707, "logps/chosen": -4.683457851409912, "logps/rejected": -4.942768096923828, "loss": 3.4704, "rewards/accuracies": 0.75, "rewards/chosen": -46.83457565307617, "rewards/margins": 2.593106269836426, "rewards/rejected": -49.42768096923828, "step": 7105 }, { "epoch": 0.9675925925925926, "grad_norm": 40.06869962317526, "learning_rate": 2.5571069006830704e-09, "logits/chosen": 14.90707778930664, "logits/rejected": 15.041557312011719, "logps/chosen": -4.748205184936523, "logps/rejected": -4.873459815979004, "loss": 3.7813, "rewards/accuracies": 0.75, "rewards/chosen": -47.482051849365234, "rewards/margins": 1.252542495727539, "rewards/rejected": -48.734596252441406, "step": 7106 }, { "epoch": 0.9677287581699346, "grad_norm": 38.720469799618904, "learning_rate": 2.5356864352307882e-09, "logits/chosen": 14.37060546875, "logits/rejected": 14.377311706542969, "logps/chosen": -4.0343122482299805, "logps/rejected": -4.126463890075684, "loss": 3.3098, "rewards/accuracies": 0.75, "rewards/chosen": -40.34312438964844, "rewards/margins": 0.9215154647827148, "rewards/rejected": -41.26464080810547, "step": 7107 }, { "epoch": 0.9678649237472767, "grad_norm": 42.704452334642866, "learning_rate": 2.5143557801256655e-09, "logits/chosen": 13.287071228027344, "logits/rejected": 13.523296356201172, "logps/chosen": -4.359006881713867, "logps/rejected": -4.527671813964844, "loss": 4.4451, "rewards/accuracies": 0.75, "rewards/chosen": -43.59007263183594, "rewards/margins": 1.6866493225097656, "rewards/rejected": -45.27671813964844, "step": 7108 }, { "epoch": 0.9680010893246187, "grad_norm": 42.29452662522005, "learning_rate": 2.493114940187491e-09, "logits/chosen": 14.851190567016602, "logits/rejected": 14.88746452331543, "logps/chosen": -4.640892028808594, "logps/rejected": -4.774192810058594, "loss": 4.4359, "rewards/accuracies": 0.75, "rewards/chosen": -46.40892791748047, "rewards/margins": 1.3330020904541016, "rewards/rejected": -47.74192810058594, "step": 7109 }, { "epoch": 0.9681372549019608, "grad_norm": 40.33797052215763, "learning_rate": 2.4719639202158026e-09, "logits/chosen": 15.177450180053711, "logits/rejected": 15.378666877746582, "logps/chosen": -4.423898220062256, "logps/rejected": -4.819788455963135, "loss": 3.3515, "rewards/accuracies": 0.75, "rewards/chosen": -44.238983154296875, "rewards/margins": 3.9589052200317383, "rewards/rejected": -48.1978874206543, "step": 7110 }, { "epoch": 0.9682734204793029, "grad_norm": 41.482748799544886, "learning_rate": 2.4509027249898893e-09, "logits/chosen": 14.719079971313477, "logits/rejected": 15.018718719482422, "logps/chosen": -4.904869556427002, "logps/rejected": -4.92076301574707, "loss": 4.1648, "rewards/accuracies": 0.5, "rewards/chosen": -49.04869842529297, "rewards/margins": 0.1589336395263672, "rewards/rejected": -49.2076301574707, "step": 7111 }, { "epoch": 0.9684095860566448, "grad_norm": 41.01777239608425, "learning_rate": 2.4299313592687e-09, "logits/chosen": 14.572548866271973, "logits/rejected": 14.543625831604004, "logps/chosen": -4.949648857116699, "logps/rejected": -5.034099578857422, "loss": 3.8488, "rewards/accuracies": 0.5, "rewards/chosen": -49.496490478515625, "rewards/margins": 0.8445062637329102, "rewards/rejected": -50.34099578857422, "step": 7112 }, { "epoch": 0.9685457516339869, "grad_norm": 50.55738198503732, "learning_rate": 2.4090498277908433e-09, "logits/chosen": 13.794086456298828, "logits/rejected": 13.785189628601074, "logps/chosen": -4.642012119293213, "logps/rejected": -4.6306633949279785, "loss": 4.4276, "rewards/accuracies": 0.25, "rewards/chosen": -46.42012023925781, "rewards/margins": -0.11348819732666016, "rewards/rejected": -46.30663299560547, "step": 7113 }, { "epoch": 0.968681917211329, "grad_norm": 44.86374397591845, "learning_rate": 2.3882581352747235e-09, "logits/chosen": 14.033493995666504, "logits/rejected": 15.242758750915527, "logps/chosen": -4.75816535949707, "logps/rejected": -4.994040489196777, "loss": 3.6899, "rewards/accuracies": 1.0, "rewards/chosen": -47.58164978027344, "rewards/margins": 2.3587560653686523, "rewards/rejected": -49.940406799316406, "step": 7114 }, { "epoch": 0.968818082788671, "grad_norm": 43.601789574516005, "learning_rate": 2.3675562864183595e-09, "logits/chosen": 14.579099655151367, "logits/rejected": 15.14980411529541, "logps/chosen": -4.627874851226807, "logps/rejected": -4.9401726722717285, "loss": 4.2086, "rewards/accuracies": 0.5, "rewards/chosen": -46.27874755859375, "rewards/margins": 3.1229801177978516, "rewards/rejected": -49.40172576904297, "step": 7115 }, { "epoch": 0.9689542483660131, "grad_norm": 38.84960107393457, "learning_rate": 2.346944285899477e-09, "logits/chosen": 14.989486694335938, "logits/rejected": 15.300920486450195, "logps/chosen": -4.523639678955078, "logps/rejected": -4.913210391998291, "loss": 3.1378, "rewards/accuracies": 0.75, "rewards/chosen": -45.23639678955078, "rewards/margins": 3.8957080841064453, "rewards/rejected": -49.13210678100586, "step": 7116 }, { "epoch": 0.9690904139433552, "grad_norm": 45.95312519421911, "learning_rate": 2.3264221383755942e-09, "logits/chosen": 14.746070861816406, "logits/rejected": 14.532705307006836, "logps/chosen": -4.790998458862305, "logps/rejected": -4.654694080352783, "loss": 4.5475, "rewards/accuracies": 0.25, "rewards/chosen": -47.90998840332031, "rewards/margins": -1.3630428314208984, "rewards/rejected": -46.54694366455078, "step": 7117 }, { "epoch": 0.9692265795206971, "grad_norm": 41.662332924584334, "learning_rate": 2.3059898484838468e-09, "logits/chosen": 14.637359619140625, "logits/rejected": 15.889765739440918, "logps/chosen": -4.734259128570557, "logps/rejected": -5.2036638259887695, "loss": 3.3474, "rewards/accuracies": 1.0, "rewards/chosen": -47.34259033203125, "rewards/margins": 4.694049835205078, "rewards/rejected": -52.03664016723633, "step": 7118 }, { "epoch": 0.9693627450980392, "grad_norm": 38.15730689398938, "learning_rate": 2.2856474208410305e-09, "logits/chosen": 13.716503143310547, "logits/rejected": 15.443414688110352, "logps/chosen": -4.524313926696777, "logps/rejected": -5.079825401306152, "loss": 3.4715, "rewards/accuracies": 0.75, "rewards/chosen": -45.243141174316406, "rewards/margins": 5.555112838745117, "rewards/rejected": -50.798255920410156, "step": 7119 }, { "epoch": 0.9694989106753813, "grad_norm": 40.61701297260157, "learning_rate": 2.2653948600437346e-09, "logits/chosen": 14.507010459899902, "logits/rejected": 14.154691696166992, "logps/chosen": -4.619473934173584, "logps/rejected": -4.553082466125488, "loss": 3.9414, "rewards/accuracies": 0.5, "rewards/chosen": -46.194740295410156, "rewards/margins": -0.6639118194580078, "rewards/rejected": -45.53083038330078, "step": 7120 }, { "epoch": 0.9696350762527233, "grad_norm": 43.51839441848799, "learning_rate": 2.24523217066821e-09, "logits/chosen": 14.80798053741455, "logits/rejected": 14.940875053405762, "logps/chosen": -4.95676326751709, "logps/rejected": -4.851929187774658, "loss": 3.8531, "rewards/accuracies": 0.75, "rewards/chosen": -49.56763458251953, "rewards/margins": -1.04833984375, "rewards/rejected": -48.51929473876953, "step": 7121 }, { "epoch": 0.9697712418300654, "grad_norm": 40.1219469263127, "learning_rate": 2.2251593572703233e-09, "logits/chosen": 14.49350357055664, "logits/rejected": 14.465263366699219, "logps/chosen": -4.754855155944824, "logps/rejected": -5.046475410461426, "loss": 3.66, "rewards/accuracies": 0.75, "rewards/chosen": -47.548553466796875, "rewards/margins": 2.9162073135375977, "rewards/rejected": -50.464759826660156, "step": 7122 }, { "epoch": 0.9699074074074074, "grad_norm": 45.111092552107934, "learning_rate": 2.2051764243856907e-09, "logits/chosen": 14.296686172485352, "logits/rejected": 14.485876083374023, "logps/chosen": -4.393174171447754, "logps/rejected": -4.64896297454834, "loss": 3.9682, "rewards/accuracies": 0.75, "rewards/chosen": -43.93173599243164, "rewards/margins": 2.557894706726074, "rewards/rejected": -46.48963165283203, "step": 7123 }, { "epoch": 0.9700435729847494, "grad_norm": 41.29193454242862, "learning_rate": 2.185283376529723e-09, "logits/chosen": 14.570077896118164, "logits/rejected": 14.679798126220703, "logps/chosen": -4.648348808288574, "logps/rejected": -4.860649108886719, "loss": 3.7507, "rewards/accuracies": 0.75, "rewards/chosen": -46.483489990234375, "rewards/margins": 2.1230030059814453, "rewards/rejected": -48.60649490356445, "step": 7124 }, { "epoch": 0.9701797385620915, "grad_norm": 39.05184238678071, "learning_rate": 2.1654802181972686e-09, "logits/chosen": 14.535222053527832, "logits/rejected": 15.390724182128906, "logps/chosen": -4.5543107986450195, "logps/rejected": -5.122418403625488, "loss": 3.8296, "rewards/accuracies": 0.75, "rewards/chosen": -45.54310607910156, "rewards/margins": 5.681081771850586, "rewards/rejected": -51.22418975830078, "step": 7125 }, { "epoch": 0.9703159041394336, "grad_norm": 42.21825589927819, "learning_rate": 2.1457669538631485e-09, "logits/chosen": 15.493257522583008, "logits/rejected": 15.492305755615234, "logps/chosen": -4.659900665283203, "logps/rejected": -4.870733261108398, "loss": 4.2657, "rewards/accuracies": 0.5, "rewards/chosen": -46.59900665283203, "rewards/margins": 2.108321189880371, "rewards/rejected": -48.70732879638672, "step": 7126 }, { "epoch": 0.9704520697167756, "grad_norm": 39.06508643432874, "learning_rate": 2.126143587981666e-09, "logits/chosen": 13.854948043823242, "logits/rejected": 14.065220832824707, "logps/chosen": -4.430172920227051, "logps/rejected": -4.59506368637085, "loss": 3.6569, "rewards/accuracies": 0.75, "rewards/chosen": -44.301734924316406, "rewards/margins": 1.6489019393920898, "rewards/rejected": -45.95063781738281, "step": 7127 }, { "epoch": 0.9705882352941176, "grad_norm": 39.668653682551266, "learning_rate": 2.106610124986874e-09, "logits/chosen": 14.257537841796875, "logits/rejected": 14.601303100585938, "logps/chosen": -4.5266618728637695, "logps/rejected": -4.7787604331970215, "loss": 3.4836, "rewards/accuracies": 0.75, "rewards/chosen": -45.26661682128906, "rewards/margins": 2.520984649658203, "rewards/rejected": -47.78760528564453, "step": 7128 }, { "epoch": 0.9707244008714597, "grad_norm": 38.1557289170659, "learning_rate": 2.0871665692925755e-09, "logits/chosen": 14.868453979492188, "logits/rejected": 15.152847290039062, "logps/chosen": -4.746260166168213, "logps/rejected": -5.020027160644531, "loss": 4.0504, "rewards/accuracies": 0.75, "rewards/chosen": -47.46260070800781, "rewards/margins": 2.7376651763916016, "rewards/rejected": -50.20026779174805, "step": 7129 }, { "epoch": 0.9708605664488017, "grad_norm": 37.24923047793195, "learning_rate": 2.0678129252921894e-09, "logits/chosen": 14.86085319519043, "logits/rejected": 14.802360534667969, "logps/chosen": -4.5395026206970215, "logps/rejected": -4.715598106384277, "loss": 3.1316, "rewards/accuracies": 0.5, "rewards/chosen": -45.395023345947266, "rewards/margins": 1.7609586715698242, "rewards/rejected": -47.155982971191406, "step": 7130 }, { "epoch": 0.9709967320261438, "grad_norm": 42.27086078107276, "learning_rate": 2.048549197358751e-09, "logits/chosen": 13.676765441894531, "logits/rejected": 14.373880386352539, "logps/chosen": -4.46767520904541, "logps/rejected": -5.009225368499756, "loss": 4.1922, "rewards/accuracies": 1.0, "rewards/chosen": -44.67675018310547, "rewards/margins": 5.415502548217773, "rewards/rejected": -50.092254638671875, "step": 7131 }, { "epoch": 0.9711328976034859, "grad_norm": 43.71738414426599, "learning_rate": 2.029375389845178e-09, "logits/chosen": 14.928289413452148, "logits/rejected": 15.09421157836914, "logps/chosen": -4.996260643005371, "logps/rejected": -4.906569957733154, "loss": 4.0188, "rewards/accuracies": 0.25, "rewards/chosen": -49.962608337402344, "rewards/margins": -0.8969087600708008, "rewards/rejected": -49.065696716308594, "step": 7132 }, { "epoch": 0.9712690631808278, "grad_norm": 38.06844701958223, "learning_rate": 2.0102915070838724e-09, "logits/chosen": 14.14809799194336, "logits/rejected": 14.455900192260742, "logps/chosen": -4.128705978393555, "logps/rejected": -4.348330020904541, "loss": 3.602, "rewards/accuracies": 0.75, "rewards/chosen": -41.28705978393555, "rewards/margins": 2.1962366104125977, "rewards/rejected": -43.483299255371094, "step": 7133 }, { "epoch": 0.9714052287581699, "grad_norm": 39.12006253184097, "learning_rate": 1.9912975533869836e-09, "logits/chosen": 15.0552978515625, "logits/rejected": 14.610766410827637, "logps/chosen": -4.962779998779297, "logps/rejected": -4.907815933227539, "loss": 4.1368, "rewards/accuracies": 0.25, "rewards/chosen": -49.62779998779297, "rewards/margins": -0.5496387481689453, "rewards/rejected": -49.078163146972656, "step": 7134 }, { "epoch": 0.971541394335512, "grad_norm": 43.33975706274306, "learning_rate": 1.9723935330464126e-09, "logits/chosen": 15.174615859985352, "logits/rejected": 15.186403274536133, "logps/chosen": -4.936762809753418, "logps/rejected": -4.622802734375, "loss": 4.114, "rewards/accuracies": 0.0, "rewards/chosen": -49.36763000488281, "rewards/margins": -3.1396055221557617, "rewards/rejected": -46.22802734375, "step": 7135 }, { "epoch": 0.971677559912854, "grad_norm": 39.62915415713999, "learning_rate": 1.9535794503336756e-09, "logits/chosen": 14.605981826782227, "logits/rejected": 14.606672286987305, "logps/chosen": -4.578235626220703, "logps/rejected": -4.775303363800049, "loss": 4.2715, "rewards/accuracies": 0.75, "rewards/chosen": -45.78235626220703, "rewards/margins": 1.9706754684448242, "rewards/rejected": -47.75303649902344, "step": 7136 }, { "epoch": 0.9718137254901961, "grad_norm": 38.185061592783136, "learning_rate": 1.93485530949995e-09, "logits/chosen": 14.28243637084961, "logits/rejected": 14.41622543334961, "logps/chosen": -4.569971084594727, "logps/rejected": -4.834777355194092, "loss": 4.0471, "rewards/accuracies": 0.75, "rewards/chosen": -45.699710845947266, "rewards/margins": 2.648061752319336, "rewards/rejected": -48.347774505615234, "step": 7137 }, { "epoch": 0.9719498910675382, "grad_norm": 44.015037708341886, "learning_rate": 1.916221114776073e-09, "logits/chosen": 14.930432319641113, "logits/rejected": 14.734682083129883, "logps/chosen": -4.427038192749023, "logps/rejected": -4.812264442443848, "loss": 4.1087, "rewards/accuracies": 1.0, "rewards/chosen": -44.2703857421875, "rewards/margins": 3.8522653579711914, "rewards/rejected": -48.12264633178711, "step": 7138 }, { "epoch": 0.9720860566448801, "grad_norm": 35.95873103129894, "learning_rate": 1.897676870372633e-09, "logits/chosen": 15.017149925231934, "logits/rejected": 15.078498840332031, "logps/chosen": -4.4976043701171875, "logps/rejected": -4.939210891723633, "loss": 3.3603, "rewards/accuracies": 0.75, "rewards/chosen": -44.976043701171875, "rewards/margins": 4.416065216064453, "rewards/rejected": -49.39210891723633, "step": 7139 }, { "epoch": 0.9722222222222222, "grad_norm": 39.09301649726991, "learning_rate": 1.8792225804798776e-09, "logits/chosen": 14.680669784545898, "logits/rejected": 14.380184173583984, "logps/chosen": -4.546194553375244, "logps/rejected": -4.766903400421143, "loss": 3.7089, "rewards/accuracies": 0.5, "rewards/chosen": -45.46194839477539, "rewards/margins": 2.2070865631103516, "rewards/rejected": -47.669036865234375, "step": 7140 }, { "epoch": 0.9723583877995643, "grad_norm": 44.09616814857598, "learning_rate": 1.8608582492676716e-09, "logits/chosen": 15.254446029663086, "logits/rejected": 14.662444114685059, "logps/chosen": -4.895313739776611, "logps/rejected": -4.6870317459106445, "loss": 4.4513, "rewards/accuracies": 0.5, "rewards/chosen": -48.9531364440918, "rewards/margins": -2.082821846008301, "rewards/rejected": -46.87031555175781, "step": 7141 }, { "epoch": 0.9724945533769063, "grad_norm": 44.204294491199796, "learning_rate": 1.8425838808855843e-09, "logits/chosen": 14.01030158996582, "logits/rejected": 14.148988723754883, "logps/chosen": -4.06182861328125, "logps/rejected": -4.495942115783691, "loss": 3.8856, "rewards/accuracies": 0.75, "rewards/chosen": -40.6182861328125, "rewards/margins": 4.341135025024414, "rewards/rejected": -44.95942306518555, "step": 7142 }, { "epoch": 0.9726307189542484, "grad_norm": 39.55463532859084, "learning_rate": 1.8243994794628459e-09, "logits/chosen": 14.543094635009766, "logits/rejected": 14.760082244873047, "logps/chosen": -4.343843460083008, "logps/rejected": -4.637004852294922, "loss": 3.4483, "rewards/accuracies": 0.75, "rewards/chosen": -43.43843078613281, "rewards/margins": 2.9316225051879883, "rewards/rejected": -46.370052337646484, "step": 7143 }, { "epoch": 0.9727668845315904, "grad_norm": 40.95946322690579, "learning_rate": 1.8063050491084364e-09, "logits/chosen": 14.87154769897461, "logits/rejected": 15.174827575683594, "logps/chosen": -4.570509433746338, "logps/rejected": -4.842497825622559, "loss": 3.7473, "rewards/accuracies": 0.5, "rewards/chosen": -45.70509338378906, "rewards/margins": 2.7198877334594727, "rewards/rejected": -48.42498016357422, "step": 7144 }, { "epoch": 0.9729030501089324, "grad_norm": 35.14662457746003, "learning_rate": 1.7883005939109075e-09, "logits/chosen": 14.332271575927734, "logits/rejected": 14.73403549194336, "logps/chosen": -4.8252668380737305, "logps/rejected": -5.0733489990234375, "loss": 3.3721, "rewards/accuracies": 1.0, "rewards/chosen": -48.25266647338867, "rewards/margins": 2.4808225631713867, "rewards/rejected": -50.733489990234375, "step": 7145 }, { "epoch": 0.9730392156862745, "grad_norm": 43.02300399581941, "learning_rate": 1.770386117938516e-09, "logits/chosen": 13.552467346191406, "logits/rejected": 14.288063049316406, "logps/chosen": -4.581313133239746, "logps/rejected": -4.83814811706543, "loss": 4.0887, "rewards/accuracies": 0.75, "rewards/chosen": -45.81312942504883, "rewards/margins": 2.5683507919311523, "rewards/rejected": -48.3814811706543, "step": 7146 }, { "epoch": 0.9731753812636166, "grad_norm": 40.2940620902827, "learning_rate": 1.7525616252391351e-09, "logits/chosen": 14.264396667480469, "logits/rejected": 14.218864440917969, "logps/chosen": -4.652407646179199, "logps/rejected": -4.569772243499756, "loss": 3.5962, "rewards/accuracies": 0.5, "rewards/chosen": -46.524078369140625, "rewards/margins": -0.82635498046875, "rewards/rejected": -45.697723388671875, "step": 7147 }, { "epoch": 0.9733115468409586, "grad_norm": 41.003715141902035, "learning_rate": 1.7348271198404318e-09, "logits/chosen": 14.765264511108398, "logits/rejected": 15.051224708557129, "logps/chosen": -4.882497787475586, "logps/rejected": -4.82839298248291, "loss": 3.5667, "rewards/accuracies": 0.5, "rewards/chosen": -48.824981689453125, "rewards/margins": -0.5410499572753906, "rewards/rejected": -48.28392791748047, "step": 7148 }, { "epoch": 0.9734477124183006, "grad_norm": 47.601387378920265, "learning_rate": 1.7171826057496452e-09, "logits/chosen": 14.587406158447266, "logits/rejected": 14.764595031738281, "logps/chosen": -4.482367992401123, "logps/rejected": -4.461599349975586, "loss": 3.9834, "rewards/accuracies": 0.5, "rewards/chosen": -44.82367706298828, "rewards/margins": -0.20768260955810547, "rewards/rejected": -44.615997314453125, "step": 7149 }, { "epoch": 0.9735838779956427, "grad_norm": 41.6258899801344, "learning_rate": 1.699628086953675e-09, "logits/chosen": 14.897089958190918, "logits/rejected": 14.616456985473633, "logps/chosen": -4.6285505294799805, "logps/rejected": -4.7614426612854, "loss": 4.1757, "rewards/accuracies": 1.0, "rewards/chosen": -46.28550720214844, "rewards/margins": 1.3289165496826172, "rewards/rejected": -47.61442565917969, "step": 7150 }, { "epoch": 0.9737200435729847, "grad_norm": 40.673401079972415, "learning_rate": 1.6821635674191259e-09, "logits/chosen": 14.371550559997559, "logits/rejected": 14.481864929199219, "logps/chosen": -4.351190567016602, "logps/rejected": -4.476224899291992, "loss": 3.7366, "rewards/accuracies": 0.75, "rewards/chosen": -43.51190948486328, "rewards/margins": 1.2503385543823242, "rewards/rejected": -44.762245178222656, "step": 7151 }, { "epoch": 0.9738562091503268, "grad_norm": 39.702191266099426, "learning_rate": 1.6647890510922191e-09, "logits/chosen": 14.833871841430664, "logits/rejected": 14.257524490356445, "logps/chosen": -4.529163837432861, "logps/rejected": -4.476554870605469, "loss": 3.7418, "rewards/accuracies": 0.5, "rewards/chosen": -45.2916374206543, "rewards/margins": -0.5260915756225586, "rewards/rejected": -44.76554489135742, "step": 7152 }, { "epoch": 0.9739923747276689, "grad_norm": 41.6648249273775, "learning_rate": 1.6475045418989253e-09, "logits/chosen": 13.907180786132812, "logits/rejected": 14.28860855102539, "logps/chosen": -4.560328960418701, "logps/rejected": -4.654163837432861, "loss": 4.0883, "rewards/accuracies": 0.5, "rewards/chosen": -45.60328674316406, "rewards/margins": 0.9383516311645508, "rewards/rejected": -46.54164123535156, "step": 7153 }, { "epoch": 0.974128540305011, "grad_norm": 40.70228324347684, "learning_rate": 1.630310043744787e-09, "logits/chosen": 13.995743751525879, "logits/rejected": 15.433309555053711, "logps/chosen": -4.781109809875488, "logps/rejected": -5.179266452789307, "loss": 4.0622, "rewards/accuracies": 0.75, "rewards/chosen": -47.81109619140625, "rewards/margins": 3.9815673828125, "rewards/rejected": -51.79266357421875, "step": 7154 }, { "epoch": 0.9742647058823529, "grad_norm": 44.96450338389998, "learning_rate": 1.6132055605150518e-09, "logits/chosen": 14.086746215820312, "logits/rejected": 14.283807754516602, "logps/chosen": -4.495896339416504, "logps/rejected": -4.638523101806641, "loss": 3.8554, "rewards/accuracies": 0.5, "rewards/chosen": -44.958961486816406, "rewards/margins": 1.4262714385986328, "rewards/rejected": -46.38523483276367, "step": 7155 }, { "epoch": 0.974400871459695, "grad_norm": 42.74888800894088, "learning_rate": 1.5961910960746282e-09, "logits/chosen": 14.049365997314453, "logits/rejected": 14.948307037353516, "logps/chosen": -4.652321815490723, "logps/rejected": -4.688824653625488, "loss": 4.0243, "rewards/accuracies": 0.5, "rewards/chosen": -46.523216247558594, "rewards/margins": 0.3650321960449219, "rewards/rejected": -46.88824462890625, "step": 7156 }, { "epoch": 0.9745370370370371, "grad_norm": 43.27798016710719, "learning_rate": 1.5792666542680855e-09, "logits/chosen": 13.903295516967773, "logits/rejected": 13.925651550292969, "logps/chosen": -4.5025105476379395, "logps/rejected": -4.519534587860107, "loss": 4.2661, "rewards/accuracies": 0.25, "rewards/chosen": -45.025108337402344, "rewards/margins": 0.1702404022216797, "rewards/rejected": -45.19534683227539, "step": 7157 }, { "epoch": 0.9746732026143791, "grad_norm": 41.263192810307295, "learning_rate": 1.5624322389196087e-09, "logits/chosen": 14.014183044433594, "logits/rejected": 15.429350852966309, "logps/chosen": -4.70175838470459, "logps/rejected": -5.15187931060791, "loss": 4.1158, "rewards/accuracies": 1.0, "rewards/chosen": -47.017581939697266, "rewards/margins": 4.501206398010254, "rewards/rejected": -51.51879119873047, "step": 7158 }, { "epoch": 0.9748093681917211, "grad_norm": 38.113903236956105, "learning_rate": 1.5456878538330443e-09, "logits/chosen": 15.134970664978027, "logits/rejected": 15.519476890563965, "logps/chosen": -4.580025672912598, "logps/rejected": -4.966307640075684, "loss": 3.8346, "rewards/accuracies": 1.0, "rewards/chosen": -45.800254821777344, "rewards/margins": 3.8628158569335938, "rewards/rejected": -49.66307067871094, "step": 7159 }, { "epoch": 0.9749455337690632, "grad_norm": 41.19903890001209, "learning_rate": 1.529033502792032e-09, "logits/chosen": 15.264154434204102, "logits/rejected": 14.834969520568848, "logps/chosen": -4.561457633972168, "logps/rejected": -4.644071102142334, "loss": 3.9264, "rewards/accuracies": 0.5, "rewards/chosen": -45.61457824707031, "rewards/margins": 0.8261318206787109, "rewards/rejected": -46.440711975097656, "step": 7160 }, { "epoch": 0.9750816993464052, "grad_norm": 37.13483054297759, "learning_rate": 1.5124691895596508e-09, "logits/chosen": 14.11154556274414, "logits/rejected": 15.354394912719727, "logps/chosen": -4.420771598815918, "logps/rejected": -4.929144859313965, "loss": 3.9538, "rewards/accuracies": 1.0, "rewards/chosen": -44.20771408081055, "rewards/margins": 5.083731651306152, "rewards/rejected": -49.29144287109375, "step": 7161 }, { "epoch": 0.9752178649237473, "grad_norm": 38.08084416405564, "learning_rate": 1.4959949178787734e-09, "logits/chosen": 14.034543991088867, "logits/rejected": 14.349836349487305, "logps/chosen": -4.513601303100586, "logps/rejected": -4.484244346618652, "loss": 3.8288, "rewards/accuracies": 0.5, "rewards/chosen": -45.136009216308594, "rewards/margins": -0.2935676574707031, "rewards/rejected": -44.84244155883789, "step": 7162 }, { "epoch": 0.9753540305010894, "grad_norm": 36.848759792481964, "learning_rate": 1.4796106914719332e-09, "logits/chosen": 14.169353485107422, "logits/rejected": 14.694730758666992, "logps/chosen": -4.5796990394592285, "logps/rejected": -4.79356575012207, "loss": 3.7491, "rewards/accuracies": 0.5, "rewards/chosen": -45.79698944091797, "rewards/margins": 2.138669967651367, "rewards/rejected": -47.93566131591797, "step": 7163 }, { "epoch": 0.9754901960784313, "grad_norm": 40.482336221966285, "learning_rate": 1.4633165140412796e-09, "logits/chosen": 14.55104923248291, "logits/rejected": 14.15871810913086, "logps/chosen": -4.49574613571167, "logps/rejected": -4.800559043884277, "loss": 3.9011, "rewards/accuracies": 0.75, "rewards/chosen": -44.95745849609375, "rewards/margins": 3.048128128051758, "rewards/rejected": -48.00558853149414, "step": 7164 }, { "epoch": 0.9756263616557734, "grad_norm": 43.229961967278804, "learning_rate": 1.4471123892685789e-09, "logits/chosen": 14.390853881835938, "logits/rejected": 14.850532531738281, "logps/chosen": -4.882924556732178, "logps/rejected": -4.825822830200195, "loss": 3.6593, "rewards/accuracies": 0.5, "rewards/chosen": -48.829246520996094, "rewards/margins": -0.5710172653198242, "rewards/rejected": -48.25822830200195, "step": 7165 }, { "epoch": 0.9757625272331155, "grad_norm": 40.23330866587597, "learning_rate": 1.430998320815302e-09, "logits/chosen": 14.423341751098633, "logits/rejected": 15.280759811401367, "logps/chosen": -4.448869228363037, "logps/rejected": -4.539032936096191, "loss": 3.923, "rewards/accuracies": 0.75, "rewards/chosen": -44.48869323730469, "rewards/margins": 0.9016389846801758, "rewards/rejected": -45.39033126831055, "step": 7166 }, { "epoch": 0.9758986928104575, "grad_norm": 38.97368234774421, "learning_rate": 1.4149743123225365e-09, "logits/chosen": 14.995336532592773, "logits/rejected": 15.111381530761719, "logps/chosen": -4.671911239624023, "logps/rejected": -4.886270999908447, "loss": 3.5229, "rewards/accuracies": 0.75, "rewards/chosen": -46.71910858154297, "rewards/margins": 2.1435985565185547, "rewards/rejected": -48.862709045410156, "step": 7167 }, { "epoch": 0.9760348583877996, "grad_norm": 41.44916614362591, "learning_rate": 1.3990403674111194e-09, "logits/chosen": 14.193169593811035, "logits/rejected": 14.234879493713379, "logps/chosen": -4.068879127502441, "logps/rejected": -4.278127670288086, "loss": 3.5104, "rewards/accuracies": 0.75, "rewards/chosen": -40.68879318237305, "rewards/margins": 2.092484474182129, "rewards/rejected": -42.781280517578125, "step": 7168 }, { "epoch": 0.9761710239651417, "grad_norm": 35.58734513556725, "learning_rate": 1.3831964896813709e-09, "logits/chosen": 14.591601371765137, "logits/rejected": 15.055782318115234, "logps/chosen": -4.5204362869262695, "logps/rejected": -4.720789909362793, "loss": 3.5853, "rewards/accuracies": 0.5, "rewards/chosen": -45.20436477661133, "rewards/margins": 2.003537178039551, "rewards/rejected": -47.20790100097656, "step": 7169 }, { "epoch": 0.9763071895424836, "grad_norm": 41.91777909572443, "learning_rate": 1.36744268271336e-09, "logits/chosen": 15.074102401733398, "logits/rejected": 13.835187911987305, "logps/chosen": -4.754690170288086, "logps/rejected": -4.296356201171875, "loss": 4.3153, "rewards/accuracies": 0.25, "rewards/chosen": -47.546897888183594, "rewards/margins": -4.583338737487793, "rewards/rejected": -42.96356201171875, "step": 7170 }, { "epoch": 0.9764433551198257, "grad_norm": 40.497304217975255, "learning_rate": 1.3517789500668175e-09, "logits/chosen": 14.571382522583008, "logits/rejected": 15.024125099182129, "logps/chosen": -4.700214862823486, "logps/rejected": -4.78654670715332, "loss": 3.9966, "rewards/accuracies": 0.75, "rewards/chosen": -47.00214767456055, "rewards/margins": 0.8633184432983398, "rewards/rejected": -47.8654670715332, "step": 7171 }, { "epoch": 0.9765795206971678, "grad_norm": 36.18200958014597, "learning_rate": 1.3362052952810897e-09, "logits/chosen": 14.237228393554688, "logits/rejected": 14.675989151000977, "logps/chosen": -4.311258316040039, "logps/rejected": -4.768850326538086, "loss": 3.702, "rewards/accuracies": 0.75, "rewards/chosen": -43.11258316040039, "rewards/margins": 4.575920104980469, "rewards/rejected": -47.68850326538086, "step": 7172 }, { "epoch": 0.9767156862745098, "grad_norm": 41.86626030946709, "learning_rate": 1.3207217218751398e-09, "logits/chosen": 14.873943328857422, "logits/rejected": 15.199045181274414, "logps/chosen": -4.3818817138671875, "logps/rejected": -4.730325698852539, "loss": 3.9638, "rewards/accuracies": 0.75, "rewards/chosen": -43.818817138671875, "rewards/margins": 3.484438896179199, "rewards/rejected": -47.303253173828125, "step": 7173 }, { "epoch": 0.9768518518518519, "grad_norm": 40.305399772326005, "learning_rate": 1.30532823334768e-09, "logits/chosen": 14.52467155456543, "logits/rejected": 15.022659301757812, "logps/chosen": -4.59080696105957, "logps/rejected": -5.014441013336182, "loss": 3.9027, "rewards/accuracies": 1.0, "rewards/chosen": -45.90806579589844, "rewards/margins": 4.2363433837890625, "rewards/rejected": -50.1444091796875, "step": 7174 }, { "epoch": 0.976988017429194, "grad_norm": 42.04858111860931, "learning_rate": 1.290024833176906e-09, "logits/chosen": 14.941054344177246, "logits/rejected": 14.814348220825195, "logps/chosen": -4.487031936645508, "logps/rejected": -4.714178085327148, "loss": 3.9114, "rewards/accuracies": 0.5, "rewards/chosen": -44.870323181152344, "rewards/margins": 2.271458625793457, "rewards/rejected": -47.14177703857422, "step": 7175 }, { "epoch": 0.9771241830065359, "grad_norm": 39.563781201217246, "learning_rate": 1.274811524820807e-09, "logits/chosen": 13.706406593322754, "logits/rejected": 14.299062728881836, "logps/chosen": -4.450936317443848, "logps/rejected": -4.740800857543945, "loss": 3.8258, "rewards/accuracies": 0.75, "rewards/chosen": -44.509361267089844, "rewards/margins": 2.8986473083496094, "rewards/rejected": -47.40800857543945, "step": 7176 }, { "epoch": 0.977260348583878, "grad_norm": 46.384321882533555, "learning_rate": 1.2596883117169444e-09, "logits/chosen": 13.991025924682617, "logits/rejected": 14.667304992675781, "logps/chosen": -4.4208831787109375, "logps/rejected": -4.741369247436523, "loss": 4.249, "rewards/accuracies": 1.0, "rewards/chosen": -44.208831787109375, "rewards/margins": 3.2048606872558594, "rewards/rejected": -47.413692474365234, "step": 7177 }, { "epoch": 0.9773965141612201, "grad_norm": 38.97202396731434, "learning_rate": 1.2446551972825403e-09, "logits/chosen": 14.228862762451172, "logits/rejected": 14.464369773864746, "logps/chosen": -4.268473148345947, "logps/rejected": -4.489284038543701, "loss": 3.6139, "rewards/accuracies": 1.0, "rewards/chosen": -42.684730529785156, "rewards/margins": 2.2081079483032227, "rewards/rejected": -44.89284133911133, "step": 7178 }, { "epoch": 0.9775326797385621, "grad_norm": 41.34834453662257, "learning_rate": 1.2297121849143889e-09, "logits/chosen": 14.54240894317627, "logits/rejected": 13.92704963684082, "logps/chosen": -4.341203689575195, "logps/rejected": -4.04205322265625, "loss": 4.1394, "rewards/accuracies": 0.5, "rewards/chosen": -43.41204071044922, "rewards/margins": -2.991506576538086, "rewards/rejected": -40.4205322265625, "step": 7179 }, { "epoch": 0.9776688453159041, "grad_norm": 37.85016363814113, "learning_rate": 1.2148592779890776e-09, "logits/chosen": 14.30095386505127, "logits/rejected": 14.121894836425781, "logps/chosen": -4.614675521850586, "logps/rejected": -4.752043724060059, "loss": 3.8796, "rewards/accuracies": 0.5, "rewards/chosen": -46.146759033203125, "rewards/margins": 1.3736791610717773, "rewards/rejected": -47.52043533325195, "step": 7180 }, { "epoch": 0.9778050108932462, "grad_norm": 39.739803584620155, "learning_rate": 1.2000964798627222e-09, "logits/chosen": 14.448884963989258, "logits/rejected": 14.483773231506348, "logps/chosen": -4.531450271606445, "logps/rejected": -4.649538993835449, "loss": 4.1788, "rewards/accuracies": 0.5, "rewards/chosen": -45.31450271606445, "rewards/margins": 1.1808929443359375, "rewards/rejected": -46.49539566040039, "step": 7181 }, { "epoch": 0.9779411764705882, "grad_norm": 40.98199945028119, "learning_rate": 1.1854237938710098e-09, "logits/chosen": 14.012107849121094, "logits/rejected": 14.813349723815918, "logps/chosen": -4.617559432983398, "logps/rejected": -4.894975662231445, "loss": 3.827, "rewards/accuracies": 0.75, "rewards/chosen": -46.17559051513672, "rewards/margins": 2.7741641998291016, "rewards/rejected": -48.94975662231445, "step": 7182 }, { "epoch": 0.9780773420479303, "grad_norm": 43.500328257196465, "learning_rate": 1.1708412233294662e-09, "logits/chosen": 15.061052322387695, "logits/rejected": 14.936596870422363, "logps/chosen": -4.462407112121582, "logps/rejected": -4.631772041320801, "loss": 3.9289, "rewards/accuracies": 0.25, "rewards/chosen": -44.62406921386719, "rewards/margins": 1.6936521530151367, "rewards/rejected": -46.317718505859375, "step": 7183 }, { "epoch": 0.9782135076252724, "grad_norm": 42.71960140413861, "learning_rate": 1.1563487715331445e-09, "logits/chosen": 13.833526611328125, "logits/rejected": 14.937277793884277, "logps/chosen": -4.519692420959473, "logps/rejected": -4.842917442321777, "loss": 3.7815, "rewards/accuracies": 0.75, "rewards/chosen": -45.196929931640625, "rewards/margins": 3.2322473526000977, "rewards/rejected": -48.429176330566406, "step": 7184 }, { "epoch": 0.9783496732026143, "grad_norm": 42.267018862125184, "learning_rate": 1.1419464417566692e-09, "logits/chosen": 14.599405288696289, "logits/rejected": 14.780157089233398, "logps/chosen": -4.254611968994141, "logps/rejected": -4.3138427734375, "loss": 4.2285, "rewards/accuracies": 0.5, "rewards/chosen": -42.546119689941406, "rewards/margins": 0.5923070907592773, "rewards/rejected": -43.138423919677734, "step": 7185 }, { "epoch": 0.9784858387799564, "grad_norm": 40.55346755315985, "learning_rate": 1.1276342372543713e-09, "logits/chosen": 14.309186935424805, "logits/rejected": 14.471939086914062, "logps/chosen": -4.442936420440674, "logps/rejected": -4.77529239654541, "loss": 3.0324, "rewards/accuracies": 0.5, "rewards/chosen": -44.42936706542969, "rewards/margins": 3.3235626220703125, "rewards/rejected": -47.752925872802734, "step": 7186 }, { "epoch": 0.9786220043572985, "grad_norm": 41.10566606187016, "learning_rate": 1.113412161260241e-09, "logits/chosen": 14.86190128326416, "logits/rejected": 14.881521224975586, "logps/chosen": -4.755636692047119, "logps/rejected": -4.824117183685303, "loss": 4.1761, "rewards/accuracies": 0.5, "rewards/chosen": -47.556365966796875, "rewards/margins": 0.6848058700561523, "rewards/rejected": -48.241172790527344, "step": 7187 }, { "epoch": 0.9787581699346405, "grad_norm": 40.754063307376235, "learning_rate": 1.0992802169878411e-09, "logits/chosen": 14.41498851776123, "logits/rejected": 15.060626983642578, "logps/chosen": -4.516014099121094, "logps/rejected": -4.818767070770264, "loss": 4.134, "rewards/accuracies": 0.75, "rewards/chosen": -45.16014099121094, "rewards/margins": 3.027529716491699, "rewards/rejected": -48.18767166137695, "step": 7188 }, { "epoch": 0.9788943355119826, "grad_norm": 43.71748386687192, "learning_rate": 1.08523840763044e-09, "logits/chosen": 14.698528289794922, "logits/rejected": 15.346142768859863, "logps/chosen": -4.460789680480957, "logps/rejected": -4.989891052246094, "loss": 4.0678, "rewards/accuracies": 1.0, "rewards/chosen": -44.6078987121582, "rewards/margins": 5.291008949279785, "rewards/rejected": -49.89890670776367, "step": 7189 }, { "epoch": 0.9790305010893247, "grad_norm": 40.60377478706291, "learning_rate": 1.0712867363608769e-09, "logits/chosen": 15.730581283569336, "logits/rejected": 14.82904052734375, "logps/chosen": -5.030542373657227, "logps/rejected": -5.07093620300293, "loss": 3.3836, "rewards/accuracies": 0.5, "rewards/chosen": -50.305419921875, "rewards/margins": 0.40393829345703125, "rewards/rejected": -50.70935821533203, "step": 7190 }, { "epoch": 0.9791666666666666, "grad_norm": 36.5684139016085, "learning_rate": 1.0574252063316524e-09, "logits/chosen": 14.096186637878418, "logits/rejected": 14.229292869567871, "logps/chosen": -4.462973594665527, "logps/rejected": -4.90067720413208, "loss": 3.6912, "rewards/accuracies": 1.0, "rewards/chosen": -44.629737854003906, "rewards/margins": 4.377036094665527, "rewards/rejected": -49.00677490234375, "step": 7191 }, { "epoch": 0.9793028322440087, "grad_norm": 40.62719354487317, "learning_rate": 1.043653820674928e-09, "logits/chosen": 14.630343437194824, "logits/rejected": 14.681072235107422, "logps/chosen": -4.714321136474609, "logps/rejected": -4.803161144256592, "loss": 4.1386, "rewards/accuracies": 0.25, "rewards/chosen": -47.143211364746094, "rewards/margins": 0.8883991241455078, "rewards/rejected": -48.03160858154297, "step": 7192 }, { "epoch": 0.9794389978213508, "grad_norm": 37.82457984961381, "learning_rate": 1.029972582502392e-09, "logits/chosen": 14.713886260986328, "logits/rejected": 15.229071617126465, "logps/chosen": -4.306416988372803, "logps/rejected": -4.928360939025879, "loss": 3.7193, "rewards/accuracies": 1.0, "rewards/chosen": -43.064170837402344, "rewards/margins": 6.219444274902344, "rewards/rejected": -49.28361129760742, "step": 7193 }, { "epoch": 0.9795751633986928, "grad_norm": 40.89187083386489, "learning_rate": 1.0163814949054827e-09, "logits/chosen": 14.292874336242676, "logits/rejected": 14.717378616333008, "logps/chosen": -4.676456451416016, "logps/rejected": -4.873289108276367, "loss": 3.7635, "rewards/accuracies": 0.75, "rewards/chosen": -46.76456069946289, "rewards/margins": 1.9683284759521484, "rewards/rejected": -48.732887268066406, "step": 7194 }, { "epoch": 0.9797113289760349, "grad_norm": 51.23037536308695, "learning_rate": 1.0028805609552104e-09, "logits/chosen": 14.098055839538574, "logits/rejected": 14.568483352661133, "logps/chosen": -4.612259864807129, "logps/rejected": -4.752493381500244, "loss": 4.1885, "rewards/accuracies": 0.75, "rewards/chosen": -46.122596740722656, "rewards/margins": 1.4023370742797852, "rewards/rejected": -47.52493667602539, "step": 7195 }, { "epoch": 0.9798474945533769, "grad_norm": 36.2485037167104, "learning_rate": 9.894697837022015e-10, "logits/chosen": 14.241981506347656, "logits/rejected": 14.064201354980469, "logps/chosen": -4.475018501281738, "logps/rejected": -4.433455467224121, "loss": 3.7177, "rewards/accuracies": 0.5, "rewards/chosen": -44.750186920166016, "rewards/margins": -0.4156312942504883, "rewards/rejected": -44.334556579589844, "step": 7196 }, { "epoch": 0.9799836601307189, "grad_norm": 38.056708983724945, "learning_rate": 9.761491661767429e-10, "logits/chosen": 15.474552154541016, "logits/rejected": 15.017608642578125, "logps/chosen": -5.185541152954102, "logps/rejected": -4.923976898193359, "loss": 3.5478, "rewards/accuracies": 0.25, "rewards/chosen": -51.85541534423828, "rewards/margins": -2.6156511306762695, "rewards/rejected": -49.23976516723633, "step": 7197 }, { "epoch": 0.980119825708061, "grad_norm": 41.46534576465641, "learning_rate": 9.629187113887828e-10, "logits/chosen": 14.844549179077148, "logits/rejected": 14.922603607177734, "logps/chosen": -4.534108638763428, "logps/rejected": -4.718569278717041, "loss": 3.7889, "rewards/accuracies": 0.75, "rewards/chosen": -45.341087341308594, "rewards/margins": 1.8446035385131836, "rewards/rejected": -47.185691833496094, "step": 7198 }, { "epoch": 0.9802559912854031, "grad_norm": 40.6125095129116, "learning_rate": 9.49778422327796e-10, "logits/chosen": 13.939702033996582, "logits/rejected": 14.854854583740234, "logps/chosen": -4.36369514465332, "logps/rejected": -4.7405476570129395, "loss": 3.9274, "rewards/accuracies": 1.0, "rewards/chosen": -43.63694763183594, "rewards/margins": 3.768526077270508, "rewards/rejected": -47.405479431152344, "step": 7199 }, { "epoch": 0.9803921568627451, "grad_norm": 41.31756405942845, "learning_rate": 9.367283019629635e-10, "logits/chosen": 14.135892868041992, "logits/rejected": 14.161003112792969, "logps/chosen": -4.527080059051514, "logps/rejected": -4.443401336669922, "loss": 4.4329, "rewards/accuracies": 0.25, "rewards/chosen": -45.27079772949219, "rewards/margins": -0.8367862701416016, "rewards/rejected": -44.43401336669922, "step": 7200 }, { "epoch": 0.9805283224400871, "grad_norm": 41.488825112027065, "learning_rate": 9.237683532430817e-10, "logits/chosen": 14.292152404785156, "logits/rejected": 14.503396034240723, "logps/chosen": -4.873505592346191, "logps/rejected": -4.763700485229492, "loss": 3.7757, "rewards/accuracies": 0.5, "rewards/chosen": -48.73505401611328, "rewards/margins": -1.098048210144043, "rewards/rejected": -47.63700866699219, "step": 7201 }, { "epoch": 0.9806644880174292, "grad_norm": 45.88833047688116, "learning_rate": 9.108985790964752e-10, "logits/chosen": 14.831541061401367, "logits/rejected": 14.991228103637695, "logps/chosen": -4.659448623657227, "logps/rejected": -4.997481346130371, "loss": 4.5656, "rewards/accuracies": 0.75, "rewards/chosen": -46.594486236572266, "rewards/margins": 3.380324363708496, "rewards/rejected": -49.974815368652344, "step": 7202 }, { "epoch": 0.9808006535947712, "grad_norm": 40.04508809330817, "learning_rate": 8.981189824313062e-10, "logits/chosen": 14.096708297729492, "logits/rejected": 14.608087539672852, "logps/chosen": -4.338228225708008, "logps/rejected": -4.670108795166016, "loss": 4.444, "rewards/accuracies": 0.75, "rewards/chosen": -43.382286071777344, "rewards/margins": 3.3187999725341797, "rewards/rejected": -46.70108413696289, "step": 7203 }, { "epoch": 0.9809368191721133, "grad_norm": 39.47157668728499, "learning_rate": 8.854295661351318e-10, "logits/chosen": 15.044147491455078, "logits/rejected": 15.18160629272461, "logps/chosen": -4.717445373535156, "logps/rejected": -4.790109634399414, "loss": 4.1492, "rewards/accuracies": 0.5, "rewards/chosen": -47.17445373535156, "rewards/margins": 0.7266378402709961, "rewards/rejected": -47.901092529296875, "step": 7204 }, { "epoch": 0.9810729847494554, "grad_norm": 42.102515260356576, "learning_rate": 8.728303330752585e-10, "logits/chosen": 14.729959487915039, "logits/rejected": 14.789575576782227, "logps/chosen": -4.731164932250977, "logps/rejected": -4.786391258239746, "loss": 3.3477, "rewards/accuracies": 0.75, "rewards/chosen": -47.31165313720703, "rewards/margins": 0.5522613525390625, "rewards/rejected": -47.863914489746094, "step": 7205 }, { "epoch": 0.9812091503267973, "grad_norm": 39.43529787835775, "learning_rate": 8.603212860985642e-10, "logits/chosen": 13.937725067138672, "logits/rejected": 14.565281867980957, "logps/chosen": -4.605886459350586, "logps/rejected": -4.810176849365234, "loss": 3.7445, "rewards/accuracies": 0.75, "rewards/chosen": -46.058860778808594, "rewards/margins": 2.04290771484375, "rewards/rejected": -48.10177230834961, "step": 7206 }, { "epoch": 0.9813453159041394, "grad_norm": 41.18669836177372, "learning_rate": 8.479024280316328e-10, "logits/chosen": 14.235544204711914, "logits/rejected": 14.213770866394043, "logps/chosen": -4.593463897705078, "logps/rejected": -4.4588398933410645, "loss": 4.0902, "rewards/accuracies": 0.25, "rewards/chosen": -45.93463897705078, "rewards/margins": -1.3462400436401367, "rewards/rejected": -44.58839797973633, "step": 7207 }, { "epoch": 0.9814814814814815, "grad_norm": 43.908850516051295, "learning_rate": 8.355737616805747e-10, "logits/chosen": 14.665771484375, "logits/rejected": 15.267234802246094, "logps/chosen": -4.605749130249023, "logps/rejected": -4.909934043884277, "loss": 4.3174, "rewards/accuracies": 0.75, "rewards/chosen": -46.0574951171875, "rewards/margins": 3.0418500900268555, "rewards/rejected": -49.099342346191406, "step": 7208 }, { "epoch": 0.9816176470588235, "grad_norm": 38.35702701680739, "learning_rate": 8.233352898311619e-10, "logits/chosen": 15.060791015625, "logits/rejected": 15.216035842895508, "logps/chosen": -4.609872817993164, "logps/rejected": -4.6761369705200195, "loss": 3.5821, "rewards/accuracies": 0.5, "rewards/chosen": -46.098724365234375, "rewards/margins": 0.6626405715942383, "rewards/rejected": -46.76136779785156, "step": 7209 }, { "epoch": 0.9817538126361656, "grad_norm": 42.17175124392833, "learning_rate": 8.111870152487377e-10, "logits/chosen": 14.605375289916992, "logits/rejected": 14.852237701416016, "logps/chosen": -4.705412864685059, "logps/rejected": -4.5994133949279785, "loss": 3.818, "rewards/accuracies": 0.25, "rewards/chosen": -47.05412673950195, "rewards/margins": -1.0599918365478516, "rewards/rejected": -45.99413299560547, "step": 7210 }, { "epoch": 0.9818899782135077, "grad_norm": 40.26109727571596, "learning_rate": 7.991289406783508e-10, "logits/chosen": 14.276086807250977, "logits/rejected": 14.021612167358398, "logps/chosen": -4.552299499511719, "logps/rejected": -4.583194732666016, "loss": 3.7462, "rewards/accuracies": 0.5, "rewards/chosen": -45.52299118041992, "rewards/margins": 0.3089570999145508, "rewards/rejected": -45.831947326660156, "step": 7211 }, { "epoch": 0.9820261437908496, "grad_norm": 39.220559080570744, "learning_rate": 7.871610688446217e-10, "logits/chosen": 14.888983726501465, "logits/rejected": 14.767145156860352, "logps/chosen": -5.162002086639404, "logps/rejected": -5.046187400817871, "loss": 3.5526, "rewards/accuracies": 0.25, "rewards/chosen": -51.620018005371094, "rewards/margins": -1.1581449508666992, "rewards/rejected": -50.461875915527344, "step": 7212 }, { "epoch": 0.9821623093681917, "grad_norm": 41.59728042202377, "learning_rate": 7.75283402451743e-10, "logits/chosen": 13.099493980407715, "logits/rejected": 13.933368682861328, "logps/chosen": -4.289217472076416, "logps/rejected": -4.535038948059082, "loss": 3.9697, "rewards/accuracies": 0.75, "rewards/chosen": -42.892173767089844, "rewards/margins": 2.4582109451293945, "rewards/rejected": -45.35038757324219, "step": 7213 }, { "epoch": 0.9822984749455338, "grad_norm": 42.258561543789085, "learning_rate": 7.63495944183612e-10, "logits/chosen": 15.36867904663086, "logits/rejected": 14.651960372924805, "logps/chosen": -4.899413108825684, "logps/rejected": -4.61229133605957, "loss": 3.8825, "rewards/accuracies": 0.25, "rewards/chosen": -48.99413299560547, "rewards/margins": -2.871212959289551, "rewards/rejected": -46.12291717529297, "step": 7214 }, { "epoch": 0.9824346405228758, "grad_norm": 39.411492905108815, "learning_rate": 7.517986967036982e-10, "logits/chosen": 14.443159103393555, "logits/rejected": 13.329185485839844, "logps/chosen": -4.608614921569824, "logps/rejected": -4.375432968139648, "loss": 4.3924, "rewards/accuracies": 0.25, "rewards/chosen": -46.086151123046875, "rewards/margins": -2.3318233489990234, "rewards/rejected": -43.75432586669922, "step": 7215 }, { "epoch": 0.9825708061002179, "grad_norm": 39.81978998883706, "learning_rate": 7.401916626550875e-10, "logits/chosen": 13.66431713104248, "logits/rejected": 14.795076370239258, "logps/chosen": -4.159261703491211, "logps/rejected": -4.62517786026001, "loss": 3.5647, "rewards/accuracies": 1.0, "rewards/chosen": -41.592613220214844, "rewards/margins": 4.65916633605957, "rewards/rejected": -46.25177764892578, "step": 7216 }, { "epoch": 0.9827069716775599, "grad_norm": 42.122694348494704, "learning_rate": 7.286748446605262e-10, "logits/chosen": 14.770343780517578, "logits/rejected": 14.727681159973145, "logps/chosen": -4.6369218826293945, "logps/rejected": -4.897407054901123, "loss": 3.2935, "rewards/accuracies": 0.5, "rewards/chosen": -46.36922073364258, "rewards/margins": 2.604851722717285, "rewards/rejected": -48.97407150268555, "step": 7217 }, { "epoch": 0.9828431372549019, "grad_norm": 38.503375728141805, "learning_rate": 7.172482453222439e-10, "logits/chosen": 14.864738464355469, "logits/rejected": 14.852219581604004, "logps/chosen": -4.857632160186768, "logps/rejected": -4.748438835144043, "loss": 4.1444, "rewards/accuracies": 0.5, "rewards/chosen": -48.576324462890625, "rewards/margins": -1.091933250427246, "rewards/rejected": -47.48439025878906, "step": 7218 }, { "epoch": 0.982979302832244, "grad_norm": 41.402609103395996, "learning_rate": 7.059118672222642e-10, "logits/chosen": 14.815744400024414, "logits/rejected": 15.421473503112793, "logps/chosen": -4.754974365234375, "logps/rejected": -5.109169006347656, "loss": 3.9538, "rewards/accuracies": 0.75, "rewards/chosen": -47.54974365234375, "rewards/margins": 3.5419464111328125, "rewards/rejected": -51.09169006347656, "step": 7219 }, { "epoch": 0.9831154684095861, "grad_norm": 41.13050682553323, "learning_rate": 6.94665712922049e-10, "logits/chosen": 13.20142936706543, "logits/rejected": 14.518238067626953, "logps/chosen": -4.416541576385498, "logps/rejected": -4.995004653930664, "loss": 3.4763, "rewards/accuracies": 1.0, "rewards/chosen": -44.16541290283203, "rewards/margins": 5.784631729125977, "rewards/rejected": -49.95004653930664, "step": 7220 }, { "epoch": 0.983251633986928, "grad_norm": 38.78418469747277, "learning_rate": 6.835097849628547e-10, "logits/chosen": 14.83973217010498, "logits/rejected": 14.544975280761719, "logps/chosen": -4.61400032043457, "logps/rejected": -4.478962421417236, "loss": 3.7622, "rewards/accuracies": 0.5, "rewards/chosen": -46.14000701904297, "rewards/margins": -1.350381851196289, "rewards/rejected": -44.78962326049805, "step": 7221 }, { "epoch": 0.9833877995642701, "grad_norm": 42.56183572548476, "learning_rate": 6.724440858653757e-10, "logits/chosen": 15.117168426513672, "logits/rejected": 15.15919303894043, "logps/chosen": -4.552878379821777, "logps/rejected": -4.661356449127197, "loss": 3.5943, "rewards/accuracies": 0.75, "rewards/chosen": -45.528785705566406, "rewards/margins": 1.0847806930541992, "rewards/rejected": -46.613563537597656, "step": 7222 }, { "epoch": 0.9835239651416122, "grad_norm": 40.563382937864425, "learning_rate": 6.614686181300566e-10, "logits/chosen": 14.529844284057617, "logits/rejected": 14.64085578918457, "logps/chosen": -4.908627986907959, "logps/rejected": -5.072410583496094, "loss": 3.5106, "rewards/accuracies": 0.75, "rewards/chosen": -49.086280822753906, "rewards/margins": 1.6378288269042969, "rewards/rejected": -50.7241096496582, "step": 7223 }, { "epoch": 0.9836601307189542, "grad_norm": 39.76212623301305, "learning_rate": 6.505833842368247e-10, "logits/chosen": 14.295036315917969, "logits/rejected": 14.186222076416016, "logps/chosen": -4.322572708129883, "logps/rejected": -4.465093612670898, "loss": 3.6703, "rewards/accuracies": 0.5, "rewards/chosen": -43.225730895996094, "rewards/margins": 1.425210952758789, "rewards/rejected": -44.65093994140625, "step": 7224 }, { "epoch": 0.9837962962962963, "grad_norm": 40.86881505054338, "learning_rate": 6.397883866453568e-10, "logits/chosen": 13.861579895019531, "logits/rejected": 14.885932922363281, "logps/chosen": -4.546813011169434, "logps/rejected": -4.787265777587891, "loss": 3.8014, "rewards/accuracies": 0.75, "rewards/chosen": -45.46813201904297, "rewards/margins": 2.404529571533203, "rewards/rejected": -47.872657775878906, "step": 7225 }, { "epoch": 0.9839324618736384, "grad_norm": 39.665918184261244, "learning_rate": 6.290836277948575e-10, "logits/chosen": 13.994497299194336, "logits/rejected": 14.413810729980469, "logps/chosen": -4.549081802368164, "logps/rejected": -4.8033552169799805, "loss": 3.6559, "rewards/accuracies": 0.75, "rewards/chosen": -45.49081802368164, "rewards/margins": 2.5427331924438477, "rewards/rejected": -48.03355026245117, "step": 7226 }, { "epoch": 0.9840686274509803, "grad_norm": 47.42940734977029, "learning_rate": 6.184691101041473e-10, "logits/chosen": 15.111013412475586, "logits/rejected": 15.453561782836914, "logps/chosen": -4.894777297973633, "logps/rejected": -4.9277215003967285, "loss": 4.3262, "rewards/accuracies": 0.5, "rewards/chosen": -48.94777297973633, "rewards/margins": 0.32944297790527344, "rewards/rejected": -49.27721405029297, "step": 7227 }, { "epoch": 0.9842047930283224, "grad_norm": 37.188606037478046, "learning_rate": 6.079448359716632e-10, "logits/chosen": 14.206330299377441, "logits/rejected": 15.123655319213867, "logps/chosen": -4.333986282348633, "logps/rejected": -4.564969062805176, "loss": 3.4301, "rewards/accuracies": 0.75, "rewards/chosen": -43.33986282348633, "rewards/margins": 2.3098249435424805, "rewards/rejected": -45.649688720703125, "step": 7228 }, { "epoch": 0.9843409586056645, "grad_norm": 37.27223998981778, "learning_rate": 5.975108077754587e-10, "logits/chosen": 15.186405181884766, "logits/rejected": 15.263692855834961, "logps/chosen": -4.826942443847656, "logps/rejected": -5.285730838775635, "loss": 3.8405, "rewards/accuracies": 1.0, "rewards/chosen": -48.26942825317383, "rewards/margins": 4.587878227233887, "rewards/rejected": -52.85730743408203, "step": 7229 }, { "epoch": 0.9844771241830066, "grad_norm": 40.2462359077693, "learning_rate": 5.871670278731588e-10, "logits/chosen": 14.208669662475586, "logits/rejected": 14.559160232543945, "logps/chosen": -4.526454448699951, "logps/rejected": -4.661660194396973, "loss": 4.1199, "rewards/accuracies": 0.5, "rewards/chosen": -45.26454162597656, "rewards/margins": 1.3520574569702148, "rewards/rejected": -46.616600036621094, "step": 7230 }, { "epoch": 0.9846132897603486, "grad_norm": 45.55441570782506, "learning_rate": 5.769134986020497e-10, "logits/chosen": 13.766366958618164, "logits/rejected": 14.296112060546875, "logps/chosen": -4.114415168762207, "logps/rejected": -4.602565765380859, "loss": 3.1482, "rewards/accuracies": 0.75, "rewards/chosen": -41.14414596557617, "rewards/margins": 4.881512641906738, "rewards/rejected": -46.025657653808594, "step": 7231 }, { "epoch": 0.9847494553376906, "grad_norm": 38.96846300301403, "learning_rate": 5.667502222789889e-10, "logits/chosen": 14.181686401367188, "logits/rejected": 14.653902053833008, "logps/chosen": -4.516643047332764, "logps/rejected": -4.907232761383057, "loss": 3.3611, "rewards/accuracies": 1.0, "rewards/chosen": -45.16643524169922, "rewards/margins": 3.9058923721313477, "rewards/rejected": -49.072322845458984, "step": 7232 }, { "epoch": 0.9848856209150327, "grad_norm": 41.75382919859723, "learning_rate": 5.56677201200495e-10, "logits/chosen": 13.723864555358887, "logits/rejected": 14.010980606079102, "logps/chosen": -3.983452558517456, "logps/rejected": -4.167458534240723, "loss": 4.0956, "rewards/accuracies": 0.75, "rewards/chosen": -39.83452606201172, "rewards/margins": 1.840057373046875, "rewards/rejected": -41.674583435058594, "step": 7233 }, { "epoch": 0.9850217864923747, "grad_norm": 40.80160501175271, "learning_rate": 5.466944376426142e-10, "logits/chosen": 14.0723876953125, "logits/rejected": 14.717437744140625, "logps/chosen": -4.616504192352295, "logps/rejected": -4.604147434234619, "loss": 3.8557, "rewards/accuracies": 0.5, "rewards/chosen": -46.16504669189453, "rewards/margins": -0.12357234954833984, "rewards/rejected": -46.041473388671875, "step": 7234 }, { "epoch": 0.9851579520697168, "grad_norm": 42.98273739681485, "learning_rate": 5.36801933861053e-10, "logits/chosen": 14.432641983032227, "logits/rejected": 14.136341094970703, "logps/chosen": -4.577507019042969, "logps/rejected": -4.513722896575928, "loss": 4.3985, "rewards/accuracies": 0.25, "rewards/chosen": -45.77507019042969, "rewards/margins": -0.6378440856933594, "rewards/rejected": -45.137229919433594, "step": 7235 }, { "epoch": 0.9852941176470589, "grad_norm": 41.7923211170664, "learning_rate": 5.269996920910458e-10, "logits/chosen": 14.8983154296875, "logits/rejected": 15.454214096069336, "logps/chosen": -4.885666847229004, "logps/rejected": -4.987980842590332, "loss": 4.6309, "rewards/accuracies": 0.5, "rewards/chosen": -48.856666564941406, "rewards/margins": 1.023141860961914, "rewards/rejected": -49.87981033325195, "step": 7236 }, { "epoch": 0.9854302832244008, "grad_norm": 42.81911916780628, "learning_rate": 5.172877145475763e-10, "logits/chosen": 15.067538261413574, "logits/rejected": 14.921699523925781, "logps/chosen": -4.843853950500488, "logps/rejected": -4.688928604125977, "loss": 4.3905, "rewards/accuracies": 0.25, "rewards/chosen": -48.438533782958984, "rewards/margins": -1.5492496490478516, "rewards/rejected": -46.889286041259766, "step": 7237 }, { "epoch": 0.9855664488017429, "grad_norm": 41.46930288737108, "learning_rate": 5.076660034250668e-10, "logits/chosen": 15.02062702178955, "logits/rejected": 15.155290603637695, "logps/chosen": -4.8655314445495605, "logps/rejected": -4.847360610961914, "loss": 3.9574, "rewards/accuracies": 0.5, "rewards/chosen": -48.655311584472656, "rewards/margins": -0.18170642852783203, "rewards/rejected": -48.473609924316406, "step": 7238 }, { "epoch": 0.985702614379085, "grad_norm": 42.3033712863109, "learning_rate": 4.981345608976894e-10, "logits/chosen": 14.09815788269043, "logits/rejected": 13.900534629821777, "logps/chosen": -4.209305286407471, "logps/rejected": -4.339505672454834, "loss": 3.843, "rewards/accuracies": 1.0, "rewards/chosen": -42.093055725097656, "rewards/margins": 1.3020009994506836, "rewards/rejected": -43.39505386352539, "step": 7239 }, { "epoch": 0.985838779956427, "grad_norm": 39.634527749215515, "learning_rate": 4.886933891191436e-10, "logits/chosen": 14.536476135253906, "logits/rejected": 14.875638008117676, "logps/chosen": -4.640113830566406, "logps/rejected": -4.596251964569092, "loss": 3.7599, "rewards/accuracies": 0.5, "rewards/chosen": -46.40113830566406, "rewards/margins": -0.43861961364746094, "rewards/rejected": -45.96251678466797, "step": 7240 }, { "epoch": 0.9859749455337691, "grad_norm": 40.625833632375, "learning_rate": 4.793424902227005e-10, "logits/chosen": 14.287691116333008, "logits/rejected": 14.867239952087402, "logps/chosen": -4.579915523529053, "logps/rejected": -4.906278610229492, "loss": 4.4583, "rewards/accuracies": 0.75, "rewards/chosen": -45.799156188964844, "rewards/margins": 3.2636280059814453, "rewards/rejected": -49.06278610229492, "step": 7241 }, { "epoch": 0.9861111111111112, "grad_norm": 40.72888198638332, "learning_rate": 4.700818663212924e-10, "logits/chosen": 14.19179916381836, "logits/rejected": 14.575572967529297, "logps/chosen": -4.513550281524658, "logps/rejected": -4.712471008300781, "loss": 3.2437, "rewards/accuracies": 0.75, "rewards/chosen": -45.135501861572266, "rewards/margins": 1.989206314086914, "rewards/rejected": -47.12471008300781, "step": 7242 }, { "epoch": 0.9862472766884531, "grad_norm": 41.771348385913655, "learning_rate": 4.609115195074231e-10, "logits/chosen": 14.328302383422852, "logits/rejected": 14.876815795898438, "logps/chosen": -4.510499477386475, "logps/rejected": -5.053783416748047, "loss": 3.8865, "rewards/accuracies": 1.0, "rewards/chosen": -45.10499572753906, "rewards/margins": 5.43283748626709, "rewards/rejected": -50.53783416748047, "step": 7243 }, { "epoch": 0.9863834422657952, "grad_norm": 41.70373336210594, "learning_rate": 4.5183145185321294e-10, "logits/chosen": 14.144752502441406, "logits/rejected": 14.249992370605469, "logps/chosen": -4.434708595275879, "logps/rejected": -4.759544849395752, "loss": 3.7019, "rewards/accuracies": 0.75, "rewards/chosen": -44.34708786010742, "rewards/margins": 3.2483606338500977, "rewards/rejected": -47.59545135498047, "step": 7244 }, { "epoch": 0.9865196078431373, "grad_norm": 42.28690286076105, "learning_rate": 4.4284166541039834e-10, "logits/chosen": 14.337484359741211, "logits/rejected": 14.593663215637207, "logps/chosen": -4.260341167449951, "logps/rejected": -4.724745750427246, "loss": 4.1353, "rewards/accuracies": 1.0, "rewards/chosen": -42.60341262817383, "rewards/margins": 4.644045829772949, "rewards/rejected": -47.247459411621094, "step": 7245 }, { "epoch": 0.9866557734204793, "grad_norm": 42.772873929233654, "learning_rate": 4.339421622102879e-10, "logits/chosen": 14.073616027832031, "logits/rejected": 14.802450180053711, "logps/chosen": -4.387800693511963, "logps/rejected": -4.693601131439209, "loss": 4.1155, "rewards/accuracies": 0.75, "rewards/chosen": -43.87800598144531, "rewards/margins": 3.058004379272461, "rewards/rejected": -46.936012268066406, "step": 7246 }, { "epoch": 0.9867919389978214, "grad_norm": 40.71803701723684, "learning_rate": 4.251329442638063e-10, "logits/chosen": 14.60995864868164, "logits/rejected": 14.676337242126465, "logps/chosen": -4.886858940124512, "logps/rejected": -5.045407772064209, "loss": 3.5093, "rewards/accuracies": 0.75, "rewards/chosen": -48.86859130859375, "rewards/margins": 1.5854864120483398, "rewards/rejected": -50.454078674316406, "step": 7247 }, { "epoch": 0.9869281045751634, "grad_norm": 41.3686406593861, "learning_rate": 4.164140135614058e-10, "logits/chosen": 15.085468292236328, "logits/rejected": 15.16472339630127, "logps/chosen": -5.026862144470215, "logps/rejected": -5.002931118011475, "loss": 3.4386, "rewards/accuracies": 0.5, "rewards/chosen": -50.268619537353516, "rewards/margins": -0.23931026458740234, "rewards/rejected": -50.02931213378906, "step": 7248 }, { "epoch": 0.9870642701525054, "grad_norm": 41.62906475240944, "learning_rate": 4.0778537207328824e-10, "logits/chosen": 14.599651336669922, "logits/rejected": 14.960211753845215, "logps/chosen": -4.752711296081543, "logps/rejected": -4.88017463684082, "loss": 4.1639, "rewards/accuracies": 0.5, "rewards/chosen": -47.52711486816406, "rewards/margins": 1.2746295928955078, "rewards/rejected": -48.80174255371094, "step": 7249 }, { "epoch": 0.9872004357298475, "grad_norm": 40.837879206775206, "learning_rate": 3.992470217491384e-10, "logits/chosen": 14.851791381835938, "logits/rejected": 14.852987289428711, "logps/chosen": -4.911834716796875, "logps/rejected": -4.761960029602051, "loss": 4.0084, "rewards/accuracies": 0.25, "rewards/chosen": -49.11834716796875, "rewards/margins": -1.4987468719482422, "rewards/rejected": -47.619598388671875, "step": 7250 }, { "epoch": 0.9873366013071896, "grad_norm": 37.507437862565695, "learning_rate": 3.90798964518213e-10, "logits/chosen": 14.470006942749023, "logits/rejected": 14.992963790893555, "logps/chosen": -4.404091835021973, "logps/rejected": -4.759634971618652, "loss": 3.7047, "rewards/accuracies": 1.0, "rewards/chosen": -44.040916442871094, "rewards/margins": 3.5554380416870117, "rewards/rejected": -47.59635543823242, "step": 7251 }, { "epoch": 0.9874727668845316, "grad_norm": 43.670970130663385, "learning_rate": 3.8244120228951847e-10, "logits/chosen": 13.90853500366211, "logits/rejected": 14.653860092163086, "logps/chosen": -4.7365522384643555, "logps/rejected": -4.853865146636963, "loss": 4.4079, "rewards/accuracies": 0.5, "rewards/chosen": -47.36552047729492, "rewards/margins": 1.1731338500976562, "rewards/rejected": -48.53865432739258, "step": 7252 }, { "epoch": 0.9876089324618736, "grad_norm": 39.81457658989952, "learning_rate": 3.7417373695149965e-10, "logits/chosen": 14.380455017089844, "logits/rejected": 15.11246395111084, "logps/chosen": -4.619741916656494, "logps/rejected": -4.7359619140625, "loss": 3.7889, "rewards/accuracies": 0.75, "rewards/chosen": -46.197418212890625, "rewards/margins": 1.1622018814086914, "rewards/rejected": -47.359619140625, "step": 7253 }, { "epoch": 0.9877450980392157, "grad_norm": 42.66112568127543, "learning_rate": 3.659965703722179e-10, "logits/chosen": 14.439481735229492, "logits/rejected": 15.264669418334961, "logps/chosen": -4.528053283691406, "logps/rejected": -4.883596420288086, "loss": 3.5815, "rewards/accuracies": 0.75, "rewards/chosen": -45.28053665161133, "rewards/margins": 3.555429458618164, "rewards/rejected": -48.835968017578125, "step": 7254 }, { "epoch": 0.9878812636165577, "grad_norm": 44.77074791773598, "learning_rate": 3.5790970439943946e-10, "logits/chosen": 14.377830505371094, "logits/rejected": 14.60382080078125, "logps/chosen": -4.664548873901367, "logps/rejected": -4.728376388549805, "loss": 3.5737, "rewards/accuracies": 0.75, "rewards/chosen": -46.64548873901367, "rewards/margins": 0.6382732391357422, "rewards/rejected": -47.28376007080078, "step": 7255 }, { "epoch": 0.9880174291938998, "grad_norm": 39.72637597983066, "learning_rate": 3.499131408604583e-10, "logits/chosen": 14.621172904968262, "logits/rejected": 15.427791595458984, "logps/chosen": -4.57291316986084, "logps/rejected": -5.267697334289551, "loss": 3.7375, "rewards/accuracies": 1.0, "rewards/chosen": -45.72913360595703, "rewards/margins": 6.947841644287109, "rewards/rejected": -52.676971435546875, "step": 7256 }, { "epoch": 0.9881535947712419, "grad_norm": 38.84362879885385, "learning_rate": 3.420068815621402e-10, "logits/chosen": 13.945566177368164, "logits/rejected": 13.798726081848145, "logps/chosen": -4.554104328155518, "logps/rejected": -4.571695804595947, "loss": 3.5071, "rewards/accuracies": 0.5, "rewards/chosen": -45.541046142578125, "rewards/margins": 0.17591476440429688, "rewards/rejected": -45.716957092285156, "step": 7257 }, { "epoch": 0.9882897603485838, "grad_norm": 42.3006257713931, "learning_rate": 3.3419092829096717e-10, "logits/chosen": 14.692913055419922, "logits/rejected": 15.067886352539062, "logps/chosen": -4.722585678100586, "logps/rejected": -4.8617963790893555, "loss": 4.3684, "rewards/accuracies": 0.75, "rewards/chosen": -47.225860595703125, "rewards/margins": 1.392104148864746, "rewards/rejected": -48.61796569824219, "step": 7258 }, { "epoch": 0.9884259259259259, "grad_norm": 37.62419397475307, "learning_rate": 3.2646528281303765e-10, "logits/chosen": 14.136566162109375, "logits/rejected": 15.032232284545898, "logps/chosen": -4.4536285400390625, "logps/rejected": -4.7956671714782715, "loss": 3.4267, "rewards/accuracies": 0.75, "rewards/chosen": -44.536285400390625, "rewards/margins": 3.42038631439209, "rewards/rejected": -47.95667266845703, "step": 7259 }, { "epoch": 0.988562091503268, "grad_norm": 39.65446411019145, "learning_rate": 3.1882994687397746e-10, "logits/chosen": 16.13947296142578, "logits/rejected": 15.462409019470215, "logps/chosen": -5.024813175201416, "logps/rejected": -5.03508186340332, "loss": 4.2666, "rewards/accuracies": 0.25, "rewards/chosen": -50.248130798339844, "rewards/margins": 0.10268497467041016, "rewards/rejected": -50.3508186340332, "step": 7260 }, { "epoch": 0.98869825708061, "grad_norm": 43.74530419372878, "learning_rate": 3.112849221991176e-10, "logits/chosen": 15.09463882446289, "logits/rejected": 14.277074813842773, "logps/chosen": -4.6082682609558105, "logps/rejected": -4.655241966247559, "loss": 3.9034, "rewards/accuracies": 0.75, "rewards/chosen": -46.082679748535156, "rewards/margins": 0.46973609924316406, "rewards/rejected": -46.55241775512695, "step": 7261 }, { "epoch": 0.9888344226579521, "grad_norm": 37.02931384166522, "learning_rate": 3.038302104932722e-10, "logits/chosen": 14.421666145324707, "logits/rejected": 14.424585342407227, "logps/chosen": -4.431539058685303, "logps/rejected": -4.50550651550293, "loss": 3.6368, "rewards/accuracies": 0.5, "rewards/chosen": -44.315391540527344, "rewards/margins": 0.7396745681762695, "rewards/rejected": -45.05506896972656, "step": 7262 }, { "epoch": 0.9889705882352942, "grad_norm": 38.64107730624619, "learning_rate": 2.964658134409159e-10, "logits/chosen": 14.924298286437988, "logits/rejected": 14.736703872680664, "logps/chosen": -4.651212692260742, "logps/rejected": -4.77729606628418, "loss": 3.2276, "rewards/accuracies": 0.5, "rewards/chosen": -46.51213073730469, "rewards/margins": 1.2608318328857422, "rewards/rejected": -47.77295684814453, "step": 7263 }, { "epoch": 0.9891067538126361, "grad_norm": 40.933509021832506, "learning_rate": 2.8919173270609554e-10, "logits/chosen": 14.801990509033203, "logits/rejected": 15.283446311950684, "logps/chosen": -4.711305618286133, "logps/rejected": -4.950862884521484, "loss": 4.286, "rewards/accuracies": 0.75, "rewards/chosen": -47.11305618286133, "rewards/margins": 2.395573616027832, "rewards/rejected": -49.508628845214844, "step": 7264 }, { "epoch": 0.9892429193899782, "grad_norm": 41.87167590244156, "learning_rate": 2.82007969932474e-10, "logits/chosen": 15.017751693725586, "logits/rejected": 14.522697448730469, "logps/chosen": -4.99163293838501, "logps/rejected": -4.783879280090332, "loss": 4.4847, "rewards/accuracies": 0.25, "rewards/chosen": -49.91632843017578, "rewards/margins": -2.077530860900879, "rewards/rejected": -47.83879852294922, "step": 7265 }, { "epoch": 0.9893790849673203, "grad_norm": 38.824892152938546, "learning_rate": 2.7491452674324176e-10, "logits/chosen": 15.022520065307617, "logits/rejected": 14.864837646484375, "logps/chosen": -4.816107749938965, "logps/rejected": -4.76186466217041, "loss": 3.4163, "rewards/accuracies": 0.25, "rewards/chosen": -48.161075592041016, "rewards/margins": -0.5424308776855469, "rewards/rejected": -47.61864471435547, "step": 7266 }, { "epoch": 0.9895152505446623, "grad_norm": 44.593161029585595, "learning_rate": 2.6791140474120565e-10, "logits/chosen": 14.587650299072266, "logits/rejected": 14.66323184967041, "logps/chosen": -4.626064300537109, "logps/rejected": -4.594334602355957, "loss": 3.5177, "rewards/accuracies": 0.5, "rewards/chosen": -46.26063919067383, "rewards/margins": -0.3172922134399414, "rewards/rejected": -45.94334411621094, "step": 7267 }, { "epoch": 0.9896514161220044, "grad_norm": 38.76778224538637, "learning_rate": 2.6099860550883314e-10, "logits/chosen": 14.59267520904541, "logits/rejected": 15.126218795776367, "logps/chosen": -4.621217727661133, "logps/rejected": -4.83461856842041, "loss": 4.1592, "rewards/accuracies": 0.5, "rewards/chosen": -46.212181091308594, "rewards/margins": 2.134005546569824, "rewards/rejected": -48.34618377685547, "step": 7268 }, { "epoch": 0.9897875816993464, "grad_norm": 135.0019894801917, "learning_rate": 2.5417613060807476e-10, "logits/chosen": 14.544944763183594, "logits/rejected": 14.327634811401367, "logps/chosen": -4.70235538482666, "logps/rejected": -4.447091579437256, "loss": 3.9085, "rewards/accuracies": 0.25, "rewards/chosen": -47.023555755615234, "rewards/margins": -2.5526371002197266, "rewards/rejected": -44.470916748046875, "step": 7269 }, { "epoch": 0.9899237472766884, "grad_norm": 40.81145446159236, "learning_rate": 2.474439815805862e-10, "logits/chosen": 13.563305854797363, "logits/rejected": 14.400150299072266, "logps/chosen": -4.44166374206543, "logps/rejected": -4.810604572296143, "loss": 3.8228, "rewards/accuracies": 1.0, "rewards/chosen": -44.41663360595703, "rewards/margins": 3.6894121170043945, "rewards/rejected": -48.106048583984375, "step": 7270 }, { "epoch": 0.9900599128540305, "grad_norm": 36.93847681665558, "learning_rate": 2.408021599475063e-10, "logits/chosen": 14.823698043823242, "logits/rejected": 14.983291625976562, "logps/chosen": -4.521012306213379, "logps/rejected": -4.890575408935547, "loss": 4.1735, "rewards/accuracies": 0.75, "rewards/chosen": -45.21012496948242, "rewards/margins": 3.6956300735473633, "rewards/rejected": -48.90575408935547, "step": 7271 }, { "epoch": 0.9901960784313726, "grad_norm": 40.845960120729, "learning_rate": 2.342506672095901e-10, "logits/chosen": 14.604353904724121, "logits/rejected": 14.63047981262207, "logps/chosen": -4.5078125, "logps/rejected": -4.498221397399902, "loss": 3.7133, "rewards/accuracies": 0.5, "rewards/chosen": -45.078125, "rewards/margins": -0.09591388702392578, "rewards/rejected": -44.98221206665039, "step": 7272 }, { "epoch": 0.9903322440087146, "grad_norm": 39.0056347667212, "learning_rate": 2.2778950484725333e-10, "logits/chosen": 14.523406028747559, "logits/rejected": 14.457988739013672, "logps/chosen": -4.648493766784668, "logps/rejected": -4.893775939941406, "loss": 3.8806, "rewards/accuracies": 0.5, "rewards/chosen": -46.48493957519531, "rewards/margins": 2.45281982421875, "rewards/rejected": -48.9377555847168, "step": 7273 }, { "epoch": 0.9904684095860566, "grad_norm": 42.02341631536633, "learning_rate": 2.2141867432043937e-10, "logits/chosen": 14.163642883300781, "logits/rejected": 15.055435180664062, "logps/chosen": -4.700526237487793, "logps/rejected": -4.9620184898376465, "loss": 3.8025, "rewards/accuracies": 0.75, "rewards/chosen": -47.00526428222656, "rewards/margins": 2.614919662475586, "rewards/rejected": -49.62018585205078, "step": 7274 }, { "epoch": 0.9906045751633987, "grad_norm": 40.78187055118124, "learning_rate": 2.1513817706870774e-10, "logits/chosen": 14.801440238952637, "logits/rejected": 14.323225021362305, "logps/chosen": -4.714398384094238, "logps/rejected": -4.707846164703369, "loss": 3.6904, "rewards/accuracies": 0.5, "rewards/chosen": -47.14398193359375, "rewards/margins": -0.0655202865600586, "rewards/rejected": -47.078460693359375, "step": 7275 }, { "epoch": 0.9907407407407407, "grad_norm": 41.59823253770183, "learning_rate": 2.0894801451110111e-10, "logits/chosen": 14.14171314239502, "logits/rejected": 14.656160354614258, "logps/chosen": -4.602765083312988, "logps/rejected": -4.847578525543213, "loss": 3.4276, "rewards/accuracies": 0.5, "rewards/chosen": -46.02764892578125, "rewards/margins": 2.448136329650879, "rewards/rejected": -48.47578430175781, "step": 7276 }, { "epoch": 0.9908769063180828, "grad_norm": 43.57486468914208, "learning_rate": 2.0284818804641167e-10, "logits/chosen": 14.048513412475586, "logits/rejected": 14.566572189331055, "logps/chosen": -4.469097137451172, "logps/rejected": -4.633275032043457, "loss": 4.3125, "rewards/accuracies": 0.75, "rewards/chosen": -44.69097137451172, "rewards/margins": 1.6417818069458008, "rewards/rejected": -46.3327522277832, "step": 7277 }, { "epoch": 0.9910130718954249, "grad_norm": 40.158630112291235, "learning_rate": 1.9683869905295912e-10, "logits/chosen": 13.934173583984375, "logits/rejected": 14.1093111038208, "logps/chosen": -3.955904245376587, "logps/rejected": -4.325420379638672, "loss": 3.2172, "rewards/accuracies": 0.75, "rewards/chosen": -39.559043884277344, "rewards/margins": 3.6951589584350586, "rewards/rejected": -43.25420379638672, "step": 7278 }, { "epoch": 0.9911492374727668, "grad_norm": 38.31337594724944, "learning_rate": 1.9091954888859063e-10, "logits/chosen": 14.44451904296875, "logits/rejected": 14.723690032958984, "logps/chosen": -4.277691841125488, "logps/rejected": -4.7554426193237305, "loss": 3.903, "rewards/accuracies": 1.0, "rewards/chosen": -42.776920318603516, "rewards/margins": 4.777504920959473, "rewards/rejected": -47.55442428588867, "step": 7279 }, { "epoch": 0.9912854030501089, "grad_norm": 49.43760055340028, "learning_rate": 1.8509073889081407e-10, "logits/chosen": 14.875988006591797, "logits/rejected": 14.954635620117188, "logps/chosen": -4.822821617126465, "logps/rejected": -5.0197296142578125, "loss": 4.5903, "rewards/accuracies": 0.5, "rewards/chosen": -48.228214263916016, "rewards/margins": 1.969080924987793, "rewards/rejected": -50.197296142578125, "step": 7280 }, { "epoch": 0.991421568627451, "grad_norm": 40.447197782049514, "learning_rate": 1.793522703766648e-10, "logits/chosen": 14.426047325134277, "logits/rejected": 14.601417541503906, "logps/chosen": -4.467830181121826, "logps/rejected": -4.803588390350342, "loss": 3.5283, "rewards/accuracies": 1.0, "rewards/chosen": -44.67829895019531, "rewards/margins": 3.357585906982422, "rewards/rejected": -48.035884857177734, "step": 7281 }, { "epoch": 0.991557734204793, "grad_norm": 36.54367233701446, "learning_rate": 1.7370414464283888e-10, "logits/chosen": 14.32984447479248, "logits/rejected": 14.475364685058594, "logps/chosen": -4.4530029296875, "logps/rejected": -4.634222030639648, "loss": 3.857, "rewards/accuracies": 0.5, "rewards/chosen": -44.530029296875, "rewards/margins": 1.8121919631958008, "rewards/rejected": -46.342220306396484, "step": 7282 }, { "epoch": 0.9916938997821351, "grad_norm": 42.64632363476468, "learning_rate": 1.6814636296555996e-10, "logits/chosen": 14.91750717163086, "logits/rejected": 15.15666389465332, "logps/chosen": -4.723906517028809, "logps/rejected": -4.841268539428711, "loss": 3.8046, "rewards/accuracies": 0.75, "rewards/chosen": -47.23906707763672, "rewards/margins": 1.1736164093017578, "rewards/rejected": -48.412681579589844, "step": 7283 }, { "epoch": 0.9918300653594772, "grad_norm": 47.858080238717314, "learning_rate": 1.6267892660066784e-10, "logits/chosen": 13.988517761230469, "logits/rejected": 14.533860206604004, "logps/chosen": -4.402315139770508, "logps/rejected": -4.834930896759033, "loss": 4.2224, "rewards/accuracies": 1.0, "rewards/chosen": -44.023155212402344, "rewards/margins": 4.326154708862305, "rewards/rejected": -48.349308013916016, "step": 7284 }, { "epoch": 0.9919662309368191, "grad_norm": 47.33325394158772, "learning_rate": 1.5730183678352992e-10, "logits/chosen": 14.718713760375977, "logits/rejected": 15.112567901611328, "logps/chosen": -4.336574077606201, "logps/rejected": -4.82415771484375, "loss": 4.4511, "rewards/accuracies": 1.0, "rewards/chosen": -43.36574172973633, "rewards/margins": 4.875838279724121, "rewards/rejected": -48.2415771484375, "step": 7285 }, { "epoch": 0.9921023965141612, "grad_norm": 40.47925058515426, "learning_rate": 1.520150947292187e-10, "logits/chosen": 14.514701843261719, "logits/rejected": 15.224906921386719, "logps/chosen": -4.3421783447265625, "logps/rejected": -4.990913391113281, "loss": 4.076, "rewards/accuracies": 1.0, "rewards/chosen": -43.421783447265625, "rewards/margins": 6.487353324890137, "rewards/rejected": -49.90913772583008, "step": 7286 }, { "epoch": 0.9922385620915033, "grad_norm": 40.24722532332051, "learning_rate": 1.468187016322453e-10, "logits/chosen": 14.549158096313477, "logits/rejected": 14.269372940063477, "logps/chosen": -4.387895107269287, "logps/rejected": -4.551682472229004, "loss": 3.9234, "rewards/accuracies": 0.75, "rewards/chosen": -43.87895202636719, "rewards/margins": 1.6378717422485352, "rewards/rejected": -45.516822814941406, "step": 7287 }, { "epoch": 0.9923747276688453, "grad_norm": 40.704527045598596, "learning_rate": 1.4171265866678162e-10, "logits/chosen": 15.196821212768555, "logits/rejected": 15.135480880737305, "logps/chosen": -4.718228340148926, "logps/rejected": -4.836625576019287, "loss": 4.2789, "rewards/accuracies": 0.5, "rewards/chosen": -47.182289123535156, "rewards/margins": 1.1839685440063477, "rewards/rejected": -48.36625671386719, "step": 7288 }, { "epoch": 0.9925108932461874, "grad_norm": 41.62715812418893, "learning_rate": 1.3669696698661582e-10, "logits/chosen": 15.158884048461914, "logits/rejected": 14.584991455078125, "logps/chosen": -4.809236526489258, "logps/rejected": -4.798376083374023, "loss": 4.3873, "rewards/accuracies": 0.5, "rewards/chosen": -48.09236145019531, "rewards/margins": -0.10860347747802734, "rewards/rejected": -47.98375701904297, "step": 7289 }, { "epoch": 0.9926470588235294, "grad_norm": 40.56074160997684, "learning_rate": 1.3177162772510796e-10, "logits/chosen": 14.677645683288574, "logits/rejected": 14.725057601928711, "logps/chosen": -4.599620819091797, "logps/rejected": -4.873373508453369, "loss": 3.4794, "rewards/accuracies": 0.75, "rewards/chosen": -45.99620819091797, "rewards/margins": 2.737527847290039, "rewards/rejected": -48.73373794555664, "step": 7290 }, { "epoch": 0.9927832244008714, "grad_norm": 41.492650937331334, "learning_rate": 1.2693664199514566e-10, "logits/chosen": 14.399825096130371, "logits/rejected": 14.757469177246094, "logps/chosen": -4.419162750244141, "logps/rejected": -4.501372814178467, "loss": 4.2606, "rewards/accuracies": 0.5, "rewards/chosen": -44.191627502441406, "rewards/margins": 0.8220996856689453, "rewards/rejected": -45.013729095458984, "step": 7291 }, { "epoch": 0.9929193899782135, "grad_norm": 39.11946818059699, "learning_rate": 1.221920108891883e-10, "logits/chosen": 14.914331436157227, "logits/rejected": 14.378353118896484, "logps/chosen": -4.810732841491699, "logps/rejected": -4.616584300994873, "loss": 3.848, "rewards/accuracies": 0.25, "rewards/chosen": -48.10732650756836, "rewards/margins": -1.941483497619629, "rewards/rejected": -46.16584396362305, "step": 7292 }, { "epoch": 0.9930555555555556, "grad_norm": 40.64346374040557, "learning_rate": 1.175377354794005e-10, "logits/chosen": 14.820854187011719, "logits/rejected": 15.264437675476074, "logps/chosen": -4.536302089691162, "logps/rejected": -4.792859077453613, "loss": 4.0688, "rewards/accuracies": 0.75, "rewards/chosen": -45.36301803588867, "rewards/margins": 2.565572738647461, "rewards/rejected": -47.928592681884766, "step": 7293 }, { "epoch": 0.9931917211328976, "grad_norm": 39.58428378707257, "learning_rate": 1.129738168174299e-10, "logits/chosen": 15.363706588745117, "logits/rejected": 14.956182479858398, "logps/chosen": -4.962254524230957, "logps/rejected": -4.750219345092773, "loss": 3.951, "rewards/accuracies": 0.25, "rewards/chosen": -49.6225471496582, "rewards/margins": -2.120356559753418, "rewards/rejected": -47.50218963623047, "step": 7294 }, { "epoch": 0.9933278867102396, "grad_norm": 38.65963531416175, "learning_rate": 1.0850025593449608e-10, "logits/chosen": 14.780792236328125, "logits/rejected": 14.585029602050781, "logps/chosen": -5.0374836921691895, "logps/rejected": -4.916891098022461, "loss": 3.833, "rewards/accuracies": 0.5, "rewards/chosen": -50.374839782714844, "rewards/margins": -1.2059288024902344, "rewards/rejected": -49.168907165527344, "step": 7295 }, { "epoch": 0.9934640522875817, "grad_norm": 39.905643530649776, "learning_rate": 1.0411705384147928e-10, "logits/chosen": 14.964298248291016, "logits/rejected": 15.447059631347656, "logps/chosen": -4.777900218963623, "logps/rejected": -5.21467399597168, "loss": 4.1366, "rewards/accuracies": 0.75, "rewards/chosen": -47.77900314331055, "rewards/margins": 4.367739677429199, "rewards/rejected": -52.14674377441406, "step": 7296 }, { "epoch": 0.9936002178649237, "grad_norm": 40.35112888249991, "learning_rate": 9.982421152878728e-11, "logits/chosen": 14.64855670928955, "logits/rejected": 14.446102142333984, "logps/chosen": -4.532573223114014, "logps/rejected": -4.529888153076172, "loss": 3.6698, "rewards/accuracies": 0.5, "rewards/chosen": -45.32572937011719, "rewards/margins": -0.02685070037841797, "rewards/rejected": -45.29887771606445, "step": 7297 }, { "epoch": 0.9937363834422658, "grad_norm": 51.82039553136796, "learning_rate": 9.562172996644413e-11, "logits/chosen": 14.24578857421875, "logits/rejected": 14.199302673339844, "logps/chosen": -4.378569602966309, "logps/rejected": -4.585386276245117, "loss": 3.9775, "rewards/accuracies": 0.75, "rewards/chosen": -43.78569793701172, "rewards/margins": 2.0681686401367188, "rewards/rejected": -45.85386657714844, "step": 7298 }, { "epoch": 0.9938725490196079, "grad_norm": 37.942023887248006, "learning_rate": 9.150961010400138e-11, "logits/chosen": 14.280217170715332, "logits/rejected": 15.346075057983398, "logps/chosen": -4.459582328796387, "logps/rejected": -4.925350666046143, "loss": 3.7469, "rewards/accuracies": 0.75, "rewards/chosen": -44.5958251953125, "rewards/margins": 4.657684326171875, "rewards/rejected": -49.253509521484375, "step": 7299 }, { "epoch": 0.9940087145969498, "grad_norm": 38.500239959859634, "learning_rate": 8.748785287062688e-11, "logits/chosen": 14.818262100219727, "logits/rejected": 15.036091804504395, "logps/chosen": -4.595972061157227, "logps/rejected": -4.79238224029541, "loss": 3.9748, "rewards/accuracies": 0.5, "rewards/chosen": -45.959716796875, "rewards/margins": 1.964106559753418, "rewards/rejected": -47.923824310302734, "step": 7300 }, { "epoch": 0.9941448801742919, "grad_norm": 35.797989229765896, "learning_rate": 8.355645917506038e-11, "logits/chosen": 14.409626960754395, "logits/rejected": 14.699498176574707, "logps/chosen": -4.558819770812988, "logps/rejected": -4.770484924316406, "loss": 3.7757, "rewards/accuracies": 0.5, "rewards/chosen": -45.58819580078125, "rewards/margins": 2.1166553497314453, "rewards/rejected": -47.70485305786133, "step": 7301 }, { "epoch": 0.994281045751634, "grad_norm": 42.435198598475864, "learning_rate": 7.971542990570235e-11, "logits/chosen": 13.798765182495117, "logits/rejected": 14.254314422607422, "logps/chosen": -4.332787036895752, "logps/rejected": -4.6398234367370605, "loss": 3.8321, "rewards/accuracies": 0.75, "rewards/chosen": -43.32786560058594, "rewards/margins": 3.070366859436035, "rewards/rejected": -46.398231506347656, "step": 7302 }, { "epoch": 0.994417211328976, "grad_norm": 40.7786702687652, "learning_rate": 7.596476593039191e-11, "logits/chosen": 14.260108947753906, "logits/rejected": 14.459524154663086, "logps/chosen": -4.610584259033203, "logps/rejected": -4.637551307678223, "loss": 4.2982, "rewards/accuracies": 0.5, "rewards/chosen": -46.10584259033203, "rewards/margins": 0.2696695327758789, "rewards/rejected": -46.375511169433594, "step": 7303 }, { "epoch": 0.9945533769063181, "grad_norm": 38.86604068942003, "learning_rate": 7.230446809667334e-11, "logits/chosen": 13.992219924926758, "logits/rejected": 14.79633903503418, "logps/chosen": -4.357625961303711, "logps/rejected": -4.817853927612305, "loss": 3.9559, "rewards/accuracies": 0.75, "rewards/chosen": -43.576263427734375, "rewards/margins": 4.602276802062988, "rewards/rejected": -48.17853927612305, "step": 7304 }, { "epoch": 0.9946895424836601, "grad_norm": 40.20783066967132, "learning_rate": 6.873453723157396e-11, "logits/chosen": 15.008984565734863, "logits/rejected": 15.240389823913574, "logps/chosen": -4.7401275634765625, "logps/rejected": -4.6999382972717285, "loss": 3.9489, "rewards/accuracies": 0.5, "rewards/chosen": -47.40127944946289, "rewards/margins": -0.40189647674560547, "rewards/rejected": -46.99938201904297, "step": 7305 }, { "epoch": 0.9948257080610022, "grad_norm": 40.53197485339444, "learning_rate": 6.525497414178183e-11, "logits/chosen": 13.351837158203125, "logits/rejected": 14.486833572387695, "logps/chosen": -4.218064308166504, "logps/rejected": -4.505767822265625, "loss": 3.8824, "rewards/accuracies": 0.75, "rewards/chosen": -42.180641174316406, "rewards/margins": 2.8770370483398438, "rewards/rejected": -45.05767822265625, "step": 7306 }, { "epoch": 0.9949618736383442, "grad_norm": 40.35689577234063, "learning_rate": 6.186577961351247e-11, "logits/chosen": 14.815193176269531, "logits/rejected": 14.358926773071289, "logps/chosen": -4.458112716674805, "logps/rejected": -4.829065322875977, "loss": 4.0026, "rewards/accuracies": 0.75, "rewards/chosen": -44.58112716674805, "rewards/margins": 3.709527015686035, "rewards/rejected": -48.290653228759766, "step": 7307 }, { "epoch": 0.9950980392156863, "grad_norm": 40.27113591685947, "learning_rate": 5.856695441259774e-11, "logits/chosen": 14.980592727661133, "logits/rejected": 13.33244514465332, "logps/chosen": -4.882611274719238, "logps/rejected": -4.43282413482666, "loss": 4.2254, "rewards/accuracies": 0.25, "rewards/chosen": -48.82611846923828, "rewards/margins": -4.497876167297363, "rewards/rejected": -44.328243255615234, "step": 7308 }, { "epoch": 0.9952342047930284, "grad_norm": 36.72608951760733, "learning_rate": 5.5358499284396956e-11, "logits/chosen": 15.1612548828125, "logits/rejected": 15.166647911071777, "logps/chosen": -4.652132034301758, "logps/rejected": -4.8822174072265625, "loss": 3.7142, "rewards/accuracies": 0.75, "rewards/chosen": -46.521324157714844, "rewards/margins": 2.3008508682250977, "rewards/rejected": -48.822174072265625, "step": 7309 }, { "epoch": 0.9953703703703703, "grad_norm": 42.020553291112925, "learning_rate": 5.224041495397458e-11, "logits/chosen": 13.903036117553711, "logits/rejected": 13.55396842956543, "logps/chosen": -4.3814215660095215, "logps/rejected": -4.427040100097656, "loss": 4.459, "rewards/accuracies": 0.75, "rewards/chosen": -43.81421661376953, "rewards/margins": 0.4561786651611328, "rewards/rejected": -44.2703971862793, "step": 7310 }, { "epoch": 0.9955065359477124, "grad_norm": 40.08946131325485, "learning_rate": 4.9212702125789317e-11, "logits/chosen": 14.39371109008789, "logits/rejected": 14.678483963012695, "logps/chosen": -4.508871078491211, "logps/rejected": -5.049667835235596, "loss": 3.9114, "rewards/accuracies": 1.0, "rewards/chosen": -45.088706970214844, "rewards/margins": 5.4079694747924805, "rewards/rejected": -50.49667739868164, "step": 7311 }, { "epoch": 0.9956427015250545, "grad_norm": 38.06772327906981, "learning_rate": 4.6275361484049426e-11, "logits/chosen": 14.506356239318848, "logits/rejected": 14.175155639648438, "logps/chosen": -4.729473114013672, "logps/rejected": -4.708045959472656, "loss": 4.1539, "rewards/accuracies": 0.5, "rewards/chosen": -47.29473114013672, "rewards/margins": -0.21426868438720703, "rewards/rejected": -47.08046340942383, "step": 7312 }, { "epoch": 0.9957788671023965, "grad_norm": 43.177542022070256, "learning_rate": 4.342839369244622e-11, "logits/chosen": 14.833551406860352, "logits/rejected": 14.3448486328125, "logps/chosen": -4.590737342834473, "logps/rejected": -4.440727710723877, "loss": 4.2607, "rewards/accuracies": 0.25, "rewards/chosen": -45.907371520996094, "rewards/margins": -1.5000953674316406, "rewards/rejected": -44.40727233886719, "step": 7313 }, { "epoch": 0.9959150326797386, "grad_norm": 38.59218079901813, "learning_rate": 4.0671799394242925e-11, "logits/chosen": 14.819326400756836, "logits/rejected": 14.734611511230469, "logps/chosen": -4.646764278411865, "logps/rejected": -4.619941711425781, "loss": 3.4875, "rewards/accuracies": 0.5, "rewards/chosen": -46.46764373779297, "rewards/margins": -0.2682228088378906, "rewards/rejected": -46.19941711425781, "step": 7314 }, { "epoch": 0.9960511982570807, "grad_norm": 42.08400003329124, "learning_rate": 3.800557921236347e-11, "logits/chosen": 14.40906047821045, "logits/rejected": 14.53378677368164, "logps/chosen": -4.631627082824707, "logps/rejected": -4.805278778076172, "loss": 3.6916, "rewards/accuracies": 0.75, "rewards/chosen": -46.3162727355957, "rewards/margins": 1.7365150451660156, "rewards/rejected": -48.05278778076172, "step": 7315 }, { "epoch": 0.9961873638344226, "grad_norm": 42.93377422491057, "learning_rate": 3.542973374925928e-11, "logits/chosen": 14.074183464050293, "logits/rejected": 15.076099395751953, "logps/chosen": -4.639565467834473, "logps/rejected": -5.064151763916016, "loss": 3.7667, "rewards/accuracies": 1.0, "rewards/chosen": -46.395652770996094, "rewards/margins": 4.24586296081543, "rewards/rejected": -50.64151382446289, "step": 7316 }, { "epoch": 0.9963235294117647, "grad_norm": 35.90964671158095, "learning_rate": 3.294426358690927e-11, "logits/chosen": 14.20781135559082, "logits/rejected": 13.865547180175781, "logps/chosen": -4.4285054206848145, "logps/rejected": -4.455842018127441, "loss": 3.5031, "rewards/accuracies": 0.5, "rewards/chosen": -44.28505325317383, "rewards/margins": 0.27336788177490234, "rewards/rejected": -44.55842208862305, "step": 7317 }, { "epoch": 0.9964596949891068, "grad_norm": 35.282646305908884, "learning_rate": 3.0549169286997466e-11, "logits/chosen": 14.469970703125, "logits/rejected": 14.628273010253906, "logps/chosen": -4.013133525848389, "logps/rejected": -4.533455848693848, "loss": 3.8432, "rewards/accuracies": 0.75, "rewards/chosen": -40.1313362121582, "rewards/margins": 5.203225135803223, "rewards/rejected": -45.334564208984375, "step": 7318 }, { "epoch": 0.9965958605664488, "grad_norm": 37.07793020071741, "learning_rate": 2.8244451390646573e-11, "logits/chosen": 14.10767936706543, "logits/rejected": 14.57154655456543, "logps/chosen": -4.664904594421387, "logps/rejected": -4.808395862579346, "loss": 3.8693, "rewards/accuracies": 0.75, "rewards/chosen": -46.649044036865234, "rewards/margins": 1.4349174499511719, "rewards/rejected": -48.083961486816406, "step": 7319 }, { "epoch": 0.9967320261437909, "grad_norm": 42.71615140936315, "learning_rate": 2.6030110418684416e-11, "logits/chosen": 14.462919235229492, "logits/rejected": 14.425313949584961, "logps/chosen": -4.642862319946289, "logps/rejected": -4.739243507385254, "loss": 4.1685, "rewards/accuracies": 0.75, "rewards/chosen": -46.42862319946289, "rewards/margins": 0.9638128280639648, "rewards/rejected": -47.39244079589844, "step": 7320 }, { "epoch": 0.996868191721133, "grad_norm": 37.849084951797764, "learning_rate": 2.3906146871421895e-11, "logits/chosen": 13.880636215209961, "logits/rejected": 15.549806594848633, "logps/chosen": -4.432492256164551, "logps/rejected": -5.017186641693115, "loss": 3.7585, "rewards/accuracies": 1.0, "rewards/chosen": -44.324920654296875, "rewards/margins": 5.846945762634277, "rewards/rejected": -50.17186737060547, "step": 7321 }, { "epoch": 0.9970043572984749, "grad_norm": 40.35730910064567, "learning_rate": 2.1872561228830634e-11, "logits/chosen": 14.893348693847656, "logits/rejected": 15.519269943237305, "logps/chosen": -4.412153244018555, "logps/rejected": -4.6597981452941895, "loss": 3.937, "rewards/accuracies": 0.75, "rewards/chosen": -44.12152862548828, "rewards/margins": 2.4764528274536133, "rewards/rejected": -46.597984313964844, "step": 7322 }, { "epoch": 0.997140522875817, "grad_norm": 38.748991580046265, "learning_rate": 1.9929353950365323e-11, "logits/chosen": 14.806018829345703, "logits/rejected": 14.60831069946289, "logps/chosen": -5.028933048248291, "logps/rejected": -4.844614505767822, "loss": 4.1809, "rewards/accuracies": 0.25, "rewards/chosen": -50.28933334350586, "rewards/margins": -1.8431882858276367, "rewards/rejected": -48.446144104003906, "step": 7323 }, { "epoch": 0.9972766884531591, "grad_norm": 39.02414662252122, "learning_rate": 1.807652547514138e-11, "logits/chosen": 14.907622337341309, "logits/rejected": 14.847326278686523, "logps/chosen": -4.782749176025391, "logps/rejected": -4.792338848114014, "loss": 3.6036, "rewards/accuracies": 0.75, "rewards/chosen": -47.827491760253906, "rewards/margins": 0.09589767456054688, "rewards/rejected": -47.92338562011719, "step": 7324 }, { "epoch": 0.9974128540305011, "grad_norm": 41.83619278900753, "learning_rate": 1.63140762218017e-11, "logits/chosen": 14.610345840454102, "logits/rejected": 15.866451263427734, "logps/chosen": -4.5461554527282715, "logps/rejected": -4.945163726806641, "loss": 3.5994, "rewards/accuracies": 0.75, "rewards/chosen": -45.46155548095703, "rewards/margins": 3.9900779724121094, "rewards/rejected": -49.451629638671875, "step": 7325 }, { "epoch": 0.9975490196078431, "grad_norm": 41.948646490329224, "learning_rate": 1.4642006588605483e-11, "logits/chosen": 14.5538330078125, "logits/rejected": 15.025680541992188, "logps/chosen": -4.65265417098999, "logps/rejected": -5.117241382598877, "loss": 4.2115, "rewards/accuracies": 0.75, "rewards/chosen": -46.52653884887695, "rewards/margins": 4.645874977111816, "rewards/rejected": -51.17241668701172, "step": 7326 }, { "epoch": 0.9976851851851852, "grad_norm": 82.5467972358, "learning_rate": 1.3060316953339424e-11, "logits/chosen": 13.758556365966797, "logits/rejected": 14.656793594360352, "logps/chosen": -4.3975419998168945, "logps/rejected": -4.863970756530762, "loss": 4.7497, "rewards/accuracies": 1.0, "rewards/chosen": -43.97542190551758, "rewards/margins": 4.66428279876709, "rewards/rejected": -48.639705657958984, "step": 7327 }, { "epoch": 0.9978213507625272, "grad_norm": 40.04281653700382, "learning_rate": 1.1569007673450925e-11, "logits/chosen": 15.040964126586914, "logits/rejected": 15.726921081542969, "logps/chosen": -4.693436622619629, "logps/rejected": -5.191343307495117, "loss": 3.2424, "rewards/accuracies": 0.75, "rewards/chosen": -46.93436813354492, "rewards/margins": 4.979065895080566, "rewards/rejected": -51.91343688964844, "step": 7328 }, { "epoch": 0.9979575163398693, "grad_norm": 42.65839102284206, "learning_rate": 1.0168079085870474e-11, "logits/chosen": 14.096973419189453, "logits/rejected": 14.034303665161133, "logps/chosen": -4.388217926025391, "logps/rejected": -4.603744983673096, "loss": 4.1367, "rewards/accuracies": 0.75, "rewards/chosen": -43.88218307495117, "rewards/margins": 2.1552658081054688, "rewards/rejected": -46.037445068359375, "step": 7329 }, { "epoch": 0.9980936819172114, "grad_norm": 38.304408935352015, "learning_rate": 8.857531507144856e-12, "logits/chosen": 13.784433364868164, "logits/rejected": 14.932424545288086, "logps/chosen": -4.527565956115723, "logps/rejected": -4.840949058532715, "loss": 4.4264, "rewards/accuracies": 0.5, "rewards/chosen": -45.275665283203125, "rewards/margins": 3.1338281631469727, "rewards/rejected": -48.40949249267578, "step": 7330 }, { "epoch": 0.9982298474945533, "grad_norm": 40.93791345263137, "learning_rate": 7.637365233437165e-12, "logits/chosen": 14.866808891296387, "logits/rejected": 14.74190616607666, "logps/chosen": -4.492611408233643, "logps/rejected": -4.530085563659668, "loss": 3.6769, "rewards/accuracies": 0.5, "rewards/chosen": -44.926116943359375, "rewards/margins": 0.3747434616088867, "rewards/rejected": -45.30085754394531, "step": 7331 }, { "epoch": 0.9983660130718954, "grad_norm": 39.24557761886659, "learning_rate": 6.507580540437985e-12, "logits/chosen": 15.168508529663086, "logits/rejected": 15.925056457519531, "logps/chosen": -4.745734214782715, "logps/rejected": -5.182828426361084, "loss": 4.3888, "rewards/accuracies": 1.0, "rewards/chosen": -47.45734405517578, "rewards/margins": 4.370939254760742, "rewards/rejected": -51.828285217285156, "step": 7332 }, { "epoch": 0.9985021786492375, "grad_norm": 40.92743052606246, "learning_rate": 5.4681776834097915e-12, "logits/chosen": 14.401067733764648, "logits/rejected": 15.91611099243164, "logps/chosen": -4.3133931159973145, "logps/rejected": -4.714920997619629, "loss": 3.6798, "rewards/accuracies": 0.75, "rewards/chosen": -43.13393020629883, "rewards/margins": 4.015280723571777, "rewards/rejected": -47.14921188354492, "step": 7333 }, { "epoch": 0.9986383442265795, "grad_norm": 36.81468870272292, "learning_rate": 4.519156897275777e-12, "logits/chosen": 14.841726303100586, "logits/rejected": 14.13094711303711, "logps/chosen": -4.499431610107422, "logps/rejected": -4.358752250671387, "loss": 3.6184, "rewards/accuracies": 0.5, "rewards/chosen": -44.994319915771484, "rewards/margins": -1.406794548034668, "rewards/rejected": -43.5875244140625, "step": 7334 }, { "epoch": 0.9987745098039216, "grad_norm": 44.02730451266736, "learning_rate": 3.660518396397805e-12, "logits/chosen": 14.097829818725586, "logits/rejected": 13.821464538574219, "logps/chosen": -4.415345191955566, "logps/rejected": -4.391105651855469, "loss": 3.357, "rewards/accuracies": 0.5, "rewards/chosen": -44.15345001220703, "rewards/margins": -0.24238872528076172, "rewards/rejected": -43.91106033325195, "step": 7335 }, { "epoch": 0.9989106753812637, "grad_norm": 42.719049423357944, "learning_rate": 2.8922623748428577e-12, "logits/chosen": 14.27830982208252, "logits/rejected": 14.38129997253418, "logps/chosen": -4.521233558654785, "logps/rejected": -4.428078651428223, "loss": 4.4576, "rewards/accuracies": 0.5, "rewards/chosen": -45.21233367919922, "rewards/margins": -0.9315481185913086, "rewards/rejected": -44.280784606933594, "step": 7336 }, { "epoch": 0.9990468409586056, "grad_norm": 44.0066139679667, "learning_rate": 2.2143890062054084e-12, "logits/chosen": 14.635753631591797, "logits/rejected": 15.169696807861328, "logps/chosen": -4.737799644470215, "logps/rejected": -4.928046226501465, "loss": 4.512, "rewards/accuracies": 0.5, "rewards/chosen": -47.37799835205078, "rewards/margins": 1.9024639129638672, "rewards/rejected": -49.280460357666016, "step": 7337 }, { "epoch": 0.9991830065359477, "grad_norm": 38.20734169085157, "learning_rate": 1.6268984436074163e-12, "logits/chosen": 14.05511474609375, "logits/rejected": 14.106461524963379, "logps/chosen": -4.421632766723633, "logps/rejected": -4.621228218078613, "loss": 3.6574, "rewards/accuracies": 0.75, "rewards/chosen": -44.21632766723633, "rewards/margins": 1.9959545135498047, "rewards/rejected": -46.2122802734375, "step": 7338 }, { "epoch": 0.9993191721132898, "grad_norm": 39.044343657900534, "learning_rate": 1.1297908198315553e-12, "logits/chosen": 14.001029968261719, "logits/rejected": 15.340462684631348, "logps/chosen": -4.392358779907227, "logps/rejected": -4.791111946105957, "loss": 3.4691, "rewards/accuracies": 0.75, "rewards/chosen": -43.923583984375, "rewards/margins": 3.987532615661621, "rewards/rejected": -47.91111755371094, "step": 7339 }, { "epoch": 0.9994553376906318, "grad_norm": 44.69621684420622, "learning_rate": 7.230662472323957e-13, "logits/chosen": 14.853174209594727, "logits/rejected": 14.980582237243652, "logps/chosen": -4.742242813110352, "logps/rejected": -5.015073299407959, "loss": 4.2667, "rewards/accuracies": 0.75, "rewards/chosen": -47.422428131103516, "rewards/margins": 2.728304862976074, "rewards/rejected": -50.150733947753906, "step": 7340 }, { "epoch": 0.9995915032679739, "grad_norm": 40.24924468279431, "learning_rate": 4.067248176919946e-13, "logits/chosen": 14.594698905944824, "logits/rejected": 15.227733612060547, "logps/chosen": -4.636050701141357, "logps/rejected": -5.03708553314209, "loss": 4.3197, "rewards/accuracies": 1.0, "rewards/chosen": -46.360504150390625, "rewards/margins": 4.01035213470459, "rewards/rejected": -50.37085723876953, "step": 7341 }, { "epoch": 0.9997276688453159, "grad_norm": 41.319553796539765, "learning_rate": 1.8076660266430622e-13, "logits/chosen": 14.643289566040039, "logits/rejected": 14.775410652160645, "logps/chosen": -4.647680759429932, "logps/rejected": -4.723776340484619, "loss": 4.2326, "rewards/accuracies": 0.75, "rewards/chosen": -46.476806640625, "rewards/margins": 0.760955810546875, "rewards/rejected": -47.237762451171875, "step": 7342 }, { "epoch": 0.9998638344226579, "grad_norm": 42.93988687635372, "learning_rate": 4.519165321958951e-14, "logits/chosen": 13.98783016204834, "logits/rejected": 14.517656326293945, "logps/chosen": -4.781124114990234, "logps/rejected": -4.95522403717041, "loss": 4.3694, "rewards/accuracies": 1.0, "rewards/chosen": -47.81124496459961, "rewards/margins": 1.7409992218017578, "rewards/rejected": -49.552242279052734, "step": 7343 }, { "epoch": 1.0, "grad_norm": 45.231449668211056, "learning_rate": 0.0, "logits/chosen": 13.54207992553711, "logits/rejected": 14.750669479370117, "logps/chosen": -4.330604553222656, "logps/rejected": -4.70823335647583, "loss": 3.9729, "rewards/accuracies": 1.0, "rewards/chosen": -43.30604553222656, "rewards/margins": 3.776291847229004, "rewards/rejected": -47.08233642578125, "step": 7344 }, { "epoch": 1.0, "eval_logits/chosen": 14.471516609191895, "eval_logits/rejected": 14.6997652053833, "eval_logps/chosen": -4.607059001922607, "eval_logps/rejected": -4.770802974700928, "eval_loss": 3.942422866821289, "eval_rewards/accuracies": 0.6459948420524597, "eval_rewards/chosen": -46.07059097290039, "eval_rewards/margins": 1.6374331712722778, "eval_rewards/rejected": -47.7080192565918, "eval_runtime": 487.8451, "eval_samples_per_second": 101.415, "eval_steps_per_second": 1.587, "step": 7344 }, { "epoch": 1.0, "step": 7344, "total_flos": 4181538261958656.0, "train_loss": 4.105394068821323, "train_runtime": 31326.6745, "train_samples_per_second": 30.007, "train_steps_per_second": 0.234 } ], "logging_steps": 1.0, "max_steps": 7344, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4181538261958656.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }