{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.996, "eval_steps": 500, "global_step": 83, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012, "grad_norm": 0.8640882968902588, "learning_rate": 5.555555555555555e-08, "logits/chosen": -1.495650291442871, "logits/rejected": -1.3535889387130737, "logps/chosen": -0.10173828899860382, "logps/rejected": -0.08792766183614731, "loss": 0.8717, "rewards/accuracies": 0.3333333730697632, "rewards/chosen": -0.20347657799720764, "rewards/margins": -0.027621246874332428, "rewards/rejected": -0.17585532367229462, "step": 1 }, { "epoch": 0.024, "grad_norm": 0.979682981967926, "learning_rate": 1.111111111111111e-07, "logits/chosen": -1.443913221359253, "logits/rejected": -1.3641024827957153, "logps/chosen": -0.10752908140420914, "logps/rejected": -0.08240076899528503, "loss": 0.8846, "rewards/accuracies": 0.2187500149011612, "rewards/chosen": -0.21505816280841827, "rewards/margins": -0.0502566322684288, "rewards/rejected": -0.16480153799057007, "step": 2 }, { "epoch": 0.036, "grad_norm": 0.9309831857681274, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -1.4732969999313354, "logits/rejected": -1.376213550567627, "logps/chosen": -0.10481980443000793, "logps/rejected": -0.07933054119348526, "loss": 0.8848, "rewards/accuracies": 0.28125, "rewards/chosen": -0.20963960886001587, "rewards/margins": -0.050978537648916245, "rewards/rejected": -0.15866108238697052, "step": 3 }, { "epoch": 0.048, "grad_norm": 0.9682196974754333, "learning_rate": 2.222222222222222e-07, "logits/chosen": -1.5285106897354126, "logits/rejected": -1.3950246572494507, "logps/chosen": -0.11225953698158264, "logps/rejected": -0.08694332838058472, "loss": 0.8849, "rewards/accuracies": 0.3229166865348816, "rewards/chosen": -0.22451907396316528, "rewards/margins": -0.05063238739967346, "rewards/rejected": -0.17388665676116943, "step": 4 }, { "epoch": 0.06, "grad_norm": 1.0820338726043701, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -1.494052767753601, "logits/rejected": -1.3356661796569824, "logps/chosen": -0.12428013980388641, "logps/rejected": -0.09185128659009933, "loss": 0.8968, "rewards/accuracies": 0.322916716337204, "rewards/chosen": -0.24856027960777283, "rewards/margins": -0.06485769897699356, "rewards/rejected": -0.18370257318019867, "step": 5 }, { "epoch": 0.072, "grad_norm": 1.0573477745056152, "learning_rate": 3.333333333333333e-07, "logits/chosen": -1.4825295209884644, "logits/rejected": -1.3897716999053955, "logps/chosen": -0.11380095779895782, "logps/rejected": -0.08133920282125473, "loss": 0.8938, "rewards/accuracies": 0.3020833432674408, "rewards/chosen": -0.22760191559791565, "rewards/margins": -0.06492353230714798, "rewards/rejected": -0.16267840564250946, "step": 6 }, { "epoch": 0.084, "grad_norm": 0.9294664263725281, "learning_rate": 3.888888888888889e-07, "logits/chosen": -1.5021216869354248, "logits/rejected": -1.3295238018035889, "logps/chosen": -0.099081851541996, "logps/rejected": -0.08119318634271622, "loss": 0.8761, "rewards/accuracies": 0.2916666567325592, "rewards/chosen": -0.198163703083992, "rewards/margins": -0.03577733412384987, "rewards/rejected": -0.16238637268543243, "step": 7 }, { "epoch": 0.096, "grad_norm": 1.0329580307006836, "learning_rate": 4.444444444444444e-07, "logits/chosen": -1.5048794746398926, "logits/rejected": -1.377795934677124, "logps/chosen": -0.12124787271022797, "logps/rejected": -0.08464659005403519, "loss": 0.9015, "rewards/accuracies": 0.3125, "rewards/chosen": -0.24249574542045593, "rewards/margins": -0.07320256531238556, "rewards/rejected": -0.16929318010807037, "step": 8 }, { "epoch": 0.108, "grad_norm": 0.9767515063285828, "learning_rate": 5e-07, "logits/chosen": -1.5186768770217896, "logits/rejected": -1.3877286911010742, "logps/chosen": -0.10737170279026031, "logps/rejected": -0.0823729932308197, "loss": 0.8847, "rewards/accuracies": 0.3020833432674408, "rewards/chosen": -0.21474340558052063, "rewards/margins": -0.04999742656946182, "rewards/rejected": -0.1647459864616394, "step": 9 }, { "epoch": 0.12, "grad_norm": 0.9653743505477905, "learning_rate": 4.997747415511704e-07, "logits/chosen": -1.4915320873260498, "logits/rejected": -1.3450301885604858, "logps/chosen": -0.10813666880130768, "logps/rejected": -0.08507634699344635, "loss": 0.8822, "rewards/accuracies": 0.2291666716337204, "rewards/chosen": -0.21627333760261536, "rewards/margins": -0.04612065851688385, "rewards/rejected": -0.1701526939868927, "step": 10 }, { "epoch": 0.132, "grad_norm": 0.9533908367156982, "learning_rate": 4.990993721356315e-07, "logits/chosen": -1.4848368167877197, "logits/rejected": -1.3539990186691284, "logps/chosen": -0.11860796809196472, "logps/rejected": -0.08763512969017029, "loss": 0.8931, "rewards/accuracies": 0.25, "rewards/chosen": -0.23721593618392944, "rewards/margins": -0.061945684254169464, "rewards/rejected": -0.17527025938034058, "step": 11 }, { "epoch": 0.144, "grad_norm": 0.9020726084709167, "learning_rate": 4.979751088147191e-07, "logits/chosen": -1.4897594451904297, "logits/rejected": -1.3646252155303955, "logps/chosen": -0.09957993775606155, "logps/rejected": -0.08218874782323837, "loss": 0.8759, "rewards/accuracies": 0.3958333432674408, "rewards/chosen": -0.1991598755121231, "rewards/margins": -0.034782394766807556, "rewards/rejected": -0.16437749564647675, "step": 12 }, { "epoch": 0.156, "grad_norm": 1.0148029327392578, "learning_rate": 4.964039775869271e-07, "logits/chosen": -1.4809292554855347, "logits/rejected": -1.3655236959457397, "logps/chosen": -0.11366236209869385, "logps/rejected": -0.0875682383775711, "loss": 0.8868, "rewards/accuracies": 0.3645833432674408, "rewards/chosen": -0.2273247241973877, "rewards/margins": -0.05218825116753578, "rewards/rejected": -0.1751364767551422, "step": 13 }, { "epoch": 0.168, "grad_norm": 1.334598422050476, "learning_rate": 4.943888097369216e-07, "logits/chosen": -1.4804601669311523, "logits/rejected": -1.3518431186676025, "logps/chosen": -0.13359910249710083, "logps/rejected": -0.08674132823944092, "loss": 0.9236, "rewards/accuracies": 0.2395833432674408, "rewards/chosen": -0.26719820499420166, "rewards/margins": -0.09371551126241684, "rewards/rejected": -0.17348265647888184, "step": 14 }, { "epoch": 0.18, "grad_norm": 0.9834568500518799, "learning_rate": 4.919332367333748e-07, "logits/chosen": -1.4974645376205444, "logits/rejected": -1.3400018215179443, "logps/chosen": -0.10907550156116486, "logps/rejected": -0.08122590184211731, "loss": 0.8879, "rewards/accuracies": 0.21875, "rewards/chosen": -0.2181510031223297, "rewards/margins": -0.05569921433925629, "rewards/rejected": -0.16245180368423462, "step": 15 }, { "epoch": 0.192, "grad_norm": 0.9361069798469543, "learning_rate": 4.890416836848127e-07, "logits/chosen": -1.4610369205474854, "logits/rejected": -1.369497299194336, "logps/chosen": -0.10335300862789154, "logps/rejected": -0.08525583893060684, "loss": 0.8765, "rewards/accuracies": 0.3541666865348816, "rewards/chosen": -0.20670601725578308, "rewards/margins": -0.03619435429573059, "rewards/rejected": -0.17051167786121368, "step": 16 }, { "epoch": 0.204, "grad_norm": 0.910740077495575, "learning_rate": 4.85719361365271e-07, "logits/chosen": -1.5087649822235107, "logits/rejected": -1.3652905225753784, "logps/chosen": -0.09960392862558365, "logps/rejected": -0.08507402241230011, "loss": 0.8719, "rewards/accuracies": 0.3541666865348816, "rewards/chosen": -0.1992078572511673, "rewards/margins": -0.029059793800115585, "rewards/rejected": -0.17014804482460022, "step": 17 }, { "epoch": 0.216, "grad_norm": 0.9233217239379883, "learning_rate": 4.819722568241273e-07, "logits/chosen": -1.4695565700531006, "logits/rejected": -1.336183786392212, "logps/chosen": -0.10343371331691742, "logps/rejected": -0.0857667475938797, "loss": 0.8769, "rewards/accuracies": 0.4166666865348816, "rewards/chosen": -0.20686742663383484, "rewards/margins": -0.03533393144607544, "rewards/rejected": -0.1715334951877594, "step": 18 }, { "epoch": 0.228, "grad_norm": 0.8909419775009155, "learning_rate": 4.778071225970339e-07, "logits/chosen": -1.4759807586669922, "logits/rejected": -1.3495370149612427, "logps/chosen": -0.10235996544361115, "logps/rejected": -0.0851350948214531, "loss": 0.8757, "rewards/accuracies": 0.3229166567325592, "rewards/chosen": -0.2047199308872223, "rewards/margins": -0.03444972261786461, "rewards/rejected": -0.1702701896429062, "step": 19 }, { "epoch": 0.24, "grad_norm": 0.9479613304138184, "learning_rate": 4.732314645373921e-07, "logits/chosen": -1.4673627614974976, "logits/rejected": -1.3651877641677856, "logps/chosen": -0.10552840679883957, "logps/rejected": -0.07911674678325653, "loss": 0.886, "rewards/accuracies": 0.28125, "rewards/chosen": -0.21105681359767914, "rewards/margins": -0.05282333493232727, "rewards/rejected": -0.15823349356651306, "step": 20 }, { "epoch": 0.252, "grad_norm": 0.854266345500946, "learning_rate": 4.68253528290297e-07, "logits/chosen": -1.4877924919128418, "logits/rejected": -1.324515461921692, "logps/chosen": -0.1002732366323471, "logps/rejected": -0.09079625457525253, "loss": 0.8664, "rewards/accuracies": 0.3645833432674408, "rewards/chosen": -0.2005464732646942, "rewards/margins": -0.018953965976834297, "rewards/rejected": -0.18159250915050507, "step": 21 }, { "epoch": 0.264, "grad_norm": 0.9627916812896729, "learning_rate": 4.6288228443332776e-07, "logits/chosen": -1.466447114944458, "logits/rejected": -1.3485172986984253, "logps/chosen": -0.10300128906965256, "logps/rejected": -0.08484764397144318, "loss": 0.8763, "rewards/accuracies": 0.3333333730697632, "rewards/chosen": -0.20600257813930511, "rewards/margins": -0.03630730137228966, "rewards/rejected": -0.16969528794288635, "step": 22 }, { "epoch": 0.276, "grad_norm": 0.8689938187599182, "learning_rate": 4.571274123109605e-07, "logits/chosen": -1.4124252796173096, "logits/rejected": -1.3001394271850586, "logps/chosen": -0.10588695108890533, "logps/rejected": -0.08438257873058319, "loss": 0.8809, "rewards/accuracies": 0.2395833283662796, "rewards/chosen": -0.21177390217781067, "rewards/margins": -0.043008752167224884, "rewards/rejected": -0.16876515746116638, "step": 23 }, { "epoch": 0.288, "grad_norm": 0.9801927208900452, "learning_rate": 4.5099928259173514e-07, "logits/chosen": -1.4522674083709717, "logits/rejected": -1.3461543321609497, "logps/chosen": -0.1134054884314537, "logps/rejected": -0.08870639652013779, "loss": 0.886, "rewards/accuracies": 0.2499999850988388, "rewards/chosen": -0.2268109768629074, "rewards/margins": -0.049398161470890045, "rewards/rejected": -0.17741279304027557, "step": 24 }, { "epoch": 0.3, "grad_norm": 0.9185568690299988, "learning_rate": 4.4450893857960984e-07, "logits/chosen": -1.4601349830627441, "logits/rejected": -1.273384928703308, "logps/chosen": -0.10715562850236893, "logps/rejected": -0.09089934825897217, "loss": 0.8746, "rewards/accuracies": 0.375, "rewards/chosen": -0.21431125700473785, "rewards/margins": -0.03251257538795471, "rewards/rejected": -0.18179869651794434, "step": 25 }, { "epoch": 0.312, "grad_norm": 0.8864983320236206, "learning_rate": 4.3766807631318105e-07, "logits/chosen": -1.438947081565857, "logits/rejected": -1.3086212873458862, "logps/chosen": -0.10850708931684494, "logps/rejected": -0.0892128050327301, "loss": 0.8781, "rewards/accuracies": 0.3229166865348816, "rewards/chosen": -0.21701417863368988, "rewards/margins": -0.038588590919971466, "rewards/rejected": -0.1784256100654602, "step": 26 }, { "epoch": 0.324, "grad_norm": 0.9518214464187622, "learning_rate": 4.3048902348863106e-07, "logits/chosen": -1.4363845586776733, "logits/rejected": -1.3129128217697144, "logps/chosen": -0.10809854418039322, "logps/rejected": -0.08249183744192123, "loss": 0.885, "rewards/accuracies": 0.2812500298023224, "rewards/chosen": -0.21619708836078644, "rewards/margins": -0.05121342092752457, "rewards/rejected": -0.16498367488384247, "step": 27 }, { "epoch": 0.336, "grad_norm": 0.9291889667510986, "learning_rate": 4.2298471724438653e-07, "logits/chosen": -1.4329116344451904, "logits/rejected": -1.3157219886779785, "logps/chosen": -0.10641689598560333, "logps/rejected": -0.0836237370967865, "loss": 0.8825, "rewards/accuracies": 0.3541666567325592, "rewards/chosen": -0.21283379197120667, "rewards/margins": -0.04558631405234337, "rewards/rejected": -0.167247474193573, "step": 28 }, { "epoch": 0.348, "grad_norm": 0.8883063197135925, "learning_rate": 4.151686808475203e-07, "logits/chosen": -1.4277637004852295, "logits/rejected": -1.2657394409179688, "logps/chosen": -0.1044892817735672, "logps/rejected": -0.08842873573303223, "loss": 0.8741, "rewards/accuracies": 0.34375, "rewards/chosen": -0.2089785635471344, "rewards/margins": -0.03212107717990875, "rewards/rejected": -0.17685747146606445, "step": 29 }, { "epoch": 0.36, "grad_norm": 0.9316148161888123, "learning_rate": 4.070549993239106e-07, "logits/chosen": -1.3881316184997559, "logits/rejected": -1.2497737407684326, "logps/chosen": -0.11017285287380219, "logps/rejected": -0.0868031308054924, "loss": 0.8842, "rewards/accuracies": 0.3645833432674408, "rewards/chosen": -0.22034570574760437, "rewards/margins": -0.046739429235458374, "rewards/rejected": -0.1736062616109848, "step": 30 }, { "epoch": 0.372, "grad_norm": 0.8768561482429504, "learning_rate": 3.9865829407607166e-07, "logits/chosen": -1.4147083759307861, "logits/rejected": -1.2981321811676025, "logps/chosen": -0.10104362666606903, "logps/rejected": -0.09066756814718246, "loss": 0.8673, "rewards/accuracies": 0.3437500298023224, "rewards/chosen": -0.20208725333213806, "rewards/margins": -0.020752109587192535, "rewards/rejected": -0.18133513629436493, "step": 31 }, { "epoch": 0.384, "grad_norm": 1.0077239274978638, "learning_rate": 3.8999369653439883e-07, "logits/chosen": -1.4155137538909912, "logits/rejected": -1.2743993997573853, "logps/chosen": -0.10902610421180725, "logps/rejected": -0.0863211378455162, "loss": 0.8827, "rewards/accuracies": 0.3020833432674408, "rewards/chosen": -0.2180522084236145, "rewards/margins": -0.0454099103808403, "rewards/rejected": -0.1726422756910324, "step": 32 }, { "epoch": 0.396, "grad_norm": 0.9207706451416016, "learning_rate": 3.810768208893079e-07, "logits/chosen": -1.3758422136306763, "logits/rejected": -1.2915358543395996, "logps/chosen": -0.1034766435623169, "logps/rejected": -0.07977995276451111, "loss": 0.883, "rewards/accuracies": 0.3020833432674408, "rewards/chosen": -0.2069532871246338, "rewards/margins": -0.047393374145030975, "rewards/rejected": -0.15955990552902222, "step": 33 }, { "epoch": 0.408, "grad_norm": 0.9844294190406799, "learning_rate": 3.7192373595340865e-07, "logits/chosen": -1.442295789718628, "logits/rejected": -1.2863086462020874, "logps/chosen": -0.10710459202528, "logps/rejected": -0.08674141019582748, "loss": 0.8796, "rewards/accuracies": 0.3020833432674408, "rewards/chosen": -0.21420918405056, "rewards/margins": -0.04072638228535652, "rewards/rejected": -0.17348282039165497, "step": 34 }, { "epoch": 0.42, "grad_norm": 0.9682800769805908, "learning_rate": 3.625509362044183e-07, "logits/chosen": -1.3701705932617188, "logits/rejected": -1.2656924724578857, "logps/chosen": -0.11000403016805649, "logps/rejected": -0.08980907499790192, "loss": 0.8803, "rewards/accuracies": 0.3854166567325592, "rewards/chosen": -0.22000806033611298, "rewards/margins": -0.04038992151618004, "rewards/rejected": -0.17961814999580383, "step": 35 }, { "epoch": 0.432, "grad_norm": 0.9004039168357849, "learning_rate": 3.529753120609982e-07, "logits/chosen": -1.4099225997924805, "logits/rejected": -1.252682089805603, "logps/chosen": -0.09707480669021606, "logps/rejected": -0.08859608322381973, "loss": 0.8653, "rewards/accuracies": 0.4479166865348816, "rewards/chosen": -0.19414961338043213, "rewards/margins": -0.016957445070147514, "rewards/rejected": -0.17719216644763947, "step": 36 }, { "epoch": 0.444, "grad_norm": 0.9568714499473572, "learning_rate": 3.4321411944507714e-07, "logits/chosen": -1.3612836599349976, "logits/rejected": -1.2257184982299805, "logps/chosen": -0.10623904317617416, "logps/rejected": -0.09100136905908585, "loss": 0.8734, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": -0.21247808635234833, "rewards/margins": -0.030475351959466934, "rewards/rejected": -0.1820027381181717, "step": 37 }, { "epoch": 0.456, "grad_norm": 0.8658465147018433, "learning_rate": 3.332849486855144e-07, "logits/chosen": -1.381141185760498, "logits/rejected": -1.2495661973953247, "logps/chosen": -0.09047922492027283, "logps/rejected": -0.07975338399410248, "loss": 0.8676, "rewards/accuracies": 0.3958333432674408, "rewards/chosen": -0.18095844984054565, "rewards/margins": -0.021451696753501892, "rewards/rejected": -0.15950676798820496, "step": 38 }, { "epoch": 0.468, "grad_norm": 0.9716626405715942, "learning_rate": 3.2320569281913754e-07, "logits/chosen": -1.3790628910064697, "logits/rejected": -1.251636028289795, "logps/chosen": -0.10486049205064774, "logps/rejected": -0.08277000486850739, "loss": 0.883, "rewards/accuracies": 0.3437500298023224, "rewards/chosen": -0.20972098410129547, "rewards/margins": -0.0441809706389904, "rewards/rejected": -0.16554000973701477, "step": 39 }, { "epoch": 0.48, "grad_norm": 0.948078989982605, "learning_rate": 3.129945153462813e-07, "logits/chosen": -1.3937729597091675, "logits/rejected": -1.2695637941360474, "logps/chosen": -0.09631801396608353, "logps/rejected": -0.08585619181394577, "loss": 0.8673, "rewards/accuracies": 0.3854166865348816, "rewards/chosen": -0.19263602793216705, "rewards/margins": -0.020923633128404617, "rewards/rejected": -0.17171238362789154, "step": 40 }, { "epoch": 0.492, "grad_norm": 0.9445565342903137, "learning_rate": 3.0266981749893154e-07, "logits/chosen": -1.3988057374954224, "logits/rejected": -1.2703980207443237, "logps/chosen": -0.09746824949979782, "logps/rejected": -0.0910995602607727, "loss": 0.8624, "rewards/accuracies": 0.40625, "rewards/chosen": -0.19493649899959564, "rewards/margins": -0.012737366370856762, "rewards/rejected": -0.1821991205215454, "step": 41 }, { "epoch": 0.504, "grad_norm": 1.080769419670105, "learning_rate": 2.922502050804623e-07, "logits/chosen": -1.4094312191009521, "logits/rejected": -1.247127890586853, "logps/chosen": -0.09884171932935715, "logps/rejected": -0.08831708878278732, "loss": 0.8677, "rewards/accuracies": 0.3958333730697632, "rewards/chosen": -0.1976834386587143, "rewards/margins": -0.02104926109313965, "rewards/rejected": -0.17663417756557465, "step": 42 }, { "epoch": 0.516, "grad_norm": 0.9974801540374756, "learning_rate": 2.8175445493671966e-07, "logits/chosen": -1.3815157413482666, "logits/rejected": -1.2270005941390991, "logps/chosen": -0.10922063887119293, "logps/rejected": -0.0900418609380722, "loss": 0.8809, "rewards/accuracies": 0.4583333730697632, "rewards/chosen": -0.21844127774238586, "rewards/margins": -0.03835754841566086, "rewards/rejected": -0.1800837218761444, "step": 43 }, { "epoch": 0.528, "grad_norm": 0.9864545464515686, "learning_rate": 2.712014811188773e-07, "logits/chosen": -1.3654570579528809, "logits/rejected": -1.2366658449172974, "logps/chosen": -0.1124953106045723, "logps/rejected": -0.0964164212346077, "loss": 0.8745, "rewards/accuracies": 0.375, "rewards/chosen": -0.2249906212091446, "rewards/margins": -0.0321577824652195, "rewards/rejected": -0.1928328424692154, "step": 44 }, { "epoch": 0.54, "grad_norm": 0.9367190599441528, "learning_rate": 2.606103007990371e-07, "logits/chosen": -1.3880029916763306, "logits/rejected": -1.2659260034561157, "logps/chosen": -0.09874889254570007, "logps/rejected": -0.09255427867174149, "loss": 0.8628, "rewards/accuracies": 0.4062500298023224, "rewards/chosen": -0.19749778509140015, "rewards/margins": -0.012389198876917362, "rewards/rejected": -0.18510855734348297, "step": 45 }, { "epoch": 0.552, "grad_norm": 0.9902623891830444, "learning_rate": 2.5e-07, "logits/chosen": -1.3836193084716797, "logits/rejected": -1.2308557033538818, "logps/chosen": -0.09668231755495071, "logps/rejected": -0.08627666532993317, "loss": 0.867, "rewards/accuracies": 0.40625, "rewards/chosen": -0.19336463510990143, "rewards/margins": -0.020811304450035095, "rewards/rejected": -0.17255333065986633, "step": 46 }, { "epoch": 0.564, "grad_norm": 1.0337759256362915, "learning_rate": 2.3938969920096296e-07, "logits/chosen": -1.3623703718185425, "logits/rejected": -1.2246556282043457, "logps/chosen": -0.11106079071760178, "logps/rejected": -0.09072640538215637, "loss": 0.8814, "rewards/accuracies": 0.4479166865348816, "rewards/chosen": -0.22212158143520355, "rewards/margins": -0.040668785572052, "rewards/rejected": -0.18145281076431274, "step": 47 }, { "epoch": 0.576, "grad_norm": 0.984899640083313, "learning_rate": 2.2879851888112278e-07, "logits/chosen": -1.3421802520751953, "logits/rejected": -1.198030710220337, "logps/chosen": -0.10126922279596329, "logps/rejected": -0.09068246185779572, "loss": 0.8687, "rewards/accuracies": 0.3958333730697632, "rewards/chosen": -0.20253844559192657, "rewards/margins": -0.021173518151044846, "rewards/rejected": -0.18136492371559143, "step": 48 }, { "epoch": 0.588, "grad_norm": 1.0588122606277466, "learning_rate": 2.182455450632803e-07, "logits/chosen": -1.3651273250579834, "logits/rejected": -1.2209078073501587, "logps/chosen": -0.10214084386825562, "logps/rejected": -0.09154266119003296, "loss": 0.8675, "rewards/accuracies": 0.4166666865348816, "rewards/chosen": -0.20428168773651123, "rewards/margins": -0.02119637280702591, "rewards/rejected": -0.18308532238006592, "step": 49 }, { "epoch": 0.6, "grad_norm": 1.0050568580627441, "learning_rate": 2.0774979491953776e-07, "logits/chosen": -1.3634512424468994, "logits/rejected": -1.1983171701431274, "logps/chosen": -0.09622293710708618, "logps/rejected": -0.09168636053800583, "loss": 0.8605, "rewards/accuracies": 0.4479166865348816, "rewards/chosen": -0.19244587421417236, "rewards/margins": -0.009073152206838131, "rewards/rejected": -0.18337272107601166, "step": 50 }, { "epoch": 0.612, "grad_norm": 1.003779411315918, "learning_rate": 1.973301825010685e-07, "logits/chosen": -1.377637267112732, "logits/rejected": -1.2013237476348877, "logps/chosen": -0.09089501947164536, "logps/rejected": -0.09312086552381516, "loss": 0.8524, "rewards/accuracies": 0.5520833730697632, "rewards/chosen": -0.1817900389432907, "rewards/margins": 0.004451685585081577, "rewards/rejected": -0.1862417310476303, "step": 51 }, { "epoch": 0.624, "grad_norm": 1.0647156238555908, "learning_rate": 1.8700548465371873e-07, "logits/chosen": -1.3391690254211426, "logits/rejected": -1.182291030883789, "logps/chosen": -0.10368049144744873, "logps/rejected": -0.09800291061401367, "loss": 0.8624, "rewards/accuracies": 0.4895833134651184, "rewards/chosen": -0.20736098289489746, "rewards/margins": -0.011355183087289333, "rewards/rejected": -0.19600582122802734, "step": 52 }, { "epoch": 0.636, "grad_norm": 1.2042913436889648, "learning_rate": 1.767943071808624e-07, "logits/chosen": -1.3675341606140137, "logits/rejected": -1.2020319700241089, "logps/chosen": -0.10732070356607437, "logps/rejected": -0.09919053316116333, "loss": 0.871, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": -0.21464140713214874, "rewards/margins": -0.01626037061214447, "rewards/rejected": -0.19838106632232666, "step": 53 }, { "epoch": 0.648, "grad_norm": 0.9795694351196289, "learning_rate": 1.667150513144856e-07, "logits/chosen": -1.3203659057617188, "logits/rejected": -1.1565968990325928, "logps/chosen": -0.09574580192565918, "logps/rejected": -0.09623640775680542, "loss": 0.8547, "rewards/accuracies": 0.5520833730697632, "rewards/chosen": -0.19149160385131836, "rewards/margins": 0.000981215387582779, "rewards/rejected": -0.19247281551361084, "step": 54 }, { "epoch": 0.66, "grad_norm": 1.0732730627059937, "learning_rate": 1.5678588055492286e-07, "logits/chosen": -1.2979214191436768, "logits/rejected": -1.150040626525879, "logps/chosen": -0.10067766904830933, "logps/rejected": -0.1018737256526947, "loss": 0.8545, "rewards/accuracies": 0.5104166865348816, "rewards/chosen": -0.20135533809661865, "rewards/margins": 0.0023921187967061996, "rewards/rejected": -0.2037474513053894, "step": 55 }, { "epoch": 0.672, "grad_norm": 0.9587694406509399, "learning_rate": 1.4702468793900186e-07, "logits/chosen": -1.3606462478637695, "logits/rejected": -1.1573679447174072, "logps/chosen": -0.09973961114883423, "logps/rejected": -0.10131655633449554, "loss": 0.8535, "rewards/accuracies": 0.5625, "rewards/chosen": -0.19947922229766846, "rewards/margins": 0.003153874073177576, "rewards/rejected": -0.2026331126689911, "step": 56 }, { "epoch": 0.684, "grad_norm": 1.0398699045181274, "learning_rate": 1.3744906379558164e-07, "logits/chosen": -1.3375906944274902, "logits/rejected": -1.1461068391799927, "logps/chosen": -0.10326816886663437, "logps/rejected": -0.10188017785549164, "loss": 0.857, "rewards/accuracies": 0.4687500298023224, "rewards/chosen": -0.20653633773326874, "rewards/margins": -0.002775975503027439, "rewards/rejected": -0.20376035571098328, "step": 57 }, { "epoch": 0.696, "grad_norm": 1.0405802726745605, "learning_rate": 1.280762640465914e-07, "logits/chosen": -1.3368816375732422, "logits/rejected": -1.1643409729003906, "logps/chosen": -0.09207028895616531, "logps/rejected": -0.09347332268953323, "loss": 0.8534, "rewards/accuracies": 0.5208333134651184, "rewards/chosen": -0.18414057791233063, "rewards/margins": 0.002806063275784254, "rewards/rejected": -0.18694664537906647, "step": 58 }, { "epoch": 0.708, "grad_norm": 1.0253372192382812, "learning_rate": 1.189231791106921e-07, "logits/chosen": -1.2978026866912842, "logits/rejected": -1.13652765750885, "logps/chosen": -0.10593652725219727, "logps/rejected": -0.10165742039680481, "loss": 0.8609, "rewards/accuracies": 0.479166716337204, "rewards/chosen": -0.21187305450439453, "rewards/margins": -0.008558189496397972, "rewards/rejected": -0.20331484079360962, "step": 59 }, { "epoch": 0.72, "grad_norm": 1.2529534101486206, "learning_rate": 1.1000630346560116e-07, "logits/chosen": -1.3010480403900146, "logits/rejected": -1.133022665977478, "logps/chosen": -0.11126932501792908, "logps/rejected": -0.09786901623010635, "loss": 0.8738, "rewards/accuracies": 0.4895833432674408, "rewards/chosen": -0.22253865003585815, "rewards/margins": -0.026800617575645447, "rewards/rejected": -0.1957380324602127, "step": 60 }, { "epoch": 0.732, "grad_norm": 0.9133132696151733, "learning_rate": 1.0134170592392836e-07, "logits/chosen": -1.3394014835357666, "logits/rejected": -1.1518943309783936, "logps/chosen": -0.09960196912288666, "logps/rejected": -0.10244160890579224, "loss": 0.8521, "rewards/accuracies": 0.5104166865348816, "rewards/chosen": -0.19920393824577332, "rewards/margins": 0.005679287016391754, "rewards/rejected": -0.20488321781158447, "step": 61 }, { "epoch": 0.744, "grad_norm": 1.0934334993362427, "learning_rate": 9.29450006760894e-08, "logits/chosen": -1.3294323682785034, "logits/rejected": -1.1431366205215454, "logps/chosen": -0.10188900679349899, "logps/rejected": -0.09653942286968231, "loss": 0.8616, "rewards/accuracies": 0.5, "rewards/chosen": -0.20377801358699799, "rewards/margins": -0.010699168778955936, "rewards/rejected": -0.19307884573936462, "step": 62 }, { "epoch": 0.756, "grad_norm": 0.9888376593589783, "learning_rate": 8.483131915247967e-08, "logits/chosen": -1.3347070217132568, "logits/rejected": -1.167306661605835, "logps/chosen": -0.10506478697061539, "logps/rejected": -0.1007775291800499, "loss": 0.8603, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": -0.21012957394123077, "rewards/margins": -0.008574524894356728, "rewards/rejected": -0.2015550583600998, "step": 63 }, { "epoch": 0.768, "grad_norm": 0.9486240744590759, "learning_rate": 7.701528275561347e-08, "logits/chosen": -1.3588067293167114, "logits/rejected": -1.1604478359222412, "logps/chosen": -0.0977708101272583, "logps/rejected": -0.09753303229808807, "loss": 0.8555, "rewards/accuracies": 0.5312500596046448, "rewards/chosen": -0.1955416202545166, "rewards/margins": -0.0004755451809614897, "rewards/rejected": -0.19506606459617615, "step": 64 }, { "epoch": 0.78, "grad_norm": 1.033793568611145, "learning_rate": 6.951097651136889e-08, "logits/chosen": -1.3951979875564575, "logits/rejected": -1.125361680984497, "logps/chosen": -0.10307514667510986, "logps/rejected": -0.1052648052573204, "loss": 0.8525, "rewards/accuracies": 0.4895833432674408, "rewards/chosen": -0.20615029335021973, "rewards/margins": 0.0043793064542114735, "rewards/rejected": -0.2105296105146408, "step": 65 }, { "epoch": 0.792, "grad_norm": 1.0308159589767456, "learning_rate": 6.233192368681889e-08, "logits/chosen": -1.3253390789031982, "logits/rejected": -1.1783477067947388, "logps/chosen": -0.09217476844787598, "logps/rejected": -0.09416632354259491, "loss": 0.8527, "rewards/accuracies": 0.5625, "rewards/chosen": -0.18434953689575195, "rewards/margins": 0.003983109723776579, "rewards/rejected": -0.18833264708518982, "step": 66 }, { "epoch": 0.804, "grad_norm": 1.1081663370132446, "learning_rate": 5.5491061420390174e-08, "logits/chosen": -1.2740880250930786, "logits/rejected": -1.152024269104004, "logps/chosen": -0.10923092067241669, "logps/rejected": -0.10176312923431396, "loss": 0.8642, "rewards/accuracies": 0.5104166865348816, "rewards/chosen": -0.21846184134483337, "rewards/margins": -0.01493558008223772, "rewards/rejected": -0.20352625846862793, "step": 67 }, { "epoch": 0.816, "grad_norm": 1.0150699615478516, "learning_rate": 4.900071740826489e-08, "logits/chosen": -1.348282814025879, "logits/rejected": -1.1383142471313477, "logps/chosen": -0.10064545273780823, "logps/rejected": -0.10860613733530045, "loss": 0.8459, "rewards/accuracies": 0.6041667461395264, "rewards/chosen": -0.20129090547561646, "rewards/margins": 0.01592138595879078, "rewards/rejected": -0.2172122746706009, "step": 68 }, { "epoch": 0.828, "grad_norm": 1.123404860496521, "learning_rate": 4.287258768903948e-08, "logits/chosen": -1.3442084789276123, "logits/rejected": -1.146917462348938, "logps/chosen": -0.10861781984567642, "logps/rejected": -0.10071661323308945, "loss": 0.8688, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": -0.21723563969135284, "rewards/margins": -0.015802428126335144, "rewards/rejected": -0.2014332264661789, "step": 69 }, { "epoch": 0.84, "grad_norm": 0.9981204867362976, "learning_rate": 3.7117715566672176e-08, "logits/chosen": -1.3524047136306763, "logits/rejected": -1.1452308893203735, "logps/chosen": -0.10508691519498825, "logps/rejected": -0.10403452813625336, "loss": 0.8568, "rewards/accuracies": 0.5729166865348816, "rewards/chosen": -0.2101738303899765, "rewards/margins": -0.0021047808695584536, "rewards/rejected": -0.2080690562725067, "step": 70 }, { "epoch": 0.852, "grad_norm": 1.0365066528320312, "learning_rate": 3.174647170970296e-08, "logits/chosen": -1.3592528104782104, "logits/rejected": -1.1138660907745361, "logps/chosen": -0.10330555588006973, "logps/rejected": -0.1064281240105629, "loss": 0.8524, "rewards/accuracies": 0.5729166865348816, "rewards/chosen": -0.20661111176013947, "rewards/margins": 0.006245152093470097, "rewards/rejected": -0.2128562480211258, "step": 71 }, { "epoch": 0.864, "grad_norm": 1.076464295387268, "learning_rate": 2.6768535462607905e-08, "logits/chosen": -1.3058511018753052, "logits/rejected": -1.126082420349121, "logps/chosen": -0.10569944232702255, "logps/rejected": -0.11077728122472763, "loss": 0.8494, "rewards/accuracies": 0.5, "rewards/chosen": -0.2113988846540451, "rewards/margins": 0.01015565823763609, "rewards/rejected": -0.22155456244945526, "step": 72 }, { "epoch": 0.876, "grad_norm": 0.9264240264892578, "learning_rate": 2.2192877402966048e-08, "logits/chosen": -1.3490333557128906, "logits/rejected": -1.1134393215179443, "logps/chosen": -0.10448520630598068, "logps/rejected": -0.11283887922763824, "loss": 0.8457, "rewards/accuracies": 0.6145833730697632, "rewards/chosen": -0.20897041261196136, "rewards/margins": 0.016707373782992363, "rewards/rejected": -0.2256777584552765, "step": 73 }, { "epoch": 0.888, "grad_norm": 1.067218542098999, "learning_rate": 1.8027743175872662e-08, "logits/chosen": -1.345249891281128, "logits/rejected": -1.1063634157180786, "logps/chosen": -0.11241614818572998, "logps/rejected": -0.11045221984386444, "loss": 0.8579, "rewards/accuracies": 0.5104166865348816, "rewards/chosen": -0.22483229637145996, "rewards/margins": -0.003927857149392366, "rewards/rejected": -0.22090443968772888, "step": 74 }, { "epoch": 0.9, "grad_norm": 1.0710501670837402, "learning_rate": 1.4280638634728948e-08, "logits/chosen": -1.3244132995605469, "logits/rejected": -1.1485953330993652, "logps/chosen": -0.11838357150554657, "logps/rejected": -0.10926854610443115, "loss": 0.8683, "rewards/accuracies": 0.47916674613952637, "rewards/chosen": -0.23676714301109314, "rewards/margins": -0.018230034038424492, "rewards/rejected": -0.2185370922088623, "step": 75 }, { "epoch": 0.912, "grad_norm": 1.1065670251846313, "learning_rate": 1.0958316315187289e-08, "logits/chosen": -1.317086100578308, "logits/rejected": -1.1413421630859375, "logps/chosen": -0.10257872194051743, "logps/rejected": -0.10372138023376465, "loss": 0.8542, "rewards/accuracies": 0.5416666269302368, "rewards/chosen": -0.20515744388103485, "rewards/margins": 0.0022853193804621696, "rewards/rejected": -0.2074427604675293, "step": 76 }, { "epoch": 0.924, "grad_norm": 1.0825769901275635, "learning_rate": 8.066763266625282e-09, "logits/chosen": -1.3573386669158936, "logits/rejected": -1.109717607498169, "logps/chosen": -0.10758916288614273, "logps/rejected": -0.10599493980407715, "loss": 0.8597, "rewards/accuracies": 0.6250000596046448, "rewards/chosen": -0.21517832577228546, "rewards/margins": -0.0031884238123893738, "rewards/rejected": -0.2119898796081543, "step": 77 }, { "epoch": 0.936, "grad_norm": 0.9156083464622498, "learning_rate": 5.611190263078463e-09, "logits/chosen": -1.309991717338562, "logits/rejected": -1.1210401058197021, "logps/chosen": -0.0973024070262909, "logps/rejected": -0.09729278087615967, "loss": 0.8551, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": -0.1946048140525818, "rewards/margins": -1.9263941794633865e-05, "rewards/rejected": -0.19458556175231934, "step": 78 }, { "epoch": 0.948, "grad_norm": 0.9994163513183594, "learning_rate": 3.5960224130728858e-09, "logits/chosen": -1.3023028373718262, "logits/rejected": -1.1124149560928345, "logps/chosen": -0.09809858351945877, "logps/rejected": -0.1080770194530487, "loss": 0.8436, "rewards/accuracies": 0.6354166865348816, "rewards/chosen": -0.19619716703891754, "rewards/margins": 0.01995689421892166, "rewards/rejected": -0.2161540389060974, "step": 79 }, { "epoch": 0.96, "grad_norm": 0.9938739538192749, "learning_rate": 2.0248911852807917e-09, "logits/chosen": -1.3484057188034058, "logits/rejected": -1.0871906280517578, "logps/chosen": -0.11684219539165497, "logps/rejected": -0.11286689341068268, "loss": 0.8601, "rewards/accuracies": 0.5104166865348816, "rewards/chosen": -0.23368439078330994, "rewards/margins": -0.007950600236654282, "rewards/rejected": -0.22573378682136536, "step": 80 }, { "epoch": 0.972, "grad_norm": 0.9481876492500305, "learning_rate": 9.006278643683696e-10, "logits/chosen": -1.3169337511062622, "logits/rejected": -1.1376291513442993, "logps/chosen": -0.09936561435461044, "logps/rejected": -0.10312428325414658, "loss": 0.8508, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": -0.1987312287092209, "rewards/margins": 0.007517362013459206, "rewards/rejected": -0.20624856650829315, "step": 81 }, { "epoch": 0.984, "grad_norm": 1.0620635747909546, "learning_rate": 2.2525844882964606e-10, "logits/chosen": -1.3467659950256348, "logits/rejected": -1.1462361812591553, "logps/chosen": -0.10522940754890442, "logps/rejected": -0.10153805464506149, "loss": 0.8594, "rewards/accuracies": 0.4687500298023224, "rewards/chosen": -0.21045881509780884, "rewards/margins": -0.007382689975202084, "rewards/rejected": -0.20307610929012299, "step": 82 }, { "epoch": 0.996, "grad_norm": 1.0221400260925293, "learning_rate": 0.0, "logits/chosen": -1.313194751739502, "logits/rejected": -1.164141297340393, "logps/chosen": -0.10121379047632217, "logps/rejected": -0.1018705815076828, "loss": 0.8549, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": -0.20242758095264435, "rewards/margins": 0.0013135506305843592, "rewards/rejected": -0.2037411630153656, "step": 83 } ], "logging_steps": 1, "max_steps": 83, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 305651334512640.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }