diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10242 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 6793, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00014721036360959813, + "grad_norm": 29.71694655784319, + "learning_rate": 7.352941176470588e-10, + "logits/chosen": -3.0151314735412598, + "logits/rejected": -2.7100119590759277, + "logps/chosen": -416.9676208496094, + "logps/rejected": -87.89925384521484, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0014721036360959812, + "grad_norm": 34.535653275362776, + "learning_rate": 7.352941176470588e-09, + "logits/chosen": -2.727412223815918, + "logits/rejected": -2.6961448192596436, + "logps/chosen": -329.0840148925781, + "logps/rejected": -229.00257873535156, + "loss": 0.6936, + "rewards/accuracies": 0.40740740299224854, + "rewards/chosen": -0.0005627116188406944, + "rewards/margins": -0.00010472683788975701, + "rewards/rejected": -0.00045798480277881026, + "step": 10 + }, + { + "epoch": 0.0029442072721919624, + "grad_norm": 29.908710096281844, + "learning_rate": 1.4705882352941176e-08, + "logits/chosen": -2.7378716468811035, + "logits/rejected": -2.7686526775360107, + "logps/chosen": -257.4278564453125, + "logps/rejected": -231.632080078125, + "loss": 0.6931, + "rewards/accuracies": 0.5333333015441895, + "rewards/chosen": -0.0004971561720594764, + "rewards/margins": -0.00041913543827831745, + "rewards/rejected": -7.802071195328608e-05, + "step": 20 + }, + { + "epoch": 0.004416310908287944, + "grad_norm": 27.09919921453225, + "learning_rate": 2.2058823529411764e-08, + "logits/chosen": -2.8206026554107666, + "logits/rejected": -2.8088622093200684, + "logps/chosen": -202.65711975097656, + "logps/rejected": -208.73812866210938, + "loss": 0.6931, + "rewards/accuracies": 0.43333330750465393, + "rewards/chosen": 0.00027185474755242467, + "rewards/margins": -0.00018912216182798147, + "rewards/rejected": 0.00046097690938040614, + "step": 30 + }, + { + "epoch": 0.005888414544383925, + "grad_norm": 27.7092064836425, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": -2.7895326614379883, + "logits/rejected": -2.736685276031494, + "logps/chosen": -244.48049926757812, + "logps/rejected": -392.7850646972656, + "loss": 0.6939, + "rewards/accuracies": 0.36666661500930786, + "rewards/chosen": -0.0002304160298081115, + "rewards/margins": -0.0012445717584341764, + "rewards/rejected": 0.0010141555685549974, + "step": 40 + }, + { + "epoch": 0.007360518180479906, + "grad_norm": 37.93822174845386, + "learning_rate": 3.676470588235294e-08, + "logits/chosen": -2.7564077377319336, + "logits/rejected": -2.781769275665283, + "logps/chosen": -273.69781494140625, + "logps/rejected": -217.20272827148438, + "loss": 0.693, + "rewards/accuracies": 0.43333330750465393, + "rewards/chosen": 0.00022456480655819178, + "rewards/margins": 0.00015951957902871072, + "rewards/rejected": 6.504535849671811e-05, + "step": 50 + }, + { + "epoch": 0.008832621816575887, + "grad_norm": 34.57174340457277, + "learning_rate": 4.411764705882353e-08, + "logits/chosen": -2.8400521278381348, + "logits/rejected": -2.829881191253662, + "logps/chosen": -320.5490417480469, + "logps/rejected": -330.97857666015625, + "loss": 0.6931, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": 0.001756187528371811, + "rewards/margins": 0.0016124255489557981, + "rewards/rejected": 0.00014376183389686048, + "step": 60 + }, + { + "epoch": 0.010304725452671868, + "grad_norm": 30.530910743543075, + "learning_rate": 5.147058823529411e-08, + "logits/chosen": -2.6563546657562256, + "logits/rejected": -2.630021572113037, + "logps/chosen": -247.36734008789062, + "logps/rejected": -297.99725341796875, + "loss": 0.6929, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 0.0011637945426627994, + "rewards/margins": 0.0010188401211053133, + "rewards/rejected": 0.00014495445066131651, + "step": 70 + }, + { + "epoch": 0.01177682908876785, + "grad_norm": 29.416834589159407, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": -2.8869853019714355, + "logits/rejected": -2.8161821365356445, + "logps/chosen": -254.61740112304688, + "logps/rejected": -228.11074829101562, + "loss": 0.6926, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 0.0010433337884023786, + "rewards/margins": 0.0009213036973960698, + "rewards/rejected": 0.00012202991638332605, + "step": 80 + }, + { + "epoch": 0.01324893272486383, + "grad_norm": 28.09685138024903, + "learning_rate": 6.617647058823529e-08, + "logits/chosen": -2.891179084777832, + "logits/rejected": -2.8563899993896484, + "logps/chosen": -297.3827819824219, + "logps/rejected": -230.2034912109375, + "loss": 0.6923, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00202867086045444, + "rewards/margins": 0.0011681572068482637, + "rewards/rejected": 0.0008605137700214982, + "step": 90 + }, + { + "epoch": 0.014721036360959812, + "grad_norm": 29.49011374462076, + "learning_rate": 7.352941176470588e-08, + "logits/chosen": -2.863232374191284, + "logits/rejected": -2.786543607711792, + "logps/chosen": -256.5393981933594, + "logps/rejected": -211.46884155273438, + "loss": 0.6921, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0021780661772936583, + "rewards/margins": 0.0021833530627191067, + "rewards/rejected": -5.28654800291406e-06, + "step": 100 + }, + { + "epoch": 0.016193139997055794, + "grad_norm": 28.81884553022271, + "learning_rate": 8.088235294117647e-08, + "logits/chosen": -2.9011025428771973, + "logits/rejected": -2.868657112121582, + "logps/chosen": -357.44110107421875, + "logps/rejected": -284.5263977050781, + "loss": 0.691, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": 0.007345405872911215, + "rewards/margins": 0.0037629674188792706, + "rewards/rejected": 0.0035824389196932316, + "step": 110 + }, + { + "epoch": 0.017665243633151775, + "grad_norm": 36.10593854637803, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": -2.7670390605926514, + "logits/rejected": -2.7676827907562256, + "logps/chosen": -250.05722045898438, + "logps/rejected": -270.88385009765625, + "loss": 0.6926, + "rewards/accuracies": 0.4333333373069763, + "rewards/chosen": 0.0035268026404082775, + "rewards/margins": -0.0004414740833453834, + "rewards/rejected": 0.0039682770147919655, + "step": 120 + }, + { + "epoch": 0.019137347269247755, + "grad_norm": 32.85577262609588, + "learning_rate": 9.558823529411763e-08, + "logits/chosen": -2.840751886367798, + "logits/rejected": -2.810009717941284, + "logps/chosen": -284.97601318359375, + "logps/rejected": -217.7137908935547, + "loss": 0.6911, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": 0.009775317274034023, + "rewards/margins": 0.005667726509273052, + "rewards/rejected": 0.004107590764760971, + "step": 130 + }, + { + "epoch": 0.020609450905343735, + "grad_norm": 32.37082512191622, + "learning_rate": 1.0294117647058822e-07, + "logits/chosen": -2.876713275909424, + "logits/rejected": -2.8541202545166016, + "logps/chosen": -194.10369873046875, + "logps/rejected": -192.15049743652344, + "loss": 0.6909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.011691203340888023, + "rewards/margins": 0.007254776544868946, + "rewards/rejected": 0.004436427261680365, + "step": 140 + }, + { + "epoch": 0.022081554541439716, + "grad_norm": 32.713515745439416, + "learning_rate": 1.1029411764705881e-07, + "logits/chosen": -2.83595609664917, + "logits/rejected": -2.730562925338745, + "logps/chosen": -262.994873046875, + "logps/rejected": -223.1656494140625, + "loss": 0.6898, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": 0.01583312824368477, + "rewards/margins": 0.0072602806612849236, + "rewards/rejected": 0.00857284665107727, + "step": 150 + }, + { + "epoch": 0.0235536581775357, + "grad_norm": 26.630848668083054, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": -2.871014356613159, + "logits/rejected": -2.7833645343780518, + "logps/chosen": -224.0769500732422, + "logps/rejected": -213.2074737548828, + "loss": 0.6885, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": 0.015150552615523338, + "rewards/margins": 0.0032093841582536697, + "rewards/rejected": 0.011941169388592243, + "step": 160 + }, + { + "epoch": 0.02502576181363168, + "grad_norm": 42.18336051297135, + "learning_rate": 1.25e-07, + "logits/chosen": -2.8010082244873047, + "logits/rejected": -2.8119544982910156, + "logps/chosen": -282.47772216796875, + "logps/rejected": -246.34597778320312, + "loss": 0.6866, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": 0.03179505467414856, + "rewards/margins": 0.016844308003783226, + "rewards/rejected": 0.014950746670365334, + "step": 170 + }, + { + "epoch": 0.02649786544972766, + "grad_norm": 32.631074502994984, + "learning_rate": 1.3235294117647057e-07, + "logits/chosen": -2.873687982559204, + "logits/rejected": -2.865361452102661, + "logps/chosen": -198.8431396484375, + "logps/rejected": -216.23617553710938, + "loss": 0.6861, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": 0.031059423461556435, + "rewards/margins": 0.01364596001803875, + "rewards/rejected": 0.017413463443517685, + "step": 180 + }, + { + "epoch": 0.02796996908582364, + "grad_norm": 27.73329421596593, + "learning_rate": 1.3970588235294117e-07, + "logits/chosen": -2.844758987426758, + "logits/rejected": -2.853353977203369, + "logps/chosen": -238.2511749267578, + "logps/rejected": -254.4850616455078, + "loss": 0.6842, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": 0.03200041130185127, + "rewards/margins": 0.012178768403828144, + "rewards/rejected": 0.019821647554636, + "step": 190 + }, + { + "epoch": 0.029442072721919624, + "grad_norm": 30.97495494751351, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -2.879666805267334, + "logits/rejected": -2.810473680496216, + "logps/chosen": -319.57366943359375, + "logps/rejected": -248.18765258789062, + "loss": 0.6824, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.04438554495573044, + "rewards/margins": 0.02779371663928032, + "rewards/rejected": 0.01659182831645012, + "step": 200 + }, + { + "epoch": 0.030914176358015605, + "grad_norm": 34.91623113017306, + "learning_rate": 1.5441176470588236e-07, + "logits/chosen": -2.7581286430358887, + "logits/rejected": -2.750488758087158, + "logps/chosen": -250.46127319335938, + "logps/rejected": -204.35153198242188, + "loss": 0.676, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04570077359676361, + "rewards/margins": 0.029080908745527267, + "rewards/rejected": 0.016619862988591194, + "step": 210 + }, + { + "epoch": 0.03238627999411159, + "grad_norm": 41.31561569841407, + "learning_rate": 1.6176470588235293e-07, + "logits/chosen": -2.755706787109375, + "logits/rejected": -2.7304928302764893, + "logps/chosen": -241.81375122070312, + "logps/rejected": -188.73704528808594, + "loss": 0.6707, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.042121924459934235, + "rewards/margins": 0.03466057404875755, + "rewards/rejected": 0.007461345288902521, + "step": 220 + }, + { + "epoch": 0.033858383630207566, + "grad_norm": 35.51770057406444, + "learning_rate": 1.6911764705882354e-07, + "logits/chosen": -2.813176155090332, + "logits/rejected": -2.7579312324523926, + "logps/chosen": -346.13604736328125, + "logps/rejected": -255.13241577148438, + "loss": 0.6761, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 0.06457143276929855, + "rewards/margins": 0.04571588709950447, + "rewards/rejected": 0.01885554939508438, + "step": 230 + }, + { + "epoch": 0.03533048726630355, + "grad_norm": 24.8754627333027, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": -2.8333022594451904, + "logits/rejected": -2.7649919986724854, + "logps/chosen": -259.2283630371094, + "logps/rejected": -253.58511352539062, + "loss": 0.6749, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": 0.038435179740190506, + "rewards/margins": 0.04031110554933548, + "rewards/rejected": -0.0018759273225441575, + "step": 240 + }, + { + "epoch": 0.036802590902399526, + "grad_norm": 31.815597060594076, + "learning_rate": 1.8382352941176472e-07, + "logits/chosen": -2.6839306354522705, + "logits/rejected": -2.724191188812256, + "logps/chosen": -238.89254760742188, + "logps/rejected": -240.45687866210938, + "loss": 0.6756, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.033027391880750656, + "rewards/margins": 0.04784069582819939, + "rewards/rejected": -0.014813309535384178, + "step": 250 + }, + { + "epoch": 0.03827469453849551, + "grad_norm": 29.651634745240163, + "learning_rate": 1.9117647058823527e-07, + "logits/chosen": -2.8824594020843506, + "logits/rejected": -2.77995228767395, + "logps/chosen": -266.725341796875, + "logps/rejected": -222.70541381835938, + "loss": 0.6727, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -0.005157656967639923, + "rewards/margins": 0.012122317217290401, + "rewards/rejected": -0.01727997325360775, + "step": 260 + }, + { + "epoch": 0.039746798174591494, + "grad_norm": 26.82257436725992, + "learning_rate": 1.9852941176470587e-07, + "logits/chosen": -2.8001790046691895, + "logits/rejected": -2.848738193511963, + "logps/chosen": -292.6630859375, + "logps/rejected": -280.6173400878906, + "loss": 0.6592, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02509542740881443, + "rewards/margins": 0.03882851451635361, + "rewards/rejected": -0.013733088970184326, + "step": 270 + }, + { + "epoch": 0.04121890181068747, + "grad_norm": 30.8834323285742, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": -2.8407390117645264, + "logits/rejected": -2.7819266319274902, + "logps/chosen": -315.1943359375, + "logps/rejected": -288.6021728515625, + "loss": 0.6535, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.059492699801921844, + "rewards/margins": 0.07932732254266739, + "rewards/rejected": -0.019834617152810097, + "step": 280 + }, + { + "epoch": 0.042691005446783455, + "grad_norm": 38.738234060325915, + "learning_rate": 2.1323529411764705e-07, + "logits/chosen": -2.8302204608917236, + "logits/rejected": -2.8276419639587402, + "logps/chosen": -268.3429870605469, + "logps/rejected": -226.09255981445312, + "loss": 0.6629, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": 0.042978741228580475, + "rewards/margins": 0.09501025825738907, + "rewards/rejected": -0.05203152820467949, + "step": 290 + }, + { + "epoch": 0.04416310908287943, + "grad_norm": 32.49191663797436, + "learning_rate": 2.2058823529411763e-07, + "logits/chosen": -2.8209404945373535, + "logits/rejected": -2.797398328781128, + "logps/chosen": -271.92620849609375, + "logps/rejected": -247.6468048095703, + "loss": 0.658, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -0.038401711732149124, + "rewards/margins": 0.06523202359676361, + "rewards/rejected": -0.10363372415304184, + "step": 300 + }, + { + "epoch": 0.045635212718975415, + "grad_norm": 45.56762739634673, + "learning_rate": 2.2794117647058823e-07, + "logits/chosen": -2.9067940711975098, + "logits/rejected": -2.8422508239746094, + "logps/chosen": -309.2583312988281, + "logps/rejected": -269.35235595703125, + "loss": 0.6644, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.0022108617704361677, + "rewards/margins": 0.10749582201242447, + "rewards/rejected": -0.10528495162725449, + "step": 310 + }, + { + "epoch": 0.0471073163550714, + "grad_norm": 37.97344033808417, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": -2.7672407627105713, + "logits/rejected": -2.7463204860687256, + "logps/chosen": -258.83441162109375, + "logps/rejected": -268.62652587890625, + "loss": 0.6417, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -0.042268022894859314, + "rewards/margins": 0.13044223189353943, + "rewards/rejected": -0.17271023988723755, + "step": 320 + }, + { + "epoch": 0.048579419991167376, + "grad_norm": 44.21640841651385, + "learning_rate": 2.426470588235294e-07, + "logits/chosen": -2.805454730987549, + "logits/rejected": -2.7683205604553223, + "logps/chosen": -234.83816528320312, + "logps/rejected": -213.6456756591797, + "loss": 0.6414, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -0.0444740429520607, + "rewards/margins": 0.13252779841423035, + "rewards/rejected": -0.17700186371803284, + "step": 330 + }, + { + "epoch": 0.05005152362726336, + "grad_norm": 37.97909859126673, + "learning_rate": 2.5e-07, + "logits/chosen": -2.865265369415283, + "logits/rejected": -2.8193905353546143, + "logps/chosen": -311.99932861328125, + "logps/rejected": -360.31268310546875, + "loss": 0.6563, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -0.16395288705825806, + "rewards/margins": 0.07938871532678604, + "rewards/rejected": -0.2433416098356247, + "step": 340 + }, + { + "epoch": 0.051523627263359344, + "grad_norm": 44.58971712270715, + "learning_rate": 2.5735294117647057e-07, + "logits/chosen": -2.754761219024658, + "logits/rejected": -2.730095386505127, + "logps/chosen": -322.5736999511719, + "logps/rejected": -275.8636779785156, + "loss": 0.6365, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -0.178586944937706, + "rewards/margins": 0.13300378620624542, + "rewards/rejected": -0.3115907311439514, + "step": 350 + }, + { + "epoch": 0.05299573089945532, + "grad_norm": 43.44889549369163, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": -2.8176536560058594, + "logits/rejected": -2.813448667526245, + "logps/chosen": -255.3389129638672, + "logps/rejected": -252.19088745117188, + "loss": 0.6531, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -0.1946530044078827, + "rewards/margins": 0.13056738674640656, + "rewards/rejected": -0.32522040605545044, + "step": 360 + }, + { + "epoch": 0.054467834535551304, + "grad_norm": 34.07038018214428, + "learning_rate": 2.720588235294117e-07, + "logits/chosen": -2.8375706672668457, + "logits/rejected": -2.8827130794525146, + "logps/chosen": -271.68414306640625, + "logps/rejected": -286.15509033203125, + "loss": 0.6096, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -0.09479375928640366, + "rewards/margins": 0.1842346489429474, + "rewards/rejected": -0.27902838587760925, + "step": 370 + }, + { + "epoch": 0.05593993817164728, + "grad_norm": 40.352174351843715, + "learning_rate": 2.7941176470588235e-07, + "logits/chosen": -2.9171364307403564, + "logits/rejected": -2.86572527885437, + "logps/chosen": -278.642578125, + "logps/rejected": -238.94302368164062, + "loss": 0.6311, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -0.10989081859588623, + "rewards/margins": 0.12142989784479141, + "rewards/rejected": -0.23132070899009705, + "step": 380 + }, + { + "epoch": 0.057412041807743265, + "grad_norm": 38.19349443279793, + "learning_rate": 2.8676470588235293e-07, + "logits/chosen": -2.8646185398101807, + "logits/rejected": -2.8530282974243164, + "logps/chosen": -278.7576599121094, + "logps/rejected": -273.5655517578125, + "loss": 0.6585, + "rewards/accuracies": 0.46666669845581055, + "rewards/chosen": -0.1812421828508377, + "rewards/margins": -0.004091627895832062, + "rewards/rejected": -0.17715056240558624, + "step": 390 + }, + { + "epoch": 0.05888414544383925, + "grad_norm": 55.49251637502761, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -2.85107421875, + "logits/rejected": -2.792712688446045, + "logps/chosen": -224.25357055664062, + "logps/rejected": -254.12350463867188, + "loss": 0.5882, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -0.12137115001678467, + "rewards/margins": 0.24730543792247772, + "rewards/rejected": -0.3686766028404236, + "step": 400 + }, + { + "epoch": 0.060356249079935226, + "grad_norm": 53.40204577177976, + "learning_rate": 3.014705882352941e-07, + "logits/chosen": -2.776435613632202, + "logits/rejected": -2.7220234870910645, + "logps/chosen": -324.9377136230469, + "logps/rejected": -336.4537658691406, + "loss": 0.5907, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -0.19358155131340027, + "rewards/margins": 0.27373096346855164, + "rewards/rejected": -0.4673125743865967, + "step": 410 + }, + { + "epoch": 0.06182835271603121, + "grad_norm": 57.94249210646548, + "learning_rate": 3.088235294117647e-07, + "logits/chosen": -2.9334912300109863, + "logits/rejected": -2.8618178367614746, + "logps/chosen": -354.8674011230469, + "logps/rejected": -338.1689453125, + "loss": 0.6561, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3509146571159363, + "rewards/margins": 0.13155964016914368, + "rewards/rejected": -0.4824741780757904, + "step": 420 + }, + { + "epoch": 0.0633004563521272, + "grad_norm": 50.97937479738409, + "learning_rate": 3.161764705882353e-07, + "logits/chosen": -2.7859561443328857, + "logits/rejected": -2.774780750274658, + "logps/chosen": -219.48977661132812, + "logps/rejected": -254.21353149414062, + "loss": 0.5956, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -0.47212687134742737, + "rewards/margins": 0.2280077189207077, + "rewards/rejected": -0.7001345753669739, + "step": 430 + }, + { + "epoch": 0.06477255998822318, + "grad_norm": 37.73542209696585, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": -2.8173322677612305, + "logits/rejected": -2.8037285804748535, + "logps/chosen": -259.81146240234375, + "logps/rejected": -267.8140563964844, + "loss": 0.6043, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -0.4589969217777252, + "rewards/margins": 0.3434470593929291, + "rewards/rejected": -0.8024439811706543, + "step": 440 + }, + { + "epoch": 0.06624466362431915, + "grad_norm": 45.717134175997984, + "learning_rate": 3.3088235294117644e-07, + "logits/chosen": -2.850761890411377, + "logits/rejected": -2.7941343784332275, + "logps/chosen": -394.3548889160156, + "logps/rejected": -367.2265319824219, + "loss": 0.6141, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -0.624030590057373, + "rewards/margins": 0.08572717756032944, + "rewards/rejected": -0.7097578048706055, + "step": 450 + }, + { + "epoch": 0.06771676726041513, + "grad_norm": 66.88873265427459, + "learning_rate": 3.3823529411764707e-07, + "logits/chosen": -2.9509973526000977, + "logits/rejected": -2.8714380264282227, + "logps/chosen": -313.90362548828125, + "logps/rejected": -293.2352600097656, + "loss": 0.6426, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.3316972851753235, + "rewards/margins": 0.4603908658027649, + "rewards/rejected": -0.7920882105827332, + "step": 460 + }, + { + "epoch": 0.06918887089651111, + "grad_norm": 49.66859112663255, + "learning_rate": 3.4558823529411765e-07, + "logits/chosen": -2.8744213581085205, + "logits/rejected": -2.865004062652588, + "logps/chosen": -252.8170928955078, + "logps/rejected": -266.478515625, + "loss": 0.5867, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4000850319862366, + "rewards/margins": 0.28942054510116577, + "rewards/rejected": -0.6895055770874023, + "step": 470 + }, + { + "epoch": 0.0706609745326071, + "grad_norm": 35.40814598418653, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": -2.7865090370178223, + "logits/rejected": -2.744558334350586, + "logps/chosen": -327.3811950683594, + "logps/rejected": -348.24542236328125, + "loss": 0.5468, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -0.4678328037261963, + "rewards/margins": 0.6113702058792114, + "rewards/rejected": -1.0792028903961182, + "step": 480 + }, + { + "epoch": 0.07213307816870308, + "grad_norm": 38.842645124543296, + "learning_rate": 3.602941176470588e-07, + "logits/chosen": -2.890005588531494, + "logits/rejected": -2.83540678024292, + "logps/chosen": -376.8315734863281, + "logps/rejected": -359.41131591796875, + "loss": 0.6261, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -0.6100105047225952, + "rewards/margins": 0.21190539002418518, + "rewards/rejected": -0.8219158053398132, + "step": 490 + }, + { + "epoch": 0.07360518180479905, + "grad_norm": 53.03066409743708, + "learning_rate": 3.6764705882352943e-07, + "logits/chosen": -2.8465638160705566, + "logits/rejected": -2.8696696758270264, + "logps/chosen": -259.658935546875, + "logps/rejected": -320.8695983886719, + "loss": 0.5486, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -0.5616716742515564, + "rewards/margins": 0.3908033072948456, + "rewards/rejected": -0.9524749517440796, + "step": 500 + }, + { + "epoch": 0.07507728544089504, + "grad_norm": 28.51717128961485, + "learning_rate": 3.75e-07, + "logits/chosen": -2.8019585609436035, + "logits/rejected": -2.800567388534546, + "logps/chosen": -341.00482177734375, + "logps/rejected": -403.8449401855469, + "loss": 0.5788, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -0.3110610246658325, + "rewards/margins": 0.7678789496421814, + "rewards/rejected": -1.0789399147033691, + "step": 510 + }, + { + "epoch": 0.07654938907699102, + "grad_norm": 66.40624997481838, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": -2.685945987701416, + "logits/rejected": -2.588822364807129, + "logps/chosen": -283.71807861328125, + "logps/rejected": -307.35369873046875, + "loss": 0.6279, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -0.6492300629615784, + "rewards/margins": 0.21936936676502228, + "rewards/rejected": -0.868599534034729, + "step": 520 + }, + { + "epoch": 0.078021492713087, + "grad_norm": 59.37053553658249, + "learning_rate": 3.8970588235294116e-07, + "logits/chosen": -2.8552117347717285, + "logits/rejected": -2.7941927909851074, + "logps/chosen": -331.2178039550781, + "logps/rejected": -345.9244384765625, + "loss": 0.6604, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6246446371078491, + "rewards/margins": 0.28604191541671753, + "rewards/rejected": -0.9106866121292114, + "step": 530 + }, + { + "epoch": 0.07949359634918299, + "grad_norm": 75.40056747252186, + "learning_rate": 3.9705882352941174e-07, + "logits/chosen": -3.0070300102233887, + "logits/rejected": -2.897230863571167, + "logps/chosen": -359.48663330078125, + "logps/rejected": -325.5563659667969, + "loss": 0.6098, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5045651793479919, + "rewards/margins": 0.19508329033851624, + "rewards/rejected": -0.6996484398841858, + "step": 540 + }, + { + "epoch": 0.08096569998527896, + "grad_norm": 45.85727789376239, + "learning_rate": 4.044117647058823e-07, + "logits/chosen": -2.806318759918213, + "logits/rejected": -2.810732364654541, + "logps/chosen": -395.5749816894531, + "logps/rejected": -424.22698974609375, + "loss": 0.5103, + "rewards/accuracies": 0.7999998927116394, + "rewards/chosen": -0.6453887224197388, + "rewards/margins": 0.6867039799690247, + "rewards/rejected": -1.332092523574829, + "step": 550 + }, + { + "epoch": 0.08243780362137494, + "grad_norm": 60.31085938854977, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": -2.9013686180114746, + "logits/rejected": -2.8248424530029297, + "logps/chosen": -331.2259826660156, + "logps/rejected": -354.3016357421875, + "loss": 0.6053, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -0.912601113319397, + "rewards/margins": 0.31690922379493713, + "rewards/rejected": -1.2295103073120117, + "step": 560 + }, + { + "epoch": 0.08390990725747093, + "grad_norm": 74.18464595651365, + "learning_rate": 4.191176470588235e-07, + "logits/chosen": -2.8501298427581787, + "logits/rejected": -2.775217056274414, + "logps/chosen": -376.1417236328125, + "logps/rejected": -371.5596008300781, + "loss": 0.6254, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -0.8113685846328735, + "rewards/margins": 0.6388000249862671, + "rewards/rejected": -1.4501686096191406, + "step": 570 + }, + { + "epoch": 0.08538201089356691, + "grad_norm": 57.83442423316419, + "learning_rate": 4.264705882352941e-07, + "logits/chosen": -2.7482221126556396, + "logits/rejected": -2.7348978519439697, + "logps/chosen": -405.81488037109375, + "logps/rejected": -341.3351745605469, + "loss": 0.6336, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1173675060272217, + "rewards/margins": 0.07470262050628662, + "rewards/rejected": -1.1920702457427979, + "step": 580 + }, + { + "epoch": 0.08685411452966289, + "grad_norm": 104.9224309149161, + "learning_rate": 4.338235294117647e-07, + "logits/chosen": -2.735396146774292, + "logits/rejected": -2.7264251708984375, + "logps/chosen": -333.8023986816406, + "logps/rejected": -370.3941345214844, + "loss": 0.573, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -0.5461455583572388, + "rewards/margins": 0.5720874071121216, + "rewards/rejected": -1.1182329654693604, + "step": 590 + }, + { + "epoch": 0.08832621816575886, + "grad_norm": 101.62792056907598, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -2.533859968185425, + "logits/rejected": -2.6341917514801025, + "logps/chosen": -335.98773193359375, + "logps/rejected": -407.0130310058594, + "loss": 0.5524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7082921266555786, + "rewards/margins": 0.6046443581581116, + "rewards/rejected": -1.312936544418335, + "step": 600 + }, + { + "epoch": 0.08979832180185485, + "grad_norm": 95.77256850037652, + "learning_rate": 4.485294117647059e-07, + "logits/chosen": -2.712843418121338, + "logits/rejected": -2.697476863861084, + "logps/chosen": -320.53265380859375, + "logps/rejected": -382.7992858886719, + "loss": 0.5509, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -0.6174439191818237, + "rewards/margins": 0.7160069346427917, + "rewards/rejected": -1.3334507942199707, + "step": 610 + }, + { + "epoch": 0.09127042543795083, + "grad_norm": 45.24675123246574, + "learning_rate": 4.5588235294117646e-07, + "logits/chosen": -2.88244891166687, + "logits/rejected": -2.819511890411377, + "logps/chosen": -301.88128662109375, + "logps/rejected": -287.34857177734375, + "loss": 0.5916, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -0.6080493927001953, + "rewards/margins": 0.29972031712532043, + "rewards/rejected": -0.9077697992324829, + "step": 620 + }, + { + "epoch": 0.09274252907404681, + "grad_norm": 67.4924873260308, + "learning_rate": 4.6323529411764704e-07, + "logits/chosen": -2.8483669757843018, + "logits/rejected": -2.8223581314086914, + "logps/chosen": -364.1528015136719, + "logps/rejected": -364.72869873046875, + "loss": 0.484, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -0.507249653339386, + "rewards/margins": 0.6113043427467346, + "rewards/rejected": -1.1185541152954102, + "step": 630 + }, + { + "epoch": 0.0942146327101428, + "grad_norm": 172.47932960727135, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": -2.763209104537964, + "logits/rejected": -2.7188196182250977, + "logps/chosen": -379.62298583984375, + "logps/rejected": -405.97967529296875, + "loss": 0.6563, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.0018929243087769, + "rewards/margins": 0.42409810423851013, + "rewards/rejected": -1.4259908199310303, + "step": 640 + }, + { + "epoch": 0.09568673634623878, + "grad_norm": 63.290053689853615, + "learning_rate": 4.779411764705882e-07, + "logits/chosen": -2.741708755493164, + "logits/rejected": -2.665816307067871, + "logps/chosen": -354.8901062011719, + "logps/rejected": -398.566162109375, + "loss": 0.5226, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.5825831294059753, + "rewards/margins": 0.5649443864822388, + "rewards/rejected": -1.1475274562835693, + "step": 650 + }, + { + "epoch": 0.09715883998233475, + "grad_norm": 62.401178333930524, + "learning_rate": 4.852941176470588e-07, + "logits/chosen": -2.742733955383301, + "logits/rejected": -2.6645336151123047, + "logps/chosen": -318.71783447265625, + "logps/rejected": -334.59613037109375, + "loss": 0.5567, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6267253756523132, + "rewards/margins": 0.4700597822666168, + "rewards/rejected": -1.0967851877212524, + "step": 660 + }, + { + "epoch": 0.09863094361843074, + "grad_norm": 104.41768376380126, + "learning_rate": 4.926470588235295e-07, + "logits/chosen": -2.6591248512268066, + "logits/rejected": -2.6875529289245605, + "logps/chosen": -360.9054260253906, + "logps/rejected": -344.1827697753906, + "loss": 0.56, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.772811770439148, + "rewards/margins": 0.3371545076370239, + "rewards/rejected": -1.1099662780761719, + "step": 670 + }, + { + "epoch": 0.10010304725452672, + "grad_norm": 40.52256107522987, + "learning_rate": 5e-07, + "logits/chosen": -2.811023235321045, + "logits/rejected": -2.6898598670959473, + "logps/chosen": -315.8854675292969, + "logps/rejected": -290.6334533691406, + "loss": 0.6372, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -0.7123109698295593, + "rewards/margins": 0.3000167906284332, + "rewards/rejected": -1.012327790260315, + "step": 680 + }, + { + "epoch": 0.1015751508906227, + "grad_norm": 95.03091495495917, + "learning_rate": 4.999966985858302e-07, + "logits/chosen": -2.770061492919922, + "logits/rejected": -2.735762119293213, + "logps/chosen": -311.56182861328125, + "logps/rejected": -359.50677490234375, + "loss": 0.5696, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6472934484481812, + "rewards/margins": 0.3652809262275696, + "rewards/rejected": -1.0125744342803955, + "step": 690 + }, + { + "epoch": 0.10304725452671869, + "grad_norm": 108.84659624524005, + "learning_rate": 4.999867944305156e-07, + "logits/chosen": -2.6839098930358887, + "logits/rejected": -2.679866075515747, + "logps/chosen": -235.9103546142578, + "logps/rejected": -313.3943176269531, + "loss": 0.5321, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -0.4037684500217438, + "rewards/margins": 0.7230291962623596, + "rewards/rejected": -1.1267975568771362, + "step": 700 + }, + { + "epoch": 0.10451935816281466, + "grad_norm": 100.01580621458803, + "learning_rate": 4.99970287795638e-07, + "logits/chosen": -2.69480562210083, + "logits/rejected": -2.638415813446045, + "logps/chosen": -355.7855224609375, + "logps/rejected": -471.83062744140625, + "loss": 0.6123, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.0282886028289795, + "rewards/margins": 0.6514967679977417, + "rewards/rejected": -1.6797853708267212, + "step": 710 + }, + { + "epoch": 0.10599146179891064, + "grad_norm": 62.50621856895838, + "learning_rate": 4.999471791171592e-07, + "logits/chosen": -2.6188976764678955, + "logits/rejected": -2.6344656944274902, + "logps/chosen": -361.3241882324219, + "logps/rejected": -374.80609130859375, + "loss": 0.5184, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.037204623222351, + "rewards/margins": 0.6255699396133423, + "rewards/rejected": -1.6627744436264038, + "step": 720 + }, + { + "epoch": 0.10746356543500662, + "grad_norm": 93.08889028228289, + "learning_rate": 4.999174690054098e-07, + "logits/chosen": -2.594902515411377, + "logits/rejected": -2.593909978866577, + "logps/chosen": -395.16607666015625, + "logps/rejected": -527.0733032226562, + "loss": 0.6142, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.692164659500122, + "rewards/margins": 0.5620343685150146, + "rewards/rejected": -2.2541990280151367, + "step": 730 + }, + { + "epoch": 0.10893566907110261, + "grad_norm": 116.21402533808507, + "learning_rate": 4.998811582450728e-07, + "logits/chosen": -2.636129140853882, + "logits/rejected": -2.653109550476074, + "logps/chosen": -442.2521057128906, + "logps/rejected": -513.395263671875, + "loss": 0.5386, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.378122091293335, + "rewards/margins": 0.8112869262695312, + "rewards/rejected": -2.189409017562866, + "step": 740 + }, + { + "epoch": 0.11040777270719859, + "grad_norm": 134.77618648710364, + "learning_rate": 4.998382477951632e-07, + "logits/chosen": -2.6785359382629395, + "logits/rejected": -2.6858668327331543, + "logps/chosen": -350.38995361328125, + "logps/rejected": -416.52117919921875, + "loss": 0.672, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": -1.3163363933563232, + "rewards/margins": 0.38548019528388977, + "rewards/rejected": -1.7018165588378906, + "step": 750 + }, + { + "epoch": 0.11187987634329456, + "grad_norm": 83.97647655782397, + "learning_rate": 4.997887387890022e-07, + "logits/chosen": -2.622954845428467, + "logits/rejected": -2.6646180152893066, + "logps/chosen": -447.6546936035156, + "logps/rejected": -496.49761962890625, + "loss": 0.4816, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -0.9576289057731628, + "rewards/margins": 0.6856824159622192, + "rewards/rejected": -1.6433115005493164, + "step": 760 + }, + { + "epoch": 0.11335197997939055, + "grad_norm": 69.85942092175857, + "learning_rate": 4.997326325341876e-07, + "logits/chosen": -2.801757335662842, + "logits/rejected": -2.7150521278381348, + "logps/chosen": -380.6666259765625, + "logps/rejected": -414.01507568359375, + "loss": 0.7034, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -1.4363845586776733, + "rewards/margins": 0.2651670575141907, + "rewards/rejected": -1.7015516757965088, + "step": 770 + }, + { + "epoch": 0.11482408361548653, + "grad_norm": 70.978175725363, + "learning_rate": 4.996699305125597e-07, + "logits/chosen": -2.5956618785858154, + "logits/rejected": -2.581505537033081, + "logps/chosen": -319.22564697265625, + "logps/rejected": -365.5293273925781, + "loss": 0.6194, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -0.925090491771698, + "rewards/margins": 0.63614422082901, + "rewards/rejected": -1.561234712600708, + "step": 780 + }, + { + "epoch": 0.11629618725158251, + "grad_norm": 73.44166048942573, + "learning_rate": 4.996006343801608e-07, + "logits/chosen": -2.5580391883850098, + "logits/rejected": -2.5126895904541016, + "logps/chosen": -305.4091491699219, + "logps/rejected": -358.71392822265625, + "loss": 0.5683, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -0.7413787245750427, + "rewards/margins": 0.5537192225456238, + "rewards/rejected": -1.295098066329956, + "step": 790 + }, + { + "epoch": 0.1177682908876785, + "grad_norm": 88.97203057276246, + "learning_rate": 4.99524745967193e-07, + "logits/chosen": -2.505258083343506, + "logits/rejected": -2.4751739501953125, + "logps/chosen": -460.5057678222656, + "logps/rejected": -452.73663330078125, + "loss": 0.5791, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.0643593072891235, + "rewards/margins": 0.6612726449966431, + "rewards/rejected": -1.7256319522857666, + "step": 800 + }, + { + "epoch": 0.11924039452377447, + "grad_norm": 68.9966633055363, + "learning_rate": 4.994422672779687e-07, + "logits/chosen": -2.4634079933166504, + "logits/rejected": -2.4504261016845703, + "logps/chosen": -338.05999755859375, + "logps/rejected": -434.5115661621094, + "loss": 0.5667, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.146486759185791, + "rewards/margins": 0.6650794744491577, + "rewards/rejected": -1.8115661144256592, + "step": 810 + }, + { + "epoch": 0.12071249815987045, + "grad_norm": 78.77666641191637, + "learning_rate": 4.993532004908588e-07, + "logits/chosen": -2.47865891456604, + "logits/rejected": -2.350578546524048, + "logps/chosen": -351.12554931640625, + "logps/rejected": -390.0417785644531, + "loss": 0.5925, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.165611982345581, + "rewards/margins": 0.41317757964134216, + "rewards/rejected": -1.578789472579956, + "step": 820 + }, + { + "epoch": 0.12218460179596644, + "grad_norm": 67.04654282705918, + "learning_rate": 4.992575479582337e-07, + "logits/chosen": -2.431173801422119, + "logits/rejected": -2.4054408073425293, + "logps/chosen": -484.1844787597656, + "logps/rejected": -520.760009765625, + "loss": 0.5382, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.0996158123016357, + "rewards/margins": 0.6050158739089966, + "rewards/rejected": -1.704632043838501, + "step": 830 + }, + { + "epoch": 0.12365670543206242, + "grad_norm": 89.06826708739868, + "learning_rate": 4.991553122064028e-07, + "logits/chosen": -2.398393154144287, + "logits/rejected": -2.2988085746765137, + "logps/chosen": -447.3907775878906, + "logps/rejected": -414.0386657714844, + "loss": 0.6512, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4857062101364136, + "rewards/margins": 0.35851725935935974, + "rewards/rejected": -1.8442237377166748, + "step": 840 + }, + { + "epoch": 0.1251288090681584, + "grad_norm": 96.92246338080771, + "learning_rate": 4.990464959355464e-07, + "logits/chosen": -2.527771472930908, + "logits/rejected": -2.5881149768829346, + "logps/chosen": -335.7137756347656, + "logps/rejected": -409.61480712890625, + "loss": 0.5856, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.012934923171997, + "rewards/margins": 0.5380204916000366, + "rewards/rejected": -1.5509555339813232, + "step": 850 + }, + { + "epoch": 0.1266009127042544, + "grad_norm": 60.49630040434806, + "learning_rate": 4.98931102019645e-07, + "logits/chosen": -2.4254355430603027, + "logits/rejected": -2.3948912620544434, + "logps/chosen": -327.5542907714844, + "logps/rejected": -379.8779296875, + "loss": 0.5655, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.0056989192962646, + "rewards/margins": 0.849550724029541, + "rewards/rejected": -1.8552496433258057, + "step": 860 + }, + { + "epoch": 0.12807301634035037, + "grad_norm": 86.58920263838296, + "learning_rate": 4.988091335064037e-07, + "logits/chosen": -2.335714101791382, + "logits/rejected": -2.2596073150634766, + "logps/chosen": -419.8048400878906, + "logps/rejected": -480.4607849121094, + "loss": 0.5917, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.8286892175674438, + "rewards/margins": 0.7305442094802856, + "rewards/rejected": -2.5592331886291504, + "step": 870 + }, + { + "epoch": 0.12954511997644635, + "grad_norm": 95.05553111652358, + "learning_rate": 4.98680593617171e-07, + "logits/chosen": -2.4978554248809814, + "logits/rejected": -2.468535900115967, + "logps/chosen": -435.4942321777344, + "logps/rejected": -370.5655822753906, + "loss": 0.5876, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": -1.453181505203247, + "rewards/margins": 0.18785279989242554, + "rewards/rejected": -1.6410341262817383, + "step": 880 + }, + { + "epoch": 0.1310172236125423, + "grad_norm": 54.557252591775615, + "learning_rate": 4.985454857468542e-07, + "logits/chosen": -2.320059299468994, + "logits/rejected": -2.328876495361328, + "logps/chosen": -502.5394592285156, + "logps/rejected": -511.71820068359375, + "loss": 0.5819, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9230819940567017, + "rewards/margins": 0.4761742949485779, + "rewards/rejected": -2.3992562294006348, + "step": 890 + }, + { + "epoch": 0.1324893272486383, + "grad_norm": 67.4351785062198, + "learning_rate": 4.984038134638297e-07, + "logits/chosen": -2.411795139312744, + "logits/rejected": -2.394960641860962, + "logps/chosen": -399.6461486816406, + "logps/rejected": -482.01202392578125, + "loss": 0.623, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.4777005910873413, + "rewards/margins": 0.8508377075195312, + "rewards/rejected": -2.328538417816162, + "step": 900 + }, + { + "epoch": 0.13396143088473428, + "grad_norm": 127.62931312194155, + "learning_rate": 4.982555805098483e-07, + "logits/chosen": -2.3161873817443848, + "logits/rejected": -2.2395336627960205, + "logps/chosen": -401.06756591796875, + "logps/rejected": -369.6282653808594, + "loss": 0.552, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -1.5588366985321045, + "rewards/margins": 0.45878124237060547, + "rewards/rejected": -2.017617702484131, + "step": 910 + }, + { + "epoch": 0.13543353452083026, + "grad_norm": 95.43063269026796, + "learning_rate": 4.981007907999372e-07, + "logits/chosen": -2.4623847007751465, + "logits/rejected": -2.489656686782837, + "logps/chosen": -316.79522705078125, + "logps/rejected": -443.9957580566406, + "loss": 0.6454, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.1876418590545654, + "rewards/margins": 0.4539087414741516, + "rewards/rejected": -1.6415506601333618, + "step": 920 + }, + { + "epoch": 0.13690563815692625, + "grad_norm": 78.16125915716293, + "learning_rate": 4.979394484222961e-07, + "logits/chosen": -2.3671488761901855, + "logits/rejected": -2.349256753921509, + "logps/chosen": -312.8221740722656, + "logps/rejected": -465.28643798828125, + "loss": 0.6373, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.2663686275482178, + "rewards/margins": 0.7071529626846313, + "rewards/rejected": -1.9735218286514282, + "step": 930 + }, + { + "epoch": 0.13837774179302223, + "grad_norm": 78.73592725477026, + "learning_rate": 4.977715576381888e-07, + "logits/chosen": -2.285475492477417, + "logits/rejected": -2.3155641555786133, + "logps/chosen": -402.8815612792969, + "logps/rejected": -458.4427795410156, + "loss": 0.5654, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.2322379350662231, + "rewards/margins": 0.605678915977478, + "rewards/rejected": -1.8379167318344116, + "step": 940 + }, + { + "epoch": 0.1398498454291182, + "grad_norm": 126.06698134553926, + "learning_rate": 4.975971228818315e-07, + "logits/chosen": -2.427016019821167, + "logits/rejected": -2.4252357482910156, + "logps/chosen": -356.84918212890625, + "logps/rejected": -452.0550842285156, + "loss": 0.634, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.346930742263794, + "rewards/margins": 0.9196730852127075, + "rewards/rejected": -2.266603946685791, + "step": 950 + }, + { + "epoch": 0.1413219490652142, + "grad_norm": 95.87499066256314, + "learning_rate": 4.974161487602753e-07, + "logits/chosen": -2.4625601768493652, + "logits/rejected": -2.4442732334136963, + "logps/chosen": -360.7414855957031, + "logps/rejected": -414.05267333984375, + "loss": 0.5792, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.1852446794509888, + "rewards/margins": 0.47427836060523987, + "rewards/rejected": -1.6595230102539062, + "step": 960 + }, + { + "epoch": 0.14279405270131018, + "grad_norm": 99.7834356964817, + "learning_rate": 4.972286400532842e-07, + "logits/chosen": -2.531005620956421, + "logits/rejected": -2.5069010257720947, + "logps/chosen": -328.6793518066406, + "logps/rejected": -429.47271728515625, + "loss": 0.5304, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.0170937776565552, + "rewards/margins": 0.7277637720108032, + "rewards/rejected": -1.7448575496673584, + "step": 970 + }, + { + "epoch": 0.14426615633740617, + "grad_norm": 66.83473385537611, + "learning_rate": 4.970346017132097e-07, + "logits/chosen": -2.5137009620666504, + "logits/rejected": -2.421959161758423, + "logps/chosen": -399.0072937011719, + "logps/rejected": -415.3330993652344, + "loss": 0.5231, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -1.1631089448928833, + "rewards/margins": 0.7736371755599976, + "rewards/rejected": -1.93674635887146, + "step": 980 + }, + { + "epoch": 0.14573825997350212, + "grad_norm": 70.25825428829938, + "learning_rate": 4.96834038864859e-07, + "logits/chosen": -2.465134859085083, + "logits/rejected": -2.477628469467163, + "logps/chosen": -433.21630859375, + "logps/rejected": -439.9414978027344, + "loss": 0.5283, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -1.304337739944458, + "rewards/margins": 0.6011897325515747, + "rewards/rejected": -1.9055274724960327, + "step": 990 + }, + { + "epoch": 0.1472103636095981, + "grad_norm": 61.53723039377392, + "learning_rate": 4.966269568053605e-07, + "logits/chosen": -2.529338836669922, + "logits/rejected": -2.51577091217041, + "logps/chosen": -417.2015686035156, + "logps/rejected": -490.9686584472656, + "loss": 0.5917, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.104016900062561, + "rewards/margins": 0.8418322801589966, + "rewards/rejected": -1.945849061012268, + "step": 1000 + }, + { + "epoch": 0.1486824672456941, + "grad_norm": 66.9941190207627, + "learning_rate": 4.964133610040232e-07, + "logits/chosen": -2.3653724193573, + "logits/rejected": -2.2819418907165527, + "logps/chosen": -391.8813171386719, + "logps/rejected": -463.3155822753906, + "loss": 0.5616, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": -1.428182601928711, + "rewards/margins": 0.6143081784248352, + "rewards/rejected": -2.0424907207489014, + "step": 1010 + }, + { + "epoch": 0.15015457088179007, + "grad_norm": 55.85459115087366, + "learning_rate": 4.961932571021928e-07, + "logits/chosen": -2.5773696899414062, + "logits/rejected": -2.484923839569092, + "logps/chosen": -398.73480224609375, + "logps/rejected": -427.26312255859375, + "loss": 0.4772, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.3710881471633911, + "rewards/margins": 0.5744053721427917, + "rewards/rejected": -1.945493459701538, + "step": 1020 + }, + { + "epoch": 0.15162667451788606, + "grad_norm": 69.7217277390317, + "learning_rate": 4.959666509131025e-07, + "logits/chosen": -2.449031352996826, + "logits/rejected": -2.369258403778076, + "logps/chosen": -398.9854431152344, + "logps/rejected": -441.555419921875, + "loss": 0.5054, + "rewards/accuracies": 0.7333332300186157, + "rewards/chosen": -1.3591684103012085, + "rewards/margins": 0.8082882761955261, + "rewards/rejected": -2.16745662689209, + "step": 1030 + }, + { + "epoch": 0.15309877815398204, + "grad_norm": 96.5843839306406, + "learning_rate": 4.957335484217193e-07, + "logits/chosen": -2.4228036403656006, + "logits/rejected": -2.439424753189087, + "logps/chosen": -412.070556640625, + "logps/rejected": -451.8875427246094, + "loss": 0.5968, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.2536553144454956, + "rewards/margins": 0.3511051535606384, + "rewards/rejected": -1.6047604084014893, + "step": 1040 + }, + { + "epoch": 0.15457088179007802, + "grad_norm": 60.77639727539831, + "learning_rate": 4.954939557845862e-07, + "logits/chosen": -2.546800136566162, + "logits/rejected": -2.4587836265563965, + "logps/chosen": -449.12017822265625, + "logps/rejected": -493.80035400390625, + "loss": 0.569, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -1.230821132659912, + "rewards/margins": 0.4044415354728699, + "rewards/rejected": -1.6352627277374268, + "step": 1050 + }, + { + "epoch": 0.156042985426174, + "grad_norm": 66.8896375730587, + "learning_rate": 4.952478793296594e-07, + "logits/chosen": -2.309026002883911, + "logits/rejected": -2.297542095184326, + "logps/chosen": -302.10589599609375, + "logps/rejected": -440.9278869628906, + "loss": 0.509, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.225482702255249, + "rewards/margins": 1.140465497970581, + "rewards/rejected": -2.36594820022583, + "step": 1060 + }, + { + "epoch": 0.15751508906227, + "grad_norm": 50.36831191681053, + "learning_rate": 4.949953255561411e-07, + "logits/chosen": -2.403249979019165, + "logits/rejected": -2.401543140411377, + "logps/chosen": -406.5440368652344, + "logps/rejected": -472.4857482910156, + "loss": 0.4851, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0370168685913086, + "rewards/margins": 0.9498859643936157, + "rewards/rejected": -1.9869028329849243, + "step": 1070 + }, + { + "epoch": 0.15898719269836598, + "grad_norm": 70.64753814162017, + "learning_rate": 4.947363011343083e-07, + "logits/chosen": -2.3614659309387207, + "logits/rejected": -2.341747283935547, + "logps/chosen": -421.7314453125, + "logps/rejected": -433.2002868652344, + "loss": 0.5136, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -1.4008315801620483, + "rewards/margins": 0.5527233481407166, + "rewards/rejected": -1.9535548686981201, + "step": 1080 + }, + { + "epoch": 0.16045929633446196, + "grad_norm": 60.76916266461304, + "learning_rate": 4.944708129053362e-07, + "logits/chosen": -2.246417999267578, + "logits/rejected": -2.3208072185516357, + "logps/chosen": -413.4805603027344, + "logps/rejected": -391.5755615234375, + "loss": 0.6021, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3196223974227905, + "rewards/margins": 0.5485795140266418, + "rewards/rejected": -1.8682018518447876, + "step": 1090 + }, + { + "epoch": 0.16193139997055792, + "grad_norm": 58.62991201711942, + "learning_rate": 4.941988678811176e-07, + "logits/chosen": -2.309478759765625, + "logits/rejected": -2.2181437015533447, + "logps/chosen": -379.4544372558594, + "logps/rejected": -432.42486572265625, + "loss": 0.5402, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -1.1119909286499023, + "rewards/margins": 0.9873258471488953, + "rewards/rejected": -2.0993170738220215, + "step": 1100 + }, + { + "epoch": 0.1634035036066539, + "grad_norm": 70.8786638476901, + "learning_rate": 4.939204732440777e-07, + "logits/chosen": -2.423079013824463, + "logits/rejected": -2.395711660385132, + "logps/chosen": -341.75653076171875, + "logps/rejected": -325.0847473144531, + "loss": 0.5211, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.1260013580322266, + "rewards/margins": 0.4885813593864441, + "rewards/rejected": -1.6145827770233154, + "step": 1110 + }, + { + "epoch": 0.16487560724274988, + "grad_norm": 113.6657671649179, + "learning_rate": 4.936356363469845e-07, + "logits/chosen": -2.2684836387634277, + "logits/rejected": -2.242072582244873, + "logps/chosen": -428.93206787109375, + "logps/rejected": -521.1017456054688, + "loss": 0.6197, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.6195265054702759, + "rewards/margins": 0.7122483253479004, + "rewards/rejected": -2.3317747116088867, + "step": 1120 + }, + { + "epoch": 0.16634771087884587, + "grad_norm": 100.56324603713574, + "learning_rate": 4.933443647127546e-07, + "logits/chosen": -2.4205946922302246, + "logits/rejected": -2.4164557456970215, + "logps/chosen": -356.76214599609375, + "logps/rejected": -373.3155212402344, + "loss": 0.6013, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -1.1072218418121338, + "rewards/margins": 0.4996451735496521, + "rewards/rejected": -1.6068670749664307, + "step": 1130 + }, + { + "epoch": 0.16781981451494185, + "grad_norm": 89.28665863178678, + "learning_rate": 4.930466660342543e-07, + "logits/chosen": -2.3306469917297363, + "logits/rejected": -2.3265433311462402, + "logps/chosen": -330.6070861816406, + "logps/rejected": -334.3078308105469, + "loss": 0.6393, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.0782711505889893, + "rewards/margins": 0.4569946825504303, + "rewards/rejected": -1.5352656841278076, + "step": 1140 + }, + { + "epoch": 0.16929191815103783, + "grad_norm": 150.25891160044847, + "learning_rate": 4.927425481740968e-07, + "logits/chosen": -2.3292593955993652, + "logits/rejected": -2.3203587532043457, + "logps/chosen": -350.44000244140625, + "logps/rejected": -389.2339172363281, + "loss": 0.567, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.2169609069824219, + "rewards/margins": 0.5923596620559692, + "rewards/rejected": -1.8093206882476807, + "step": 1150 + }, + { + "epoch": 0.17076402178713382, + "grad_norm": 48.06844794253284, + "learning_rate": 4.924320191644341e-07, + "logits/chosen": -2.422712802886963, + "logits/rejected": -2.394336223602295, + "logps/chosen": -440.1647033691406, + "logps/rejected": -496.31793212890625, + "loss": 0.5335, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.4149316549301147, + "rewards/margins": 0.7855010032653809, + "rewards/rejected": -2.200432300567627, + "step": 1160 + }, + { + "epoch": 0.1722361254232298, + "grad_norm": 83.77130935317149, + "learning_rate": 4.921150872067452e-07, + "logits/chosen": -2.4535346031188965, + "logits/rejected": -2.3652877807617188, + "logps/chosen": -418.82220458984375, + "logps/rejected": -425.19488525390625, + "loss": 0.5664, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": -1.1962155103683472, + "rewards/margins": 0.6466376781463623, + "rewards/rejected": -1.8428531885147095, + "step": 1170 + }, + { + "epoch": 0.17370822905932579, + "grad_norm": 91.59752905193137, + "learning_rate": 4.917917606716194e-07, + "logits/chosen": -2.4181668758392334, + "logits/rejected": -2.3900115489959717, + "logps/chosen": -325.76593017578125, + "logps/rejected": -416.0220642089844, + "loss": 0.5374, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.461897373199463, + "rewards/margins": 0.36997920274734497, + "rewards/rejected": -1.8318767547607422, + "step": 1180 + }, + { + "epoch": 0.17518033269542177, + "grad_norm": 76.16351095883016, + "learning_rate": 4.914620480985352e-07, + "logits/chosen": -2.428467273712158, + "logits/rejected": -2.451366901397705, + "logps/chosen": -474.949951171875, + "logps/rejected": -492.2447204589844, + "loss": 0.5585, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.3370530605316162, + "rewards/margins": 0.565496563911438, + "rewards/rejected": -1.9025497436523438, + "step": 1190 + }, + { + "epoch": 0.17665243633151773, + "grad_norm": 107.45413198647944, + "learning_rate": 4.911259581956345e-07, + "logits/chosen": -2.353527307510376, + "logits/rejected": -2.3184163570404053, + "logps/chosen": -392.1676940917969, + "logps/rejected": -554.6839599609375, + "loss": 0.5939, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.1728734970092773, + "rewards/margins": 0.6887558698654175, + "rewards/rejected": -1.8616292476654053, + "step": 1200 + }, + { + "epoch": 0.1781245399676137, + "grad_norm": 77.99912218156554, + "learning_rate": 4.907834998394932e-07, + "logits/chosen": -2.337217330932617, + "logits/rejected": -2.3447673320770264, + "logps/chosen": -319.287841796875, + "logps/rejected": -442.96649169921875, + "loss": 0.526, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -1.176101803779602, + "rewards/margins": 0.8924981355667114, + "rewards/rejected": -2.0685999393463135, + "step": 1210 + }, + { + "epoch": 0.1795966436037097, + "grad_norm": 84.59812296988449, + "learning_rate": 4.904346820748862e-07, + "logits/chosen": -2.3390042781829834, + "logits/rejected": -2.287078619003296, + "logps/chosen": -444.9212951660156, + "logps/rejected": -448.62445068359375, + "loss": 0.5983, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.581557035446167, + "rewards/margins": 0.8236225247383118, + "rewards/rejected": -2.405179262161255, + "step": 1220 + }, + { + "epoch": 0.18106874723980568, + "grad_norm": 75.28672432165742, + "learning_rate": 4.900795141145487e-07, + "logits/chosen": -2.42710542678833, + "logits/rejected": -2.348849058151245, + "logps/chosen": -309.20269775390625, + "logps/rejected": -447.49273681640625, + "loss": 0.4723, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.2348849773406982, + "rewards/margins": 0.9994825124740601, + "rewards/rejected": -2.2343673706054688, + "step": 1230 + }, + { + "epoch": 0.18254085087590166, + "grad_norm": 116.22003934278138, + "learning_rate": 4.897180053389332e-07, + "logits/chosen": -2.44488263130188, + "logits/rejected": -2.380082607269287, + "logps/chosen": -376.8838806152344, + "logps/rejected": -449.9505310058594, + "loss": 0.5498, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.3597933053970337, + "rewards/margins": 0.9663193821907043, + "rewards/rejected": -2.326112747192383, + "step": 1240 + }, + { + "epoch": 0.18401295451199765, + "grad_norm": 113.26949845765985, + "learning_rate": 4.89350165295961e-07, + "logits/chosen": -2.4226882457733154, + "logits/rejected": -2.443052053451538, + "logps/chosen": -406.7887268066406, + "logps/rejected": -512.9963989257812, + "loss": 0.549, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5804365873336792, + "rewards/margins": 0.7972809672355652, + "rewards/rejected": -2.3777174949645996, + "step": 1250 + }, + { + "epoch": 0.18548505814809363, + "grad_norm": 77.5018473320193, + "learning_rate": 4.88976003700771e-07, + "logits/chosen": -2.444471836090088, + "logits/rejected": -2.384782552719116, + "logps/chosen": -494.62823486328125, + "logps/rejected": -513.8779296875, + "loss": 0.5738, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.4464528560638428, + "rewards/margins": 0.7910875678062439, + "rewards/rejected": -2.2375404834747314, + "step": 1260 + }, + { + "epoch": 0.1869571617841896, + "grad_norm": 112.42034982707989, + "learning_rate": 4.885955304354622e-07, + "logits/chosen": -2.271170139312744, + "logits/rejected": -2.236534595489502, + "logps/chosen": -363.8572692871094, + "logps/rejected": -452.79547119140625, + "loss": 0.5434, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.1983444690704346, + "rewards/margins": 0.9610024690628052, + "rewards/rejected": -2.1593470573425293, + "step": 1270 + }, + { + "epoch": 0.1884292654202856, + "grad_norm": 64.16600190033466, + "learning_rate": 4.882087555488331e-07, + "logits/chosen": -2.4348254203796387, + "logits/rejected": -2.407278060913086, + "logps/chosen": -429.31591796875, + "logps/rejected": -401.4253845214844, + "loss": 0.5399, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.171771764755249, + "rewards/margins": 0.6124464869499207, + "rewards/rejected": -1.784218192100525, + "step": 1280 + }, + { + "epoch": 0.18990136905638158, + "grad_norm": 82.37540342444584, + "learning_rate": 4.878156892561167e-07, + "logits/chosen": -2.32810640335083, + "logits/rejected": -2.2988626956939697, + "logps/chosen": -373.263671875, + "logps/rejected": -435.224609375, + "loss": 0.6002, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.7349411249160767, + "rewards/margins": 0.24868178367614746, + "rewards/rejected": -1.9836229085922241, + "step": 1290 + }, + { + "epoch": 0.19137347269247756, + "grad_norm": 59.088407052635176, + "learning_rate": 4.874163419387099e-07, + "logits/chosen": -2.341831684112549, + "logits/rejected": -2.2889797687530518, + "logps/chosen": -348.39495849609375, + "logps/rejected": -461.44024658203125, + "loss": 0.5314, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.6071557998657227, + "rewards/margins": 0.8054006695747375, + "rewards/rejected": -2.4125566482543945, + "step": 1300 + }, + { + "epoch": 0.19284557632857352, + "grad_norm": 85.83186215874153, + "learning_rate": 4.870107241438999e-07, + "logits/chosen": -2.2909867763519287, + "logits/rejected": -2.2923741340637207, + "logps/chosen": -432.17498779296875, + "logps/rejected": -511.97369384765625, + "loss": 0.6081, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": -1.7900841236114502, + "rewards/margins": 0.7998554110527039, + "rewards/rejected": -2.589939594268799, + "step": 1310 + }, + { + "epoch": 0.1943176799646695, + "grad_norm": 115.58856946614992, + "learning_rate": 4.865988465845852e-07, + "logits/chosen": -2.2347519397735596, + "logits/rejected": -2.245803117752075, + "logps/chosen": -451.7115173339844, + "logps/rejected": -477.14190673828125, + "loss": 0.6592, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9571558237075806, + "rewards/margins": 0.4260638356208801, + "rewards/rejected": -2.3832201957702637, + "step": 1320 + }, + { + "epoch": 0.1957897836007655, + "grad_norm": 56.247187641756526, + "learning_rate": 4.861807201389933e-07, + "logits/chosen": -2.298593282699585, + "logits/rejected": -2.2836527824401855, + "logps/chosen": -395.5080261230469, + "logps/rejected": -422.0174255371094, + "loss": 0.5134, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3166104555130005, + "rewards/margins": 0.7878126502037048, + "rewards/rejected": -2.1044230461120605, + "step": 1330 + }, + { + "epoch": 0.19726188723686147, + "grad_norm": 72.2572401175464, + "learning_rate": 4.857563558503925e-07, + "logits/chosen": -2.2002978324890137, + "logits/rejected": -2.1842079162597656, + "logps/chosen": -389.43194580078125, + "logps/rejected": -487.50537109375, + "loss": 0.5763, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2549211978912354, + "rewards/margins": 0.7385715842247009, + "rewards/rejected": -1.9934924840927124, + "step": 1340 + }, + { + "epoch": 0.19873399087295746, + "grad_norm": 87.83123217315685, + "learning_rate": 4.853257649268014e-07, + "logits/chosen": -2.0522632598876953, + "logits/rejected": -2.0326969623565674, + "logps/chosen": -346.51702880859375, + "logps/rejected": -352.9090270996094, + "loss": 0.564, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.210363745689392, + "rewards/margins": 0.5963667035102844, + "rewards/rejected": -1.8067305088043213, + "step": 1350 + }, + { + "epoch": 0.20020609450905344, + "grad_norm": 74.50589492499915, + "learning_rate": 4.848889587406915e-07, + "logits/chosen": -2.100217342376709, + "logits/rejected": -2.1636486053466797, + "logps/chosen": -400.50286865234375, + "logps/rejected": -456.6929626464844, + "loss": 0.5952, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.4444917440414429, + "rewards/margins": 0.5497073531150818, + "rewards/rejected": -1.9941990375518799, + "step": 1360 + }, + { + "epoch": 0.20167819814514942, + "grad_norm": 68.28747998248026, + "learning_rate": 4.84445948828688e-07, + "logits/chosen": -1.9630744457244873, + "logits/rejected": -1.928708791732788, + "logps/chosen": -423.45819091796875, + "logps/rejected": -327.8357849121094, + "loss": 0.661, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": -1.4783233404159546, + "rewards/margins": 0.27565279603004456, + "rewards/rejected": -1.7539761066436768, + "step": 1370 + }, + { + "epoch": 0.2031503017812454, + "grad_norm": 40.11470821536808, + "learning_rate": 4.839967468912645e-07, + "logits/chosen": -2.274794816970825, + "logits/rejected": -2.239185333251953, + "logps/chosen": -424.9100646972656, + "logps/rejected": -447.09332275390625, + "loss": 0.5817, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.20197331905365, + "rewards/margins": 0.4895460605621338, + "rewards/rejected": -1.6915193796157837, + "step": 1380 + }, + { + "epoch": 0.2046224054173414, + "grad_norm": 55.070489889352885, + "learning_rate": 4.83541364792434e-07, + "logits/chosen": -2.164649248123169, + "logits/rejected": -2.199143409729004, + "logps/chosen": -343.9012145996094, + "logps/rejected": -413.23809814453125, + "loss": 0.5492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2343758344650269, + "rewards/margins": 0.5638534426689148, + "rewards/rejected": -1.7982292175292969, + "step": 1390 + }, + { + "epoch": 0.20609450905343737, + "grad_norm": 90.68011547957582, + "learning_rate": 4.83079814559436e-07, + "logits/chosen": -2.2209150791168213, + "logits/rejected": -2.1901187896728516, + "logps/chosen": -431.10748291015625, + "logps/rejected": -434.15301513671875, + "loss": 0.5758, + "rewards/accuracies": 0.7333332300186157, + "rewards/chosen": -1.4397367238998413, + "rewards/margins": 0.46521681547164917, + "rewards/rejected": -1.9049535989761353, + "step": 1400 + }, + { + "epoch": 0.20756661268953333, + "grad_norm": 52.91551921147721, + "learning_rate": 4.826121083824181e-07, + "logits/chosen": -2.191051483154297, + "logits/rejected": -2.200676202774048, + "logps/chosen": -454.6627502441406, + "logps/rejected": -480.51434326171875, + "loss": 0.5124, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6879589557647705, + "rewards/margins": 0.661774754524231, + "rewards/rejected": -2.349733829498291, + "step": 1410 + }, + { + "epoch": 0.20903871632562931, + "grad_norm": 70.2569211705282, + "learning_rate": 4.82138258614115e-07, + "logits/chosen": -2.286085844039917, + "logits/rejected": -2.2468771934509277, + "logps/chosen": -427.1380310058594, + "logps/rejected": -478.649169921875, + "loss": 0.5381, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4441181421279907, + "rewards/margins": 0.5933027863502502, + "rewards/rejected": -2.0374207496643066, + "step": 1420 + }, + { + "epoch": 0.2105108199617253, + "grad_norm": 72.56375081335844, + "learning_rate": 4.816582777695212e-07, + "logits/chosen": -2.3731799125671387, + "logits/rejected": -2.311305522918701, + "logps/chosen": -331.2760925292969, + "logps/rejected": -454.18902587890625, + "loss": 0.5115, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.076615333557129, + "rewards/margins": 1.04938542842865, + "rewards/rejected": -2.1260008811950684, + "step": 1430 + }, + { + "epoch": 0.21198292359782128, + "grad_norm": 54.411801029759864, + "learning_rate": 4.811721785255612e-07, + "logits/chosen": -2.124269962310791, + "logits/rejected": -2.222337007522583, + "logps/chosen": -352.1149597167969, + "logps/rejected": -428.43023681640625, + "loss": 0.5652, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.6614176034927368, + "rewards/margins": 0.7122272849082947, + "rewards/rejected": -2.3736445903778076, + "step": 1440 + }, + { + "epoch": 0.21345502723391727, + "grad_norm": 82.06576829852317, + "learning_rate": 4.806799737207546e-07, + "logits/chosen": -2.1645843982696533, + "logits/rejected": -2.1630325317382812, + "logps/chosen": -374.63409423828125, + "logps/rejected": -454.5563049316406, + "loss": 0.5515, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.6300185918807983, + "rewards/margins": 0.6133270859718323, + "rewards/rejected": -2.2433457374572754, + "step": 1450 + }, + { + "epoch": 0.21492713087001325, + "grad_norm": 67.80352638960416, + "learning_rate": 4.801816763548766e-07, + "logits/chosen": -2.3570351600646973, + "logits/rejected": -2.316193103790283, + "logps/chosen": -411.8035583496094, + "logps/rejected": -493.68988037109375, + "loss": 0.5316, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.2624685764312744, + "rewards/margins": 0.9030007123947144, + "rewards/rejected": -2.165469169616699, + "step": 1460 + }, + { + "epoch": 0.21639923450610923, + "grad_norm": 95.05707227159996, + "learning_rate": 4.796772995886151e-07, + "logits/chosen": -2.283470630645752, + "logits/rejected": -2.3086066246032715, + "logps/chosen": -391.48797607421875, + "logps/rejected": -432.56005859375, + "loss": 0.5508, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2750332355499268, + "rewards/margins": 0.5988991260528564, + "rewards/rejected": -1.8739322423934937, + "step": 1470 + }, + { + "epoch": 0.21787133814220522, + "grad_norm": 140.9461818698126, + "learning_rate": 4.791668567432229e-07, + "logits/chosen": -2.206038236618042, + "logits/rejected": -2.099503755569458, + "logps/chosen": -459.97857666015625, + "logps/rejected": -490.9842224121094, + "loss": 0.598, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.6516094207763672, + "rewards/margins": 0.7941322922706604, + "rewards/rejected": -2.445741891860962, + "step": 1480 + }, + { + "epoch": 0.2193434417783012, + "grad_norm": 183.6512991772917, + "learning_rate": 4.78650361300166e-07, + "logits/chosen": -2.14744234085083, + "logits/rejected": -2.1521944999694824, + "logps/chosen": -491.5018615722656, + "logps/rejected": -593.4058227539062, + "loss": 0.485, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.5822572708129883, + "rewards/margins": 1.180605173110962, + "rewards/rejected": -2.76286244392395, + "step": 1490 + }, + { + "epoch": 0.22081554541439719, + "grad_norm": 152.6586509125849, + "learning_rate": 4.781278269007675e-07, + "logits/chosen": -2.243333339691162, + "logits/rejected": -2.1558566093444824, + "logps/chosen": -508.1153259277344, + "logps/rejected": -600.8419799804688, + "loss": 0.4947, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.7356001138687134, + "rewards/margins": 1.3437955379486084, + "rewards/rejected": -3.0793955326080322, + "step": 1500 + }, + { + "epoch": 0.22228764905049314, + "grad_norm": 58.841664867929794, + "learning_rate": 4.775992673458469e-07, + "logits/chosen": -2.221709728240967, + "logits/rejected": -2.2236690521240234, + "logps/chosen": -436.99310302734375, + "logps/rejected": -475.7952575683594, + "loss": 0.4813, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.649553656578064, + "rewards/margins": 0.7525339722633362, + "rewards/rejected": -2.402087688446045, + "step": 1510 + }, + { + "epoch": 0.22375975268658913, + "grad_norm": 44.88281655360537, + "learning_rate": 4.770646965953564e-07, + "logits/chosen": -2.152620792388916, + "logits/rejected": -2.201392650604248, + "logps/chosen": -407.99212646484375, + "logps/rejected": -485.4007873535156, + "loss": 0.5774, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1216094493865967, + "rewards/margins": 1.0040279626846313, + "rewards/rejected": -2.1256375312805176, + "step": 1520 + }, + { + "epoch": 0.2252318563226851, + "grad_norm": 73.33166185776642, + "learning_rate": 4.765241287680116e-07, + "logits/chosen": -2.3910231590270996, + "logits/rejected": -2.31839656829834, + "logps/chosen": -430.1438903808594, + "logps/rejected": -368.10040283203125, + "loss": 0.5163, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -0.9393638372421265, + "rewards/margins": 0.6706619262695312, + "rewards/rejected": -1.6100257635116577, + "step": 1530 + }, + { + "epoch": 0.2267039599587811, + "grad_norm": 170.76401188845102, + "learning_rate": 4.759775781409187e-07, + "logits/chosen": -2.2840447425842285, + "logits/rejected": -2.1959846019744873, + "logps/chosen": -427.909912109375, + "logps/rejected": -435.9754943847656, + "loss": 0.5666, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.518591284751892, + "rewards/margins": 0.5713554620742798, + "rewards/rejected": -2.089946746826172, + "step": 1540 + }, + { + "epoch": 0.22817606359487708, + "grad_norm": 103.893414667703, + "learning_rate": 4.7542505914919775e-07, + "logits/chosen": -2.2440171241760254, + "logits/rejected": -2.2995383739471436, + "logps/chosen": -413.8138122558594, + "logps/rejected": -487.21539306640625, + "loss": 0.4865, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.2422773838043213, + "rewards/margins": 1.0913513898849487, + "rewards/rejected": -2.3336286544799805, + "step": 1550 + }, + { + "epoch": 0.22964816723097306, + "grad_norm": 167.15525782807174, + "learning_rate": 4.7486658638560076e-07, + "logits/chosen": -2.269134044647217, + "logits/rejected": -2.3225743770599365, + "logps/chosen": -395.209228515625, + "logps/rejected": -459.2503967285156, + "loss": 0.6068, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.4696258306503296, + "rewards/margins": 0.6033800840377808, + "rewards/rejected": -2.0730061531066895, + "step": 1560 + }, + { + "epoch": 0.23112027086706904, + "grad_norm": 79.45116160460039, + "learning_rate": 4.7430217460012703e-07, + "logits/chosen": -2.1645662784576416, + "logits/rejected": -2.171447277069092, + "logps/chosen": -404.60809326171875, + "logps/rejected": -458.00848388671875, + "loss": 0.5817, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3078653812408447, + "rewards/margins": 1.0730340480804443, + "rewards/rejected": -2.380899429321289, + "step": 1570 + }, + { + "epoch": 0.23259237450316503, + "grad_norm": 78.56393349628003, + "learning_rate": 4.7373183869963295e-07, + "logits/chosen": -2.3662962913513184, + "logits/rejected": -2.352410316467285, + "logps/chosen": -426.9559020996094, + "logps/rejected": -451.57373046875, + "loss": 0.5443, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -0.8947224617004395, + "rewards/margins": 0.622563362121582, + "rewards/rejected": -1.517285943031311, + "step": 1580 + }, + { + "epoch": 0.234064478139261, + "grad_norm": 131.7459369942946, + "learning_rate": 4.7315559374743896e-07, + "logits/chosen": -2.3659346103668213, + "logits/rejected": -2.362769603729248, + "logps/chosen": -349.2052917480469, + "logps/rejected": -436.92669677734375, + "loss": 0.5979, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -0.9476059675216675, + "rewards/margins": 0.6078127026557922, + "rewards/rejected": -1.5554184913635254, + "step": 1590 + }, + { + "epoch": 0.235536581775357, + "grad_norm": 44.82723629927153, + "learning_rate": 4.725734549629308e-07, + "logits/chosen": -2.1057968139648438, + "logits/rejected": -2.058248519897461, + "logps/chosen": -324.7589111328125, + "logps/rejected": -433.03125, + "loss": 0.5026, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.8902209401130676, + "rewards/margins": 0.9870883822441101, + "rewards/rejected": -1.8773094415664673, + "step": 1600 + }, + { + "epoch": 0.23700868541145298, + "grad_norm": 73.68505558811076, + "learning_rate": 4.719854377211585e-07, + "logits/chosen": -2.272258996963501, + "logits/rejected": -2.2817509174346924, + "logps/chosen": -373.8901062011719, + "logps/rejected": -369.22357177734375, + "loss": 0.5808, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.2494170665740967, + "rewards/margins": 0.49917951226234436, + "rewards/rejected": -1.7485965490341187, + "step": 1610 + }, + { + "epoch": 0.23848078904754894, + "grad_norm": 62.41926411543295, + "learning_rate": 4.713915575524296e-07, + "logits/chosen": -2.321585178375244, + "logits/rejected": -2.312068223953247, + "logps/chosen": -345.80181884765625, + "logps/rejected": -392.3072204589844, + "loss": 0.6452, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.5046050548553467, + "rewards/margins": 0.4555700719356537, + "rewards/rejected": -1.9601751565933228, + "step": 1620 + }, + { + "epoch": 0.23995289268364492, + "grad_norm": 64.68804339227235, + "learning_rate": 4.7079183014189937e-07, + "logits/chosen": -2.219395399093628, + "logits/rejected": -2.190106153488159, + "logps/chosen": -357.221435546875, + "logps/rejected": -403.87225341796875, + "loss": 0.6257, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3035352230072021, + "rewards/margins": 0.6760867834091187, + "rewards/rejected": -1.9796218872070312, + "step": 1630 + }, + { + "epoch": 0.2414249963197409, + "grad_norm": 51.89929034951486, + "learning_rate": 4.7018627132915634e-07, + "logits/chosen": -2.233522891998291, + "logits/rejected": -2.192657470703125, + "logps/chosen": -401.9055480957031, + "logps/rejected": -394.88177490234375, + "loss": 0.5365, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3327224254608154, + "rewards/margins": 0.48236504197120667, + "rewards/rejected": -1.8150875568389893, + "step": 1640 + }, + { + "epoch": 0.2428970999558369, + "grad_norm": 85.93473965536052, + "learning_rate": 4.695748971078042e-07, + "logits/chosen": -2.240841865539551, + "logits/rejected": -2.1687302589416504, + "logps/chosen": -410.93267822265625, + "logps/rejected": -384.59490966796875, + "loss": 0.5257, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -1.2426220178604126, + "rewards/margins": 0.5909266471862793, + "rewards/rejected": -1.8335487842559814, + "step": 1650 + }, + { + "epoch": 0.24436920359193287, + "grad_norm": 111.08286570809123, + "learning_rate": 4.689577236250389e-07, + "logits/chosen": -2.044264078140259, + "logits/rejected": -1.9523632526397705, + "logps/chosen": -398.46551513671875, + "logps/rejected": -446.1504821777344, + "loss": 0.5188, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.3809359073638916, + "rewards/margins": 0.5848661661148071, + "rewards/rejected": -1.9658019542694092, + "step": 1660 + }, + { + "epoch": 0.24584130722802885, + "grad_norm": 87.97749117842795, + "learning_rate": 4.683347671812228e-07, + "logits/chosen": -2.1749801635742188, + "logits/rejected": -2.1189217567443848, + "logps/chosen": -417.2113342285156, + "logps/rejected": -407.57562255859375, + "loss": 0.4981, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5862901210784912, + "rewards/margins": 0.8106204867362976, + "rewards/rejected": -2.3969106674194336, + "step": 1670 + }, + { + "epoch": 0.24731341086412484, + "grad_norm": 201.70017216773684, + "learning_rate": 4.677060442294537e-07, + "logits/chosen": -2.0802016258239746, + "logits/rejected": -2.126875400543213, + "logps/chosen": -501.54412841796875, + "logps/rejected": -533.7816162109375, + "loss": 0.587, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.8800122737884521, + "rewards/margins": 0.5721237063407898, + "rewards/rejected": -2.452136278152466, + "step": 1680 + }, + { + "epoch": 0.24878551450022082, + "grad_norm": 77.57429466738041, + "learning_rate": 4.6707157137513056e-07, + "logits/chosen": -2.139401912689209, + "logits/rejected": -2.201511859893799, + "logps/chosen": -422.985107421875, + "logps/rejected": -426.33868408203125, + "loss": 0.6606, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4644190073013306, + "rewards/margins": 0.5641460418701172, + "rewards/rejected": -2.028564929962158, + "step": 1690 + }, + { + "epoch": 0.2502576181363168, + "grad_norm": 68.74654542987247, + "learning_rate": 4.664313653755147e-07, + "logits/chosen": -2.253908634185791, + "logits/rejected": -2.2548208236694336, + "logps/chosen": -506.58056640625, + "logps/rejected": -461.81756591796875, + "loss": 0.5103, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.1101315021514893, + "rewards/margins": 0.5425457954406738, + "rewards/rejected": -1.652677297592163, + "step": 1700 + }, + { + "epoch": 0.25172972177241276, + "grad_norm": 109.92068591445454, + "learning_rate": 4.6578544313928735e-07, + "logits/chosen": -1.9825680255889893, + "logits/rejected": -1.967922568321228, + "logps/chosen": -354.4578857421875, + "logps/rejected": -455.46185302734375, + "loss": 0.5181, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.0310165882110596, + "rewards/margins": 0.9791271090507507, + "rewards/rejected": -2.010143756866455, + "step": 1710 + }, + { + "epoch": 0.2532018254085088, + "grad_norm": 49.56357703874075, + "learning_rate": 4.6513382172610324e-07, + "logits/chosen": -2.0617716312408447, + "logits/rejected": -1.990020513534546, + "logps/chosen": -358.312744140625, + "logps/rejected": -415.950927734375, + "loss": 0.526, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.2179559469223022, + "rewards/margins": 0.7157061100006104, + "rewards/rejected": -1.9336620569229126, + "step": 1720 + }, + { + "epoch": 0.25467392904460473, + "grad_norm": 78.51049458621162, + "learning_rate": 4.6447651834613955e-07, + "logits/chosen": -2.0542900562286377, + "logits/rejected": -2.0733561515808105, + "logps/chosen": -419.802001953125, + "logps/rejected": -477.8985290527344, + "loss": 0.6094, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -1.6641391515731812, + "rewards/margins": 0.49551528692245483, + "rewards/rejected": -2.1596546173095703, + "step": 1730 + }, + { + "epoch": 0.25614603268070074, + "grad_norm": 86.37919713558429, + "learning_rate": 4.638135503596419e-07, + "logits/chosen": -1.9813175201416016, + "logits/rejected": -1.9807865619659424, + "logps/chosen": -395.8018493652344, + "logps/rejected": -491.34844970703125, + "loss": 0.6035, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3744761943817139, + "rewards/margins": 0.50504469871521, + "rewards/rejected": -1.8795210123062134, + "step": 1740 + }, + { + "epoch": 0.2576181363167967, + "grad_norm": 122.24007902351458, + "learning_rate": 4.6314493527646553e-07, + "logits/chosen": -1.908752202987671, + "logits/rejected": -1.8275295495986938, + "logps/chosen": -544.9287719726562, + "logps/rejected": -529.8731689453125, + "loss": 0.5514, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -2.193781852722168, + "rewards/margins": 0.4797934591770172, + "rewards/rejected": -2.6735751628875732, + "step": 1750 + }, + { + "epoch": 0.2590902399528927, + "grad_norm": 111.38509377434121, + "learning_rate": 4.624706907556129e-07, + "logits/chosen": -1.9328529834747314, + "logits/rejected": -1.859193205833435, + "logps/chosen": -411.56549072265625, + "logps/rejected": -442.96697998046875, + "loss": 0.5919, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.7481504678726196, + "rewards/margins": 0.781250536441803, + "rewards/rejected": -2.5294013023376465, + "step": 1760 + }, + { + "epoch": 0.26056234358898867, + "grad_norm": 99.90570617559119, + "learning_rate": 4.617908346047673e-07, + "logits/chosen": -1.8282277584075928, + "logits/rejected": -1.754119873046875, + "logps/chosen": -366.02471923828125, + "logps/rejected": -478.244140625, + "loss": 0.6386, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.5268586874008179, + "rewards/margins": 1.007537841796875, + "rewards/rejected": -2.5343966484069824, + "step": 1770 + }, + { + "epoch": 0.2620344472250846, + "grad_norm": 45.900325465875554, + "learning_rate": 4.6110538477982265e-07, + "logits/chosen": -2.1062495708465576, + "logits/rejected": -2.0097177028656006, + "logps/chosen": -326.62396240234375, + "logps/rejected": -463.8211364746094, + "loss": 0.5441, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -0.9709870219230652, + "rewards/margins": 1.244681477546692, + "rewards/rejected": -2.215668201446533, + "step": 1780 + }, + { + "epoch": 0.26350655086118063, + "grad_norm": 95.98792923375153, + "learning_rate": 4.6041435938440887e-07, + "logits/chosen": -2.246121883392334, + "logits/rejected": -2.160053014755249, + "logps/chosen": -329.06573486328125, + "logps/rejected": -397.63031005859375, + "loss": 0.5317, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -0.8664793968200684, + "rewards/margins": 0.6470896005630493, + "rewards/rejected": -1.5135688781738281, + "step": 1790 + }, + { + "epoch": 0.2649786544972766, + "grad_norm": 63.177129128161916, + "learning_rate": 4.5971777666941445e-07, + "logits/chosen": -2.286999225616455, + "logits/rejected": -2.284747362136841, + "logps/chosen": -354.82171630859375, + "logps/rejected": -430.58416748046875, + "loss": 0.5906, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -0.7478771209716797, + "rewards/margins": 0.969946026802063, + "rewards/rejected": -1.7178232669830322, + "step": 1800 + }, + { + "epoch": 0.2664507581333726, + "grad_norm": 59.41703379019638, + "learning_rate": 4.5901565503250373e-07, + "logits/chosen": -2.211920738220215, + "logits/rejected": -2.1894142627716064, + "logps/chosen": -383.20623779296875, + "logps/rejected": -458.1167907714844, + "loss": 0.4183, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.8634785413742065, + "rewards/margins": 0.9898441433906555, + "rewards/rejected": -1.8533223867416382, + "step": 1810 + }, + { + "epoch": 0.26792286176946856, + "grad_norm": 87.00567530091242, + "learning_rate": 4.583080130176312e-07, + "logits/chosen": -2.072298049926758, + "logits/rejected": -1.9915415048599243, + "logps/chosen": -385.45037841796875, + "logps/rejected": -418.5541076660156, + "loss": 0.4998, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.291184663772583, + "rewards/margins": 0.8152947425842285, + "rewards/rejected": -2.1064791679382324, + "step": 1820 + }, + { + "epoch": 0.26939496540556457, + "grad_norm": 65.87317647516328, + "learning_rate": 4.575948693145518e-07, + "logits/chosen": -1.8900423049926758, + "logits/rejected": -1.8983854055404663, + "logps/chosen": -469.83843994140625, + "logps/rejected": -537.6843872070312, + "loss": 0.4759, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -1.6715986728668213, + "rewards/margins": 0.9330682754516602, + "rewards/rejected": -2.6046664714813232, + "step": 1830 + }, + { + "epoch": 0.2708670690416605, + "grad_norm": 168.28452498319217, + "learning_rate": 4.568762427583275e-07, + "logits/chosen": -1.899664282798767, + "logits/rejected": -1.9039865732192993, + "logps/chosen": -461.4872131347656, + "logps/rejected": -569.5077514648438, + "loss": 0.4832, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -2.0172457695007324, + "rewards/margins": 0.9426043629646301, + "rewards/rejected": -2.9598498344421387, + "step": 1840 + }, + { + "epoch": 0.27233917267775654, + "grad_norm": 84.25152471625829, + "learning_rate": 4.561521523288293e-07, + "logits/chosen": -1.7609468698501587, + "logits/rejected": -1.7570053339004517, + "logps/chosen": -531.4232788085938, + "logps/rejected": -610.440185546875, + "loss": 0.599, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.2724013328552246, + "rewards/margins": 0.78504878282547, + "rewards/rejected": -3.05745005607605, + "step": 1850 + }, + { + "epoch": 0.2738112763138525, + "grad_norm": 64.60205031600752, + "learning_rate": 4.554226171502365e-07, + "logits/chosen": -2.054368495941162, + "logits/rejected": -1.8862028121948242, + "logps/chosen": -485.37646484375, + "logps/rejected": -515.5431518554688, + "loss": 0.5419, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.8449373245239258, + "rewards/margins": 0.4809306561946869, + "rewards/rejected": -2.3258678913116455, + "step": 1860 + }, + { + "epoch": 0.2752833799499485, + "grad_norm": 43.35161446092641, + "learning_rate": 4.546876564905313e-07, + "logits/chosen": -1.995887041091919, + "logits/rejected": -2.111243486404419, + "logps/chosen": -367.51226806640625, + "logps/rejected": -585.813232421875, + "loss": 0.4878, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.4822397232055664, + "rewards/margins": 0.9520265460014343, + "rewards/rejected": -2.4342660903930664, + "step": 1870 + }, + { + "epoch": 0.27675548358604446, + "grad_norm": 343.2178726866082, + "learning_rate": 4.5394728976099015e-07, + "logits/chosen": -1.803593635559082, + "logits/rejected": -1.6266990900039673, + "logps/chosen": -480.0843811035156, + "logps/rejected": -665.6341552734375, + "loss": 0.4613, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": -2.212033748626709, + "rewards/margins": 2.2006149291992188, + "rewards/rejected": -4.4126482009887695, + "step": 1880 + }, + { + "epoch": 0.2782275872221404, + "grad_norm": 286.25799396124734, + "learning_rate": 4.532015365156705e-07, + "logits/chosen": -1.7095129489898682, + "logits/rejected": -1.7458789348602295, + "logps/chosen": -545.7160034179688, + "logps/rejected": -588.18701171875, + "loss": 0.68, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.5791265964508057, + "rewards/margins": 0.8755763173103333, + "rewards/rejected": -3.454702854156494, + "step": 1890 + }, + { + "epoch": 0.2796996908582364, + "grad_norm": 48.51006392169838, + "learning_rate": 4.524504164508951e-07, + "logits/chosen": -1.9694645404815674, + "logits/rejected": -1.910211205482483, + "logps/chosen": -500.1825256347656, + "logps/rejected": -554.2801513671875, + "loss": 0.5479, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.062370777130127, + "rewards/margins": 0.7691951990127563, + "rewards/rejected": -2.8315658569335938, + "step": 1900 + }, + { + "epoch": 0.2811717944943324, + "grad_norm": 51.21679486961547, + "learning_rate": 4.5169394940473137e-07, + "logits/chosen": -2.143329381942749, + "logits/rejected": -2.0423364639282227, + "logps/chosen": -460.9620056152344, + "logps/rejected": -592.3289794921875, + "loss": 0.5591, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.7631984949111938, + "rewards/margins": 1.3187248706817627, + "rewards/rejected": -3.081923723220825, + "step": 1910 + }, + { + "epoch": 0.2826438981304284, + "grad_norm": 129.64171052449032, + "learning_rate": 4.509321553564676e-07, + "logits/chosen": -1.980489730834961, + "logits/rejected": -1.9732120037078857, + "logps/chosen": -474.60821533203125, + "logps/rejected": -529.0524291992188, + "loss": 0.6056, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.968618631362915, + "rewards/margins": 0.6428844332695007, + "rewards/rejected": -2.6115028858184814, + "step": 1920 + }, + { + "epoch": 0.28411600176652435, + "grad_norm": 91.61272469300111, + "learning_rate": 4.501650544260848e-07, + "logits/chosen": -1.7855335474014282, + "logits/rejected": -1.8899421691894531, + "logps/chosen": -396.77557373046875, + "logps/rejected": -440.35113525390625, + "loss": 0.5294, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.661126732826233, + "rewards/margins": 0.4826766848564148, + "rewards/rejected": -2.143803358078003, + "step": 1930 + }, + { + "epoch": 0.28558810540262036, + "grad_norm": 64.9062432082929, + "learning_rate": 4.4939266687372636e-07, + "logits/chosen": -1.8499739170074463, + "logits/rejected": -1.8070611953735352, + "logps/chosen": -545.0595703125, + "logps/rejected": -537.5134887695312, + "loss": 0.6371, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.003462791442871, + "rewards/margins": 0.822962760925293, + "rewards/rejected": -2.826425552368164, + "step": 1940 + }, + { + "epoch": 0.2870602090387163, + "grad_norm": 72.46060572465097, + "learning_rate": 4.4861501309916185e-07, + "logits/chosen": -2.132291555404663, + "logits/rejected": -1.9888403415679932, + "logps/chosen": -392.7615051269531, + "logps/rejected": -412.9408264160156, + "loss": 0.4532, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3353325128555298, + "rewards/margins": 0.9555924534797668, + "rewards/rejected": -2.2909250259399414, + "step": 1950 + }, + { + "epoch": 0.28853231267481233, + "grad_norm": 81.77773885072362, + "learning_rate": 4.478321136412487e-07, + "logits/chosen": -1.8498504161834717, + "logits/rejected": -1.88349187374115, + "logps/chosen": -442.2222595214844, + "logps/rejected": -571.5682373046875, + "loss": 0.5914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.7767490148544312, + "rewards/margins": 0.9378013610839844, + "rewards/rejected": -2.714550018310547, + "step": 1960 + }, + { + "epoch": 0.2900044163109083, + "grad_norm": 44.5536895996614, + "learning_rate": 4.4704398917738996e-07, + "logits/chosen": -2.1403515338897705, + "logits/rejected": -2.1420319080352783, + "logps/chosen": -430.294189453125, + "logps/rejected": -562.9959716796875, + "loss": 0.6216, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": -1.5881879329681396, + "rewards/margins": 1.093450665473938, + "rewards/rejected": -2.681638717651367, + "step": 1970 + }, + { + "epoch": 0.29147651994700424, + "grad_norm": 80.33533767680922, + "learning_rate": 4.4625066052298766e-07, + "logits/chosen": -2.1666476726531982, + "logits/rejected": -2.2133193016052246, + "logps/chosen": -341.072509765625, + "logps/rejected": -459.281494140625, + "loss": 0.6206, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -1.0330458879470825, + "rewards/margins": 0.5029893517494202, + "rewards/rejected": -1.536035180091858, + "step": 1980 + }, + { + "epoch": 0.29294862358310025, + "grad_norm": 46.40009387079778, + "learning_rate": 4.454521486308936e-07, + "logits/chosen": -2.2967538833618164, + "logits/rejected": -2.2735185623168945, + "logps/chosen": -434.9996032714844, + "logps/rejected": -454.46466064453125, + "loss": 0.5252, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.235095739364624, + "rewards/margins": 0.620541512966156, + "rewards/rejected": -1.8556371927261353, + "step": 1990 + }, + { + "epoch": 0.2944207272191962, + "grad_norm": 83.58968277404044, + "learning_rate": 4.4464847459085553e-07, + "logits/chosen": -2.194180727005005, + "logits/rejected": -2.152617931365967, + "logps/chosen": -369.58526611328125, + "logps/rejected": -453.7283630371094, + "loss": 0.4891, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -0.9464923739433289, + "rewards/margins": 0.9927309155464172, + "rewards/rejected": -1.939223289489746, + "step": 2000 + }, + { + "epoch": 0.2958928308552922, + "grad_norm": 67.70289222429692, + "learning_rate": 4.438396596289604e-07, + "logits/chosen": -2.0201032161712646, + "logits/rejected": -1.9795821905136108, + "logps/chosen": -378.4675598144531, + "logps/rejected": -463.73358154296875, + "loss": 0.6122, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.0994731187820435, + "rewards/margins": 1.120152473449707, + "rewards/rejected": -2.21962571144104, + "step": 2010 + }, + { + "epoch": 0.2973649344913882, + "grad_norm": 52.41322536545237, + "learning_rate": 4.430257251070736e-07, + "logits/chosen": -2.0778236389160156, + "logits/rejected": -2.0799641609191895, + "logps/chosen": -350.8024597167969, + "logps/rejected": -394.7392272949219, + "loss": 0.4984, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.162217378616333, + "rewards/margins": 0.7542338371276855, + "rewards/rejected": -1.916451096534729, + "step": 2020 + }, + { + "epoch": 0.2988370381274842, + "grad_norm": 67.73673556669453, + "learning_rate": 4.422066925222748e-07, + "logits/chosen": -1.8490890264511108, + "logits/rejected": -1.8243553638458252, + "logps/chosen": -382.03857421875, + "logps/rejected": -442.67694091796875, + "loss": 0.518, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -1.456194281578064, + "rewards/margins": 0.7980313301086426, + "rewards/rejected": -2.254225730895996, + "step": 2030 + }, + { + "epoch": 0.30030914176358015, + "grad_norm": 71.13802966188344, + "learning_rate": 4.4138258350629033e-07, + "logits/chosen": -2.1250674724578857, + "logits/rejected": -1.9244062900543213, + "logps/chosen": -453.32830810546875, + "logps/rejected": -496.22442626953125, + "loss": 0.5195, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.2166922092437744, + "rewards/margins": 0.9763118624687195, + "rewards/rejected": -2.1930041313171387, + "step": 2040 + }, + { + "epoch": 0.30178124539967616, + "grad_norm": 57.46256978501969, + "learning_rate": 4.405534198249216e-07, + "logits/chosen": -2.053389072418213, + "logits/rejected": -1.8966805934906006, + "logps/chosen": -420.7420959472656, + "logps/rejected": -443.77301025390625, + "loss": 0.573, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.753434419631958, + "rewards/margins": 0.6384382247924805, + "rewards/rejected": -2.3918726444244385, + "step": 2050 + }, + { + "epoch": 0.3032533490357721, + "grad_norm": 136.57527003638705, + "learning_rate": 4.3971922337747045e-07, + "logits/chosen": -2.0888214111328125, + "logits/rejected": -2.0872230529785156, + "logps/chosen": -389.699462890625, + "logps/rejected": -414.8658142089844, + "loss": 0.5046, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1965277194976807, + "rewards/margins": 0.6797729730606079, + "rewards/rejected": -1.8763010501861572, + "step": 2060 + }, + { + "epoch": 0.3047254526718681, + "grad_norm": 94.69647216195305, + "learning_rate": 4.388800161961606e-07, + "logits/chosen": -2.1853480339050293, + "logits/rejected": -2.00262188911438, + "logps/chosen": -457.60504150390625, + "logps/rejected": -500.3125, + "loss": 0.5225, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.6782252788543701, + "rewards/margins": 0.9120586514472961, + "rewards/rejected": -2.5902838706970215, + "step": 2070 + }, + { + "epoch": 0.3061975563079641, + "grad_norm": 80.7135984674759, + "learning_rate": 4.380358204455559e-07, + "logits/chosen": -1.9161068201065063, + "logits/rejected": -1.828960657119751, + "logps/chosen": -420.57763671875, + "logps/rejected": -515.581787109375, + "loss": 0.5566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.095799207687378, + "rewards/margins": 0.6575914621353149, + "rewards/rejected": -2.7533905506134033, + "step": 2080 + }, + { + "epoch": 0.30766965994406004, + "grad_norm": 140.86744922722565, + "learning_rate": 4.3718665842197494e-07, + "logits/chosen": -1.846639633178711, + "logits/rejected": -1.63237726688385, + "logps/chosen": -408.6860656738281, + "logps/rejected": -446.44696044921875, + "loss": 0.5416, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.84528386592865, + "rewards/margins": 0.801751971244812, + "rewards/rejected": -2.647035837173462, + "step": 2090 + }, + { + "epoch": 0.30914176358015605, + "grad_norm": 86.2351492332159, + "learning_rate": 4.363325525529019e-07, + "logits/chosen": -1.990800142288208, + "logits/rejected": -1.9234727621078491, + "logps/chosen": -474.1063537597656, + "logps/rejected": -505.68536376953125, + "loss": 0.5774, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.0050294399261475, + "rewards/margins": 0.7985720038414001, + "rewards/rejected": -2.8036017417907715, + "step": 2100 + }, + { + "epoch": 0.310613867216252, + "grad_norm": 82.43498691912119, + "learning_rate": 4.354735253963947e-07, + "logits/chosen": -1.9951881170272827, + "logits/rejected": -1.949826955795288, + "logps/chosen": -400.01849365234375, + "logps/rejected": -472.091552734375, + "loss": 0.5263, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.5853168964385986, + "rewards/margins": 0.906389594078064, + "rewards/rejected": -2.491706609725952, + "step": 2110 + }, + { + "epoch": 0.312085970852348, + "grad_norm": 122.96141085088001, + "learning_rate": 4.3460959964048854e-07, + "logits/chosen": -2.0313777923583984, + "logits/rejected": -1.9553247690200806, + "logps/chosen": -511.46917724609375, + "logps/rejected": -529.93359375, + "loss": 0.5944, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.088846206665039, + "rewards/margins": 0.41115322709083557, + "rewards/rejected": -2.499999523162842, + "step": 2120 + }, + { + "epoch": 0.31355807448844397, + "grad_norm": 79.32529715353333, + "learning_rate": 4.337407981025974e-07, + "logits/chosen": -2.134321928024292, + "logits/rejected": -1.9682848453521729, + "logps/chosen": -527.14013671875, + "logps/rejected": -666.1718139648438, + "loss": 0.477, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -2.1259849071502686, + "rewards/margins": 1.2098978757858276, + "rewards/rejected": -3.3358826637268066, + "step": 2130 + }, + { + "epoch": 0.31503017812454, + "grad_norm": 110.95917452289056, + "learning_rate": 4.3286714372891086e-07, + "logits/chosen": -2.1190648078918457, + "logits/rejected": -1.9981591701507568, + "logps/chosen": -400.00421142578125, + "logps/rejected": -496.0963439941406, + "loss": 0.5239, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.6231648921966553, + "rewards/margins": 0.8570715188980103, + "rewards/rejected": -2.480236530303955, + "step": 2140 + }, + { + "epoch": 0.31650228176063594, + "grad_norm": 59.800882417419665, + "learning_rate": 4.319886595937885e-07, + "logits/chosen": -2.0465383529663086, + "logits/rejected": -1.972486138343811, + "logps/chosen": -494.1653747558594, + "logps/rejected": -648.9085693359375, + "loss": 0.3614, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6825342178344727, + "rewards/margins": 1.6859804391860962, + "rewards/rejected": -3.3685145378112793, + "step": 2150 + }, + { + "epoch": 0.31797438539673195, + "grad_norm": 95.0575343159912, + "learning_rate": 4.3110536889914996e-07, + "logits/chosen": -2.133918285369873, + "logits/rejected": -1.9676601886749268, + "logps/chosen": -450.9820861816406, + "logps/rejected": -508.0374450683594, + "loss": 0.4724, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -1.5981152057647705, + "rewards/margins": 0.8304576873779297, + "rewards/rejected": -2.4285728931427, + "step": 2160 + }, + { + "epoch": 0.3194464890328279, + "grad_norm": 78.50260156221027, + "learning_rate": 4.302172949738626e-07, + "logits/chosen": -1.779275894165039, + "logits/rejected": -1.5978610515594482, + "logps/chosen": -404.69146728515625, + "logps/rejected": -677.0027465820312, + "loss": 0.3005, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.7400023937225342, + "rewards/margins": 2.4424076080322266, + "rewards/rejected": -4.182409763336182, + "step": 2170 + }, + { + "epoch": 0.3209185926689239, + "grad_norm": 120.80261336430219, + "learning_rate": 4.2932446127312516e-07, + "logits/chosen": -1.6290452480316162, + "logits/rejected": -1.6696780920028687, + "logps/chosen": -479.7286682128906, + "logps/rejected": -704.2164916992188, + "loss": 0.516, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.393439769744873, + "rewards/margins": 1.6574440002441406, + "rewards/rejected": -4.0508832931518555, + "step": 2180 + }, + { + "epoch": 0.3223906963050199, + "grad_norm": 116.84340711567805, + "learning_rate": 4.2842689137784825e-07, + "logits/chosen": -1.956629991531372, + "logits/rejected": -1.7823489904403687, + "logps/chosen": -556.0930786132812, + "logps/rejected": -571.1639404296875, + "loss": 0.5605, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.419422149658203, + "rewards/margins": 1.0306999683380127, + "rewards/rejected": -3.450122117996216, + "step": 2190 + }, + { + "epoch": 0.32386279994111583, + "grad_norm": 94.0357593498927, + "learning_rate": 4.2752460899403175e-07, + "logits/chosen": -1.935051679611206, + "logits/rejected": -1.7816410064697266, + "logps/chosen": -361.193359375, + "logps/rejected": -450.76214599609375, + "loss": 0.5667, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.496861219406128, + "rewards/margins": 0.6518845558166504, + "rewards/rejected": -2.1487457752227783, + "step": 2200 + }, + { + "epoch": 0.32533490357721184, + "grad_norm": 81.78314191404822, + "learning_rate": 4.2661763795213824e-07, + "logits/chosen": -1.7283738851547241, + "logits/rejected": -1.6573574542999268, + "logps/chosen": -367.89044189453125, + "logps/rejected": -504.7132873535156, + "loss": 0.5184, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.3765082359313965, + "rewards/margins": 1.175343632698059, + "rewards/rejected": -2.551851749420166, + "step": 2210 + }, + { + "epoch": 0.3268070072133078, + "grad_norm": 63.12952877069577, + "learning_rate": 4.2570600220646425e-07, + "logits/chosen": -1.95919668674469, + "logits/rejected": -1.9487415552139282, + "logps/chosen": -453.505615234375, + "logps/rejected": -488.37469482421875, + "loss": 0.5233, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.8469117879867554, + "rewards/margins": 0.49366050958633423, + "rewards/rejected": -2.3405721187591553, + "step": 2220 + }, + { + "epoch": 0.3282791108494038, + "grad_norm": 84.55781493827709, + "learning_rate": 4.247897258345071e-07, + "logits/chosen": -2.0467917919158936, + "logits/rejected": -1.8139393329620361, + "logps/chosen": -516.1986083984375, + "logps/rejected": -502.95904541015625, + "loss": 0.4837, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.9642683267593384, + "rewards/margins": 0.6500707864761353, + "rewards/rejected": -2.6143391132354736, + "step": 2230 + }, + { + "epoch": 0.32975121448549977, + "grad_norm": 185.01694583448307, + "learning_rate": 4.238688330363292e-07, + "logits/chosen": -1.7943252325057983, + "logits/rejected": -1.6955482959747314, + "logps/chosen": -409.77911376953125, + "logps/rejected": -505.22064208984375, + "loss": 0.5043, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.974449872970581, + "rewards/margins": 1.0962729454040527, + "rewards/rejected": -3.070723056793213, + "step": 2240 + }, + { + "epoch": 0.3312233181215958, + "grad_norm": 85.63241176222752, + "learning_rate": 4.2294334813391874e-07, + "logits/chosen": -1.723963975906372, + "logits/rejected": -1.7017498016357422, + "logps/chosen": -457.54083251953125, + "logps/rejected": -585.0357666015625, + "loss": 0.4626, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.6555570363998413, + "rewards/margins": 0.8884385228157043, + "rewards/rejected": -2.5439953804016113, + "step": 2250 + }, + { + "epoch": 0.33269542175769173, + "grad_norm": 167.7923840239941, + "learning_rate": 4.220132955705476e-07, + "logits/chosen": -1.657631516456604, + "logits/rejected": -1.4443316459655762, + "logps/chosen": -415.800537109375, + "logps/rejected": -459.9042053222656, + "loss": 0.6202, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6468541622161865, + "rewards/margins": 0.7804270386695862, + "rewards/rejected": -2.427281141281128, + "step": 2260 + }, + { + "epoch": 0.33416752539378775, + "grad_norm": 75.71838930892913, + "learning_rate": 4.2107869991012536e-07, + "logits/chosen": -1.6908735036849976, + "logits/rejected": -1.1443507671356201, + "logps/chosen": -490.3043518066406, + "logps/rejected": -563.1408081054688, + "loss": 0.4309, + "rewards/accuracies": 0.9333332777023315, + "rewards/chosen": -1.7285573482513428, + "rewards/margins": 1.4261088371276855, + "rewards/rejected": -3.15466570854187, + "step": 2270 + }, + { + "epoch": 0.3356396290298837, + "grad_norm": 100.20055445743147, + "learning_rate": 4.201395858365509e-07, + "logits/chosen": -1.2715295553207397, + "logits/rejected": -1.2886875867843628, + "logps/chosen": -443.1541442871094, + "logps/rejected": -566.6259155273438, + "loss": 0.5255, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.1264452934265137, + "rewards/margins": 0.73969966173172, + "rewards/rejected": -2.8661446571350098, + "step": 2280 + }, + { + "epoch": 0.33711173266597966, + "grad_norm": 117.91056032716772, + "learning_rate": 4.191959781530603e-07, + "logits/chosen": -1.5496199131011963, + "logits/rejected": -1.3153395652770996, + "logps/chosen": -432.66241455078125, + "logps/rejected": -533.2408447265625, + "loss": 0.5201, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7369009256362915, + "rewards/margins": 1.3840081691741943, + "rewards/rejected": -3.1209089756011963, + "step": 2290 + }, + { + "epoch": 0.33858383630207567, + "grad_norm": 214.10184300907184, + "learning_rate": 4.1824790178157184e-07, + "logits/chosen": -1.5339057445526123, + "logits/rejected": -1.2114434242248535, + "logps/chosen": -522.705322265625, + "logps/rejected": -590.180419921875, + "loss": 0.5918, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.344374179840088, + "rewards/margins": 1.2204809188842773, + "rewards/rejected": -3.5648550987243652, + "step": 2300 + }, + { + "epoch": 0.3400559399381716, + "grad_norm": 253.24056160111485, + "learning_rate": 4.172953817620275e-07, + "logits/chosen": -1.602736473083496, + "logits/rejected": -1.5537532567977905, + "logps/chosen": -441.55194091796875, + "logps/rejected": -509.0533752441406, + "loss": 0.6189, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.169576644897461, + "rewards/margins": 0.7569717168807983, + "rewards/rejected": -2.926548480987549, + "step": 2310 + }, + { + "epoch": 0.34152804357426764, + "grad_norm": 55.83571691484689, + "learning_rate": 4.1633844325173215e-07, + "logits/chosen": -2.034428358078003, + "logits/rejected": -1.8150005340576172, + "logps/chosen": -473.466796875, + "logps/rejected": -548.31396484375, + "loss": 0.4598, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.817256212234497, + "rewards/margins": 0.8173943758010864, + "rewards/rejected": -2.634650230407715, + "step": 2320 + }, + { + "epoch": 0.3430001472103636, + "grad_norm": 86.04473482331228, + "learning_rate": 4.153771115246886e-07, + "logits/chosen": -1.687479019165039, + "logits/rejected": -1.6358970403671265, + "logps/chosen": -449.14404296875, + "logps/rejected": -494.1427307128906, + "loss": 0.5747, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8904402256011963, + "rewards/margins": 0.8686310052871704, + "rewards/rejected": -2.759071111679077, + "step": 2330 + }, + { + "epoch": 0.3444722508464596, + "grad_norm": 72.76035002761718, + "learning_rate": 4.144114119709303e-07, + "logits/chosen": -1.5500452518463135, + "logits/rejected": -1.3492635488510132, + "logps/chosen": -449.443359375, + "logps/rejected": -517.4268798828125, + "loss": 0.4756, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5190051794052124, + "rewards/margins": 0.9872074127197266, + "rewards/rejected": -2.5062127113342285, + "step": 2340 + }, + { + "epoch": 0.34594435448255556, + "grad_norm": 108.41474329963664, + "learning_rate": 4.134413700958509e-07, + "logits/chosen": -1.5014220476150513, + "logits/rejected": -1.4534940719604492, + "logps/chosen": -458.2308654785156, + "logps/rejected": -543.97021484375, + "loss": 0.5152, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.8431669473648071, + "rewards/margins": 1.0037271976470947, + "rewards/rejected": -2.8468940258026123, + "step": 2350 + }, + { + "epoch": 0.34741645811865157, + "grad_norm": 78.54538308859094, + "learning_rate": 4.1246701151953014e-07, + "logits/chosen": -1.2571971416473389, + "logits/rejected": -1.0684103965759277, + "logps/chosen": -441.16046142578125, + "logps/rejected": -474.87542724609375, + "loss": 0.5396, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.0949184894561768, + "rewards/margins": 0.6241711974143982, + "rewards/rejected": -2.7190892696380615, + "step": 2360 + }, + { + "epoch": 0.34888856175474753, + "grad_norm": 76.37689872757143, + "learning_rate": 4.1148836197605774e-07, + "logits/chosen": -1.5744102001190186, + "logits/rejected": -1.3901218175888062, + "logps/chosen": -432.1310119628906, + "logps/rejected": -531.0370483398438, + "loss": 0.4773, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7929332256317139, + "rewards/margins": 1.1230725049972534, + "rewards/rejected": -2.9160056114196777, + "step": 2370 + }, + { + "epoch": 0.35036066539084354, + "grad_norm": 97.36409282958522, + "learning_rate": 4.105054473128536e-07, + "logits/chosen": -1.1355375051498413, + "logits/rejected": -1.2072243690490723, + "logps/chosen": -364.8439025878906, + "logps/rejected": -446.48388671875, + "loss": 0.5207, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.6437803506851196, + "rewards/margins": 0.8946324586868286, + "rewards/rejected": -2.538412570953369, + "step": 2380 + }, + { + "epoch": 0.3518327690269395, + "grad_norm": 122.9088139231662, + "learning_rate": 4.0951829348998477e-07, + "logits/chosen": -1.556002140045166, + "logits/rejected": -1.3016414642333984, + "logps/chosen": -513.4646606445312, + "logps/rejected": -580.1838989257812, + "loss": 0.5559, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1793551445007324, + "rewards/margins": 1.1707295179367065, + "rewards/rejected": -3.3500847816467285, + "step": 2390 + }, + { + "epoch": 0.35330487266303545, + "grad_norm": 81.73250263304992, + "learning_rate": 4.0852692657948027e-07, + "logits/chosen": -1.3853505849838257, + "logits/rejected": -1.0933678150177002, + "logps/chosen": -488.0162048339844, + "logps/rejected": -584.0532836914062, + "loss": 0.5017, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.599107027053833, + "rewards/margins": 1.1225740909576416, + "rewards/rejected": -3.7216808795928955, + "step": 2400 + }, + { + "epoch": 0.35477697629913146, + "grad_norm": 239.56056985131121, + "learning_rate": 4.075313727646422e-07, + "logits/chosen": -1.6437149047851562, + "logits/rejected": -1.4680321216583252, + "logps/chosen": -474.12750244140625, + "logps/rejected": -686.6546020507812, + "loss": 0.438, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -2.000917911529541, + "rewards/margins": 2.0760135650634766, + "rewards/rejected": -4.076931476593018, + "step": 2410 + }, + { + "epoch": 0.3562490799352274, + "grad_norm": 69.6078744773945, + "learning_rate": 4.0653165833935433e-07, + "logits/chosen": -1.8435087203979492, + "logits/rejected": -1.7100938558578491, + "logps/chosen": -424.34722900390625, + "logps/rejected": -437.58843994140625, + "loss": 0.4709, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.5596811771392822, + "rewards/margins": 0.8523275256156921, + "rewards/rejected": -2.412008762359619, + "step": 2420 + }, + { + "epoch": 0.35772118357132343, + "grad_norm": 61.9639939989687, + "learning_rate": 4.0552780970738755e-07, + "logits/chosen": -1.8579962253570557, + "logits/rejected": -1.8543269634246826, + "logps/chosen": -392.9651184082031, + "logps/rejected": -570.3347778320312, + "loss": 0.4588, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.5522739887237549, + "rewards/margins": 1.1792339086532593, + "rewards/rejected": -2.7315075397491455, + "step": 2430 + }, + { + "epoch": 0.3591932872074194, + "grad_norm": 55.46326733513349, + "learning_rate": 4.045198533817028e-07, + "logits/chosen": -1.7987346649169922, + "logits/rejected": -1.6985905170440674, + "logps/chosen": -443.2705078125, + "logps/rejected": -466.10064697265625, + "loss": 0.5339, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.9710447788238525, + "rewards/margins": 0.5192912817001343, + "rewards/rejected": -2.4903359413146973, + "step": 2440 + }, + { + "epoch": 0.3606653908435154, + "grad_norm": 112.81413437101291, + "learning_rate": 4.0350781598375027e-07, + "logits/chosen": -1.9687469005584717, + "logits/rejected": -1.6060094833374023, + "logps/chosen": -549.8952026367188, + "logps/rejected": -518.181396484375, + "loss": 0.5757, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7485243082046509, + "rewards/margins": 0.8799106478691101, + "rewards/rejected": -2.628434896469116, + "step": 2450 + }, + { + "epoch": 0.36213749447961135, + "grad_norm": 44.21488723290479, + "learning_rate": 4.024917242427669e-07, + "logits/chosen": -1.781081199645996, + "logits/rejected": -1.5755531787872314, + "logps/chosen": -512.1905517578125, + "logps/rejected": -607.9486083984375, + "loss": 0.5292, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.281831741333008, + "rewards/margins": 1.170804500579834, + "rewards/rejected": -3.452636241912842, + "step": 2460 + }, + { + "epoch": 0.36360959811570737, + "grad_norm": 83.23515227439339, + "learning_rate": 4.0147160499507006e-07, + "logits/chosen": -1.7541526556015015, + "logits/rejected": -1.5497632026672363, + "logps/chosen": -557.5874633789062, + "logps/rejected": -586.5172729492188, + "loss": 0.5035, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.248645544052124, + "rewards/margins": 0.9582318067550659, + "rewards/rejected": -3.2068774700164795, + "step": 2470 + }, + { + "epoch": 0.3650817017518033, + "grad_norm": 130.9035535859141, + "learning_rate": 4.004474851833488e-07, + "logits/chosen": -1.936694860458374, + "logits/rejected": -1.7358448505401611, + "logps/chosen": -574.6199951171875, + "logps/rejected": -587.8235473632812, + "loss": 0.4882, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7007898092269897, + "rewards/margins": 1.3249223232269287, + "rewards/rejected": -3.025712251663208, + "step": 2480 + }, + { + "epoch": 0.36655380538789933, + "grad_norm": 100.21586857400584, + "learning_rate": 3.9941939185595255e-07, + "logits/chosen": -1.4397448301315308, + "logits/rejected": -1.3709533214569092, + "logps/chosen": -438.868408203125, + "logps/rejected": -590.3887939453125, + "loss": 0.47, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -1.9680229425430298, + "rewards/margins": 1.3686507940292358, + "rewards/rejected": -3.3366737365722656, + "step": 2490 + }, + { + "epoch": 0.3680259090239953, + "grad_norm": 59.2336937947295, + "learning_rate": 3.9838735216617615e-07, + "logits/chosen": -1.8292615413665771, + "logits/rejected": -1.586116075515747, + "logps/chosen": -429.8548278808594, + "logps/rejected": -454.844482421875, + "loss": 0.6133, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.8058971166610718, + "rewards/margins": 0.6426948308944702, + "rewards/rejected": -2.448591947555542, + "step": 2500 + }, + { + "epoch": 0.36949801266009125, + "grad_norm": 98.86128391117239, + "learning_rate": 3.9735139337154334e-07, + "logits/chosen": -1.7934539318084717, + "logits/rejected": -1.5515906810760498, + "logps/chosen": -379.17108154296875, + "logps/rejected": -437.5555114746094, + "loss": 0.6632, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.8329546451568604, + "rewards/margins": 0.759555995464325, + "rewards/rejected": -2.59251070022583, + "step": 2510 + }, + { + "epoch": 0.37097011629618726, + "grad_norm": 47.04610290174856, + "learning_rate": 3.963115428330864e-07, + "logits/chosen": -2.13936448097229, + "logits/rejected": -2.10416841506958, + "logps/chosen": -476.16766357421875, + "logps/rejected": -518.3255615234375, + "loss": 0.5222, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.3312400579452515, + "rewards/margins": 0.5248512029647827, + "rewards/rejected": -1.8560912609100342, + "step": 2520 + }, + { + "epoch": 0.3724422199322832, + "grad_norm": 77.55954173516001, + "learning_rate": 3.9526782801462384e-07, + "logits/chosen": -2.2373321056365967, + "logits/rejected": -1.9484474658966064, + "logps/chosen": -444.98663330078125, + "logps/rejected": -433.94781494140625, + "loss": 0.4787, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.3387541770935059, + "rewards/margins": 1.1053967475891113, + "rewards/rejected": -2.4441511631011963, + "step": 2530 + }, + { + "epoch": 0.3739143235683792, + "grad_norm": 96.31670464059874, + "learning_rate": 3.9422027648203474e-07, + "logits/chosen": -1.6263630390167236, + "logits/rejected": -1.6051050424575806, + "logps/chosen": -363.9243469238281, + "logps/rejected": -438.3628845214844, + "loss": 0.484, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.6988608837127686, + "rewards/margins": 0.7454334497451782, + "rewards/rejected": -2.4442944526672363, + "step": 2540 + }, + { + "epoch": 0.3753864272044752, + "grad_norm": 64.88104735810006, + "learning_rate": 3.9316891590253086e-07, + "logits/chosen": -2.0933780670166016, + "logits/rejected": -1.8305991888046265, + "logps/chosen": -478.14569091796875, + "logps/rejected": -542.096923828125, + "loss": 0.5203, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.9514715671539307, + "rewards/margins": 1.069432020187378, + "rewards/rejected": -3.0209033489227295, + "step": 2550 + }, + { + "epoch": 0.3768585308405712, + "grad_norm": 95.34597534948138, + "learning_rate": 3.921137740439261e-07, + "logits/chosen": -2.0900235176086426, + "logits/rejected": -2.0113472938537598, + "logps/chosen": -395.46380615234375, + "logps/rejected": -435.18548583984375, + "loss": 0.4261, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -1.478169322013855, + "rewards/margins": 0.7347229719161987, + "rewards/rejected": -2.2128922939300537, + "step": 2560 + }, + { + "epoch": 0.37833063447666715, + "grad_norm": 78.43912895519057, + "learning_rate": 3.910548787739024e-07, + "logits/chosen": -2.1403000354766846, + "logits/rejected": -1.960837721824646, + "logps/chosen": -493.2071838378906, + "logps/rejected": -586.7843017578125, + "loss": 0.5076, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.12876033782959, + "rewards/margins": 1.023504376411438, + "rewards/rejected": -3.1522648334503174, + "step": 2570 + }, + { + "epoch": 0.37980273811276316, + "grad_norm": 72.70993845407499, + "learning_rate": 3.8999225805927483e-07, + "logits/chosen": -2.005814552307129, + "logits/rejected": -1.6591650247573853, + "logps/chosen": -455.30328369140625, + "logps/rejected": -541.7001953125, + "loss": 0.5842, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -1.9915201663970947, + "rewards/margins": 1.0560119152069092, + "rewards/rejected": -3.047531843185425, + "step": 2580 + }, + { + "epoch": 0.3812748417488591, + "grad_norm": 62.91922669436355, + "learning_rate": 3.889259399652519e-07, + "logits/chosen": -2.0703909397125244, + "logits/rejected": -1.8510854244232178, + "logps/chosen": -564.1527709960938, + "logps/rejected": -611.5841064453125, + "loss": 0.4638, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.516140937805176, + "rewards/margins": 1.0101659297943115, + "rewards/rejected": -3.5263068675994873, + "step": 2590 + }, + { + "epoch": 0.38274694538495513, + "grad_norm": 169.01131469837011, + "learning_rate": 3.8785595265469497e-07, + "logits/chosen": -1.961424469947815, + "logits/rejected": -1.9792935848236084, + "logps/chosen": -525.9747924804688, + "logps/rejected": -577.0545043945312, + "loss": 0.5418, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.6867942810058594, + "rewards/margins": 0.8112853169441223, + "rewards/rejected": -3.498079776763916, + "step": 2600 + }, + { + "epoch": 0.3842190490210511, + "grad_norm": 73.90378018615306, + "learning_rate": 3.867823243873743e-07, + "logits/chosen": -2.102569103240967, + "logits/rejected": -1.9672361612319946, + "logps/chosen": -424.940185546875, + "logps/rejected": -519.3734130859375, + "loss": 0.4096, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.077277183532715, + "rewards/margins": 1.1491105556488037, + "rewards/rejected": -3.2263877391815186, + "step": 2610 + }, + { + "epoch": 0.38569115265714704, + "grad_norm": 78.45736581695806, + "learning_rate": 3.8570508351922234e-07, + "logits/chosen": -2.030846357345581, + "logits/rejected": -1.8978300094604492, + "logps/chosen": -553.8700561523438, + "logps/rejected": -656.2894287109375, + "loss": 0.5293, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.677258253097534, + "rewards/margins": 1.3484762907028198, + "rewards/rejected": -4.025734901428223, + "step": 2620 + }, + { + "epoch": 0.38716325629324305, + "grad_norm": 124.70621091661863, + "learning_rate": 3.8462425850158533e-07, + "logits/chosen": -1.9009478092193604, + "logits/rejected": -1.9104833602905273, + "logps/chosen": -489.1398010253906, + "logps/rejected": -651.7132568359375, + "loss": 0.5581, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -2.644368886947632, + "rewards/margins": 1.326848030090332, + "rewards/rejected": -3.971216917037964, + "step": 2630 + }, + { + "epoch": 0.388635359929339, + "grad_norm": 57.51562361361137, + "learning_rate": 3.8353987788047136e-07, + "logits/chosen": -2.180210828781128, + "logits/rejected": -2.0472755432128906, + "logps/chosen": -510.35302734375, + "logps/rejected": -559.6215209960938, + "loss": 0.4784, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -2.0872254371643066, + "rewards/margins": 0.9153797030448914, + "rewards/rejected": -3.0026049613952637, + "step": 2640 + }, + { + "epoch": 0.390107463565435, + "grad_norm": 83.16271716741163, + "learning_rate": 3.824519702957969e-07, + "logits/chosen": -2.1296792030334473, + "logits/rejected": -1.9682823419570923, + "logps/chosen": -569.1190795898438, + "logps/rejected": -541.9053955078125, + "loss": 0.633, + "rewards/accuracies": 0.5666666030883789, + "rewards/chosen": -2.212919235229492, + "rewards/margins": 0.719650149345398, + "rewards/rejected": -2.9325695037841797, + "step": 2650 + }, + { + "epoch": 0.391579567201531, + "grad_norm": 74.67378373781077, + "learning_rate": 3.8136056448063016e-07, + "logits/chosen": -2.210920810699463, + "logits/rejected": -2.0824389457702637, + "logps/chosen": -425.52880859375, + "logps/rejected": -545.993408203125, + "loss": 0.5263, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.547957181930542, + "rewards/margins": 1.3698958158493042, + "rewards/rejected": -2.9178528785705566, + "step": 2660 + }, + { + "epoch": 0.393051670837627, + "grad_norm": 46.24625662702139, + "learning_rate": 3.802656892604319e-07, + "logits/chosen": -1.937159776687622, + "logits/rejected": -1.7159931659698486, + "logps/chosen": -382.53009033203125, + "logps/rejected": -478.021484375, + "loss": 0.4826, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.587925910949707, + "rewards/margins": 1.2533605098724365, + "rewards/rejected": -2.8412866592407227, + "step": 2670 + }, + { + "epoch": 0.39452377447372294, + "grad_norm": 87.53274846684974, + "learning_rate": 3.791673735522949e-07, + "logits/chosen": -1.765702486038208, + "logits/rejected": -1.5696810483932495, + "logps/chosen": -458.1748962402344, + "logps/rejected": -629.7850341796875, + "loss": 0.5143, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.04221773147583, + "rewards/margins": 1.3660646677017212, + "rewards/rejected": -3.408282518386841, + "step": 2680 + }, + { + "epoch": 0.39599587810981896, + "grad_norm": 200.15413558137533, + "learning_rate": 3.7806564636417936e-07, + "logits/chosen": -1.8686439990997314, + "logits/rejected": -1.6479641199111938, + "logps/chosen": -416.6416931152344, + "logps/rejected": -515.533447265625, + "loss": 0.56, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.5594228506088257, + "rewards/margins": 1.1629923582077026, + "rewards/rejected": -2.7224154472351074, + "step": 2690 + }, + { + "epoch": 0.3974679817459149, + "grad_norm": 81.20552891795218, + "learning_rate": 3.769605367941472e-07, + "logits/chosen": -1.686522126197815, + "logits/rejected": -1.5740878582000732, + "logps/chosen": -437.0101623535156, + "logps/rejected": -497.9092712402344, + "loss": 0.4115, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.8423454761505127, + "rewards/margins": 0.7787291407585144, + "rewards/rejected": -2.621074914932251, + "step": 2700 + }, + { + "epoch": 0.39894008538201087, + "grad_norm": 73.68884413112927, + "learning_rate": 3.7585207402959377e-07, + "logits/chosen": -1.6373815536499023, + "logits/rejected": -1.3962100744247437, + "logps/chosen": -407.1414794921875, + "logps/rejected": -485.02197265625, + "loss": 0.5006, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7172142267227173, + "rewards/margins": 0.8074946403503418, + "rewards/rejected": -2.5247085094451904, + "step": 2710 + }, + { + "epoch": 0.4004121890181069, + "grad_norm": 54.50965956630463, + "learning_rate": 3.747402873464764e-07, + "logits/chosen": -1.5445497035980225, + "logits/rejected": -0.9880827069282532, + "logps/chosen": -458.2935485839844, + "logps/rejected": -542.5947875976562, + "loss": 0.4642, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.8875930309295654, + "rewards/margins": 1.2322635650634766, + "rewards/rejected": -3.119856834411621, + "step": 2720 + }, + { + "epoch": 0.40188429265420283, + "grad_norm": 196.0975594745305, + "learning_rate": 3.7362520610854147e-07, + "logits/chosen": -1.4485681056976318, + "logits/rejected": -1.2357394695281982, + "logps/chosen": -372.23126220703125, + "logps/rejected": -513.0504150390625, + "loss": 0.522, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.6937439441680908, + "rewards/margins": 1.329360008239746, + "rewards/rejected": -3.023104190826416, + "step": 2730 + }, + { + "epoch": 0.40335639629029885, + "grad_norm": 295.9338362264524, + "learning_rate": 3.725068597665491e-07, + "logits/chosen": -1.313795804977417, + "logits/rejected": -1.1979414224624634, + "logps/chosen": -428.66778564453125, + "logps/rejected": -589.0925903320312, + "loss": 0.627, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.3330397605895996, + "rewards/margins": 0.8843925595283508, + "rewards/rejected": -3.2174324989318848, + "step": 2740 + }, + { + "epoch": 0.4048284999263948, + "grad_norm": 54.27979036747751, + "learning_rate": 3.71385277857495e-07, + "logits/chosen": -1.4289398193359375, + "logits/rejected": -1.287122130393982, + "logps/chosen": -532.2777709960938, + "logps/rejected": -656.8150634765625, + "loss": 0.6018, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.7441351413726807, + "rewards/margins": 1.0424460172653198, + "rewards/rejected": -3.786581516265869, + "step": 2750 + }, + { + "epoch": 0.4063006035624908, + "grad_norm": 119.80872816783915, + "learning_rate": 3.702604900038302e-07, + "logits/chosen": -1.800419569015503, + "logits/rejected": -1.5214576721191406, + "logps/chosen": -438.43695068359375, + "logps/rejected": -515.1673583984375, + "loss": 0.5185, + "rewards/accuracies": 0.7333332300186157, + "rewards/chosen": -2.0457539558410645, + "rewards/margins": 1.1193088293075562, + "rewards/rejected": -3.16506290435791, + "step": 2760 + }, + { + "epoch": 0.40777270719858677, + "grad_norm": 237.63404715943577, + "learning_rate": 3.691325259126794e-07, + "logits/chosen": -1.5565061569213867, + "logits/rejected": -1.455678939819336, + "logps/chosen": -480.0604553222656, + "logps/rejected": -588.1300048828125, + "loss": 0.5994, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.537501335144043, + "rewards/margins": 0.7854641675949097, + "rewards/rejected": -3.322965145111084, + "step": 2770 + }, + { + "epoch": 0.4092448108346828, + "grad_norm": 102.91561167279252, + "learning_rate": 3.6800141537505556e-07, + "logits/chosen": -1.76127028465271, + "logits/rejected": -1.544158935546875, + "logps/chosen": -418.3402404785156, + "logps/rejected": -603.7567138671875, + "loss": 0.4582, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8573930263519287, + "rewards/margins": 1.370255947113037, + "rewards/rejected": -3.227648973464966, + "step": 2780 + }, + { + "epoch": 0.41071691447077874, + "grad_norm": 139.07007690087863, + "learning_rate": 3.6686718826507363e-07, + "logits/chosen": -1.8670623302459717, + "logits/rejected": -1.6136724948883057, + "logps/chosen": -521.4725952148438, + "logps/rejected": -544.9457397460938, + "loss": 0.6254, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.211174964904785, + "rewards/margins": 0.55262690782547, + "rewards/rejected": -2.7638020515441895, + "step": 2790 + }, + { + "epoch": 0.41218901810687475, + "grad_norm": 49.89174324192258, + "learning_rate": 3.6572987453916114e-07, + "logits/chosen": -1.5626221895217896, + "logits/rejected": -1.2662019729614258, + "logps/chosen": -437.171630859375, + "logps/rejected": -503.14166259765625, + "loss": 0.4083, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.0375397205352783, + "rewards/margins": 1.0458136796951294, + "rewards/rejected": -3.083353281021118, + "step": 2800 + }, + { + "epoch": 0.4136611217429707, + "grad_norm": 90.94094428939768, + "learning_rate": 3.645895042352672e-07, + "logits/chosen": -1.6565611362457275, + "logits/rejected": -1.1093449592590332, + "logps/chosen": -495.419677734375, + "logps/rejected": -671.6307983398438, + "loss": 0.4292, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.191828727722168, + "rewards/margins": 1.9161417484283447, + "rewards/rejected": -4.107970237731934, + "step": 2810 + }, + { + "epoch": 0.41513322537906666, + "grad_norm": 211.81112793229224, + "learning_rate": 3.634461074720695e-07, + "logits/chosen": -1.1938692331314087, + "logits/rejected": -1.0484073162078857, + "logps/chosen": -502.84564208984375, + "logps/rejected": -644.0247192382812, + "loss": 0.6078, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.64770245552063, + "rewards/margins": 1.4506587982177734, + "rewards/rejected": -4.098361492156982, + "step": 2820 + }, + { + "epoch": 0.4166053290151627, + "grad_norm": 58.38429865008052, + "learning_rate": 3.622997144481781e-07, + "logits/chosen": -1.488816499710083, + "logits/rejected": -0.9468615651130676, + "logps/chosen": -598.2494506835938, + "logps/rejected": -602.2901000976562, + "loss": 0.5417, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -3.132472038269043, + "rewards/margins": 0.8369997143745422, + "rewards/rejected": -3.9694721698760986, + "step": 2830 + }, + { + "epoch": 0.41807743265125863, + "grad_norm": 62.95912189808289, + "learning_rate": 3.611503554413383e-07, + "logits/chosen": -1.8090894222259521, + "logits/rejected": -1.451247215270996, + "logps/chosen": -495.7832946777344, + "logps/rejected": -583.2427978515625, + "loss": 0.6016, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0796589851379395, + "rewards/margins": 1.063936471939087, + "rewards/rejected": -3.1435952186584473, + "step": 2840 + }, + { + "epoch": 0.41954953628735464, + "grad_norm": 87.42038976692645, + "learning_rate": 3.599980608076312e-07, + "logits/chosen": -1.7922834157943726, + "logits/rejected": -1.585123062133789, + "logps/chosen": -446.61285400390625, + "logps/rejected": -481.375244140625, + "loss": 0.525, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.575880765914917, + "rewards/margins": 1.0226125717163086, + "rewards/rejected": -2.5984935760498047, + "step": 2850 + }, + { + "epoch": 0.4210216399234506, + "grad_norm": 86.14833044925632, + "learning_rate": 3.5884286098067124e-07, + "logits/chosen": -1.8769581317901611, + "logits/rejected": -1.6354297399520874, + "logps/chosen": -531.2254638671875, + "logps/rejected": -580.4683227539062, + "loss": 0.4799, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.2853713035583496, + "rewards/margins": 0.8154304623603821, + "rewards/rejected": -3.1008009910583496, + "step": 2860 + }, + { + "epoch": 0.4224937435595466, + "grad_norm": 123.69716860725718, + "learning_rate": 3.5768478647080315e-07, + "logits/chosen": -1.484922170639038, + "logits/rejected": -1.5234549045562744, + "logps/chosen": -514.9072875976562, + "logps/rejected": -570.5345458984375, + "loss": 0.526, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2644734382629395, + "rewards/margins": 0.9515140652656555, + "rewards/rejected": -3.21598744392395, + "step": 2870 + }, + { + "epoch": 0.42396584719564256, + "grad_norm": 112.14727851913004, + "learning_rate": 3.565238678642957e-07, + "logits/chosen": -1.3305034637451172, + "logits/rejected": -0.8070168495178223, + "logps/chosen": -568.4759521484375, + "logps/rejected": -623.7313842773438, + "loss": 0.5131, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.5929512977600098, + "rewards/margins": 1.525914192199707, + "rewards/rejected": -4.118865966796875, + "step": 2880 + }, + { + "epoch": 0.4254379508317386, + "grad_norm": 74.10908173340124, + "learning_rate": 3.55360135822534e-07, + "logits/chosen": -1.519352674484253, + "logits/rejected": -1.223768949508667, + "logps/chosen": -547.48095703125, + "logps/rejected": -674.0955200195312, + "loss": 0.4907, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.6193230152130127, + "rewards/margins": 1.473724603652954, + "rewards/rejected": -4.093048095703125, + "step": 2890 + }, + { + "epoch": 0.42691005446783453, + "grad_norm": 72.52863306080229, + "learning_rate": 3.541936210812095e-07, + "logits/chosen": -1.5181057453155518, + "logits/rejected": -1.10567307472229, + "logps/chosen": -515.7329711914062, + "logps/rejected": -631.7528686523438, + "loss": 0.3838, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -2.479140520095825, + "rewards/margins": 1.6800016164779663, + "rewards/rejected": -4.15914249420166, + "step": 2900 + }, + { + "epoch": 0.42838215810393054, + "grad_norm": 49.627712806376785, + "learning_rate": 3.5302435444950894e-07, + "logits/chosen": -1.7588831186294556, + "logits/rejected": -1.3383047580718994, + "logps/chosen": -472.28656005859375, + "logps/rejected": -623.2119140625, + "loss": 0.6826, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8840452432632446, + "rewards/margins": 2.0654664039611816, + "rewards/rejected": -3.949511766433716, + "step": 2910 + }, + { + "epoch": 0.4298542617400265, + "grad_norm": 69.02208137075738, + "learning_rate": 3.518523668092994e-07, + "logits/chosen": -1.5955554246902466, + "logits/rejected": -1.3054542541503906, + "logps/chosen": -498.1922302246094, + "logps/rejected": -553.93310546875, + "loss": 0.5607, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.228318691253662, + "rewards/margins": 1.007958173751831, + "rewards/rejected": -3.236276626586914, + "step": 2920 + }, + { + "epoch": 0.43132636537612246, + "grad_norm": 67.20246185818063, + "learning_rate": 3.506776891143138e-07, + "logits/chosen": -1.737927794456482, + "logits/rejected": -1.773044228553772, + "logps/chosen": -392.6949157714844, + "logps/rejected": -551.0619506835938, + "loss": 0.5183, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.410337209701538, + "rewards/margins": 0.9181815981864929, + "rewards/rejected": -2.3285186290740967, + "step": 2930 + }, + { + "epoch": 0.43279846901221847, + "grad_norm": 72.05701817187511, + "learning_rate": 3.495003523893329e-07, + "logits/chosen": -1.5914640426635742, + "logits/rejected": -1.643734335899353, + "logps/chosen": -390.11895751953125, + "logps/rejected": -439.25213623046875, + "loss": 0.5905, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -1.526058554649353, + "rewards/margins": 0.5697023272514343, + "rewards/rejected": -2.0957608222961426, + "step": 2940 + }, + { + "epoch": 0.4342705726483144, + "grad_norm": 65.75433921960477, + "learning_rate": 3.4832038772936574e-07, + "logits/chosen": -1.932199478149414, + "logits/rejected": -1.797698974609375, + "logps/chosen": -406.03515625, + "logps/rejected": -465.648681640625, + "loss": 0.5076, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.5398509502410889, + "rewards/margins": 0.5732296705245972, + "rewards/rejected": -2.1130805015563965, + "step": 2950 + }, + { + "epoch": 0.43574267628441044, + "grad_norm": 87.55330917826268, + "learning_rate": 3.471378262988288e-07, + "logits/chosen": -1.737178087234497, + "logits/rejected": -1.5921614170074463, + "logps/chosen": -487.3334045410156, + "logps/rejected": -531.8839111328125, + "loss": 0.5489, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -2.0652036666870117, + "rewards/margins": 0.9940094947814941, + "rewards/rejected": -3.059213161468506, + "step": 2960 + }, + { + "epoch": 0.4372147799205064, + "grad_norm": 142.23018768485719, + "learning_rate": 3.459526993307225e-07, + "logits/chosen": -1.7247488498687744, + "logits/rejected": -1.447429895401001, + "logps/chosen": -547.8975219726562, + "logps/rejected": -553.40234375, + "loss": 0.6484, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.2105212211608887, + "rewards/margins": 0.7438166737556458, + "rewards/rejected": -2.9543375968933105, + "step": 2970 + }, + { + "epoch": 0.4386868835566024, + "grad_norm": 101.09664810030793, + "learning_rate": 3.4476503812580677e-07, + "logits/chosen": -1.8655691146850586, + "logits/rejected": -1.5461689233779907, + "logps/chosen": -482.3690490722656, + "logps/rejected": -560.6808471679688, + "loss": 0.5351, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.89851975440979, + "rewards/margins": 0.9472289085388184, + "rewards/rejected": -2.8457486629486084, + "step": 2980 + }, + { + "epoch": 0.44015898719269836, + "grad_norm": 70.59637093478896, + "learning_rate": 3.4357487405177367e-07, + "logits/chosen": -1.7736320495605469, + "logits/rejected": -1.7680776119232178, + "logps/chosen": -389.9610900878906, + "logps/rejected": -493.32379150390625, + "loss": 0.4904, + "rewards/accuracies": 0.73333340883255, + "rewards/chosen": -1.611891746520996, + "rewards/margins": 0.892806351184845, + "rewards/rejected": -2.5046982765197754, + "step": 2990 + }, + { + "epoch": 0.44163109082879437, + "grad_norm": 151.5442601307048, + "learning_rate": 3.423822385424195e-07, + "logits/chosen": -1.827993392944336, + "logits/rejected": -1.775909423828125, + "logps/chosen": -438.62225341796875, + "logps/rejected": -544.998779296875, + "loss": 0.4331, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.6295242309570312, + "rewards/margins": 1.149675965309143, + "rewards/rejected": -2.7792000770568848, + "step": 3000 + }, + { + "epoch": 0.4431031944648903, + "grad_norm": 60.659098930168575, + "learning_rate": 3.411871630968145e-07, + "logits/chosen": -1.5524766445159912, + "logits/rejected": -1.1825859546661377, + "logps/chosen": -528.868896484375, + "logps/rejected": -706.8850708007812, + "loss": 0.5274, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.2250075340270996, + "rewards/margins": 1.7779312133789062, + "rewards/rejected": -4.002938270568848, + "step": 3010 + }, + { + "epoch": 0.4445752981009863, + "grad_norm": 154.46708278157683, + "learning_rate": 3.3998967927847067e-07, + "logits/chosen": -1.6661765575408936, + "logits/rejected": -1.5166027545928955, + "logps/chosen": -551.692626953125, + "logps/rejected": -647.8118896484375, + "loss": 0.5445, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.135056495666504, + "rewards/margins": 1.152606725692749, + "rewards/rejected": -3.287663221359253, + "step": 3020 + }, + { + "epoch": 0.4460474017370823, + "grad_norm": 119.13849085965208, + "learning_rate": 3.3878981871450846e-07, + "logits/chosen": -1.7531712055206299, + "logits/rejected": -1.5738329887390137, + "logps/chosen": -513.7649536132812, + "logps/rejected": -595.7522583007812, + "loss": 0.5144, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.9461761713027954, + "rewards/margins": 0.7522245645523071, + "rewards/rejected": -2.6984009742736816, + "step": 3030 + }, + { + "epoch": 0.44751950537317825, + "grad_norm": 140.4071339053982, + "learning_rate": 3.375876130948211e-07, + "logits/chosen": -1.5250799655914307, + "logits/rejected": -1.0465915203094482, + "logps/chosen": -496.75140380859375, + "logps/rejected": -623.2408447265625, + "loss": 0.4446, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -2.276080846786499, + "rewards/margins": 1.414711833000183, + "rewards/rejected": -3.6907927989959717, + "step": 3040 + }, + { + "epoch": 0.44899160900927426, + "grad_norm": 135.7715441436105, + "learning_rate": 3.3638309417123824e-07, + "logits/chosen": -1.4496361017227173, + "logits/rejected": -0.9799026250839233, + "logps/chosen": -511.5415954589844, + "logps/rejected": -523.0415649414062, + "loss": 0.5433, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.2215921878814697, + "rewards/margins": 0.7228263020515442, + "rewards/rejected": -2.944418430328369, + "step": 3050 + }, + { + "epoch": 0.4504637126453702, + "grad_norm": 56.023764718637864, + "learning_rate": 3.3517629375668615e-07, + "logits/chosen": -1.5622495412826538, + "logits/rejected": -1.116483449935913, + "logps/chosen": -512.7913208007812, + "logps/rejected": -615.9537963867188, + "loss": 0.4915, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -2.2164382934570312, + "rewards/margins": 1.2098634243011475, + "rewards/rejected": -3.4263014793395996, + "step": 3060 + }, + { + "epoch": 0.45193581628146623, + "grad_norm": 90.24170161644254, + "learning_rate": 3.3396724372434914e-07, + "logits/chosen": -1.747844934463501, + "logits/rejected": -1.278145432472229, + "logps/chosen": -507.3158264160156, + "logps/rejected": -506.06982421875, + "loss": 0.4962, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.565063714981079, + "rewards/margins": 1.2676362991333008, + "rewards/rejected": -2.832700252532959, + "step": 3070 + }, + { + "epoch": 0.4534079199175622, + "grad_norm": 60.22811685538905, + "learning_rate": 3.327559760068263e-07, + "logits/chosen": -1.1775696277618408, + "logits/rejected": -1.0890233516693115, + "logps/chosen": -364.0589294433594, + "logps/rejected": -567.9036865234375, + "loss": 0.5557, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.7436749935150146, + "rewards/margins": 1.4570614099502563, + "rewards/rejected": -3.2007362842559814, + "step": 3080 + }, + { + "epoch": 0.4548800235536582, + "grad_norm": 118.66669319731676, + "learning_rate": 3.3154252259528883e-07, + "logits/chosen": -1.7384620904922485, + "logits/rejected": -1.1229941844940186, + "logps/chosen": -505.9580993652344, + "logps/rejected": -577.2669067382812, + "loss": 0.4273, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.698209524154663, + "rewards/margins": 1.3837387561798096, + "rewards/rejected": -3.0819482803344727, + "step": 3090 + }, + { + "epoch": 0.45635212718975415, + "grad_norm": 156.13451787505036, + "learning_rate": 3.303269155386351e-07, + "logits/chosen": -1.5324653387069702, + "logits/rejected": -1.4273545742034912, + "logps/chosen": -447.69000244140625, + "logps/rejected": -613.1942138671875, + "loss": 0.5226, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.686049222946167, + "rewards/margins": 1.6342623233795166, + "rewards/rejected": -3.3203117847442627, + "step": 3100 + }, + { + "epoch": 0.45782423082585016, + "grad_norm": 159.94425113132564, + "learning_rate": 3.291091869426439e-07, + "logits/chosen": -1.512087345123291, + "logits/rejected": -1.2060292959213257, + "logps/chosen": -457.0021057128906, + "logps/rejected": -590.585205078125, + "loss": 0.5458, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.6273653507232666, + "rewards/margins": 1.8371286392211914, + "rewards/rejected": -3.464493989944458, + "step": 3110 + }, + { + "epoch": 0.4592963344619461, + "grad_norm": 71.62202116138619, + "learning_rate": 3.27889368969127e-07, + "logits/chosen": -1.33388352394104, + "logits/rejected": -1.0087616443634033, + "logps/chosen": -471.14788818359375, + "logps/rejected": -535.3895263671875, + "loss": 0.4593, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.886225938796997, + "rewards/margins": 1.4341530799865723, + "rewards/rejected": -3.3203787803649902, + "step": 3120 + }, + { + "epoch": 0.4607684380980421, + "grad_norm": 115.49752201205501, + "learning_rate": 3.266674938350789e-07, + "logits/chosen": -1.5824775695800781, + "logits/rejected": -1.180802345275879, + "logps/chosen": -501.3780212402344, + "logps/rejected": -558.4815673828125, + "loss": 0.5024, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.978123664855957, + "rewards/margins": 1.0970547199249268, + "rewards/rejected": -3.075178384780884, + "step": 3130 + }, + { + "epoch": 0.4622405417341381, + "grad_norm": 62.914893801051676, + "learning_rate": 3.254435938118269e-07, + "logits/chosen": -1.2504870891571045, + "logits/rejected": -1.497119426727295, + "logps/chosen": -491.69195556640625, + "logps/rejected": -576.496337890625, + "loss": 0.5683, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -1.9048473834991455, + "rewards/margins": 0.7879148721694946, + "rewards/rejected": -2.6927618980407715, + "step": 3140 + }, + { + "epoch": 0.46371264537023404, + "grad_norm": 78.1707782607172, + "learning_rate": 3.242177012241778e-07, + "logits/chosen": -1.6074014902114868, + "logits/rejected": -1.0992376804351807, + "logps/chosen": -471.624755859375, + "logps/rejected": -518.9395141601562, + "loss": 0.4731, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.0670464038848877, + "rewards/margins": 1.0713918209075928, + "rewards/rejected": -3.1384377479553223, + "step": 3150 + }, + { + "epoch": 0.46518474900633006, + "grad_norm": 156.05958550009998, + "learning_rate": 3.229898484495649e-07, + "logits/chosen": -1.3677150011062622, + "logits/rejected": -1.2939870357513428, + "logps/chosen": -380.930908203125, + "logps/rejected": -485.2793884277344, + "loss": 0.5496, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -1.9171888828277588, + "rewards/margins": 0.9369261860847473, + "rewards/rejected": -2.8541150093078613, + "step": 3160 + }, + { + "epoch": 0.466656852642426, + "grad_norm": 97.56014654460785, + "learning_rate": 3.2176006791719266e-07, + "logits/chosen": -1.3693625926971436, + "logits/rejected": -1.182793378829956, + "logps/chosen": -430.64459228515625, + "logps/rejected": -586.9564208984375, + "loss": 0.4636, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.0552849769592285, + "rewards/margins": 1.5252145528793335, + "rewards/rejected": -3.5804991722106934, + "step": 3170 + }, + { + "epoch": 0.468128956278522, + "grad_norm": 234.9041999362437, + "learning_rate": 3.2052839210718007e-07, + "logits/chosen": -1.42060387134552, + "logits/rejected": -1.5373618602752686, + "logps/chosen": -445.12908935546875, + "logps/rejected": -590.1561279296875, + "loss": 0.445, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -2.1267595291137695, + "rewards/margins": 1.0089360475540161, + "rewards/rejected": -3.135695457458496, + "step": 3180 + }, + { + "epoch": 0.469601059914618, + "grad_norm": 71.5681442261575, + "learning_rate": 3.1929485354970297e-07, + "logits/chosen": -1.5528169870376587, + "logits/rejected": -1.1293237209320068, + "logps/chosen": -500.97662353515625, + "logps/rejected": -688.411376953125, + "loss": 0.3742, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1596217155456543, + "rewards/margins": 1.8486528396606445, + "rewards/rejected": -4.008275032043457, + "step": 3190 + }, + { + "epoch": 0.471073163550714, + "grad_norm": 81.63225233188635, + "learning_rate": 3.180594848241346e-07, + "logits/chosen": -1.0608268976211548, + "logits/rejected": -0.9076783061027527, + "logps/chosen": -436.4244689941406, + "logps/rejected": -717.12939453125, + "loss": 0.5504, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -2.1242620944976807, + "rewards/margins": 1.451608657836914, + "rewards/rejected": -3.5758705139160156, + "step": 3200 + }, + { + "epoch": 0.47254526718680995, + "grad_norm": 97.73506742652809, + "learning_rate": 3.1682231855818565e-07, + "logits/chosen": -1.0978786945343018, + "logits/rejected": -0.9731031656265259, + "logps/chosen": -404.39874267578125, + "logps/rejected": -591.658203125, + "loss": 0.4612, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -2.088728427886963, + "rewards/margins": 1.6628957986831665, + "rewards/rejected": -3.751624584197998, + "step": 3210 + }, + { + "epoch": 0.47401737082290596, + "grad_norm": 85.0010513822755, + "learning_rate": 3.1558338742704175e-07, + "logits/chosen": -1.2816003561019897, + "logits/rejected": -1.0689582824707031, + "logps/chosen": -560.6173095703125, + "logps/rejected": -580.8114013671875, + "loss": 0.6525, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.4841196537017822, + "rewards/margins": 0.893811047077179, + "rewards/rejected": -3.3779304027557373, + "step": 3220 + }, + { + "epoch": 0.4754894744590019, + "grad_norm": 100.11505506269383, + "learning_rate": 3.1434272415250164e-07, + "logits/chosen": -1.4650182723999023, + "logits/rejected": -1.2752690315246582, + "logps/chosen": -484.4212341308594, + "logps/rejected": -591.5261840820312, + "loss": 0.4575, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.8464012145996094, + "rewards/margins": 1.2857813835144043, + "rewards/rejected": -3.1321825981140137, + "step": 3230 + }, + { + "epoch": 0.47696157809509787, + "grad_norm": 46.39861976638429, + "learning_rate": 3.1310036150211155e-07, + "logits/chosen": -1.473344326019287, + "logits/rejected": -1.3880174160003662, + "logps/chosen": -435.3146057128906, + "logps/rejected": -552.3840942382812, + "loss": 0.4707, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.4952512979507446, + "rewards/margins": 1.3589332103729248, + "rewards/rejected": -2.85418438911438, + "step": 3240 + }, + { + "epoch": 0.4784336817311939, + "grad_norm": 73.99342350101426, + "learning_rate": 3.1185633228830076e-07, + "logits/chosen": -1.588384985923767, + "logits/rejected": -1.0761555433273315, + "logps/chosen": -467.46746826171875, + "logps/rejected": -601.6129150390625, + "loss": 0.419, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -1.6622146368026733, + "rewards/margins": 1.834486961364746, + "rewards/rejected": -3.496701717376709, + "step": 3250 + }, + { + "epoch": 0.47990578536728984, + "grad_norm": 98.74165950215125, + "learning_rate": 3.1061066936751483e-07, + "logits/chosen": -0.9826352000236511, + "logits/rejected": -0.9047662615776062, + "logps/chosen": -465.37451171875, + "logps/rejected": -630.9509887695312, + "loss": 0.4116, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -2.0668835639953613, + "rewards/margins": 1.3577176332473755, + "rewards/rejected": -3.4246013164520264, + "step": 3260 + }, + { + "epoch": 0.48137788900338585, + "grad_norm": 175.8876074522777, + "learning_rate": 3.0936340563934733e-07, + "logits/chosen": -0.8058466911315918, + "logits/rejected": -0.3506219983100891, + "logps/chosen": -459.06658935546875, + "logps/rejected": -625.363037109375, + "loss": 0.5226, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.0341808795928955, + "rewards/margins": 1.7463868856430054, + "rewards/rejected": -3.7805678844451904, + "step": 3270 + }, + { + "epoch": 0.4828499926394818, + "grad_norm": 84.58835458542602, + "learning_rate": 3.0811457404567153e-07, + "logits/chosen": -1.1082096099853516, + "logits/rejected": -0.5211424827575684, + "logps/chosen": -529.4493408203125, + "logps/rejected": -570.3908081054688, + "loss": 0.4447, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.1187548637390137, + "rewards/margins": 1.142906904220581, + "rewards/rejected": -3.261662244796753, + "step": 3280 + }, + { + "epoch": 0.4843220962755778, + "grad_norm": 226.10441491280082, + "learning_rate": 3.068642075697699e-07, + "logits/chosen": -0.8802739977836609, + "logits/rejected": -0.38997170329093933, + "logps/chosen": -498.98760986328125, + "logps/rejected": -615.8001098632812, + "loss": 0.6234, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.4553208351135254, + "rewards/margins": 1.3117055892944336, + "rewards/rejected": -3.767026424407959, + "step": 3290 + }, + { + "epoch": 0.4857941999116738, + "grad_norm": 229.47066023436236, + "learning_rate": 3.0561233923546336e-07, + "logits/chosen": -0.770553469657898, + "logits/rejected": -0.4320153594017029, + "logps/chosen": -443.873291015625, + "logps/rejected": -608.1763916015625, + "loss": 0.5024, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.0744788646698, + "rewards/margins": 1.6497430801391602, + "rewards/rejected": -3.724221706390381, + "step": 3300 + }, + { + "epoch": 0.4872663035477698, + "grad_norm": 74.78417504907533, + "learning_rate": 3.0435900210623867e-07, + "logits/chosen": -1.1979401111602783, + "logits/rejected": -0.43465495109558105, + "logps/chosen": -464.83758544921875, + "logps/rejected": -641.0127563476562, + "loss": 0.39, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -2.107029676437378, + "rewards/margins": 1.6612132787704468, + "rewards/rejected": -3.7682433128356934, + "step": 3310 + }, + { + "epoch": 0.48873840718386574, + "grad_norm": 105.78958686678781, + "learning_rate": 3.031042292843755e-07, + "logits/chosen": -0.8636314272880554, + "logits/rejected": -0.3642720878124237, + "logps/chosen": -512.8819580078125, + "logps/rejected": -628.2528076171875, + "loss": 0.5605, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.3306820392608643, + "rewards/margins": 1.2208974361419678, + "rewards/rejected": -3.551579713821411, + "step": 3320 + }, + { + "epoch": 0.4902105108199617, + "grad_norm": 76.48318967741196, + "learning_rate": 3.0184805391007205e-07, + "logits/chosen": -0.8498581051826477, + "logits/rejected": -0.871806263923645, + "logps/chosen": -517.5167846679688, + "logps/rejected": -663.9696044921875, + "loss": 0.4523, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.3173983097076416, + "rewards/margins": 1.2108620405197144, + "rewards/rejected": -3.5282604694366455, + "step": 3330 + }, + { + "epoch": 0.4916826144560577, + "grad_norm": 145.98501824524817, + "learning_rate": 3.0059050916056977e-07, + "logits/chosen": -1.2941370010375977, + "logits/rejected": -0.7344252467155457, + "logps/chosen": -490.34075927734375, + "logps/rejected": -504.689208984375, + "loss": 0.5227, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.319364547729492, + "rewards/margins": 0.9229898452758789, + "rewards/rejected": -3.24235463142395, + "step": 3340 + }, + { + "epoch": 0.49315471809215367, + "grad_norm": 150.1456276906277, + "learning_rate": 2.9933162824927713e-07, + "logits/chosen": -1.3360679149627686, + "logits/rejected": -1.0388100147247314, + "logps/chosen": -406.2869873046875, + "logps/rejected": -524.8819580078125, + "loss": 0.5839, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.806243658065796, + "rewards/margins": 1.2452871799468994, + "rewards/rejected": -3.0515308380126953, + "step": 3350 + }, + { + "epoch": 0.4946268217282497, + "grad_norm": 123.36328674749015, + "learning_rate": 2.9807144442489234e-07, + "logits/chosen": -1.5217986106872559, + "logits/rejected": -1.367989420890808, + "logps/chosen": -494.9296875, + "logps/rejected": -578.8140258789062, + "loss": 0.4493, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.9644739627838135, + "rewards/margins": 1.0848608016967773, + "rewards/rejected": -3.04933500289917, + "step": 3360 + }, + { + "epoch": 0.49609892536434563, + "grad_norm": 127.26890614977778, + "learning_rate": 2.9680999097052536e-07, + "logits/chosen": -1.045332908630371, + "logits/rejected": -0.6473127603530884, + "logps/chosen": -380.8509826660156, + "logps/rejected": -549.314208984375, + "loss": 0.4999, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -1.8362480401992798, + "rewards/margins": 1.5513255596160889, + "rewards/rejected": -3.387573719024658, + "step": 3370 + }, + { + "epoch": 0.49757102900044164, + "grad_norm": 121.64872188970894, + "learning_rate": 2.9554730120281847e-07, + "logits/chosen": -1.0802496671676636, + "logits/rejected": -0.7545103430747986, + "logps/chosen": -504.43951416015625, + "logps/rejected": -551.0089721679688, + "loss": 0.5083, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.4239296913146973, + "rewards/margins": 0.9063888788223267, + "rewards/rejected": -3.3303189277648926, + "step": 3380 + }, + { + "epoch": 0.4990431326365376, + "grad_norm": 217.77926948158648, + "learning_rate": 2.942834084710668e-07, + "logits/chosen": -1.2604620456695557, + "logits/rejected": -0.8426252603530884, + "logps/chosen": -576.6885986328125, + "logps/rejected": -585.8273315429688, + "loss": 0.5694, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -2.9954752922058105, + "rewards/margins": 0.6722756624221802, + "rewards/rejected": -3.6677513122558594, + "step": 3390 + }, + { + "epoch": 0.5005152362726336, + "grad_norm": 59.55497001661116, + "learning_rate": 2.9301834615633766e-07, + "logits/chosen": -1.2834960222244263, + "logits/rejected": -1.2142704725265503, + "logps/chosen": -373.6388244628906, + "logps/rejected": -521.4171752929688, + "loss": 0.5225, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.6534874439239502, + "rewards/margins": 1.4567590951919556, + "rewards/rejected": -3.110246419906616, + "step": 3400 + }, + { + "epoch": 0.5019873399087296, + "grad_norm": 71.00093048463245, + "learning_rate": 2.917521476705879e-07, + "logits/chosen": -1.7101757526397705, + "logits/rejected": -1.3449281454086304, + "logps/chosen": -515.8695068359375, + "logps/rejected": -611.7098388671875, + "loss": 0.4086, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0750255584716797, + "rewards/margins": 1.4023538827896118, + "rewards/rejected": -3.477379322052002, + "step": 3410 + }, + { + "epoch": 0.5034594435448255, + "grad_norm": 49.5839683579514, + "learning_rate": 2.904848464557827e-07, + "logits/chosen": -1.4406013488769531, + "logits/rejected": -1.1085827350616455, + "logps/chosen": -524.0177001953125, + "logps/rejected": -585.9599609375, + "loss": 0.5667, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.40733003616333, + "rewards/margins": 0.967034637928009, + "rewards/rejected": -3.374364137649536, + "step": 3420 + }, + { + "epoch": 0.5049315471809216, + "grad_norm": 85.61397724979567, + "learning_rate": 2.892164759830114e-07, + "logits/chosen": -1.5079066753387451, + "logits/rejected": -1.257784128189087, + "logps/chosen": -453.931640625, + "logps/rejected": -542.47119140625, + "loss": 0.5128, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.9137006998062134, + "rewards/margins": 0.9580184817314148, + "rewards/rejected": -2.8717193603515625, + "step": 3430 + }, + { + "epoch": 0.5064036508170175, + "grad_norm": 82.0959950524895, + "learning_rate": 2.8794706975160425e-07, + "logits/chosen": -1.355088233947754, + "logits/rejected": -0.8838594555854797, + "logps/chosen": -458.64288330078125, + "logps/rejected": -550.1956176757812, + "loss": 0.5778, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.168499231338501, + "rewards/margins": 0.8988539576530457, + "rewards/rejected": -3.0673534870147705, + "step": 3440 + }, + { + "epoch": 0.5078757544531135, + "grad_norm": 253.74382162669954, + "learning_rate": 2.866766612882468e-07, + "logits/chosen": -1.590222716331482, + "logits/rejected": -1.3019611835479736, + "logps/chosen": -463.160400390625, + "logps/rejected": -584.1785888671875, + "loss": 0.4692, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -1.9804470539093018, + "rewards/margins": 1.0360428094863892, + "rewards/rejected": -3.0164899826049805, + "step": 3450 + }, + { + "epoch": 0.5093478580892095, + "grad_norm": 145.59951370429994, + "learning_rate": 2.8540528414609514e-07, + "logits/chosen": -1.4311822652816772, + "logits/rejected": -1.291873574256897, + "logps/chosen": -428.0205993652344, + "logps/rejected": -620.2302856445312, + "loss": 0.4905, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.0075182914733887, + "rewards/margins": 1.4721126556396484, + "rewards/rejected": -3.479630708694458, + "step": 3460 + }, + { + "epoch": 0.5108199617253054, + "grad_norm": 210.65952881229376, + "learning_rate": 2.8413297190388947e-07, + "logits/chosen": -1.275827407836914, + "logits/rejected": -1.1379631757736206, + "logps/chosen": -485.94110107421875, + "logps/rejected": -591.2982177734375, + "loss": 0.422, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3439741134643555, + "rewards/margins": 1.3426644802093506, + "rewards/rejected": -3.686638593673706, + "step": 3470 + }, + { + "epoch": 0.5122920653614015, + "grad_norm": 154.5414180365665, + "learning_rate": 2.8285975816506687e-07, + "logits/chosen": -1.6475321054458618, + "logits/rejected": -1.55992591381073, + "logps/chosen": -572.9066772460938, + "logps/rejected": -577.265380859375, + "loss": 0.4724, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.9798471927642822, + "rewards/margins": 0.8711148500442505, + "rewards/rejected": -2.8509621620178223, + "step": 3480 + }, + { + "epoch": 0.5137641689974974, + "grad_norm": 201.29864770526098, + "learning_rate": 2.815856765568746e-07, + "logits/chosen": -1.5275428295135498, + "logits/rejected": -1.2967138290405273, + "logps/chosen": -494.72705078125, + "logps/rejected": -501.51422119140625, + "loss": 0.4639, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.013699769973755, + "rewards/margins": 0.7832995653152466, + "rewards/rejected": -2.796999216079712, + "step": 3490 + }, + { + "epoch": 0.5152362726335934, + "grad_norm": 129.85296618665402, + "learning_rate": 2.803107607294811e-07, + "logits/chosen": -1.5083149671554565, + "logits/rejected": -1.1571937799453735, + "logps/chosen": -491.9873046875, + "logps/rejected": -605.0579833984375, + "loss": 0.6084, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -2.3382468223571777, + "rewards/margins": 1.2085020542144775, + "rewards/rejected": -3.546748638153076, + "step": 3500 + }, + { + "epoch": 0.5167083762696894, + "grad_norm": 70.19907136055035, + "learning_rate": 2.7903504435508787e-07, + "logits/chosen": -1.5205142498016357, + "logits/rejected": -1.459729552268982, + "logps/chosen": -538.9444580078125, + "logps/rejected": -628.7930908203125, + "loss": 0.5761, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.288799285888672, + "rewards/margins": 1.0626940727233887, + "rewards/rejected": -3.3514931201934814, + "step": 3510 + }, + { + "epoch": 0.5181804799057854, + "grad_norm": 68.81567047419881, + "learning_rate": 2.777585611270398e-07, + "logits/chosen": -1.4742847681045532, + "logits/rejected": -1.2906100749969482, + "logps/chosen": -481.75030517578125, + "logps/rejected": -547.8358764648438, + "loss": 0.513, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.153231143951416, + "rewards/margins": 1.1176798343658447, + "rewards/rejected": -3.2709107398986816, + "step": 3520 + }, + { + "epoch": 0.5196525835418814, + "grad_norm": 90.0257973668148, + "learning_rate": 2.7648134475893544e-07, + "logits/chosen": -1.5419515371322632, + "logits/rejected": -1.3836100101470947, + "logps/chosen": -430.83465576171875, + "logps/rejected": -483.0149841308594, + "loss": 0.5873, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.88266921043396, + "rewards/margins": 0.6606709957122803, + "rewards/rejected": -2.5433402061462402, + "step": 3530 + }, + { + "epoch": 0.5211246871779773, + "grad_norm": 81.63921611121803, + "learning_rate": 2.7520342898373657e-07, + "logits/chosen": -1.2974321842193604, + "logits/rejected": -1.248305082321167, + "logps/chosen": -407.1553649902344, + "logps/rejected": -456.27496337890625, + "loss": 0.6178, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": -1.9946372509002686, + "rewards/margins": 0.4798363149166107, + "rewards/rejected": -2.474473714828491, + "step": 3540 + }, + { + "epoch": 0.5225967908140733, + "grad_norm": 137.5158281849911, + "learning_rate": 2.73924847552877e-07, + "logits/chosen": -1.6245288848876953, + "logits/rejected": -1.3985730409622192, + "logps/chosen": -377.77703857421875, + "logps/rejected": -440.16143798828125, + "loss": 0.4942, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.1989734172821045, + "rewards/margins": 1.011608600616455, + "rewards/rejected": -2.2105817794799805, + "step": 3550 + }, + { + "epoch": 0.5240688944501692, + "grad_norm": 39.0545127974598, + "learning_rate": 2.726456342353718e-07, + "logits/chosen": -1.861161231994629, + "logits/rejected": -1.7452576160430908, + "logps/chosen": -453.5718688964844, + "logps/rejected": -524.6138305664062, + "loss": 0.4472, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4303337335586548, + "rewards/margins": 0.8840058445930481, + "rewards/rejected": -2.3143393993377686, + "step": 3560 + }, + { + "epoch": 0.5255409980862653, + "grad_norm": 76.62538965875889, + "learning_rate": 2.7136582281692443e-07, + "logits/chosen": -1.4425104856491089, + "logits/rejected": -1.296722412109375, + "logps/chosen": -419.91998291015625, + "logps/rejected": -487.6356506347656, + "loss": 0.4761, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.4998046159744263, + "rewards/margins": 1.3667972087860107, + "rewards/rejected": -2.8666019439697266, + "step": 3570 + }, + { + "epoch": 0.5270131017223613, + "grad_norm": 69.43079769817764, + "learning_rate": 2.700854470990357e-07, + "logits/chosen": -1.267761468887329, + "logits/rejected": -1.1068871021270752, + "logps/chosen": -458.8053283691406, + "logps/rejected": -559.14892578125, + "loss": 0.5634, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.9432264566421509, + "rewards/margins": 1.1178710460662842, + "rewards/rejected": -3.0610976219177246, + "step": 3580 + }, + { + "epoch": 0.5284852053584572, + "grad_norm": 78.38906813712495, + "learning_rate": 2.6880454089810954e-07, + "logits/chosen": -1.6390260457992554, + "logits/rejected": -1.1603784561157227, + "logps/chosen": -463.13739013671875, + "logps/rejected": -532.7391357421875, + "loss": 0.5553, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6987426280975342, + "rewards/margins": 0.8926237225532532, + "rewards/rejected": -2.5913662910461426, + "step": 3590 + }, + { + "epoch": 0.5299573089945532, + "grad_norm": 81.10946669612669, + "learning_rate": 2.6752313804456124e-07, + "logits/chosen": -1.5291790962219238, + "logits/rejected": -1.1126550436019897, + "logps/chosen": -368.992919921875, + "logps/rejected": -401.21661376953125, + "loss": 0.5227, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.483386754989624, + "rewards/margins": 0.8745701909065247, + "rewards/rejected": -2.357957124710083, + "step": 3600 + }, + { + "epoch": 0.5314294126306492, + "grad_norm": 56.88285650557294, + "learning_rate": 2.66241272381923e-07, + "logits/chosen": -1.7307682037353516, + "logits/rejected": -1.5255147218704224, + "logps/chosen": -440.72613525390625, + "logps/rejected": -431.8831481933594, + "loss": 0.498, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": -1.9333336353302002, + "rewards/margins": 0.45823031663894653, + "rewards/rejected": -2.391564130783081, + "step": 3610 + }, + { + "epoch": 0.5329015162667452, + "grad_norm": 56.40287434849917, + "learning_rate": 2.649589777659506e-07, + "logits/chosen": -1.3099838495254517, + "logits/rejected": -1.3023030757904053, + "logps/chosen": -367.8974304199219, + "logps/rejected": -578.2512817382812, + "loss": 0.5057, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.64794921875, + "rewards/margins": 1.2504719495773315, + "rewards/rejected": -2.898421287536621, + "step": 3620 + }, + { + "epoch": 0.5343736199028412, + "grad_norm": 119.65891081239167, + "learning_rate": 2.6367628806372893e-07, + "logits/chosen": -1.6773631572723389, + "logits/rejected": -1.149111270904541, + "logps/chosen": -538.9449462890625, + "logps/rejected": -559.1810913085938, + "loss": 0.5035, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.901005744934082, + "rewards/margins": 1.1809548139572144, + "rewards/rejected": -3.081960678100586, + "step": 3630 + }, + { + "epoch": 0.5358457235389371, + "grad_norm": 95.84017516557691, + "learning_rate": 2.623932371527776e-07, + "logits/chosen": -1.6771188974380493, + "logits/rejected": -1.0108649730682373, + "logps/chosen": -476.6202697753906, + "logps/rejected": -512.6324462890625, + "loss": 0.6603, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.636016845703125, + "rewards/margins": 1.2511885166168213, + "rewards/rejected": -2.8872056007385254, + "step": 3640 + }, + { + "epoch": 0.5373178271750331, + "grad_norm": 111.01952880779704, + "learning_rate": 2.611098589201563e-07, + "logits/chosen": -1.1505934000015259, + "logits/rejected": -0.9485780596733093, + "logps/chosen": -430.86834716796875, + "logps/rejected": -550.8515625, + "loss": 0.4283, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.161510705947876, + "rewards/margins": 1.2933380603790283, + "rewards/rejected": -3.454848527908325, + "step": 3650 + }, + { + "epoch": 0.5387899308111291, + "grad_norm": 92.29434350934869, + "learning_rate": 2.5982618726156965e-07, + "logits/chosen": -1.3886348009109497, + "logits/rejected": -0.9874445796012878, + "logps/chosen": -394.1221618652344, + "logps/rejected": -506.32232666015625, + "loss": 0.4472, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8757234811782837, + "rewards/margins": 1.1317042112350464, + "rewards/rejected": -3.00742769241333, + "step": 3660 + }, + { + "epoch": 0.5402620344472251, + "grad_norm": 53.97893446781452, + "learning_rate": 2.5854225608047217e-07, + "logits/chosen": -1.1530479192733765, + "logits/rejected": -1.0273164510726929, + "logps/chosen": -473.6204528808594, + "logps/rejected": -578.148193359375, + "loss": 0.452, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.1500978469848633, + "rewards/margins": 1.1406571865081787, + "rewards/rejected": -3.2907555103302, + "step": 3670 + }, + { + "epoch": 0.541734138083321, + "grad_norm": 120.84042969388693, + "learning_rate": 2.572580992871725e-07, + "logits/chosen": -1.6167587041854858, + "logits/rejected": -1.355512261390686, + "logps/chosen": -581.9547729492188, + "logps/rejected": -619.6044311523438, + "loss": 0.4742, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1702680587768555, + "rewards/margins": 0.9465597867965698, + "rewards/rejected": -3.1168277263641357, + "step": 3680 + }, + { + "epoch": 0.543206241719417, + "grad_norm": 181.60185616995793, + "learning_rate": 2.5597375079793826e-07, + "logits/chosen": -1.279551386833191, + "logits/rejected": -0.9392638206481934, + "logps/chosen": -427.4566345214844, + "logps/rejected": -514.9344482421875, + "loss": 0.5333, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9153506755828857, + "rewards/margins": 0.7623427510261536, + "rewards/rejected": -2.6776933670043945, + "step": 3690 + }, + { + "epoch": 0.5446783453555131, + "grad_norm": 153.6606530436007, + "learning_rate": 2.5468924453409977e-07, + "logits/chosen": -1.1252529621124268, + "logits/rejected": -0.8129235506057739, + "logps/chosen": -485.22930908203125, + "logps/rejected": -564.3529663085938, + "loss": 0.6252, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.4217991828918457, + "rewards/margins": 0.7627391219139099, + "rewards/rejected": -3.1845381259918213, + "step": 3700 + }, + { + "epoch": 0.546150448991609, + "grad_norm": 178.63083169442316, + "learning_rate": 2.534046144211544e-07, + "logits/chosen": -1.3765456676483154, + "logits/rejected": -1.306849718093872, + "logps/chosen": -547.9827880859375, + "logps/rejected": -626.2952880859375, + "loss": 0.5789, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.6243114471435547, + "rewards/margins": 1.0017272233963013, + "rewards/rejected": -3.6260387897491455, + "step": 3710 + }, + { + "epoch": 0.547622552627705, + "grad_norm": 75.45013979772031, + "learning_rate": 2.521198943878708e-07, + "logits/chosen": -1.379810094833374, + "logits/rejected": -1.7108083963394165, + "logps/chosen": -469.18499755859375, + "logps/rejected": -589.1279907226562, + "loss": 0.5737, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.1698946952819824, + "rewards/margins": 0.5262952446937561, + "rewards/rejected": -2.6961898803710938, + "step": 3720 + }, + { + "epoch": 0.5490946562638009, + "grad_norm": 74.44509269827175, + "learning_rate": 2.5083511836539213e-07, + "logits/chosen": -1.649370551109314, + "logits/rejected": -1.5051219463348389, + "logps/chosen": -479.7125549316406, + "logps/rejected": -517.9224853515625, + "loss": 0.442, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -2.0585999488830566, + "rewards/margins": 0.6317602396011353, + "rewards/rejected": -2.6903603076934814, + "step": 3730 + }, + { + "epoch": 0.550566759899897, + "grad_norm": 136.43389324678625, + "learning_rate": 2.495503202863407e-07, + "logits/chosen": -1.4302846193313599, + "logits/rejected": -1.2281324863433838, + "logps/chosen": -445.82135009765625, + "logps/rejected": -501.482666015625, + "loss": 0.5566, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.1197800636291504, + "rewards/margins": 0.9216873049736023, + "rewards/rejected": -3.0414676666259766, + "step": 3740 + }, + { + "epoch": 0.552038863535993, + "grad_norm": 62.630186363293845, + "learning_rate": 2.4826553408392104e-07, + "logits/chosen": -1.5661261081695557, + "logits/rejected": -1.1826081275939941, + "logps/chosen": -407.4442138671875, + "logps/rejected": -538.7042236328125, + "loss": 0.4662, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -1.6647484302520752, + "rewards/margins": 1.5092394351959229, + "rewards/rejected": -3.173987627029419, + "step": 3750 + }, + { + "epoch": 0.5535109671720889, + "grad_norm": 86.02800482082752, + "learning_rate": 2.469807936910242e-07, + "logits/chosen": -1.6797685623168945, + "logits/rejected": -1.370241403579712, + "logps/chosen": -465.075439453125, + "logps/rejected": -546.945556640625, + "loss": 0.4697, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.67606520652771, + "rewards/margins": 1.0977511405944824, + "rewards/rejected": -2.7738165855407715, + "step": 3760 + }, + { + "epoch": 0.5549830708081849, + "grad_norm": 54.59576761212067, + "learning_rate": 2.456961330393313e-07, + "logits/chosen": -1.639600396156311, + "logits/rejected": -1.202811598777771, + "logps/chosen": -469.275390625, + "logps/rejected": -479.2810974121094, + "loss": 0.527, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7847795486450195, + "rewards/margins": 0.8321501016616821, + "rewards/rejected": -2.616929292678833, + "step": 3770 + }, + { + "epoch": 0.5564551744442808, + "grad_norm": 73.3938590081862, + "learning_rate": 2.444115860584174e-07, + "logits/chosen": -1.453405499458313, + "logits/rejected": -1.0574461221694946, + "logps/chosen": -485.96990966796875, + "logps/rejected": -598.1585693359375, + "loss": 0.4781, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.9251506328582764, + "rewards/margins": 1.3404624462127686, + "rewards/rejected": -3.265613079071045, + "step": 3780 + }, + { + "epoch": 0.5579272780803769, + "grad_norm": 72.71998088547606, + "learning_rate": 2.4312718667485523e-07, + "logits/chosen": -1.5285999774932861, + "logits/rejected": -1.2470290660858154, + "logps/chosen": -406.674560546875, + "logps/rejected": -481.251953125, + "loss": 0.4074, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.5569398403167725, + "rewards/margins": 1.0535308122634888, + "rewards/rejected": -2.6104705333709717, + "step": 3790 + }, + { + "epoch": 0.5593993817164729, + "grad_norm": 134.5178522146344, + "learning_rate": 2.418429688113194e-07, + "logits/chosen": -1.3199043273925781, + "logits/rejected": -0.9480411410331726, + "logps/chosen": -598.58984375, + "logps/rejected": -566.9969482421875, + "loss": 0.4879, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.618852376937866, + "rewards/margins": 0.8649944067001343, + "rewards/rejected": -3.4838461875915527, + "step": 3800 + }, + { + "epoch": 0.5608714853525688, + "grad_norm": 73.71307866502673, + "learning_rate": 2.405589663856904e-07, + "logits/chosen": -0.9569090008735657, + "logits/rejected": -0.6438056826591492, + "logps/chosen": -474.856201171875, + "logps/rejected": -612.9947509765625, + "loss": 0.4447, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.1610846519470215, + "rewards/margins": 1.5707834959030151, + "rewards/rejected": -3.731868267059326, + "step": 3810 + }, + { + "epoch": 0.5623435889886648, + "grad_norm": 41.11949902083471, + "learning_rate": 2.3927521331015865e-07, + "logits/chosen": -1.1380226612091064, + "logits/rejected": -0.7719400525093079, + "logps/chosen": -381.8536682128906, + "logps/rejected": -537.5950927734375, + "loss": 0.5149, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.7397053241729736, + "rewards/margins": 1.511662244796753, + "rewards/rejected": -3.2513675689697266, + "step": 3820 + }, + { + "epoch": 0.5638156926247608, + "grad_norm": 96.74408396601311, + "learning_rate": 2.379917434903289e-07, + "logits/chosen": -1.1581860780715942, + "logits/rejected": -1.2510985136032104, + "logps/chosen": -532.9301147460938, + "logps/rejected": -654.2242431640625, + "loss": 0.4521, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.6351118087768555, + "rewards/margins": 1.122220754623413, + "rewards/rejected": -3.7573325634002686, + "step": 3830 + }, + { + "epoch": 0.5652877962608568, + "grad_norm": 97.4843952044398, + "learning_rate": 2.3670859082432458e-07, + "logits/chosen": -1.5482327938079834, + "logits/rejected": -1.4216538667678833, + "logps/chosen": -529.13720703125, + "logps/rejected": -642.1002807617188, + "loss": 0.4968, + "rewards/accuracies": 0.9000000953674316, + "rewards/chosen": -2.1928248405456543, + "rewards/margins": 1.7888505458831787, + "rewards/rejected": -3.981675386428833, + "step": 3840 + }, + { + "epoch": 0.5667598998969527, + "grad_norm": 73.47633727104785, + "learning_rate": 2.35425789201893e-07, + "logits/chosen": -1.0061591863632202, + "logits/rejected": -0.7440835237503052, + "logps/chosen": -502.69512939453125, + "logps/rejected": -640.5232543945312, + "loss": 0.4073, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2626755237579346, + "rewards/margins": 1.4708521366119385, + "rewards/rejected": -3.733527660369873, + "step": 3850 + }, + { + "epoch": 0.5682320035330487, + "grad_norm": 64.0358949154304, + "learning_rate": 2.3414337250350982e-07, + "logits/chosen": -1.1740540266036987, + "logits/rejected": -1.1250637769699097, + "logps/chosen": -483.04461669921875, + "logps/rejected": -605.46728515625, + "loss": 0.5574, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -2.5032196044921875, + "rewards/margins": 1.3022310733795166, + "rewards/rejected": -3.805450916290283, + "step": 3860 + }, + { + "epoch": 0.5697041071691447, + "grad_norm": 152.85416869595431, + "learning_rate": 2.3286137459948428e-07, + "logits/chosen": -1.3287904262542725, + "logits/rejected": -1.1025748252868652, + "logps/chosen": -359.2784118652344, + "logps/rejected": -531.5712890625, + "loss": 0.5738, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6601765155792236, + "rewards/margins": 1.5283925533294678, + "rewards/rejected": -3.1885690689086914, + "step": 3870 + }, + { + "epoch": 0.5711762108052407, + "grad_norm": 73.18056431773438, + "learning_rate": 2.3157982934906463e-07, + "logits/chosen": -1.6040395498275757, + "logits/rejected": -1.5567848682403564, + "logps/chosen": -456.64154052734375, + "logps/rejected": -551.6588745117188, + "loss": 0.4884, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.4352662563323975, + "rewards/margins": 1.1799513101577759, + "rewards/rejected": -2.615217685699463, + "step": 3880 + }, + { + "epoch": 0.5726483144413367, + "grad_norm": 65.32805588008291, + "learning_rate": 2.3029877059954414e-07, + "logits/chosen": -1.7298914194107056, + "logits/rejected": -1.5228397846221924, + "logps/chosen": -378.32366943359375, + "logps/rejected": -520.7960205078125, + "loss": 0.4564, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2672197818756104, + "rewards/margins": 1.5151159763336182, + "rewards/rejected": -2.7823357582092285, + "step": 3890 + }, + { + "epoch": 0.5741204180774326, + "grad_norm": 118.18428418990575, + "learning_rate": 2.2901823218536693e-07, + "logits/chosen": -1.4429901838302612, + "logits/rejected": -1.1411277055740356, + "logps/chosen": -416.7167053222656, + "logps/rejected": -420.8246154785156, + "loss": 0.4889, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": -1.7115978002548218, + "rewards/margins": 0.5646583437919617, + "rewards/rejected": -2.2762560844421387, + "step": 3900 + }, + { + "epoch": 0.5755925217135286, + "grad_norm": 71.59673653818123, + "learning_rate": 2.2773824792723428e-07, + "logits/chosen": -1.0435127019882202, + "logits/rejected": -0.9207866787910461, + "logps/chosen": -394.86614990234375, + "logps/rejected": -560.9642333984375, + "loss": 0.3894, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": -1.6596311330795288, + "rewards/margins": 1.6902406215667725, + "rewards/rejected": -3.349871873855591, + "step": 3910 + }, + { + "epoch": 0.5770646253496247, + "grad_norm": 112.69302397287926, + "learning_rate": 2.2645885163121156e-07, + "logits/chosen": -1.3206192255020142, + "logits/rejected": -1.1502015590667725, + "logps/chosen": -469.08734130859375, + "logps/rejected": -630.0872192382812, + "loss": 0.4004, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.9029033184051514, + "rewards/margins": 1.7898085117340088, + "rewards/rejected": -3.6927120685577393, + "step": 3920 + }, + { + "epoch": 0.5785367289857206, + "grad_norm": 110.66446249593685, + "learning_rate": 2.2518007708783512e-07, + "logits/chosen": -1.1579114198684692, + "logits/rejected": -0.9727389216423035, + "logps/chosen": -485.3668518066406, + "logps/rejected": -710.7271118164062, + "loss": 0.5876, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.2650344371795654, + "rewards/margins": 1.7066015005111694, + "rewards/rejected": -3.9716358184814453, + "step": 3930 + }, + { + "epoch": 0.5800088326218166, + "grad_norm": 161.55313652260037, + "learning_rate": 2.2390195807122027e-07, + "logits/chosen": -1.1768076419830322, + "logits/rejected": -1.2023046016693115, + "logps/chosen": -406.9942932128906, + "logps/rejected": -469.81280517578125, + "loss": 0.5523, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.4834096431732178, + "rewards/margins": 0.9628269076347351, + "rewards/rejected": -2.4462366104125977, + "step": 3940 + }, + { + "epoch": 0.5814809362579125, + "grad_norm": 94.94998619858231, + "learning_rate": 2.2262452833816886e-07, + "logits/chosen": -1.3384846448898315, + "logits/rejected": -0.9222444295883179, + "logps/chosen": -430.72412109375, + "logps/rejected": -487.2897033691406, + "loss": 0.5538, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.771453857421875, + "rewards/margins": 1.0263020992279053, + "rewards/rejected": -2.7977561950683594, + "step": 3950 + }, + { + "epoch": 0.5829530398940085, + "grad_norm": 68.04198161886123, + "learning_rate": 2.2134782162727778e-07, + "logits/chosen": -1.4039433002471924, + "logits/rejected": -1.1637251377105713, + "logps/chosen": -399.25299072265625, + "logps/rejected": -505.22454833984375, + "loss": 0.4924, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5472524166107178, + "rewards/margins": 1.3030979633331299, + "rewards/rejected": -2.8503503799438477, + "step": 3960 + }, + { + "epoch": 0.5844251435301046, + "grad_norm": 66.47851168626673, + "learning_rate": 2.2007187165804822e-07, + "logits/chosen": -1.3414843082427979, + "logits/rejected": -1.1878784894943237, + "logps/chosen": -462.64471435546875, + "logps/rejected": -652.8829345703125, + "loss": 0.5388, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.770440697669983, + "rewards/margins": 1.5921293497085571, + "rewards/rejected": -3.362570285797119, + "step": 3970 + }, + { + "epoch": 0.5858972471662005, + "grad_norm": 117.28853818914095, + "learning_rate": 2.1879671212999437e-07, + "logits/chosen": -1.3968640565872192, + "logits/rejected": -1.2348015308380127, + "logps/chosen": -422.02142333984375, + "logps/rejected": -453.6162109375, + "loss": 0.4861, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.5324064493179321, + "rewards/margins": 0.8223631978034973, + "rewards/rejected": -2.354769468307495, + "step": 3980 + }, + { + "epoch": 0.5873693508022965, + "grad_norm": 74.37807716917598, + "learning_rate": 2.1752237672175433e-07, + "logits/chosen": -1.6141881942749023, + "logits/rejected": -1.5041468143463135, + "logps/chosen": -433.03814697265625, + "logps/rejected": -501.9605407714844, + "loss": 0.4872, + "rewards/accuracies": 0.6333333849906921, + "rewards/chosen": -1.6484006643295288, + "rewards/margins": 0.7100468873977661, + "rewards/rejected": -2.358447551727295, + "step": 3990 + }, + { + "epoch": 0.5888414544383924, + "grad_norm": 109.70695382656255, + "learning_rate": 2.162488990901998e-07, + "logits/chosen": -1.3594119548797607, + "logits/rejected": -1.4310438632965088, + "logps/chosen": -469.493896484375, + "logps/rejected": -575.752197265625, + "loss": 0.4965, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.108708620071411, + "rewards/margins": 1.2015089988708496, + "rewards/rejected": -3.3102176189422607, + "step": 4000 + }, + { + "epoch": 0.5903135580744885, + "grad_norm": 104.3600016671151, + "learning_rate": 2.1497631286954764e-07, + "logits/chosen": -1.666786551475525, + "logits/rejected": -1.3860242366790771, + "logps/chosen": -494.908935546875, + "logps/rejected": -528.1370849609375, + "loss": 0.47, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.6129837036132812, + "rewards/margins": 1.143545389175415, + "rewards/rejected": -2.7565293312072754, + "step": 4010 + }, + { + "epoch": 0.5917856617105844, + "grad_norm": 71.6693667345026, + "learning_rate": 2.1370465167047118e-07, + "logits/chosen": -1.484675645828247, + "logits/rejected": -1.2214198112487793, + "logps/chosen": -425.3543395996094, + "logps/rejected": -488.04974365234375, + "loss": 0.4548, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.856046438217163, + "rewards/margins": 1.1506905555725098, + "rewards/rejected": -3.006736993789673, + "step": 4020 + }, + { + "epoch": 0.5932577653466804, + "grad_norm": 146.84292284931118, + "learning_rate": 2.124339490792128e-07, + "logits/chosen": -1.3530104160308838, + "logits/rejected": -1.2188152074813843, + "logps/chosen": -473.00537109375, + "logps/rejected": -619.8487548828125, + "loss": 0.4377, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.0449280738830566, + "rewards/margins": 1.3577020168304443, + "rewards/rejected": -3.402630567550659, + "step": 4030 + }, + { + "epoch": 0.5947298689827764, + "grad_norm": 122.25387107848064, + "learning_rate": 2.1116423865669703e-07, + "logits/chosen": -1.5488669872283936, + "logits/rejected": -0.9515336751937866, + "logps/chosen": -461.306884765625, + "logps/rejected": -568.2154541015625, + "loss": 0.4654, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.9645694494247437, + "rewards/margins": 1.2693474292755127, + "rewards/rejected": -3.233916759490967, + "step": 4040 + }, + { + "epoch": 0.5962019726188724, + "grad_norm": 235.13506422428213, + "learning_rate": 2.0989555393764354e-07, + "logits/chosen": -1.2473701238632202, + "logits/rejected": -1.2329696416854858, + "logps/chosen": -602.1939697265625, + "logps/rejected": -580.3751220703125, + "loss": 0.6393, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": -2.310791015625, + "rewards/margins": 0.33805155754089355, + "rewards/rejected": -2.6488428115844727, + "step": 4050 + }, + { + "epoch": 0.5976740762549684, + "grad_norm": 563.1435619922773, + "learning_rate": 2.0862792842968214e-07, + "logits/chosen": -1.3019721508026123, + "logits/rejected": -0.7194503545761108, + "logps/chosen": -480.62744140625, + "logps/rejected": -496.35162353515625, + "loss": 0.4969, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.706891655921936, + "rewards/margins": 1.077613353729248, + "rewards/rejected": -2.7845048904418945, + "step": 4060 + }, + { + "epoch": 0.5991461798910643, + "grad_norm": 58.0731471020333, + "learning_rate": 2.0736139561246713e-07, + "logits/chosen": -1.298715591430664, + "logits/rejected": -0.9374839067459106, + "logps/chosen": -446.7394104003906, + "logps/rejected": -576.3798828125, + "loss": 0.4956, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.9432166814804077, + "rewards/margins": 1.155698537826538, + "rewards/rejected": -3.0989153385162354, + "step": 4070 + }, + { + "epoch": 0.6006182835271603, + "grad_norm": 160.0838271344589, + "learning_rate": 2.060959889367938e-07, + "logits/chosen": -1.3573552370071411, + "logits/rejected": -1.2131727933883667, + "logps/chosen": -444.8660583496094, + "logps/rejected": -627.1134033203125, + "loss": 0.5652, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.6022586822509766, + "rewards/margins": 1.9062550067901611, + "rewards/rejected": -3.508513927459717, + "step": 4080 + }, + { + "epoch": 0.6020903871632562, + "grad_norm": 56.19264279279395, + "learning_rate": 2.0483174182371435e-07, + "logits/chosen": -1.2950035333633423, + "logits/rejected": -1.2583367824554443, + "logps/chosen": -447.3335876464844, + "logps/rejected": -489.1827697753906, + "loss": 0.5014, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.8214184045791626, + "rewards/margins": 0.38246357440948486, + "rewards/rejected": -2.2038822174072266, + "step": 4090 + }, + { + "epoch": 0.6035624907993523, + "grad_norm": 87.53544565757808, + "learning_rate": 2.0356868766365536e-07, + "logits/chosen": -1.4937045574188232, + "logits/rejected": -1.3296773433685303, + "logps/chosen": -480.1390686035156, + "logps/rejected": -570.79736328125, + "loss": 0.5801, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -1.896793007850647, + "rewards/margins": 0.5548346638679504, + "rewards/rejected": -2.451627492904663, + "step": 4100 + }, + { + "epoch": 0.6050345944354483, + "grad_norm": 44.10408418958209, + "learning_rate": 2.023068598155363e-07, + "logits/chosen": -1.6244083642959595, + "logits/rejected": -1.396142601966858, + "logps/chosen": -494.4314880371094, + "logps/rejected": -482.0111389160156, + "loss": 0.5163, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.9353554248809814, + "rewards/margins": 0.5971390008926392, + "rewards/rejected": -2.532494306564331, + "step": 4110 + }, + { + "epoch": 0.6065066980715442, + "grad_norm": 163.74425668870725, + "learning_rate": 2.010462916058875e-07, + "logits/chosen": -1.4992249011993408, + "logits/rejected": -1.3636547327041626, + "logps/chosen": -497.752685546875, + "logps/rejected": -636.1647338867188, + "loss": 0.4682, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7190614938735962, + "rewards/margins": 1.5106425285339355, + "rewards/rejected": -3.229703903198242, + "step": 4120 + }, + { + "epoch": 0.6079788017076402, + "grad_norm": 86.19422535899103, + "learning_rate": 1.9978701632797118e-07, + "logits/chosen": -1.3390390872955322, + "logits/rejected": -1.3868645429611206, + "logps/chosen": -399.7951354980469, + "logps/rejected": -478.0362243652344, + "loss": 0.5323, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.708728551864624, + "rewards/margins": 0.6808555126190186, + "rewards/rejected": -2.3895840644836426, + "step": 4130 + }, + { + "epoch": 0.6094509053437362, + "grad_norm": 45.662869145997696, + "learning_rate": 1.9852906724090127e-07, + "logits/chosen": -1.4514620304107666, + "logits/rejected": -1.27532958984375, + "logps/chosen": -360.64459228515625, + "logps/rejected": -502.5489196777344, + "loss": 0.4232, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.448917269706726, + "rewards/margins": 1.2002012729644775, + "rewards/rejected": -2.649118423461914, + "step": 4140 + }, + { + "epoch": 0.6109230089798322, + "grad_norm": 72.18189801407802, + "learning_rate": 1.9727247756876534e-07, + "logits/chosen": -1.2466822862625122, + "logits/rejected": -1.274192452430725, + "logps/chosen": -389.36590576171875, + "logps/rejected": -438.1494140625, + "loss": 0.5727, + "rewards/accuracies": 0.5333333611488342, + "rewards/chosen": -1.5773522853851318, + "rewards/margins": 0.251679390668869, + "rewards/rejected": -1.8290317058563232, + "step": 4150 + }, + { + "epoch": 0.6123951126159282, + "grad_norm": 169.00599370987914, + "learning_rate": 1.9601728049974683e-07, + "logits/chosen": -0.8179019093513489, + "logits/rejected": -0.8521889448165894, + "logps/chosen": -385.68951416015625, + "logps/rejected": -588.3092041015625, + "loss": 0.5126, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -1.6947778463363647, + "rewards/margins": 1.571314811706543, + "rewards/rejected": -3.2660927772521973, + "step": 4160 + }, + { + "epoch": 0.6138672162520241, + "grad_norm": 136.92680207190415, + "learning_rate": 1.94763509185249e-07, + "logits/chosen": -1.1400117874145508, + "logits/rejected": -0.8559826016426086, + "logps/chosen": -437.7047424316406, + "logps/rejected": -570.7591552734375, + "loss": 0.4396, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8259289264678955, + "rewards/margins": 1.2261183261871338, + "rewards/rejected": -3.05204701423645, + "step": 4170 + }, + { + "epoch": 0.6153393198881201, + "grad_norm": 84.51813363754414, + "learning_rate": 1.935111967390189e-07, + "logits/chosen": -1.3487924337387085, + "logits/rejected": -0.7945448756217957, + "logps/chosen": -437.0166015625, + "logps/rejected": -504.2601623535156, + "loss": 0.4342, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.9184494018554688, + "rewards/margins": 1.235744595527649, + "rewards/rejected": -3.1541943550109863, + "step": 4180 + }, + { + "epoch": 0.6168114235242161, + "grad_norm": 166.47500894891064, + "learning_rate": 1.922603762362729e-07, + "logits/chosen": -1.1917392015457153, + "logits/rejected": -0.632662296295166, + "logps/chosen": -451.96435546875, + "logps/rejected": -550.7449340820312, + "loss": 0.5066, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.263746738433838, + "rewards/margins": 0.9583805203437805, + "rewards/rejected": -3.2221274375915527, + "step": 4190 + }, + { + "epoch": 0.6182835271603121, + "grad_norm": 142.3877280811928, + "learning_rate": 1.9101108071282342e-07, + "logits/chosen": -1.324268102645874, + "logits/rejected": -0.8658564686775208, + "logps/chosen": -487.85040283203125, + "logps/rejected": -607.1859130859375, + "loss": 0.4358, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -2.1432671546936035, + "rewards/margins": 1.656567931175232, + "rewards/rejected": -3.799835205078125, + "step": 4200 + }, + { + "epoch": 0.619755630796408, + "grad_norm": 339.00407260300676, + "learning_rate": 1.8976334316420576e-07, + "logits/chosen": -1.6026979684829712, + "logits/rejected": -1.2258638143539429, + "logps/chosen": -554.0025634765625, + "logps/rejected": -596.1912841796875, + "loss": 0.5022, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.1759510040283203, + "rewards/margins": 1.0144656896591187, + "rewards/rejected": -3.1904165744781494, + "step": 4210 + }, + { + "epoch": 0.621227734432504, + "grad_norm": 122.82760928039936, + "learning_rate": 1.8851719654480748e-07, + "logits/chosen": -1.4198747873306274, + "logits/rejected": -1.0640865564346313, + "logps/chosen": -454.615966796875, + "logps/rejected": -558.69384765625, + "loss": 0.4799, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.9796329736709595, + "rewards/margins": 1.1223433017730713, + "rewards/rejected": -3.101975917816162, + "step": 4220 + }, + { + "epoch": 0.6226998380686001, + "grad_norm": 161.7773940705314, + "learning_rate": 1.8727267376699735e-07, + "logits/chosen": -1.353320837020874, + "logits/rejected": -1.0286481380462646, + "logps/chosen": -517.1593017578125, + "logps/rejected": -537.186279296875, + "loss": 0.5511, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.943701148033142, + "rewards/margins": 0.9693961143493652, + "rewards/rejected": -2.913097858428955, + "step": 4230 + }, + { + "epoch": 0.624171941704696, + "grad_norm": 78.51428756603991, + "learning_rate": 1.8602980770025645e-07, + "logits/chosen": -1.1248716115951538, + "logits/rejected": -0.7332050800323486, + "logps/chosen": -475.0301208496094, + "logps/rejected": -551.0636596679688, + "loss": 0.4834, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.750465750694275, + "rewards/margins": 1.583151936531067, + "rewards/rejected": -3.333617687225342, + "step": 4240 + }, + { + "epoch": 0.625644045340792, + "grad_norm": 227.2006708825864, + "learning_rate": 1.8478863117031007e-07, + "logits/chosen": -0.8710759282112122, + "logits/rejected": -0.4334639608860016, + "logps/chosen": -477.48602294921875, + "logps/rejected": -682.909912109375, + "loss": 0.4993, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -2.5163652896881104, + "rewards/margins": 1.8585186004638672, + "rewards/rejected": -4.374884128570557, + "step": 4250 + }, + { + "epoch": 0.6271161489768879, + "grad_norm": 302.5431374009507, + "learning_rate": 1.8354917695826026e-07, + "logits/chosen": -1.0217506885528564, + "logits/rejected": -0.8784247636795044, + "logps/chosen": -594.0630493164062, + "logps/rejected": -712.32470703125, + "loss": 0.4769, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.5547966957092285, + "rewards/margins": 1.508219838142395, + "rewards/rejected": -4.06301736831665, + "step": 4260 + }, + { + "epoch": 0.6285882526129839, + "grad_norm": 63.18588484122831, + "learning_rate": 1.8231147779972074e-07, + "logits/chosen": -0.7983840703964233, + "logits/rejected": -0.6720956563949585, + "logps/chosen": -431.10662841796875, + "logps/rejected": -594.2606201171875, + "loss": 0.6117, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.266784191131592, + "rewards/margins": 1.3137670755386353, + "rewards/rejected": -3.5805516242980957, + "step": 4270 + }, + { + "epoch": 0.63006035624908, + "grad_norm": 44.37720620127915, + "learning_rate": 1.8107556638395168e-07, + "logits/chosen": -1.5759875774383545, + "logits/rejected": -1.0857738256454468, + "logps/chosen": -430.1385192871094, + "logps/rejected": -603.598388671875, + "loss": 0.489, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4764127731323242, + "rewards/margins": 1.7837111949920654, + "rewards/rejected": -3.2601242065429688, + "step": 4280 + }, + { + "epoch": 0.6315324598851759, + "grad_norm": 167.3905707093181, + "learning_rate": 1.798414753529971e-07, + "logits/chosen": -1.4073432683944702, + "logits/rejected": -1.2732982635498047, + "logps/chosen": -479.56634521484375, + "logps/rejected": -554.1336669921875, + "loss": 0.4638, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.8155193328857422, + "rewards/margins": 1.1145744323730469, + "rewards/rejected": -2.930094003677368, + "step": 4290 + }, + { + "epoch": 0.6330045635212719, + "grad_norm": 129.29854211295714, + "learning_rate": 1.7860923730082152e-07, + "logits/chosen": -1.5550177097320557, + "logits/rejected": -1.3968069553375244, + "logps/chosen": -499.90191650390625, + "logps/rejected": -540.1566772460938, + "loss": 0.5812, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.8159093856811523, + "rewards/margins": 1.0279780626296997, + "rewards/rejected": -2.8438870906829834, + "step": 4300 + }, + { + "epoch": 0.6344766671573678, + "grad_norm": 79.24778077716044, + "learning_rate": 1.7737888477245052e-07, + "logits/chosen": -1.3963193893432617, + "logits/rejected": -1.3133435249328613, + "logps/chosen": -519.7384643554688, + "logps/rejected": -594.7615356445312, + "loss": 0.4701, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2885804176330566, + "rewards/margins": 0.7993799448013306, + "rewards/rejected": -3.0879604816436768, + "step": 4310 + }, + { + "epoch": 0.6359487707934639, + "grad_norm": 114.48838295700725, + "learning_rate": 1.761504502631102e-07, + "logits/chosen": -1.5096849203109741, + "logits/rejected": -1.320414423942566, + "logps/chosen": -491.1665954589844, + "logps/rejected": -676.3113403320312, + "loss": 0.5378, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1326701641082764, + "rewards/margins": 1.8542041778564453, + "rewards/rejected": -3.9868741035461426, + "step": 4320 + }, + { + "epoch": 0.6374208744295599, + "grad_norm": 163.50997835759824, + "learning_rate": 1.749239662173693e-07, + "logits/chosen": -1.050625205039978, + "logits/rejected": -0.9444645643234253, + "logps/chosen": -394.00372314453125, + "logps/rejected": -534.69775390625, + "loss": 0.4701, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.090010166168213, + "rewards/margins": 1.2359343767166138, + "rewards/rejected": -3.3259453773498535, + "step": 4330 + }, + { + "epoch": 0.6388929780656558, + "grad_norm": 190.47149570315298, + "learning_rate": 1.7369946502828245e-07, + "logits/chosen": -1.4550175666809082, + "logits/rejected": -1.129786729812622, + "logps/chosen": -420.1614685058594, + "logps/rejected": -462.8443908691406, + "loss": 0.4673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4609946012496948, + "rewards/margins": 1.1368591785430908, + "rewards/rejected": -2.597853660583496, + "step": 4340 + }, + { + "epoch": 0.6403650817017518, + "grad_norm": 99.5559276911712, + "learning_rate": 1.7247697903653395e-07, + "logits/chosen": -1.433441400527954, + "logits/rejected": -1.221961259841919, + "logps/chosen": -492.36279296875, + "logps/rejected": -606.5283203125, + "loss": 0.5153, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.7743669748306274, + "rewards/margins": 1.4679977893829346, + "rewards/rejected": -3.2423648834228516, + "step": 4350 + }, + { + "epoch": 0.6418371853378478, + "grad_norm": 248.97625154100606, + "learning_rate": 1.7125654052958465e-07, + "logits/chosen": -1.164658784866333, + "logits/rejected": -0.7563608884811401, + "logps/chosen": -465.689208984375, + "logps/rejected": -579.2647705078125, + "loss": 0.5439, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.1886813640594482, + "rewards/margins": 1.329219102859497, + "rewards/rejected": -3.5179009437561035, + "step": 4360 + }, + { + "epoch": 0.6433092889739438, + "grad_norm": 129.99182067921348, + "learning_rate": 1.7003818174081832e-07, + "logits/chosen": -1.3270455598831177, + "logits/rejected": -0.5682891607284546, + "logps/chosen": -464.4905700683594, + "logps/rejected": -562.85986328125, + "loss": 0.4398, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -2.226857900619507, + "rewards/margins": 1.4428317546844482, + "rewards/rejected": -3.669689893722534, + "step": 4370 + }, + { + "epoch": 0.6447813926100397, + "grad_norm": 103.23094567396204, + "learning_rate": 1.6882193484869067e-07, + "logits/chosen": -1.0623207092285156, + "logits/rejected": -0.6714349985122681, + "logps/chosen": -449.782470703125, + "logps/rejected": -530.7210693359375, + "loss": 0.6072, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.9685523509979248, + "rewards/margins": 0.9595939517021179, + "rewards/rejected": -2.9281461238861084, + "step": 4380 + }, + { + "epoch": 0.6462534962461357, + "grad_norm": 75.58064540802451, + "learning_rate": 1.676078319758796e-07, + "logits/chosen": -1.3233578205108643, + "logits/rejected": -0.8019570112228394, + "logps/chosen": -417.3827209472656, + "logps/rejected": -484.7991638183594, + "loss": 0.502, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.56229567527771, + "rewards/margins": 1.234908938407898, + "rewards/rejected": -2.7972047328948975, + "step": 4390 + }, + { + "epoch": 0.6477255998822317, + "grad_norm": 91.14333187991943, + "learning_rate": 1.6639590518843643e-07, + "logits/chosen": -1.3182179927825928, + "logits/rejected": -0.9390050768852234, + "logps/chosen": -492.20556640625, + "logps/rejected": -690.0474243164062, + "loss": 0.4192, + "rewards/accuracies": 0.9000000953674316, + "rewards/chosen": -1.693629503250122, + "rewards/margins": 1.931666374206543, + "rewards/rejected": -3.625296115875244, + "step": 4400 + }, + { + "epoch": 0.6491977035183277, + "grad_norm": 103.96904160967095, + "learning_rate": 1.6518618649493932e-07, + "logits/chosen": -0.8079828023910522, + "logits/rejected": -0.6551457643508911, + "logps/chosen": -466.8829040527344, + "logps/rejected": -660.3395385742188, + "loss": 0.3714, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.098818302154541, + "rewards/margins": 1.8155527114868164, + "rewards/rejected": -3.9143710136413574, + "step": 4410 + }, + { + "epoch": 0.6506698071544237, + "grad_norm": 96.29224878558416, + "learning_rate": 1.6397870784564777e-07, + "logits/chosen": -0.7224613428115845, + "logits/rejected": -0.5728973150253296, + "logps/chosen": -382.28131103515625, + "logps/rejected": -667.2921142578125, + "loss": 0.5461, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.6757816076278687, + "rewards/margins": 2.2099719047546387, + "rewards/rejected": -3.885753631591797, + "step": 4420 + }, + { + "epoch": 0.6521419107905196, + "grad_norm": 117.01571082456245, + "learning_rate": 1.6277350113165887e-07, + "logits/chosen": -0.7781810760498047, + "logits/rejected": -0.28642207384109497, + "logps/chosen": -481.189208984375, + "logps/rejected": -567.7074584960938, + "loss": 0.541, + "rewards/accuracies": 0.7333332300186157, + "rewards/chosen": -2.172182559967041, + "rewards/margins": 1.3926115036010742, + "rewards/rejected": -3.5647940635681152, + "step": 4430 + }, + { + "epoch": 0.6536140144266156, + "grad_norm": 121.71211239342821, + "learning_rate": 1.615705981840646e-07, + "logits/chosen": -1.1096986532211304, + "logits/rejected": -0.8436762690544128, + "logps/chosen": -545.0184326171875, + "logps/rejected": -645.73974609375, + "loss": 0.5431, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.231415271759033, + "rewards/margins": 1.2301454544067383, + "rewards/rejected": -3.4615604877471924, + "step": 4440 + }, + { + "epoch": 0.6550861180627117, + "grad_norm": 90.9746267725247, + "learning_rate": 1.6037003077311178e-07, + "logits/chosen": -1.2080752849578857, + "logits/rejected": -0.9661863446235657, + "logps/chosen": -425.611328125, + "logps/rejected": -622.8643188476562, + "loss": 0.5348, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.8019592761993408, + "rewards/margins": 1.7126600742340088, + "rewards/rejected": -3.5146193504333496, + "step": 4450 + }, + { + "epoch": 0.6565582216988076, + "grad_norm": 51.72339156461159, + "learning_rate": 1.591718306073625e-07, + "logits/chosen": -1.6418259143829346, + "logits/rejected": -0.8949257731437683, + "logps/chosen": -667.9796142578125, + "logps/rejected": -655.2195434570312, + "loss": 0.4994, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.303271770477295, + "rewards/margins": 1.214694619178772, + "rewards/rejected": -3.5179665088653564, + "step": 4460 + }, + { + "epoch": 0.6580303253349036, + "grad_norm": 92.752762425613, + "learning_rate": 1.5797602933285672e-07, + "logits/chosen": -1.2960970401763916, + "logits/rejected": -0.6208903193473816, + "logps/chosen": -487.95458984375, + "logps/rejected": -595.2272338867188, + "loss": 0.5112, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.0796303749084473, + "rewards/margins": 1.442471981048584, + "rewards/rejected": -3.5221023559570312, + "step": 4470 + }, + { + "epoch": 0.6595024289709995, + "grad_norm": 107.21381148475398, + "learning_rate": 1.5678265853227679e-07, + "logits/chosen": -1.1505801677703857, + "logits/rejected": -1.1555681228637695, + "logps/chosen": -496.9366149902344, + "logps/rejected": -691.8958129882812, + "loss": 0.4413, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -2.0033645629882812, + "rewards/margins": 1.6010324954986572, + "rewards/rejected": -3.6043968200683594, + "step": 4480 + }, + { + "epoch": 0.6609745326070955, + "grad_norm": 141.42706000917778, + "learning_rate": 1.5559174972411271e-07, + "logits/chosen": -1.458112359046936, + "logits/rejected": -0.9190812110900879, + "logps/chosen": -532.1338500976562, + "logps/rejected": -662.2808837890625, + "loss": 0.4329, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.9828624725341797, + "rewards/margins": 1.6626027822494507, + "rewards/rejected": -3.6454646587371826, + "step": 4490 + }, + { + "epoch": 0.6624466362431916, + "grad_norm": 215.2111128292361, + "learning_rate": 1.5440333436183022e-07, + "logits/chosen": -0.9033737182617188, + "logits/rejected": -0.6734446287155151, + "logps/chosen": -407.6439514160156, + "logps/rejected": -588.3001708984375, + "loss": 0.5357, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.8382084369659424, + "rewards/margins": 2.0106863975524902, + "rewards/rejected": -3.8488948345184326, + "step": 4500 + }, + { + "epoch": 0.6639187398792875, + "grad_norm": 86.54737385899564, + "learning_rate": 1.532174438330399e-07, + "logits/chosen": -1.2031002044677734, + "logits/rejected": -0.8040673136711121, + "logps/chosen": -544.1952514648438, + "logps/rejected": -569.1370849609375, + "loss": 0.4201, + "rewards/accuracies": 0.9000000953674316, + "rewards/chosen": -2.124861478805542, + "rewards/margins": 1.1427748203277588, + "rewards/rejected": -3.2676360607147217, + "step": 4510 + }, + { + "epoch": 0.6653908435153835, + "grad_norm": 54.129479609989204, + "learning_rate": 1.5203410945866807e-07, + "logits/chosen": -1.31126070022583, + "logits/rejected": -0.9798968434333801, + "logps/chosen": -466.87725830078125, + "logps/rejected": -614.9652099609375, + "loss": 0.418, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.1668317317962646, + "rewards/margins": 1.2651424407958984, + "rewards/rejected": -3.431974411010742, + "step": 4520 + }, + { + "epoch": 0.6668629471514794, + "grad_norm": 114.33620646710379, + "learning_rate": 1.5085336249212982e-07, + "logits/chosen": -0.9281169176101685, + "logits/rejected": -0.6057072281837463, + "logps/chosen": -511.455078125, + "logps/rejected": -658.9219360351562, + "loss": 0.491, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.412461996078491, + "rewards/margins": 1.834416389465332, + "rewards/rejected": -4.246878147125244, + "step": 4530 + }, + { + "epoch": 0.6683350507875755, + "grad_norm": 79.42523215491616, + "learning_rate": 1.4967523411850314e-07, + "logits/chosen": -1.1892764568328857, + "logits/rejected": -1.1653081178665161, + "logps/chosen": -429.823486328125, + "logps/rejected": -586.5986328125, + "loss": 0.4514, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.00909161567688, + "rewards/margins": 1.306792974472046, + "rewards/rejected": -3.315884828567505, + "step": 4540 + }, + { + "epoch": 0.6698071544236714, + "grad_norm": 71.52321846746271, + "learning_rate": 1.484997554537057e-07, + "logits/chosen": -1.2200419902801514, + "logits/rejected": -1.0729676485061646, + "logps/chosen": -469.2467346191406, + "logps/rejected": -617.2420654296875, + "loss": 0.5469, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8788034915924072, + "rewards/margins": 1.1826003789901733, + "rewards/rejected": -3.061404228210449, + "step": 4550 + }, + { + "epoch": 0.6712792580597674, + "grad_norm": 191.2325204474366, + "learning_rate": 1.4732695754367287e-07, + "logits/chosen": -1.338135004043579, + "logits/rejected": -0.9593397378921509, + "logps/chosen": -474.0179138183594, + "logps/rejected": -605.5560913085938, + "loss": 0.5865, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -1.8360450267791748, + "rewards/margins": 1.369950532913208, + "rewards/rejected": -3.205996036529541, + "step": 4560 + }, + { + "epoch": 0.6727513616958634, + "grad_norm": 103.62820847801295, + "learning_rate": 1.4615687136353787e-07, + "logits/chosen": -1.3806118965148926, + "logits/rejected": -0.9853204488754272, + "logps/chosen": -479.7335510253906, + "logps/rejected": -516.6441040039062, + "loss": 0.5687, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.8853724002838135, + "rewards/margins": 1.0776479244232178, + "rewards/rejected": -2.9630208015441895, + "step": 4570 + }, + { + "epoch": 0.6742234653319593, + "grad_norm": 116.26950210507823, + "learning_rate": 1.4498952781681328e-07, + "logits/chosen": -1.3017466068267822, + "logits/rejected": -1.197304129600525, + "logps/chosen": -418.6080627441406, + "logps/rejected": -496.27734375, + "loss": 0.4458, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.840444564819336, + "rewards/margins": 1.01614511013031, + "rewards/rejected": -2.8565890789031982, + "step": 4580 + }, + { + "epoch": 0.6756955689680554, + "grad_norm": 68.58980235885531, + "learning_rate": 1.4382495773457544e-07, + "logits/chosen": -1.3158092498779297, + "logits/rejected": -1.3214632272720337, + "logps/chosen": -421.3922424316406, + "logps/rejected": -436.61212158203125, + "loss": 0.5711, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.5681204795837402, + "rewards/margins": 0.6551679372787476, + "rewards/rejected": -2.2232887744903564, + "step": 4590 + }, + { + "epoch": 0.6771676726041513, + "grad_norm": 98.70861733760921, + "learning_rate": 1.4266319187464965e-07, + "logits/chosen": -1.419651746749878, + "logits/rejected": -1.1971409320831299, + "logps/chosen": -361.8268737792969, + "logps/rejected": -456.25030517578125, + "loss": 0.4314, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5300724506378174, + "rewards/margins": 0.9448378682136536, + "rewards/rejected": -2.474910259246826, + "step": 4600 + }, + { + "epoch": 0.6786397762402473, + "grad_norm": 55.51035289215918, + "learning_rate": 1.415042609207981e-07, + "logits/chosen": -1.4711560010910034, + "logits/rejected": -1.386880874633789, + "logps/chosen": -365.0799255371094, + "logps/rejected": -440.0135803222656, + "loss": 0.4237, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.4096601009368896, + "rewards/margins": 1.103913426399231, + "rewards/rejected": -2.5135738849639893, + "step": 4610 + }, + { + "epoch": 0.6801118798763433, + "grad_norm": 209.63059800351263, + "learning_rate": 1.4034819548190936e-07, + "logits/chosen": -1.234034776687622, + "logits/rejected": -0.9818994402885437, + "logps/chosen": -454.1690368652344, + "logps/rejected": -545.9376220703125, + "loss": 0.4792, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.118584632873535, + "rewards/margins": 1.1898410320281982, + "rewards/rejected": -3.3084259033203125, + "step": 4620 + }, + { + "epoch": 0.6815839835124393, + "grad_norm": 100.9259147767831, + "learning_rate": 1.3919502609119004e-07, + "logits/chosen": -1.4547697305679321, + "logits/rejected": -1.3069839477539062, + "logps/chosen": -511.80126953125, + "logps/rejected": -628.25830078125, + "loss": 0.4671, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1555685997009277, + "rewards/margins": 1.2655562162399292, + "rewards/rejected": -3.4211249351501465, + "step": 4630 + }, + { + "epoch": 0.6830560871485353, + "grad_norm": 211.9949362206441, + "learning_rate": 1.380447832053583e-07, + "logits/chosen": -1.3253339529037476, + "logits/rejected": -1.1672375202178955, + "logps/chosen": -433.7339782714844, + "logps/rejected": -600.3421630859375, + "loss": 0.484, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.9534915685653687, + "rewards/margins": 1.6766173839569092, + "rewards/rejected": -3.630108594894409, + "step": 4640 + }, + { + "epoch": 0.6845281907846312, + "grad_norm": 50.02880841553545, + "learning_rate": 1.3689749720383934e-07, + "logits/chosen": -1.2918349504470825, + "logits/rejected": -1.2667691707611084, + "logps/chosen": -441.39129638671875, + "logps/rejected": -476.4908752441406, + "loss": 0.4678, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.721518874168396, + "rewards/margins": 0.6427041888237, + "rewards/rejected": -2.364223003387451, + "step": 4650 + }, + { + "epoch": 0.6860002944207272, + "grad_norm": 143.75760413247963, + "learning_rate": 1.357531983879633e-07, + "logits/chosen": -1.1302011013031006, + "logits/rejected": -1.0732858180999756, + "logps/chosen": -433.2850646972656, + "logps/rejected": -533.68896484375, + "loss": 0.5513, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.286162853240967, + "rewards/margins": 1.1638197898864746, + "rewards/rejected": -3.4499828815460205, + "step": 4660 + }, + { + "epoch": 0.6874723980568233, + "grad_norm": 108.29236998779979, + "learning_rate": 1.3461191698016482e-07, + "logits/chosen": -0.981033980846405, + "logits/rejected": -0.8925952911376953, + "logps/chosen": -493.61236572265625, + "logps/rejected": -569.1792602539062, + "loss": 0.4632, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -2.3269200325012207, + "rewards/margins": 1.0681781768798828, + "rewards/rejected": -3.3950984477996826, + "step": 4670 + }, + { + "epoch": 0.6889445016929192, + "grad_norm": 94.18207724123126, + "learning_rate": 1.3347368312318475e-07, + "logits/chosen": -1.4530584812164307, + "logits/rejected": -1.236474633216858, + "logps/chosen": -498.83404541015625, + "logps/rejected": -584.1963500976562, + "loss": 0.5514, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7037700414657593, + "rewards/margins": 1.0627331733703613, + "rewards/rejected": -2.766503095626831, + "step": 4680 + }, + { + "epoch": 0.6904166053290152, + "grad_norm": 143.66884752967144, + "learning_rate": 1.3233852687927415e-07, + "logits/chosen": -1.4111673831939697, + "logits/rejected": -1.213225245475769, + "logps/chosen": -480.89971923828125, + "logps/rejected": -599.5103149414062, + "loss": 0.5369, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.9547340869903564, + "rewards/margins": 1.056686520576477, + "rewards/rejected": -3.011420726776123, + "step": 4690 + }, + { + "epoch": 0.6918887089651111, + "grad_norm": 142.52500874158315, + "learning_rate": 1.3120647822940035e-07, + "logits/chosen": -1.2101199626922607, + "logits/rejected": -1.045078158378601, + "logps/chosen": -433.05816650390625, + "logps/rejected": -544.9658203125, + "loss": 0.482, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.7011553049087524, + "rewards/margins": 1.4601318836212158, + "rewards/rejected": -3.161287307739258, + "step": 4700 + }, + { + "epoch": 0.6933608126012071, + "grad_norm": 123.52516429875446, + "learning_rate": 1.3007756707245488e-07, + "logits/chosen": -1.2354273796081543, + "logits/rejected": -1.3107268810272217, + "logps/chosen": -412.0826110839844, + "logps/rejected": -524.1710205078125, + "loss": 0.5479, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7833210229873657, + "rewards/margins": 1.1784168481826782, + "rewards/rejected": -2.961737632751465, + "step": 4710 + }, + { + "epoch": 0.6948329162373031, + "grad_norm": 82.25740720170047, + "learning_rate": 1.2895182322446415e-07, + "logits/chosen": -1.2244060039520264, + "logits/rejected": -1.035180687904358, + "logps/chosen": -455.625244140625, + "logps/rejected": -508.426025390625, + "loss": 0.4967, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.736236333847046, + "rewards/margins": 1.14691960811615, + "rewards/rejected": -2.8831560611724854, + "step": 4720 + }, + { + "epoch": 0.6963050198733991, + "grad_norm": 160.5338642317761, + "learning_rate": 1.2782927641780167e-07, + "logits/chosen": -1.386671781539917, + "logits/rejected": -1.4357919692993164, + "logps/chosen": -501.72314453125, + "logps/rejected": -617.049560546875, + "loss": 0.4581, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.885978102684021, + "rewards/margins": 1.1497385501861572, + "rewards/rejected": -3.035717010498047, + "step": 4730 + }, + { + "epoch": 0.6977771235094951, + "grad_norm": 219.84205483800653, + "learning_rate": 1.2670995630040288e-07, + "logits/chosen": -1.244444727897644, + "logits/rejected": -1.0640175342559814, + "logps/chosen": -422.1329650878906, + "logps/rejected": -528.4967041015625, + "loss": 0.555, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8891346454620361, + "rewards/margins": 1.2488270998001099, + "rewards/rejected": -3.1379618644714355, + "step": 4740 + }, + { + "epoch": 0.699249227145591, + "grad_norm": 43.512277857387936, + "learning_rate": 1.2559389243498213e-07, + "logits/chosen": -1.0504766702651978, + "logits/rejected": -1.1236060857772827, + "logps/chosen": -487.58447265625, + "logps/rejected": -614.5943603515625, + "loss": 0.4544, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6895822286605835, + "rewards/margins": 1.5867253541946411, + "rewards/rejected": -3.2763073444366455, + "step": 4750 + }, + { + "epoch": 0.7007213307816871, + "grad_norm": 85.09493118715747, + "learning_rate": 1.2448111429825198e-07, + "logits/chosen": -1.225906491279602, + "logits/rejected": -1.0558003187179565, + "logps/chosen": -510.42431640625, + "logps/rejected": -680.6683349609375, + "loss": 0.3979, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0438990592956543, + "rewards/margins": 1.981340765953064, + "rewards/rejected": -4.02523946762085, + "step": 4760 + }, + { + "epoch": 0.702193434417783, + "grad_norm": 140.23547038398513, + "learning_rate": 1.2337165128014443e-07, + "logits/chosen": -1.1954014301300049, + "logits/rejected": -0.9831902384757996, + "logps/chosen": -481.63238525390625, + "logps/rejected": -589.9175415039062, + "loss": 0.5157, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0979185104370117, + "rewards/margins": 1.1501258611679077, + "rewards/rejected": -3.24804425239563, + "step": 4770 + }, + { + "epoch": 0.703665538053879, + "grad_norm": 264.72731011513565, + "learning_rate": 1.2226553268303494e-07, + "logits/chosen": -0.6188865303993225, + "logits/rejected": -0.4021533131599426, + "logps/chosen": -423.24603271484375, + "logps/rejected": -527.1163330078125, + "loss": 0.5671, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.116725206375122, + "rewards/margins": 0.8284025192260742, + "rewards/rejected": -2.9451279640197754, + "step": 4780 + }, + { + "epoch": 0.705137641689975, + "grad_norm": 91.77797281274971, + "learning_rate": 1.2116278772096835e-07, + "logits/chosen": -1.146105408668518, + "logits/rejected": -0.7056888341903687, + "logps/chosen": -528.7418823242188, + "logps/rejected": -605.4951782226562, + "loss": 0.4927, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.0645480155944824, + "rewards/margins": 1.4351472854614258, + "rewards/rejected": -3.49969482421875, + "step": 4790 + }, + { + "epoch": 0.7066097453260709, + "grad_norm": 184.57813046294538, + "learning_rate": 1.2006344551888736e-07, + "logits/chosen": -1.0005053281784058, + "logits/rejected": -1.0541980266571045, + "logps/chosen": -356.3594665527344, + "logps/rejected": -482.3995666503906, + "loss": 0.4353, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7400871515274048, + "rewards/margins": 1.1408098936080933, + "rewards/rejected": -2.880896806716919, + "step": 4800 + }, + { + "epoch": 0.708081848962167, + "grad_norm": 129.03556755949657, + "learning_rate": 1.1896753511186364e-07, + "logits/chosen": -1.5008571147918701, + "logits/rejected": -1.0873032808303833, + "logps/chosen": -475.8860778808594, + "logps/rejected": -451.3075256347656, + "loss": 0.4483, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9100555181503296, + "rewards/margins": 0.7982396483421326, + "rewards/rejected": -2.7082953453063965, + "step": 4810 + }, + { + "epoch": 0.7095539525982629, + "grad_norm": 109.33908203859355, + "learning_rate": 1.1787508544433002e-07, + "logits/chosen": -1.136890172958374, + "logits/rejected": -0.9680862426757812, + "logps/chosen": -449.29541015625, + "logps/rejected": -589.8648071289062, + "loss": 0.499, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.0743818283081055, + "rewards/margins": 1.3076345920562744, + "rewards/rejected": -3.38201642036438, + "step": 4820 + }, + { + "epoch": 0.7110260562343589, + "grad_norm": 65.40595512821454, + "learning_rate": 1.1678612536931718e-07, + "logits/chosen": -1.265831470489502, + "logits/rejected": -1.0169765949249268, + "logps/chosen": -483.75830078125, + "logps/rejected": -614.5604858398438, + "loss": 0.5145, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7931849956512451, + "rewards/margins": 1.668465256690979, + "rewards/rejected": -3.4616503715515137, + "step": 4830 + }, + { + "epoch": 0.7124981598704548, + "grad_norm": 107.75101661020405, + "learning_rate": 1.1570068364769081e-07, + "logits/chosen": -1.2228275537490845, + "logits/rejected": -0.9565554857254028, + "logps/chosen": -479.07525634765625, + "logps/rejected": -605.0894775390625, + "loss": 0.56, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -2.240854263305664, + "rewards/margins": 0.9406126737594604, + "rewards/rejected": -3.181466817855835, + "step": 4840 + }, + { + "epoch": 0.7139702635065509, + "grad_norm": 90.33844903048573, + "learning_rate": 1.146187889473924e-07, + "logits/chosen": -1.1427541971206665, + "logits/rejected": -1.102870225906372, + "logps/chosen": -504.01800537109375, + "logps/rejected": -623.41455078125, + "loss": 0.5578, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.2632498741149902, + "rewards/margins": 1.3922555446624756, + "rewards/rejected": -3.655505418777466, + "step": 4850 + }, + { + "epoch": 0.7154423671426469, + "grad_norm": 86.37462311998722, + "learning_rate": 1.135404698426819e-07, + "logits/chosen": -1.3055331707000732, + "logits/rejected": -1.0770326852798462, + "logps/chosen": -459.8268127441406, + "logps/rejected": -584.7457275390625, + "loss": 0.4815, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.1178436279296875, + "rewards/margins": 1.2331651449203491, + "rewards/rejected": -3.351008892059326, + "step": 4860 + }, + { + "epoch": 0.7169144707787428, + "grad_norm": 219.95155352294205, + "learning_rate": 1.1246575481338305e-07, + "logits/chosen": -1.5462088584899902, + "logits/rejected": -1.1840392351150513, + "logps/chosen": -502.8143615722656, + "logps/rejected": -613.36181640625, + "loss": 0.4773, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.9136772155761719, + "rewards/margins": 1.6055662631988525, + "rewards/rejected": -3.5192437171936035, + "step": 4870 + }, + { + "epoch": 0.7183865744148388, + "grad_norm": 134.90641417431038, + "learning_rate": 1.1139467224413132e-07, + "logits/chosen": -1.2078558206558228, + "logits/rejected": -0.8434429168701172, + "logps/chosen": -432.5479431152344, + "logps/rejected": -537.1464233398438, + "loss": 0.5439, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9673494100570679, + "rewards/margins": 1.2908252477645874, + "rewards/rejected": -3.258174419403076, + "step": 4880 + }, + { + "epoch": 0.7198586780509347, + "grad_norm": 68.24633187819909, + "learning_rate": 1.1032725042362393e-07, + "logits/chosen": -1.235530138015747, + "logits/rejected": -0.9020940065383911, + "logps/chosen": -536.8782958984375, + "logps/rejected": -552.8338623046875, + "loss": 0.5013, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.643974781036377, + "rewards/margins": 0.541441798210144, + "rewards/rejected": -3.1854166984558105, + "step": 4890 + }, + { + "epoch": 0.7213307816870308, + "grad_norm": 208.56208619977357, + "learning_rate": 1.0926351754387336e-07, + "logits/chosen": -1.2716879844665527, + "logits/rejected": -1.2387449741363525, + "logps/chosen": -478.672607421875, + "logps/rejected": -568.089599609375, + "loss": 0.4597, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.7221558094024658, + "rewards/margins": 1.278188705444336, + "rewards/rejected": -3.000344753265381, + "step": 4900 + }, + { + "epoch": 0.7228028853231268, + "grad_norm": 61.40340833109135, + "learning_rate": 1.0820350169946174e-07, + "logits/chosen": -1.3892163038253784, + "logits/rejected": -1.3794981241226196, + "logps/chosen": -382.14105224609375, + "logps/rejected": -530.7696533203125, + "loss": 0.4713, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.5014878511428833, + "rewards/margins": 1.3289624452590942, + "rewards/rejected": -2.8304500579833984, + "step": 4910 + }, + { + "epoch": 0.7242749889592227, + "grad_norm": 127.50887792303523, + "learning_rate": 1.0714723088679983e-07, + "logits/chosen": -1.522983193397522, + "logits/rejected": -1.0181244611740112, + "logps/chosen": -511.8858337402344, + "logps/rejected": -541.6467895507812, + "loss": 0.4641, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.082904577255249, + "rewards/margins": 1.1971763372421265, + "rewards/rejected": -3.280080795288086, + "step": 4920 + }, + { + "epoch": 0.7257470925953187, + "grad_norm": 78.70695395767184, + "learning_rate": 1.06094733003387e-07, + "logits/chosen": -1.2051807641983032, + "logits/rejected": -1.1361181735992432, + "logps/chosen": -477.1014709472656, + "logps/rejected": -555.8853149414062, + "loss": 0.4816, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5811794996261597, + "rewards/margins": 1.1953643560409546, + "rewards/rejected": -2.7765440940856934, + "step": 4930 + }, + { + "epoch": 0.7272191962314147, + "grad_norm": 240.1764326503965, + "learning_rate": 1.0504603584707463e-07, + "logits/chosen": -1.2330982685089111, + "logits/rejected": -1.023993968963623, + "logps/chosen": -428.2814025878906, + "logps/rejected": -533.4830322265625, + "loss": 0.5244, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.11854887008667, + "rewards/margins": 1.11293625831604, + "rewards/rejected": -3.231484889984131, + "step": 4940 + }, + { + "epoch": 0.7286912998675107, + "grad_norm": 196.26001904227923, + "learning_rate": 1.0400116711533217e-07, + "logits/chosen": -1.3029544353485107, + "logits/rejected": -0.9165960550308228, + "logps/chosen": -529.0264892578125, + "logps/rejected": -552.2896728515625, + "loss": 0.5233, + "rewards/accuracies": 0.73333340883255, + "rewards/chosen": -1.8545910120010376, + "rewards/margins": 0.8611334562301636, + "rewards/rejected": -2.715724468231201, + "step": 4950 + }, + { + "epoch": 0.7301634035036066, + "grad_norm": 155.94982807610398, + "learning_rate": 1.029601544045148e-07, + "logits/chosen": -1.104768991470337, + "logits/rejected": -0.8252719640731812, + "logps/chosen": -474.57305908203125, + "logps/rejected": -527.7381591796875, + "loss": 0.4161, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9284982681274414, + "rewards/margins": 1.0434739589691162, + "rewards/rejected": -2.9719719886779785, + "step": 4960 + }, + { + "epoch": 0.7316355071397026, + "grad_norm": 51.6727868299786, + "learning_rate": 1.0192302520913563e-07, + "logits/chosen": -0.9299944639205933, + "logits/rejected": -0.6598071455955505, + "logps/chosen": -468.71539306640625, + "logps/rejected": -579.9954223632812, + "loss": 0.4837, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.1794610023498535, + "rewards/margins": 1.0939624309539795, + "rewards/rejected": -3.273423671722412, + "step": 4970 + }, + { + "epoch": 0.7331076107757987, + "grad_norm": 110.55540072362982, + "learning_rate": 1.0088980692113872e-07, + "logits/chosen": -1.024954915046692, + "logits/rejected": -0.7805144190788269, + "logps/chosen": -561.6268920898438, + "logps/rejected": -674.0429077148438, + "loss": 0.4224, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.212777614593506, + "rewards/margins": 1.7772693634033203, + "rewards/rejected": -3.990046977996826, + "step": 4980 + }, + { + "epoch": 0.7345797144118946, + "grad_norm": 90.26203016339764, + "learning_rate": 9.986052682917611e-08, + "logits/chosen": -1.4810940027236938, + "logits/rejected": -1.230894923210144, + "logps/chosen": -502.7579040527344, + "logps/rejected": -554.7216186523438, + "loss": 0.613, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -1.9367786645889282, + "rewards/margins": 0.5233142971992493, + "rewards/rejected": -2.4600930213928223, + "step": 4990 + }, + { + "epoch": 0.7360518180479906, + "grad_norm": 81.04926915145059, + "learning_rate": 9.883521211788682e-08, + "logits/chosen": -0.960936427116394, + "logits/rejected": -0.5718650817871094, + "logps/chosen": -493.239013671875, + "logps/rejected": -577.866943359375, + "loss": 0.4684, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.908361792564392, + "rewards/margins": 1.3755227327346802, + "rewards/rejected": -3.2838847637176514, + "step": 5000 + }, + { + "epoch": 0.7375239216840865, + "grad_norm": 74.59843512838292, + "learning_rate": 9.781388986717898e-08, + "logits/chosen": -1.1208409070968628, + "logits/rejected": -0.9476618766784668, + "logps/chosen": -411.9447326660156, + "logps/rejected": -518.728759765625, + "loss": 0.4647, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.009578227996826, + "rewards/margins": 1.4132570028305054, + "rewards/rejected": -3.422835111618042, + "step": 5010 + }, + { + "epoch": 0.7389960253201825, + "grad_norm": 154.7930705726265, + "learning_rate": 9.679658705151461e-08, + "logits/chosen": -1.0803502798080444, + "logits/rejected": -0.8001077771186829, + "logps/chosen": -441.07916259765625, + "logps/rejected": -584.9094848632812, + "loss": 0.5437, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -2.0469777584075928, + "rewards/margins": 1.5888301134109497, + "rewards/rejected": -3.635808229446411, + "step": 5020 + }, + { + "epoch": 0.7404681289562786, + "grad_norm": 140.79156361461415, + "learning_rate": 9.578333053919704e-08, + "logits/chosen": -1.1078836917877197, + "logits/rejected": -1.1201703548431396, + "logps/chosen": -426.13885498046875, + "logps/rejected": -649.7520751953125, + "loss": 0.6181, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.020275115966797, + "rewards/margins": 1.667791724205017, + "rewards/rejected": -3.6880669593811035, + "step": 5030 + }, + { + "epoch": 0.7419402325923745, + "grad_norm": 77.80948000016986, + "learning_rate": 9.477414709166182e-08, + "logits/chosen": -1.4168012142181396, + "logits/rejected": -1.0917749404907227, + "logps/chosen": -503.44268798828125, + "logps/rejected": -614.2481689453125, + "loss": 0.5284, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.988455057144165, + "rewards/margins": 1.3467494249343872, + "rewards/rejected": -3.335204601287842, + "step": 5040 + }, + { + "epoch": 0.7434123362284705, + "grad_norm": 87.99138728768457, + "learning_rate": 9.376906336276894e-08, + "logits/chosen": -1.034148931503296, + "logits/rejected": -1.0049606561660767, + "logps/chosen": -422.7621154785156, + "logps/rejected": -563.6510009765625, + "loss": 0.4997, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6347382068634033, + "rewards/margins": 1.159616231918335, + "rewards/rejected": -2.7943549156188965, + "step": 5050 + }, + { + "epoch": 0.7448844398645664, + "grad_norm": 111.81114941982995, + "learning_rate": 9.276810589809978e-08, + "logits/chosen": -1.1015427112579346, + "logits/rejected": -1.2049635648727417, + "logps/chosen": -353.7889709472656, + "logps/rejected": -491.4095153808594, + "loss": 0.4642, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.6308670043945312, + "rewards/margins": 1.2243486642837524, + "rewards/rejected": -2.855215549468994, + "step": 5060 + }, + { + "epoch": 0.7463565435006625, + "grad_norm": 242.49546203208843, + "learning_rate": 9.177130113425562e-08, + "logits/chosen": -1.185302972793579, + "logits/rejected": -0.9789964556694031, + "logps/chosen": -422.8511657714844, + "logps/rejected": -507.0664978027344, + "loss": 0.6004, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.359539270401001, + "rewards/margins": 0.921037495136261, + "rewards/rejected": -3.2805771827697754, + "step": 5070 + }, + { + "epoch": 0.7478286471367585, + "grad_norm": 101.11339953647807, + "learning_rate": 9.077867539815948e-08, + "logits/chosen": -1.6359272003173828, + "logits/rejected": -1.3089901208877563, + "logps/chosen": -425.45819091796875, + "logps/rejected": -454.26043701171875, + "loss": 0.4915, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.5834044218063354, + "rewards/margins": 0.8697870373725891, + "rewards/rejected": -2.4531915187835693, + "step": 5080 + }, + { + "epoch": 0.7493007507728544, + "grad_norm": 47.639643565896755, + "learning_rate": 8.979025490636064e-08, + "logits/chosen": -1.3621609210968018, + "logits/rejected": -1.256654143333435, + "logps/chosen": -446.0369567871094, + "logps/rejected": -507.7056579589844, + "loss": 0.4722, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.8531553745269775, + "rewards/margins": 0.7603796720504761, + "rewards/rejected": -2.613534450531006, + "step": 5090 + }, + { + "epoch": 0.7507728544089504, + "grad_norm": 118.8779220514645, + "learning_rate": 8.880606576434249e-08, + "logits/chosen": -1.488552451133728, + "logits/rejected": -1.2982892990112305, + "logps/chosen": -405.803955078125, + "logps/rejected": -508.36431884765625, + "loss": 0.4721, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.5724586248397827, + "rewards/margins": 1.145263671875, + "rewards/rejected": -2.7177224159240723, + "step": 5100 + }, + { + "epoch": 0.7522449580450463, + "grad_norm": 68.61340044861234, + "learning_rate": 8.782613396583285e-08, + "logits/chosen": -1.3668371438980103, + "logits/rejected": -1.1793230772018433, + "logps/chosen": -451.9384765625, + "logps/rejected": -477.31085205078125, + "loss": 0.4848, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.6828315258026123, + "rewards/margins": 0.7365844249725342, + "rewards/rejected": -2.4194159507751465, + "step": 5110 + }, + { + "epoch": 0.7537170616811424, + "grad_norm": 139.7376959255474, + "learning_rate": 8.685048539211745e-08, + "logits/chosen": -1.1131430864334106, + "logits/rejected": -0.9064427614212036, + "logps/chosen": -508.733154296875, + "logps/rejected": -587.9703979492188, + "loss": 0.4359, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.178009033203125, + "rewards/margins": 1.0938993692398071, + "rewards/rejected": -3.2719085216522217, + "step": 5120 + }, + { + "epoch": 0.7551891653172383, + "grad_norm": 135.509992272648, + "learning_rate": 8.587914581135672e-08, + "logits/chosen": -0.9653269648551941, + "logits/rejected": -0.8376359939575195, + "logps/chosen": -408.57305908203125, + "logps/rejected": -519.3517456054688, + "loss": 0.486, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.551114797592163, + "rewards/margins": 1.113257646560669, + "rewards/rejected": -2.664372682571411, + "step": 5130 + }, + { + "epoch": 0.7566612689533343, + "grad_norm": 417.05549959702216, + "learning_rate": 8.491214087790447e-08, + "logits/chosen": -1.4374322891235352, + "logits/rejected": -1.3166886568069458, + "logps/chosen": -521.9385375976562, + "logps/rejected": -651.793212890625, + "loss": 0.5184, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.9301999807357788, + "rewards/margins": 1.2636077404022217, + "rewards/rejected": -3.193808078765869, + "step": 5140 + }, + { + "epoch": 0.7581333725894303, + "grad_norm": 204.55940551843685, + "learning_rate": 8.394949613163111e-08, + "logits/chosen": -1.0238239765167236, + "logits/rejected": -0.9224424362182617, + "logps/chosen": -386.3232727050781, + "logps/rejected": -460.06463623046875, + "loss": 0.4518, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.6176748275756836, + "rewards/margins": 0.9928226470947266, + "rewards/rejected": -2.61049747467041, + "step": 5150 + }, + { + "epoch": 0.7596054762255263, + "grad_norm": 34.407521574259064, + "learning_rate": 8.299123699724864e-08, + "logits/chosen": -1.1135423183441162, + "logits/rejected": -0.7028575539588928, + "logps/chosen": -530.0160522460938, + "logps/rejected": -581.4072875976562, + "loss": 0.4823, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8791427612304688, + "rewards/margins": 1.3711122274398804, + "rewards/rejected": -3.2502551078796387, + "step": 5160 + }, + { + "epoch": 0.7610775798616223, + "grad_norm": 101.6517993320236, + "learning_rate": 8.203738878363933e-08, + "logits/chosen": -0.7041149139404297, + "logits/rejected": -0.17306460440158844, + "logps/chosen": -437.92120361328125, + "logps/rejected": -510.3734436035156, + "loss": 0.5355, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -2.0662975311279297, + "rewards/margins": 1.1094260215759277, + "rewards/rejected": -3.1757235527038574, + "step": 5170 + }, + { + "epoch": 0.7625496834977182, + "grad_norm": 163.90139851649843, + "learning_rate": 8.108797668318743e-08, + "logits/chosen": -0.9095066785812378, + "logits/rejected": -0.8623138666152954, + "logps/chosen": -445.17529296875, + "logps/rejected": -568.5921020507812, + "loss": 0.5455, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.2778639793395996, + "rewards/margins": 1.1191171407699585, + "rewards/rejected": -3.3969810009002686, + "step": 5180 + }, + { + "epoch": 0.7640217871338142, + "grad_norm": 73.88233739116606, + "learning_rate": 8.01430257711132e-08, + "logits/chosen": -0.9905455708503723, + "logits/rejected": -0.7103983163833618, + "logps/chosen": -458.93902587890625, + "logps/rejected": -593.7363891601562, + "loss": 0.4265, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.1253790855407715, + "rewards/margins": 1.6912176609039307, + "rewards/rejected": -3.8165974617004395, + "step": 5190 + }, + { + "epoch": 0.7654938907699103, + "grad_norm": 196.9430917658201, + "learning_rate": 7.92025610048114e-08, + "logits/chosen": -1.013805627822876, + "logits/rejected": -0.6459366083145142, + "logps/chosen": -409.70068359375, + "logps/rejected": -502.82110595703125, + "loss": 0.4609, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7644859552383423, + "rewards/margins": 1.2906858921051025, + "rewards/rejected": -3.0551717281341553, + "step": 5200 + }, + { + "epoch": 0.7669659944060062, + "grad_norm": 74.651730931631, + "learning_rate": 7.826660722319165e-08, + "logits/chosen": -1.122715950012207, + "logits/rejected": -0.6192690134048462, + "logps/chosen": -541.5331420898438, + "logps/rejected": -591.73779296875, + "loss": 0.4167, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -1.8046505451202393, + "rewards/margins": 1.2774540185928345, + "rewards/rejected": -3.0821046829223633, + "step": 5210 + }, + { + "epoch": 0.7684380980421022, + "grad_norm": 73.39352904258119, + "learning_rate": 7.733518914602252e-08, + "logits/chosen": -1.3215038776397705, + "logits/rejected": -0.8376846313476562, + "logps/chosen": -507.8236389160156, + "logps/rejected": -609.2210693359375, + "loss": 0.4754, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.7433942556381226, + "rewards/margins": 1.6118046045303345, + "rewards/rejected": -3.355198621749878, + "step": 5220 + }, + { + "epoch": 0.7699102016781981, + "grad_norm": 75.28094293842216, + "learning_rate": 7.640833137327882e-08, + "logits/chosen": -0.9051146507263184, + "logits/rejected": -0.7753626108169556, + "logps/chosen": -438.79345703125, + "logps/rejected": -540.4361572265625, + "loss": 0.4078, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7430813312530518, + "rewards/margins": 1.3570858240127563, + "rewards/rejected": -3.1001667976379395, + "step": 5230 + }, + { + "epoch": 0.7713823053142941, + "grad_norm": 128.82503440964186, + "learning_rate": 7.548605838449151e-08, + "logits/chosen": -0.6902369260787964, + "logits/rejected": -0.8114057779312134, + "logps/chosen": -402.22052001953125, + "logps/rejected": -536.6979370117188, + "loss": 0.413, + "rewards/accuracies": 0.9000000953674316, + "rewards/chosen": -1.646235704421997, + "rewards/margins": 1.00114107131958, + "rewards/rejected": -2.6473770141601562, + "step": 5240 + }, + { + "epoch": 0.7728544089503901, + "grad_norm": 80.97571513219803, + "learning_rate": 7.456839453810157e-08, + "logits/chosen": -0.9411908388137817, + "logits/rejected": -0.9172288775444031, + "logps/chosen": -474.6258850097656, + "logps/rejected": -630.0208740234375, + "loss": 0.4246, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.9182020425796509, + "rewards/margins": 1.4652349948883057, + "rewards/rejected": -3.383436918258667, + "step": 5250 + }, + { + "epoch": 0.7743265125864861, + "grad_norm": 122.10571052023454, + "learning_rate": 7.365536407081633e-08, + "logits/chosen": -1.0238875150680542, + "logits/rejected": -1.099306344985962, + "logps/chosen": -414.7713928222656, + "logps/rejected": -576.614990234375, + "loss": 0.3961, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5568387508392334, + "rewards/margins": 1.4501093626022339, + "rewards/rejected": -3.0069479942321777, + "step": 5260 + }, + { + "epoch": 0.7757986162225821, + "grad_norm": 85.50550045383129, + "learning_rate": 7.274699109696975e-08, + "logits/chosen": -0.6875780820846558, + "logits/rejected": -0.4641219675540924, + "logps/chosen": -390.39202880859375, + "logps/rejected": -429.81597900390625, + "loss": 0.63, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.5352225303649902, + "rewards/margins": 0.7711163759231567, + "rewards/rejected": -2.3063387870788574, + "step": 5270 + }, + { + "epoch": 0.777270719858678, + "grad_norm": 91.4723023139376, + "learning_rate": 7.184329960788491e-08, + "logits/chosen": -0.732958197593689, + "logits/rejected": -0.5895234942436218, + "logps/chosen": -372.40020751953125, + "logps/rejected": -540.822265625, + "loss": 0.4537, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.6646983623504639, + "rewards/margins": 1.4132230281829834, + "rewards/rejected": -3.0779216289520264, + "step": 5280 + }, + { + "epoch": 0.7787428234947741, + "grad_norm": 278.3368464566536, + "learning_rate": 7.094431347124092e-08, + "logits/chosen": -0.7445374727249146, + "logits/rejected": -0.5801293253898621, + "logps/chosen": -404.41229248046875, + "logps/rejected": -530.0057373046875, + "loss": 0.3971, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.5595781803131104, + "rewards/margins": 1.558570146560669, + "rewards/rejected": -3.1181483268737793, + "step": 5290 + }, + { + "epoch": 0.78021492713087, + "grad_norm": 74.82596662365242, + "learning_rate": 7.00500564304424e-08, + "logits/chosen": -1.0966825485229492, + "logits/rejected": -0.7031265497207642, + "logps/chosen": -554.8265380859375, + "logps/rejected": -602.3065795898438, + "loss": 0.482, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.9159107208251953, + "rewards/margins": 0.9733444452285767, + "rewards/rejected": -2.8892550468444824, + "step": 5300 + }, + { + "epoch": 0.781687030766966, + "grad_norm": 120.62747714463966, + "learning_rate": 6.916055210399219e-08, + "logits/chosen": -1.3223588466644287, + "logits/rejected": -1.294297695159912, + "logps/chosen": -470.7421875, + "logps/rejected": -630.7884521484375, + "loss": 0.5395, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6789801120758057, + "rewards/margins": 1.175195336341858, + "rewards/rejected": -2.854175329208374, + "step": 5310 + }, + { + "epoch": 0.783159134403062, + "grad_norm": 167.9875759220198, + "learning_rate": 6.827582398486797e-08, + "logits/chosen": -1.1235952377319336, + "logits/rejected": -0.8662859201431274, + "logps/chosen": -563.8563842773438, + "logps/rejected": -630.38525390625, + "loss": 0.5601, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7638486623764038, + "rewards/margins": 1.6777280569076538, + "rewards/rejected": -3.4415767192840576, + "step": 5320 + }, + { + "epoch": 0.7846312380391579, + "grad_norm": 95.30009107178319, + "learning_rate": 6.739589543990118e-08, + "logits/chosen": -0.8049508929252625, + "logits/rejected": -0.3748716711997986, + "logps/chosen": -458.09881591796875, + "logps/rejected": -523.8165283203125, + "loss": 0.5408, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7180614471435547, + "rewards/margins": 1.321679949760437, + "rewards/rejected": -3.039741039276123, + "step": 5330 + }, + { + "epoch": 0.786103341675254, + "grad_norm": 83.28037746239194, + "learning_rate": 6.652078970916037e-08, + "logits/chosen": -1.2533515691757202, + "logits/rejected": -0.9195235967636108, + "logps/chosen": -450.052734375, + "logps/rejected": -632.3322143554688, + "loss": 0.4399, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -1.8251291513442993, + "rewards/margins": 1.7994213104248047, + "rewards/rejected": -3.6245503425598145, + "step": 5340 + }, + { + "epoch": 0.7875754453113499, + "grad_norm": 71.37025968394839, + "learning_rate": 6.565052990533715e-08, + "logits/chosen": -1.2955372333526611, + "logits/rejected": -1.0631654262542725, + "logps/chosen": -458.4562072753906, + "logps/rejected": -558.8685302734375, + "loss": 0.4946, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.0641376972198486, + "rewards/margins": 1.1009408235549927, + "rewards/rejected": -3.165078639984131, + "step": 5350 + }, + { + "epoch": 0.7890475489474459, + "grad_norm": 169.63257262619535, + "learning_rate": 6.478513901313615e-08, + "logits/chosen": -0.7250382900238037, + "logits/rejected": -0.8027860522270203, + "logps/chosen": -411.46270751953125, + "logps/rejected": -578.429443359375, + "loss": 0.5967, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6648842096328735, + "rewards/margins": 1.2077052593231201, + "rewards/rejected": -2.872589588165283, + "step": 5360 + }, + { + "epoch": 0.7905196525835418, + "grad_norm": 132.49128507271854, + "learning_rate": 6.392463988866714e-08, + "logits/chosen": -0.8881329298019409, + "logits/rejected": -0.8352672457695007, + "logps/chosen": -463.90423583984375, + "logps/rejected": -611.5502319335938, + "loss": 0.6029, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8873636722564697, + "rewards/margins": 1.2025063037872314, + "rewards/rejected": -3.089869737625122, + "step": 5370 + }, + { + "epoch": 0.7919917562196379, + "grad_norm": 61.542050881116836, + "learning_rate": 6.306905525884227e-08, + "logits/chosen": -1.3942468166351318, + "logits/rejected": -1.2076038122177124, + "logps/chosen": -462.5706481933594, + "logps/rejected": -575.2427368164062, + "loss": 0.4698, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.9121758937835693, + "rewards/margins": 1.0809472799301147, + "rewards/rejected": -2.9931235313415527, + "step": 5380 + }, + { + "epoch": 0.7934638598557339, + "grad_norm": 103.19763062346142, + "learning_rate": 6.221840772077525e-08, + "logits/chosen": -0.9392485618591309, + "logits/rejected": -0.785864531993866, + "logps/chosen": -442.7115783691406, + "logps/rejected": -534.4571533203125, + "loss": 0.4657, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -2.0529017448425293, + "rewards/margins": 1.207236886024475, + "rewards/rejected": -3.260138750076294, + "step": 5390 + }, + { + "epoch": 0.7949359634918298, + "grad_norm": 122.32476765811322, + "learning_rate": 6.137271974118468e-08, + "logits/chosen": -1.2774121761322021, + "logits/rejected": -0.9235979914665222, + "logps/chosen": -475.64581298828125, + "logps/rejected": -539.6106567382812, + "loss": 0.5263, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -2.122734546661377, + "rewards/margins": 0.862054705619812, + "rewards/rejected": -2.9847893714904785, + "step": 5400 + }, + { + "epoch": 0.7964080671279258, + "grad_norm": 64.95411965382428, + "learning_rate": 6.05320136558011e-08, + "logits/chosen": -1.3055362701416016, + "logits/rejected": -1.145372986793518, + "logps/chosen": -454.47137451171875, + "logps/rejected": -581.1241455078125, + "loss": 0.5245, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -2.04017972946167, + "rewards/margins": 1.0378855466842651, + "rewards/rejected": -3.0780653953552246, + "step": 5410 + }, + { + "epoch": 0.7978801707640217, + "grad_norm": 116.48014811484558, + "learning_rate": 5.969631166877607e-08, + "logits/chosen": -1.1434481143951416, + "logits/rejected": -0.8270803689956665, + "logps/chosen": -515.3753051757812, + "logps/rejected": -593.0107421875, + "loss": 0.4484, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.8972396850585938, + "rewards/margins": 1.433844804763794, + "rewards/rejected": -3.3310844898223877, + "step": 5420 + }, + { + "epoch": 0.7993522744001178, + "grad_norm": 117.519692767702, + "learning_rate": 5.8865635852096754e-08, + "logits/chosen": -1.0619937181472778, + "logits/rejected": -0.7892602682113647, + "logps/chosen": -413.7740173339844, + "logps/rejected": -494.00909423828125, + "loss": 0.4651, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.7297977209091187, + "rewards/margins": 1.0969278812408447, + "rewards/rejected": -2.826725959777832, + "step": 5430 + }, + { + "epoch": 0.8008243780362138, + "grad_norm": 100.66979647975546, + "learning_rate": 5.8040008145002344e-08, + "logits/chosen": -1.5371156930923462, + "logits/rejected": -0.8197881579399109, + "logps/chosen": -532.117919921875, + "logps/rejected": -614.5211791992188, + "loss": 0.402, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.0510811805725098, + "rewards/margins": 1.4343019723892212, + "rewards/rejected": -3.4853832721710205, + "step": 5440 + }, + { + "epoch": 0.8022964816723097, + "grad_norm": 101.8884243594703, + "learning_rate": 5.721945035340511e-08, + "logits/chosen": -0.8520714640617371, + "logits/rejected": -0.6139043569564819, + "logps/chosen": -428.1153869628906, + "logps/rejected": -618.9529418945312, + "loss": 0.4072, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.7851970195770264, + "rewards/margins": 2.267371416091919, + "rewards/rejected": -4.052568435668945, + "step": 5450 + }, + { + "epoch": 0.8037685853084057, + "grad_norm": 100.25980805909032, + "learning_rate": 5.640398414931399e-08, + "logits/chosen": -0.8231403231620789, + "logits/rejected": -0.8821843862533569, + "logps/chosen": -411.2177734375, + "logps/rejected": -550.2454833984375, + "loss": 0.5865, + "rewards/accuracies": 0.5666666626930237, + "rewards/chosen": -2.009340763092041, + "rewards/margins": 0.9562188386917114, + "rewards/rejected": -2.965559482574463, + "step": 5460 + }, + { + "epoch": 0.8052406889445017, + "grad_norm": 142.23657214742704, + "learning_rate": 5.5593631070262296e-08, + "logits/chosen": -1.092525839805603, + "logits/rejected": -0.8739947080612183, + "logps/chosen": -511.62603759765625, + "logps/rejected": -679.1864624023438, + "loss": 0.4404, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": -2.1543993949890137, + "rewards/margins": 1.8308734893798828, + "rewards/rejected": -3.9852726459503174, + "step": 5470 + }, + { + "epoch": 0.8067127925805977, + "grad_norm": 36.22180305726395, + "learning_rate": 5.478841251873922e-08, + "logits/chosen": -1.0160709619522095, + "logits/rejected": -0.8282931447029114, + "logps/chosen": -498.01593017578125, + "logps/rejected": -597.4322509765625, + "loss": 0.4351, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.6116422414779663, + "rewards/margins": 1.7601864337921143, + "rewards/rejected": -3.371828556060791, + "step": 5480 + }, + { + "epoch": 0.8081848962166936, + "grad_norm": 49.50775250090131, + "learning_rate": 5.398834976162414e-08, + "logits/chosen": -1.0119783878326416, + "logits/rejected": -0.84825199842453, + "logps/chosen": -447.3890686035156, + "logps/rejected": -616.212158203125, + "loss": 0.4133, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": -1.6231396198272705, + "rewards/margins": 1.9679412841796875, + "rewards/rejected": -3.591080904006958, + "step": 5490 + }, + { + "epoch": 0.8096569998527896, + "grad_norm": 94.12103804659836, + "learning_rate": 5.319346392962545e-08, + "logits/chosen": -0.9219223260879517, + "logits/rejected": -1.1575325727462769, + "logps/chosen": -483.120361328125, + "logps/rejected": -614.8145141601562, + "loss": 0.4967, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -2.1298470497131348, + "rewards/margins": 1.2720041275024414, + "rewards/rejected": -3.401851177215576, + "step": 5500 + }, + { + "epoch": 0.8111291034888857, + "grad_norm": 81.43480149716352, + "learning_rate": 5.24037760167218e-08, + "logits/chosen": -1.2106125354766846, + "logits/rejected": -0.9000335931777954, + "logps/chosen": -507.7727966308594, + "logps/rejected": -634.916259765625, + "loss": 0.4055, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.8326189517974854, + "rewards/margins": 1.7793636322021484, + "rewards/rejected": -3.6119818687438965, + "step": 5510 + }, + { + "epoch": 0.8126012071249816, + "grad_norm": 127.47498322992408, + "learning_rate": 5.161930687960808e-08, + "logits/chosen": -1.1342895030975342, + "logits/rejected": -0.4972988963127136, + "logps/chosen": -537.1297607421875, + "logps/rejected": -529.9140014648438, + "loss": 0.6542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.323653221130371, + "rewards/margins": 0.920685887336731, + "rewards/rejected": -3.2443389892578125, + "step": 5520 + }, + { + "epoch": 0.8140733107610776, + "grad_norm": 154.2724258209214, + "learning_rate": 5.0840077237144594e-08, + "logits/chosen": -1.1352190971374512, + "logits/rejected": -0.48211875557899475, + "logps/chosen": -487.15673828125, + "logps/rejected": -586.2088012695312, + "loss": 0.5186, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.9213392734527588, + "rewards/margins": 1.605242133140564, + "rewards/rejected": -3.5265815258026123, + "step": 5530 + }, + { + "epoch": 0.8155454143971735, + "grad_norm": 56.92923315566267, + "learning_rate": 5.006610766980945e-08, + "logits/chosen": -1.3120381832122803, + "logits/rejected": -0.9684025049209595, + "logps/chosen": -483.5899353027344, + "logps/rejected": -585.5111083984375, + "loss": 0.4709, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.9091869592666626, + "rewards/margins": 1.3269858360290527, + "rewards/rejected": -3.236172914505005, + "step": 5540 + }, + { + "epoch": 0.8170175180332695, + "grad_norm": 159.00410823623452, + "learning_rate": 4.929741861915571e-08, + "logits/chosen": -1.103283405303955, + "logits/rejected": -0.8487194776535034, + "logps/chosen": -449.9546813964844, + "logps/rejected": -545.6394653320312, + "loss": 0.4793, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9547722339630127, + "rewards/margins": 1.2869764566421509, + "rewards/rejected": -3.241748332977295, + "step": 5550 + }, + { + "epoch": 0.8184896216693656, + "grad_norm": 174.72996470987636, + "learning_rate": 4.853403038727047e-08, + "logits/chosen": -0.8006267547607422, + "logits/rejected": -0.8652332425117493, + "logps/chosen": -446.9790954589844, + "logps/rejected": -649.7698974609375, + "loss": 0.5078, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.230410099029541, + "rewards/margins": 1.1523979902267456, + "rewards/rejected": -3.382808208465576, + "step": 5560 + }, + { + "epoch": 0.8199617253054615, + "grad_norm": 160.17335513149266, + "learning_rate": 4.777596313623966e-08, + "logits/chosen": -0.8895164728164673, + "logits/rejected": -0.713270366191864, + "logps/chosen": -526.5275268554688, + "logps/rejected": -661.9764404296875, + "loss": 0.4825, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.8631007671356201, + "rewards/margins": 1.9552085399627686, + "rewards/rejected": -3.8183093070983887, + "step": 5570 + }, + { + "epoch": 0.8214338289415575, + "grad_norm": 86.23011243438715, + "learning_rate": 4.702323688761492e-08, + "logits/chosen": -1.1419235467910767, + "logits/rejected": -0.6052602529525757, + "logps/chosen": -504.4242248535156, + "logps/rejected": -643.70556640625, + "loss": 0.461, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.5124993324279785, + "rewards/margins": 1.4872256517410278, + "rewards/rejected": -3.9997246265411377, + "step": 5580 + }, + { + "epoch": 0.8229059325776534, + "grad_norm": 130.81186723130014, + "learning_rate": 4.627587152188522e-08, + "logits/chosen": -1.2554781436920166, + "logits/rejected": -0.850979208946228, + "logps/chosen": -460.42193603515625, + "logps/rejected": -534.8896484375, + "loss": 0.4905, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.7845499515533447, + "rewards/margins": 1.4234528541564941, + "rewards/rejected": -3.208002805709839, + "step": 5590 + }, + { + "epoch": 0.8243780362137495, + "grad_norm": 53.48766106246754, + "learning_rate": 4.5533886777951395e-08, + "logits/chosen": -1.2386672496795654, + "logits/rejected": -1.0995800495147705, + "logps/chosen": -488.24609375, + "logps/rejected": -717.6665649414062, + "loss": 0.466, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -1.591486930847168, + "rewards/margins": 1.6269986629486084, + "rewards/rejected": -3.2184855937957764, + "step": 5600 + }, + { + "epoch": 0.8258501398498455, + "grad_norm": 179.02658737472754, + "learning_rate": 4.4797302252604916e-08, + "logits/chosen": -0.8327213525772095, + "logits/rejected": -0.48737287521362305, + "logps/chosen": -420.37127685546875, + "logps/rejected": -418.37469482421875, + "loss": 0.5046, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.9189449548721313, + "rewards/margins": 0.5347118377685547, + "rewards/rejected": -2.4536566734313965, + "step": 5610 + }, + { + "epoch": 0.8273222434859414, + "grad_norm": 119.17117268247245, + "learning_rate": 4.4066137400010555e-08, + "logits/chosen": -1.0616779327392578, + "logits/rejected": -0.677248477935791, + "logps/chosen": -466.2523498535156, + "logps/rejected": -678.2501831054688, + "loss": 0.4532, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9914319515228271, + "rewards/margins": 2.348839282989502, + "rewards/rejected": -4.340271472930908, + "step": 5620 + }, + { + "epoch": 0.8287943471220374, + "grad_norm": 154.605434088848, + "learning_rate": 4.3340411531192306e-08, + "logits/chosen": -1.2709392309188843, + "logits/rejected": -1.0618146657943726, + "logps/chosen": -456.10687255859375, + "logps/rejected": -474.19207763671875, + "loss": 0.4917, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.4876735210418701, + "rewards/margins": 0.9246397018432617, + "rewards/rejected": -2.4123129844665527, + "step": 5630 + }, + { + "epoch": 0.8302664507581333, + "grad_norm": 76.55175223996622, + "learning_rate": 4.262014381352363e-08, + "logits/chosen": -1.1449110507965088, + "logits/rejected": -0.7669967412948608, + "logps/chosen": -472.25311279296875, + "logps/rejected": -571.5916748046875, + "loss": 0.5048, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.970712423324585, + "rewards/margins": 1.2301080226898193, + "rewards/rejected": -3.2008204460144043, + "step": 5640 + }, + { + "epoch": 0.8317385543942294, + "grad_norm": 85.02930703515193, + "learning_rate": 4.1905353270220795e-08, + "logits/chosen": -0.9813809394836426, + "logits/rejected": -0.7250092029571533, + "logps/chosen": -424.24029541015625, + "logps/rejected": -592.8509521484375, + "loss": 0.5054, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6724621057510376, + "rewards/margins": 1.3414427042007446, + "rewards/rejected": -3.0139048099517822, + "step": 5650 + }, + { + "epoch": 0.8332106580303253, + "grad_norm": 157.6285759953968, + "learning_rate": 4.119605877984089e-08, + "logits/chosen": -1.104259729385376, + "logits/rejected": -1.0037262439727783, + "logps/chosen": -414.4137268066406, + "logps/rejected": -492.80072021484375, + "loss": 0.7174, + "rewards/accuracies": 0.6333332657814026, + "rewards/chosen": -2.0089263916015625, + "rewards/margins": 0.5909749269485474, + "rewards/rejected": -2.5999014377593994, + "step": 5660 + }, + { + "epoch": 0.8346827616664213, + "grad_norm": 96.77748205867135, + "learning_rate": 4.049227907578284e-08, + "logits/chosen": -1.0676041841506958, + "logits/rejected": -1.1607863903045654, + "logps/chosen": -451.34478759765625, + "logps/rejected": -709.9849853515625, + "loss": 0.4862, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.9473644495010376, + "rewards/margins": 2.1161437034606934, + "rewards/rejected": -4.063508033752441, + "step": 5670 + }, + { + "epoch": 0.8361548653025173, + "grad_norm": 149.98334291225578, + "learning_rate": 3.979403274579313e-08, + "logits/chosen": -1.1041946411132812, + "logits/rejected": -0.8294920921325684, + "logps/chosen": -480.2505798339844, + "logps/rejected": -617.8287353515625, + "loss": 0.4211, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8639404773712158, + "rewards/margins": 1.836787223815918, + "rewards/rejected": -3.7007274627685547, + "step": 5680 + }, + { + "epoch": 0.8376269689386133, + "grad_norm": 208.11218973412983, + "learning_rate": 3.9101338231474354e-08, + "logits/chosen": -1.2287366390228271, + "logits/rejected": -1.170518159866333, + "logps/chosen": -497.8878479003906, + "logps/rejected": -603.9132080078125, + "loss": 0.5463, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1765148639678955, + "rewards/margins": 1.1622388362884521, + "rewards/rejected": -3.3387539386749268, + "step": 5690 + }, + { + "epoch": 0.8390990725747093, + "grad_norm": 262.99584228024503, + "learning_rate": 3.841421382779827e-08, + "logits/chosen": -0.9853576421737671, + "logits/rejected": -0.9694561958312988, + "logps/chosen": -434.36968994140625, + "logps/rejected": -623.12353515625, + "loss": 0.5086, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.9018787145614624, + "rewards/margins": 1.5596400499343872, + "rewards/rejected": -3.4615185260772705, + "step": 5700 + }, + { + "epoch": 0.8405711762108052, + "grad_norm": 105.01294458107496, + "learning_rate": 3.773267768262289e-08, + "logits/chosen": -1.087580680847168, + "logits/rejected": -0.925014317035675, + "logps/chosen": -448.4651794433594, + "logps/rejected": -566.5296630859375, + "loss": 0.5454, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.8132915496826172, + "rewards/margins": 1.158361792564392, + "rewards/rejected": -2.9716532230377197, + "step": 5710 + }, + { + "epoch": 0.8420432798469012, + "grad_norm": 73.87553599269656, + "learning_rate": 3.7056747796212844e-08, + "logits/chosen": -1.0383179187774658, + "logits/rejected": -0.7845174670219421, + "logps/chosen": -416.5511169433594, + "logps/rejected": -576.56298828125, + "loss": 0.4035, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -1.6777441501617432, + "rewards/margins": 1.9753530025482178, + "rewards/rejected": -3.653097152709961, + "step": 5720 + }, + { + "epoch": 0.8435153834829971, + "grad_norm": 100.82851322025643, + "learning_rate": 3.63864420207643e-08, + "logits/chosen": -1.1529678106307983, + "logits/rejected": -1.1272436380386353, + "logps/chosen": -488.6665954589844, + "logps/rejected": -651.0697021484375, + "loss": 0.3804, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.9270823001861572, + "rewards/margins": 1.856080412864685, + "rewards/rejected": -3.7831625938415527, + "step": 5730 + }, + { + "epoch": 0.8449874871190932, + "grad_norm": 111.12084223372518, + "learning_rate": 3.5721778059933096e-08, + "logits/chosen": -1.1405971050262451, + "logits/rejected": -1.0132901668548584, + "logps/chosen": -491.6520080566406, + "logps/rejected": -472.123779296875, + "loss": 0.5681, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -2.2600319385528564, + "rewards/margins": 0.36489155888557434, + "rewards/rejected": -2.6249232292175293, + "step": 5740 + }, + { + "epoch": 0.8464595907551892, + "grad_norm": 53.54603995220204, + "learning_rate": 3.506277346836731e-08, + "logits/chosen": -0.9994527101516724, + "logits/rejected": -1.1588630676269531, + "logps/chosen": -467.54095458984375, + "logps/rejected": -668.3226318359375, + "loss": 0.547, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.395822048187256, + "rewards/margins": 1.0861228704452515, + "rewards/rejected": -3.4819445610046387, + "step": 5750 + }, + { + "epoch": 0.8479316943912851, + "grad_norm": 254.240027819125, + "learning_rate": 3.440944565124371e-08, + "logits/chosen": -0.8595970273017883, + "logits/rejected": -0.6523178815841675, + "logps/chosen": -398.448974609375, + "logps/rejected": -523.7808837890625, + "loss": 0.4023, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.7556339502334595, + "rewards/margins": 1.1756236553192139, + "rewards/rejected": -2.931257724761963, + "step": 5760 + }, + { + "epoch": 0.8494037980273811, + "grad_norm": 223.22737317671786, + "learning_rate": 3.376181186380808e-08, + "logits/chosen": -1.2577579021453857, + "logits/rejected": -0.9970169067382812, + "logps/chosen": -472.1746520996094, + "logps/rejected": -514.9297485351562, + "loss": 0.5821, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.2786521911621094, + "rewards/margins": 0.6689008474349976, + "rewards/rejected": -2.9475531578063965, + "step": 5770 + }, + { + "epoch": 0.8508759016634772, + "grad_norm": 197.44604806145549, + "learning_rate": 3.311988921091935e-08, + "logits/chosen": -1.3751404285430908, + "logits/rejected": -0.8442422151565552, + "logps/chosen": -464.4234313964844, + "logps/rejected": -507.7842712402344, + "loss": 0.4788, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -1.6880584955215454, + "rewards/margins": 1.2394956350326538, + "rewards/rejected": -2.9275546073913574, + "step": 5780 + }, + { + "epoch": 0.8523480052995731, + "grad_norm": 102.29696075400405, + "learning_rate": 3.248369464659775e-08, + "logits/chosen": -1.027417778968811, + "logits/rejected": -0.942034900188446, + "logps/chosen": -374.5035400390625, + "logps/rejected": -518.6979370117188, + "loss": 0.445, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.6253198385238647, + "rewards/margins": 1.6298513412475586, + "rewards/rejected": -3.255171537399292, + "step": 5790 + }, + { + "epoch": 0.8538201089356691, + "grad_norm": 92.03487099959763, + "learning_rate": 3.1853244973577306e-08, + "logits/chosen": -1.2639530897140503, + "logits/rejected": -0.739490807056427, + "logps/chosen": -470.2442932128906, + "logps/rejected": -612.7994995117188, + "loss": 0.5182, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.8614349365234375, + "rewards/margins": 1.4685957431793213, + "rewards/rejected": -3.330030918121338, + "step": 5800 + }, + { + "epoch": 0.855292212571765, + "grad_norm": 142.11913033968304, + "learning_rate": 3.122855684286185e-08, + "logits/chosen": -0.9426442384719849, + "logits/rejected": -0.8573685884475708, + "logps/chosen": -509.3414001464844, + "logps/rejected": -676.3045654296875, + "loss": 0.4528, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1690833568573, + "rewards/margins": 1.5321247577667236, + "rewards/rejected": -3.7012085914611816, + "step": 5810 + }, + { + "epoch": 0.8567643162078611, + "grad_norm": 63.691459163921024, + "learning_rate": 3.060964675328545e-08, + "logits/chosen": -0.9643454551696777, + "logits/rejected": -0.8281852006912231, + "logps/chosen": -487.9972229003906, + "logps/rejected": -614.6697998046875, + "loss": 0.4987, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.344829559326172, + "rewards/margins": 1.1859668493270874, + "rewards/rejected": -3.530796766281128, + "step": 5820 + }, + { + "epoch": 0.858236419843957, + "grad_norm": 90.70629094172945, + "learning_rate": 2.999653105107644e-08, + "logits/chosen": -0.971082329750061, + "logits/rejected": -0.880352795124054, + "logps/chosen": -414.6600646972656, + "logps/rejected": -575.28076171875, + "loss": 0.5205, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.7170751094818115, + "rewards/margins": 1.6688358783721924, + "rewards/rejected": -3.385911226272583, + "step": 5830 + }, + { + "epoch": 0.859708523480053, + "grad_norm": 95.93040768662246, + "learning_rate": 2.9389225929425637e-08, + "logits/chosen": -1.0628657341003418, + "logits/rejected": -0.8438869714736938, + "logps/chosen": -411.8072204589844, + "logps/rejected": -538.6499633789062, + "loss": 0.4095, + "rewards/accuracies": 0.9000000953674316, + "rewards/chosen": -1.662790298461914, + "rewards/margins": 1.56641685962677, + "rewards/rejected": -3.2292072772979736, + "step": 5840 + }, + { + "epoch": 0.861180627116149, + "grad_norm": 119.05362322287101, + "learning_rate": 2.8787747428058977e-08, + "logits/chosen": -1.1255278587341309, + "logits/rejected": -1.2217938899993896, + "logps/chosen": -497.3191833496094, + "logps/rejected": -604.4038696289062, + "loss": 0.3855, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.7328379154205322, + "rewards/margins": 1.214876413345337, + "rewards/rejected": -2.947714328765869, + "step": 5850 + }, + { + "epoch": 0.8626527307522449, + "grad_norm": 152.68991525890866, + "learning_rate": 2.8192111432813554e-08, + "logits/chosen": -1.560386300086975, + "logits/rejected": -1.0124329328536987, + "logps/chosen": -463.9375, + "logps/rejected": -585.4347534179688, + "loss": 0.5953, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.9509546756744385, + "rewards/margins": 1.1762186288833618, + "rewards/rejected": -3.1271729469299316, + "step": 5860 + }, + { + "epoch": 0.864124834388341, + "grad_norm": 127.23924804045015, + "learning_rate": 2.7602333675218443e-08, + "logits/chosen": -1.2014895677566528, + "logits/rejected": -0.9078919291496277, + "logps/chosen": -433.54364013671875, + "logps/rejected": -464.6463317871094, + "loss": 0.5079, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.8237797021865845, + "rewards/margins": 0.9135274887084961, + "rewards/rejected": -2.737307071685791, + "step": 5870 + }, + { + "epoch": 0.8655969380244369, + "grad_norm": 76.70428432092903, + "learning_rate": 2.701842973207874e-08, + "logits/chosen": -0.9394499063491821, + "logits/rejected": -0.7848396897315979, + "logps/chosen": -439.12432861328125, + "logps/rejected": -620.3496704101562, + "loss": 0.4827, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.6949279308319092, + "rewards/margins": 1.8139915466308594, + "rewards/rejected": -3.5089192390441895, + "step": 5880 + }, + { + "epoch": 0.8670690416605329, + "grad_norm": 71.93267040248142, + "learning_rate": 2.6440415025064417e-08, + "logits/chosen": -1.2235647439956665, + "logits/rejected": -1.0570650100708008, + "logps/chosen": -482.0106506347656, + "logps/rejected": -619.4939575195312, + "loss": 0.4735, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6319239139556885, + "rewards/margins": 1.7923908233642578, + "rewards/rejected": -3.424314498901367, + "step": 5890 + }, + { + "epoch": 0.8685411452966288, + "grad_norm": 98.10158170971337, + "learning_rate": 2.5868304820303027e-08, + "logits/chosen": -1.1498442888259888, + "logits/rejected": -0.9952654838562012, + "logps/chosen": -497.09588623046875, + "logps/rejected": -573.7760009765625, + "loss": 0.4794, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -2.0244147777557373, + "rewards/margins": 1.2200558185577393, + "rewards/rejected": -3.2444705963134766, + "step": 5900 + }, + { + "epoch": 0.8700132489327249, + "grad_norm": 72.61468827010874, + "learning_rate": 2.5302114227976513e-08, + "logits/chosen": -1.2181391716003418, + "logits/rejected": -1.087153673171997, + "logps/chosen": -447.2759704589844, + "logps/rejected": -553.8470458984375, + "loss": 0.4193, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.5535515546798706, + "rewards/margins": 1.3484240770339966, + "rewards/rejected": -2.901975631713867, + "step": 5910 + }, + { + "epoch": 0.8714853525688209, + "grad_norm": 137.7867364223512, + "learning_rate": 2.4741858201922065e-08, + "logits/chosen": -1.1270124912261963, + "logits/rejected": -0.8954577445983887, + "logps/chosen": -467.2159729003906, + "logps/rejected": -643.2802124023438, + "loss": 0.4876, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.9270009994506836, + "rewards/margins": 1.7104324102401733, + "rewards/rejected": -3.6374335289001465, + "step": 5920 + }, + { + "epoch": 0.8729574562049168, + "grad_norm": 45.79532439368993, + "learning_rate": 2.4187551539237073e-08, + "logits/chosen": -1.2035057544708252, + "logits/rejected": -0.7668858766555786, + "logps/chosen": -451.9247131347656, + "logps/rejected": -554.51953125, + "loss": 0.4662, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6970252990722656, + "rewards/margins": 1.5563323497772217, + "rewards/rejected": -3.2533576488494873, + "step": 5930 + }, + { + "epoch": 0.8744295598410128, + "grad_norm": 57.65829417544029, + "learning_rate": 2.363920887988849e-08, + "logits/chosen": -1.0363441705703735, + "logits/rejected": -0.8559466600418091, + "logps/chosen": -458.0599060058594, + "logps/rejected": -675.2276611328125, + "loss": 0.4659, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -2.34497332572937, + "rewards/margins": 1.9114878177642822, + "rewards/rejected": -4.256461143493652, + "step": 5940 + }, + { + "epoch": 0.8759016634771087, + "grad_norm": 143.93465632362677, + "learning_rate": 2.3096844706326112e-08, + "logits/chosen": -1.090350866317749, + "logits/rejected": -0.6928873658180237, + "logps/chosen": -496.72564697265625, + "logps/rejected": -630.3134765625, + "loss": 0.4589, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.0421767234802246, + "rewards/margins": 1.9863598346710205, + "rewards/rejected": -4.028536796569824, + "step": 5950 + }, + { + "epoch": 0.8773737671132048, + "grad_norm": 112.07739438060192, + "learning_rate": 2.2560473343100155e-08, + "logits/chosen": -1.1264533996582031, + "logits/rejected": -0.8227478265762329, + "logps/chosen": -421.42547607421875, + "logps/rejected": -528.3717041015625, + "loss": 0.5382, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.9015655517578125, + "rewards/margins": 1.4118037223815918, + "rewards/rejected": -3.313369035720825, + "step": 5960 + }, + { + "epoch": 0.8788458707493008, + "grad_norm": 116.4748272060503, + "learning_rate": 2.20301089564828e-08, + "logits/chosen": -1.0290887355804443, + "logits/rejected": -0.39749449491500854, + "logps/chosen": -472.896240234375, + "logps/rejected": -572.2553100585938, + "loss": 0.4707, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.9639848470687866, + "rewards/margins": 1.3849437236785889, + "rewards/rejected": -3.3489291667938232, + "step": 5970 + }, + { + "epoch": 0.8803179743853967, + "grad_norm": 124.02803087870763, + "learning_rate": 2.150576555409392e-08, + "logits/chosen": -1.0157904624938965, + "logits/rejected": -0.6371365785598755, + "logps/chosen": -364.58868408203125, + "logps/rejected": -477.22076416015625, + "loss": 0.5358, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.7165544033050537, + "rewards/margins": 1.3299009799957275, + "rewards/rejected": -3.046455144882202, + "step": 5980 + }, + { + "epoch": 0.8817900780214927, + "grad_norm": 109.4277135113522, + "learning_rate": 2.0987456984531472e-08, + "logits/chosen": -1.455991506576538, + "logits/rejected": -1.1371604204177856, + "logps/chosen": -449.2311096191406, + "logps/rejected": -541.2726440429688, + "loss": 0.5156, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -1.9608980417251587, + "rewards/margins": 1.1026145219802856, + "rewards/rejected": -3.0635123252868652, + "step": 5990 + }, + { + "epoch": 0.8832621816575887, + "grad_norm": 82.68907077772235, + "learning_rate": 2.0475196937005557e-08, + "logits/chosen": -1.0126214027404785, + "logits/rejected": -0.9786561727523804, + "logps/chosen": -367.5531005859375, + "logps/rejected": -586.8067626953125, + "loss": 0.4608, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.5657813549041748, + "rewards/margins": 1.5962798595428467, + "rewards/rejected": -3.1620612144470215, + "step": 6000 + }, + { + "epoch": 0.8847342852936847, + "grad_norm": 205.49619592613445, + "learning_rate": 1.9968998940976784e-08, + "logits/chosen": -1.3245373964309692, + "logits/rejected": -0.9575395584106445, + "logps/chosen": -436.8778381347656, + "logps/rejected": -620.2337036132812, + "loss": 0.4562, + "rewards/accuracies": 0.8666666150093079, + "rewards/chosen": -1.7500731945037842, + "rewards/margins": 2.0945775508880615, + "rewards/rejected": -3.844650983810425, + "step": 6010 + }, + { + "epoch": 0.8862063889297807, + "grad_norm": 128.62523784183892, + "learning_rate": 1.9468876365799054e-08, + "logits/chosen": -0.9967330098152161, + "logits/rejected": -0.6000679135322571, + "logps/chosen": -482.8002014160156, + "logps/rejected": -575.7860107421875, + "loss": 0.3783, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.136063814163208, + "rewards/margins": 1.5794326066970825, + "rewards/rejected": -3.71549654006958, + "step": 6020 + }, + { + "epoch": 0.8876784925658766, + "grad_norm": 55.25526754372452, + "learning_rate": 1.89748424203664e-08, + "logits/chosen": -1.1315118074417114, + "logits/rejected": -0.7708331346511841, + "logps/chosen": -488.0318298339844, + "logps/rejected": -543.4078369140625, + "loss": 0.4223, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.7892534732818604, + "rewards/margins": 1.1177319288253784, + "rewards/rejected": -2.90698504447937, + "step": 6030 + }, + { + "epoch": 0.8891505962019726, + "grad_norm": 76.68621314264306, + "learning_rate": 1.8486910152764102e-08, + "logits/chosen": -1.0846840143203735, + "logits/rejected": -0.6855759620666504, + "logps/chosen": -427.6155700683594, + "logps/rejected": -574.6207275390625, + "loss": 0.4117, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.7044055461883545, + "rewards/margins": 1.4993574619293213, + "rewards/rejected": -3.2037627696990967, + "step": 6040 + }, + { + "epoch": 0.8906226998380686, + "grad_norm": 74.03394201480887, + "learning_rate": 1.8005092449924292e-08, + "logits/chosen": -1.0500560998916626, + "logits/rejected": -1.0109421014785767, + "logps/chosen": -417.21588134765625, + "logps/rejected": -489.804931640625, + "loss": 0.5801, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.044023036956787, + "rewards/margins": 0.6984446048736572, + "rewards/rejected": -2.7424676418304443, + "step": 6050 + }, + { + "epoch": 0.8920948034741646, + "grad_norm": 132.86923242747673, + "learning_rate": 1.7529402037285312e-08, + "logits/chosen": -1.1678305864334106, + "logits/rejected": -1.0054666996002197, + "logps/chosen": -462.7268981933594, + "logps/rejected": -646.675537109375, + "loss": 0.452, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.8796907663345337, + "rewards/margins": 1.9571386575698853, + "rewards/rejected": -3.836829662322998, + "step": 6060 + }, + { + "epoch": 0.8935669071102605, + "grad_norm": 58.115963415199815, + "learning_rate": 1.7059851478455594e-08, + "logits/chosen": -1.0361781120300293, + "logits/rejected": -0.7650968432426453, + "logps/chosen": -472.3480529785156, + "logps/rejected": -594.3772583007812, + "loss": 0.3842, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.1054859161376953, + "rewards/margins": 1.515193223953247, + "rewards/rejected": -3.6206798553466797, + "step": 6070 + }, + { + "epoch": 0.8950390107463565, + "grad_norm": 55.546504269962334, + "learning_rate": 1.6596453174882145e-08, + "logits/chosen": -0.8260339498519897, + "logits/rejected": -0.8860335350036621, + "logps/chosen": -425.872314453125, + "logps/rejected": -601.16015625, + "loss": 0.4087, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.35821533203125, + "rewards/margins": 1.0598571300506592, + "rewards/rejected": -3.418072462081909, + "step": 6080 + }, + { + "epoch": 0.8965111143824526, + "grad_norm": 58.67715479865092, + "learning_rate": 1.6139219365522732e-08, + "logits/chosen": -1.2001938819885254, + "logits/rejected": -0.8947411775588989, + "logps/chosen": -536.5673217773438, + "logps/rejected": -654.0597534179688, + "loss": 0.4242, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -2.0023317337036133, + "rewards/margins": 1.3319785594940186, + "rewards/rejected": -3.334310531616211, + "step": 6090 + }, + { + "epoch": 0.8979832180185485, + "grad_norm": 104.52385286627003, + "learning_rate": 1.5688162126522853e-08, + "logits/chosen": -1.22173011302948, + "logits/rejected": -0.6969095468521118, + "logps/chosen": -479.08148193359375, + "logps/rejected": -558.4459228515625, + "loss": 0.4425, + "rewards/accuracies": 0.73333340883255, + "rewards/chosen": -2.0066559314727783, + "rewards/margins": 1.5528547763824463, + "rewards/rejected": -3.5595107078552246, + "step": 6100 + }, + { + "epoch": 0.8994553216546445, + "grad_norm": 56.22576967455009, + "learning_rate": 1.5243293370896554e-08, + "logits/chosen": -1.1908855438232422, + "logits/rejected": -0.8657861948013306, + "logps/chosen": -470.0201110839844, + "logps/rejected": -540.4694213867188, + "loss": 0.5103, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.9704595804214478, + "rewards/margins": 0.9596738815307617, + "rewards/rejected": -2.93013334274292, + "step": 6110 + }, + { + "epoch": 0.9009274252907404, + "grad_norm": 78.84335626771664, + "learning_rate": 1.4804624848211933e-08, + "logits/chosen": -0.9483867883682251, + "logits/rejected": -0.694191575050354, + "logps/chosen": -443.8580017089844, + "logps/rejected": -550.030029296875, + "loss": 0.5239, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.0560388565063477, + "rewards/margins": 0.8804718852043152, + "rewards/rejected": -2.9365105628967285, + "step": 6120 + }, + { + "epoch": 0.9023995289268365, + "grad_norm": 108.55464657718488, + "learning_rate": 1.4372168144280744e-08, + "logits/chosen": -0.9750598669052124, + "logits/rejected": -0.5964799523353577, + "logps/chosen": -464.1393127441406, + "logps/rejected": -559.6013793945312, + "loss": 0.4942, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.0285205841064453, + "rewards/margins": 1.3152391910552979, + "rewards/rejected": -3.343759536743164, + "step": 6130 + }, + { + "epoch": 0.9038716325629325, + "grad_norm": 133.943725791223, + "learning_rate": 1.3945934680852617e-08, + "logits/chosen": -1.1605137586593628, + "logits/rejected": -0.6728196740150452, + "logps/chosen": -473.07672119140625, + "logps/rejected": -631.1534423828125, + "loss": 0.4379, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.7821353673934937, + "rewards/margins": 2.1389527320861816, + "rewards/rejected": -3.9210879802703857, + "step": 6140 + }, + { + "epoch": 0.9053437361990284, + "grad_norm": 51.57869668531544, + "learning_rate": 1.3525935715313074e-08, + "logits/chosen": -0.963829517364502, + "logits/rejected": -0.6825015544891357, + "logps/chosen": -402.5187072753906, + "logps/rejected": -547.8536376953125, + "loss": 0.4045, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.1833713054656982, + "rewards/margins": 1.1140834093093872, + "rewards/rejected": -3.297454833984375, + "step": 6150 + }, + { + "epoch": 0.9068158398351244, + "grad_norm": 66.05268488641035, + "learning_rate": 1.3112182340386412e-08, + "logits/chosen": -0.8111869692802429, + "logits/rejected": -0.7929922938346863, + "logps/chosen": -409.51611328125, + "logps/rejected": -542.3306274414062, + "loss": 0.4791, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.024592399597168, + "rewards/margins": 1.5310230255126953, + "rewards/rejected": -3.5556156635284424, + "step": 6160 + }, + { + "epoch": 0.9082879434712203, + "grad_norm": 114.52674341060448, + "learning_rate": 1.2704685483842626e-08, + "logits/chosen": -0.9519659876823425, + "logits/rejected": -0.5504119992256165, + "logps/chosen": -374.53839111328125, + "logps/rejected": -454.9689025878906, + "loss": 0.5334, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.816737174987793, + "rewards/margins": 1.2262861728668213, + "rewards/rejected": -3.0430235862731934, + "step": 6170 + }, + { + "epoch": 0.9097600471073164, + "grad_norm": 77.48187962321082, + "learning_rate": 1.2303455908208915e-08, + "logits/chosen": -1.1340410709381104, + "logits/rejected": -0.9948369860649109, + "logps/chosen": -433.7491149902344, + "logps/rejected": -493.46563720703125, + "loss": 0.4553, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.0793144702911377, + "rewards/margins": 1.07127845287323, + "rewards/rejected": -3.1505930423736572, + "step": 6180 + }, + { + "epoch": 0.9112321507434124, + "grad_norm": 82.07364183991278, + "learning_rate": 1.1908504210485337e-08, + "logits/chosen": -1.0878283977508545, + "logits/rejected": -0.5000115633010864, + "logps/chosen": -500.4331970214844, + "logps/rejected": -550.1397705078125, + "loss": 0.4731, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8979928493499756, + "rewards/margins": 1.2087833881378174, + "rewards/rejected": -3.106776237487793, + "step": 6190 + }, + { + "epoch": 0.9127042543795083, + "grad_norm": 47.59430648468817, + "learning_rate": 1.1519840821864968e-08, + "logits/chosen": -1.2188410758972168, + "logits/rejected": -0.8747907876968384, + "logps/chosen": -498.33526611328125, + "logps/rejected": -611.33544921875, + "loss": 0.4315, + "rewards/accuracies": 0.6666667461395264, + "rewards/chosen": -1.9243104457855225, + "rewards/margins": 1.0853172540664673, + "rewards/rejected": -3.0096278190612793, + "step": 6200 + }, + { + "epoch": 0.9141763580156043, + "grad_norm": 374.367432511081, + "learning_rate": 1.113747600745829e-08, + "logits/chosen": -1.0106618404388428, + "logits/rejected": -0.9849384427070618, + "logps/chosen": -435.19476318359375, + "logps/rejected": -569.897216796875, + "loss": 0.5233, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -1.8131921291351318, + "rewards/margins": 1.2545406818389893, + "rewards/rejected": -3.067732810974121, + "step": 6210 + }, + { + "epoch": 0.9156484616517003, + "grad_norm": 223.04598552152095, + "learning_rate": 1.0761419866022247e-08, + "logits/chosen": -1.302537441253662, + "logits/rejected": -1.143919587135315, + "logps/chosen": -408.35308837890625, + "logps/rejected": -521.6826782226562, + "loss": 0.5591, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.5771526098251343, + "rewards/margins": 1.4312165975570679, + "rewards/rejected": -3.0083696842193604, + "step": 6220 + }, + { + "epoch": 0.9171205652877963, + "grad_norm": 109.83107388272123, + "learning_rate": 1.039168232969348e-08, + "logits/chosen": -1.0693390369415283, + "logits/rejected": -0.8511130213737488, + "logps/chosen": -462.4171447753906, + "logps/rejected": -543.8641357421875, + "loss": 0.5307, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.021796941757202, + "rewards/margins": 1.014459252357483, + "rewards/rejected": -3.0362563133239746, + "step": 6230 + }, + { + "epoch": 0.9185926689238922, + "grad_norm": 220.7733947686544, + "learning_rate": 1.0028273163725898e-08, + "logits/chosen": -1.2089552879333496, + "logits/rejected": -1.0650651454925537, + "logps/chosen": -561.923828125, + "logps/rejected": -613.2421264648438, + "loss": 0.4803, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.3091793060302734, + "rewards/margins": 1.1226726770401, + "rewards/rejected": -3.431851863861084, + "step": 6240 + }, + { + "epoch": 0.9200647725599882, + "grad_norm": 77.48951853990002, + "learning_rate": 9.671201966232889e-09, + "logits/chosen": -0.8748568296432495, + "logits/rejected": -0.8215582966804504, + "logps/chosen": -445.2130432128906, + "logps/rejected": -563.3865356445312, + "loss": 0.4223, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -2.226011276245117, + "rewards/margins": 1.1341211795806885, + "rewards/rejected": -3.3601322174072266, + "step": 6250 + }, + { + "epoch": 0.9215368761960842, + "grad_norm": 83.13525475425418, + "learning_rate": 9.320478167933659e-09, + "logits/chosen": -1.1657829284667969, + "logits/rejected": -1.0267692804336548, + "logps/chosen": -465.1402893066406, + "logps/rejected": -614.5159912109375, + "loss": 0.4565, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.878274917602539, + "rewards/margins": 1.4825143814086914, + "rewards/rejected": -3.3607895374298096, + "step": 6260 + }, + { + "epoch": 0.9230089798321802, + "grad_norm": 124.34046003087606, + "learning_rate": 8.976111031904377e-09, + "logits/chosen": -1.5715224742889404, + "logits/rejected": -1.0920354127883911, + "logps/chosen": -490.23919677734375, + "logps/rejected": -550.3733520507812, + "loss": 0.4351, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.9004207849502563, + "rewards/margins": 1.3330665826797485, + "rewards/rejected": -3.233487367630005, + "step": 6270 + }, + { + "epoch": 0.9244810834682762, + "grad_norm": 110.78770180571546, + "learning_rate": 8.638109653333419e-09, + "logits/chosen": -1.502655267715454, + "logits/rejected": -0.9701645970344543, + "logps/chosen": -563.0130004882812, + "logps/rejected": -620.131103515625, + "loss": 0.4696, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.1406774520874023, + "rewards/margins": 1.4651085138320923, + "rewards/rejected": -3.605785846710205, + "step": 6280 + }, + { + "epoch": 0.9259531871043721, + "grad_norm": 88.95499558524205, + "learning_rate": 8.306482959281103e-09, + "logits/chosen": -0.8414198160171509, + "logits/rejected": -0.6477373242378235, + "logps/chosen": -436.2674255371094, + "logps/rejected": -523.2989501953125, + "loss": 0.5516, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.353652238845825, + "rewards/margins": 1.1429803371429443, + "rewards/rejected": -3.4966323375701904, + "step": 6290 + }, + { + "epoch": 0.9274252907404681, + "grad_norm": 63.55674079565389, + "learning_rate": 7.981239708443971e-09, + "logits/chosen": -1.1431224346160889, + "logits/rejected": -0.7958438396453857, + "logps/chosen": -482.1263732910156, + "logps/rejected": -610.4625854492188, + "loss": 0.4622, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -1.8927704095840454, + "rewards/margins": 1.7283560037612915, + "rewards/rejected": -3.621126651763916, + "step": 6300 + }, + { + "epoch": 0.9288973943765642, + "grad_norm": 83.68214539376939, + "learning_rate": 7.662388490923378e-09, + "logits/chosen": -0.9260037541389465, + "logits/rejected": -0.7859050035476685, + "logps/chosen": -568.2583618164062, + "logps/rejected": -735.2937622070312, + "loss": 0.4456, + "rewards/accuracies": 0.6666666269302368, + "rewards/chosen": -2.9225242137908936, + "rewards/margins": 1.501201868057251, + "rewards/rejected": -4.4237260818481445, + "step": 6310 + }, + { + "epoch": 0.9303694980126601, + "grad_norm": 76.1498696254405, + "learning_rate": 7.3499377279988e-09, + "logits/chosen": -1.1692345142364502, + "logits/rejected": -0.7514751553535461, + "logps/chosen": -493.26055908203125, + "logps/rejected": -542.8193969726562, + "loss": 0.6095, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.403029203414917, + "rewards/margins": 0.9948217272758484, + "rewards/rejected": -3.397850751876831, + "step": 6320 + }, + { + "epoch": 0.9318416016487561, + "grad_norm": 76.39111960554915, + "learning_rate": 7.043895671905248e-09, + "logits/chosen": -1.087283968925476, + "logits/rejected": -1.15114426612854, + "logps/chosen": -432.1312561035156, + "logps/rejected": -595.9295043945312, + "loss": 0.4472, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.951411247253418, + "rewards/margins": 1.359915018081665, + "rewards/rejected": -3.311326265335083, + "step": 6330 + }, + { + "epoch": 0.933313705284852, + "grad_norm": 125.96388152280082, + "learning_rate": 6.744270405615371e-09, + "logits/chosen": -1.1190003156661987, + "logits/rejected": -0.8621671795845032, + "logps/chosen": -459.2276916503906, + "logps/rejected": -640.4942626953125, + "loss": 0.4387, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -1.7050561904907227, + "rewards/margins": 2.3613228797912598, + "rewards/rejected": -4.066379070281982, + "step": 6340 + }, + { + "epoch": 0.934785808920948, + "grad_norm": 82.64190295202974, + "learning_rate": 6.4510698426259165e-09, + "logits/chosen": -1.0808823108673096, + "logits/rejected": -0.8061186671257019, + "logps/chosen": -454.32550048828125, + "logps/rejected": -562.7786865234375, + "loss": 0.3587, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.911500334739685, + "rewards/margins": 1.4210889339447021, + "rewards/rejected": -3.3325889110565186, + "step": 6350 + }, + { + "epoch": 0.936257912557044, + "grad_norm": 60.9474210453319, + "learning_rate": 6.164301726748838e-09, + "logits/chosen": -1.1044061183929443, + "logits/rejected": -0.6380583047866821, + "logps/chosen": -435.17108154296875, + "logps/rejected": -544.3577880859375, + "loss": 0.4034, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -1.6812435388565063, + "rewards/margins": 1.6647593975067139, + "rewards/rejected": -3.3460030555725098, + "step": 6360 + }, + { + "epoch": 0.93773001619314, + "grad_norm": 193.57731333604957, + "learning_rate": 5.883973631906736e-09, + "logits/chosen": -1.0638282299041748, + "logits/rejected": -0.8393194079399109, + "logps/chosen": -497.1612243652344, + "logps/rejected": -657.2183837890625, + "loss": 0.486, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.517075300216675, + "rewards/margins": 1.5914770364761353, + "rewards/rejected": -4.1085524559021, + "step": 6370 + }, + { + "epoch": 0.939202119829236, + "grad_norm": 108.94371171332827, + "learning_rate": 5.610092961932766e-09, + "logits/chosen": -1.0650379657745361, + "logits/rejected": -1.0115690231323242, + "logps/chosen": -485.01177978515625, + "logps/rejected": -645.3680419921875, + "loss": 0.546, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.1590046882629395, + "rewards/margins": 1.8166015148162842, + "rewards/rejected": -3.9756064414978027, + "step": 6380 + }, + { + "epoch": 0.9406742234653319, + "grad_norm": 129.99729538000426, + "learning_rate": 5.3426669503751355e-09, + "logits/chosen": -1.3481450080871582, + "logits/rejected": -0.661180853843689, + "logps/chosen": -549.8556518554688, + "logps/rejected": -561.4913330078125, + "loss": 0.4474, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.3059380054473877, + "rewards/margins": 0.9488359689712524, + "rewards/rejected": -3.2547736167907715, + "step": 6390 + }, + { + "epoch": 0.942146327101428, + "grad_norm": 146.380243349574, + "learning_rate": 5.081702660306025e-09, + "logits/chosen": -1.2535059452056885, + "logits/rejected": -0.7133938670158386, + "logps/chosen": -502.32635498046875, + "logps/rejected": -682.0352783203125, + "loss": 0.5171, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.412825345993042, + "rewards/margins": 1.7324565649032593, + "rewards/rejected": -4.145281791687012, + "step": 6400 + }, + { + "epoch": 0.9436184307375239, + "grad_norm": 81.94967851941736, + "learning_rate": 4.827206984135024e-09, + "logits/chosen": -0.8781982660293579, + "logits/rejected": -0.7172101736068726, + "logps/chosen": -429.4791564941406, + "logps/rejected": -579.2913818359375, + "loss": 0.5307, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -2.239814281463623, + "rewards/margins": 1.498968482017517, + "rewards/rejected": -3.7387824058532715, + "step": 6410 + }, + { + "epoch": 0.9450905343736199, + "grad_norm": 138.26152953388132, + "learning_rate": 4.579186643427241e-09, + "logits/chosen": -1.214597463607788, + "logits/rejected": -0.9208871126174927, + "logps/chosen": -532.3272705078125, + "logps/rejected": -577.6005249023438, + "loss": 0.489, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -1.9566433429718018, + "rewards/margins": 1.2619284391403198, + "rewards/rejected": -3.218571424484253, + "step": 6420 + }, + { + "epoch": 0.9465626380097159, + "grad_norm": 86.30520111983465, + "learning_rate": 4.337648188725618e-09, + "logits/chosen": -0.9052795171737671, + "logits/rejected": -0.7033430933952332, + "logps/chosen": -447.89837646484375, + "logps/rejected": -559.1326904296875, + "loss": 0.5596, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.6097948551177979, + "rewards/margins": 1.6141388416290283, + "rewards/rejected": -3.223933458328247, + "step": 6430 + }, + { + "epoch": 0.9480347416458119, + "grad_norm": 152.58013998937017, + "learning_rate": 4.102597999377927e-09, + "logits/chosen": -1.1753242015838623, + "logits/rejected": -0.9160283207893372, + "logps/chosen": -574.9229736328125, + "logps/rejected": -731.6678466796875, + "loss": 0.473, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -2.5810494422912598, + "rewards/margins": 1.3761615753173828, + "rewards/rejected": -3.9572112560272217, + "step": 6440 + }, + { + "epoch": 0.9495068452819079, + "grad_norm": 163.27660701986466, + "learning_rate": 3.874042283368406e-09, + "logits/chosen": -1.0492947101593018, + "logits/rejected": -0.7721529006958008, + "logps/chosen": -483.2264709472656, + "logps/rejected": -625.9090576171875, + "loss": 0.4999, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.5114665031433105, + "rewards/margins": 1.2074260711669922, + "rewards/rejected": -3.7188925743103027, + "step": 6450 + }, + { + "epoch": 0.9509789489180038, + "grad_norm": 99.3662653340575, + "learning_rate": 3.6519870771536952e-09, + "logits/chosen": -1.1349852085113525, + "logits/rejected": -1.2059035301208496, + "logps/chosen": -489.3065490722656, + "logps/rejected": -616.331298828125, + "loss": 0.4708, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.455379009246826, + "rewards/margins": 0.5930863618850708, + "rewards/rejected": -3.0484652519226074, + "step": 6460 + }, + { + "epoch": 0.9524510525540998, + "grad_norm": 82.77468525430078, + "learning_rate": 3.436438245503409e-09, + "logits/chosen": -0.8502506017684937, + "logits/rejected": -0.889320969581604, + "logps/chosen": -482.10333251953125, + "logps/rejected": -593.2684936523438, + "loss": 0.5548, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -1.8977534770965576, + "rewards/margins": 1.2781405448913574, + "rewards/rejected": -3.175893783569336, + "step": 6470 + }, + { + "epoch": 0.9539231561901957, + "grad_norm": 167.0158777074398, + "learning_rate": 3.2274014813452875e-09, + "logits/chosen": -0.8480228185653687, + "logits/rejected": -0.5235060453414917, + "logps/chosen": -460.0982971191406, + "logps/rejected": -723.8695068359375, + "loss": 0.4238, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4261162281036377, + "rewards/margins": 2.0561296939849854, + "rewards/rejected": -4.482245445251465, + "step": 6480 + }, + { + "epoch": 0.9553952598262918, + "grad_norm": 110.54140763899403, + "learning_rate": 3.0248823056148175e-09, + "logits/chosen": -0.8810976147651672, + "logits/rejected": -1.1996238231658936, + "logps/chosen": -466.66021728515625, + "logps/rejected": -640.5092163085938, + "loss": 0.4564, + "rewards/accuracies": 0.6666666865348816, + "rewards/chosen": -2.160752773284912, + "rewards/margins": 1.6792404651641846, + "rewards/rejected": -3.8399930000305176, + "step": 6490 + }, + { + "epoch": 0.9568673634623878, + "grad_norm": 178.20887101590088, + "learning_rate": 2.828886067109404e-09, + "logits/chosen": -1.2081787586212158, + "logits/rejected": -0.9349848031997681, + "logps/chosen": -535.8568115234375, + "logps/rejected": -677.1038208007812, + "loss": 0.4802, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3741135597229004, + "rewards/margins": 1.6998382806777954, + "rewards/rejected": -4.0739521980285645, + "step": 6500 + }, + { + "epoch": 0.9583394670984837, + "grad_norm": 193.20684771579718, + "learning_rate": 2.639417942347122e-09, + "logits/chosen": -1.1314600706100464, + "logits/rejected": -0.8010835647583008, + "logps/chosen": -393.9501037597656, + "logps/rejected": -628.40673828125, + "loss": 0.4901, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7911121845245361, + "rewards/margins": 2.1184442043304443, + "rewards/rejected": -3.9095566272735596, + "step": 6510 + }, + { + "epoch": 0.9598115707345797, + "grad_norm": 165.29491013006896, + "learning_rate": 2.456482935429993e-09, + "logits/chosen": -1.1866682767868042, + "logits/rejected": -0.7399438619613647, + "logps/chosen": -496.7454528808594, + "logps/rejected": -597.7395629882812, + "loss": 0.6551, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -2.2442588806152344, + "rewards/margins": 1.2275264263153076, + "rewards/rejected": -3.471785306930542, + "step": 6520 + }, + { + "epoch": 0.9612836743706757, + "grad_norm": 99.58153689375328, + "learning_rate": 2.2800858779118417e-09, + "logits/chosen": -1.0078599452972412, + "logits/rejected": -0.7700494527816772, + "logps/chosen": -473.7530822753906, + "logps/rejected": -612.4725952148438, + "loss": 0.6633, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.1779778003692627, + "rewards/margins": 1.5644938945770264, + "rewards/rejected": -3.742471694946289, + "step": 6530 + }, + { + "epoch": 0.9627557780067717, + "grad_norm": 188.52558940572416, + "learning_rate": 2.1102314286705334e-09, + "logits/chosen": -1.1187241077423096, + "logits/rejected": -1.0723965167999268, + "logps/chosen": -545.0596923828125, + "logps/rejected": -545.8067016601562, + "loss": 0.485, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.1726815700531006, + "rewards/margins": 0.48218727111816406, + "rewards/rejected": -2.6548686027526855, + "step": 6540 + }, + { + "epoch": 0.9642278816428677, + "grad_norm": 106.92872480734137, + "learning_rate": 1.9469240737852445e-09, + "logits/chosen": -0.9942170977592468, + "logits/rejected": -0.9531810879707336, + "logps/chosen": -371.33502197265625, + "logps/rejected": -550.0565185546875, + "loss": 0.4782, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8037058115005493, + "rewards/margins": 1.6463518142700195, + "rewards/rejected": -3.4500575065612793, + "step": 6550 + }, + { + "epoch": 0.9656999852789636, + "grad_norm": 106.36449742857832, + "learning_rate": 1.790168126417635e-09, + "logits/chosen": -1.0383527278900146, + "logits/rejected": -0.5622974038124084, + "logps/chosen": -368.7752990722656, + "logps/rejected": -527.6478271484375, + "loss": 0.6526, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.5008089542388916, + "rewards/margins": 1.8395076990127563, + "rewards/rejected": -3.3403167724609375, + "step": 6560 + }, + { + "epoch": 0.9671720889150596, + "grad_norm": 78.96697374992503, + "learning_rate": 1.6399677266982214e-09, + "logits/chosen": -1.2151182889938354, + "logits/rejected": -0.919575572013855, + "logps/chosen": -520.7152099609375, + "logps/rejected": -604.151611328125, + "loss": 0.4721, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -2.2255795001983643, + "rewards/margins": 1.1835567951202393, + "rewards/rejected": -3.4091362953186035, + "step": 6570 + }, + { + "epoch": 0.9686441925511556, + "grad_norm": 246.65851849936138, + "learning_rate": 1.4963268416167685e-09, + "logits/chosen": -0.9127308130264282, + "logits/rejected": -0.5409738421440125, + "logps/chosen": -447.56182861328125, + "logps/rejected": -559.7342529296875, + "loss": 0.5321, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.769230842590332, + "rewards/margins": 1.4840223789215088, + "rewards/rejected": -3.253253221511841, + "step": 6580 + }, + { + "epoch": 0.9701162961872516, + "grad_norm": 84.18152836940874, + "learning_rate": 1.3592492649177056e-09, + "logits/chosen": -0.7209831476211548, + "logits/rejected": -0.6905950307846069, + "logps/chosen": -458.73699951171875, + "logps/rejected": -554.8598022460938, + "loss": 0.5196, + "rewards/accuracies": 0.8333333134651184, + "rewards/chosen": -2.030134677886963, + "rewards/margins": 1.450232744216919, + "rewards/rejected": -3.4803671836853027, + "step": 6590 + }, + { + "epoch": 0.9715883998233475, + "grad_norm": 126.73736366351699, + "learning_rate": 1.2287386169998747e-09, + "logits/chosen": -1.3722821474075317, + "logits/rejected": -0.7988008856773376, + "logps/chosen": -462.2713928222656, + "logps/rejected": -598.27099609375, + "loss": 0.4373, + "rewards/accuracies": 0.7666667103767395, + "rewards/chosen": -2.1100926399230957, + "rewards/margins": 1.6891053915023804, + "rewards/rejected": -3.7991981506347656, + "step": 6600 + }, + { + "epoch": 0.9730605034594435, + "grad_norm": 80.16063512408078, + "learning_rate": 1.1047983448209397e-09, + "logits/chosen": -1.0949660539627075, + "logits/rejected": -0.8592510223388672, + "logps/chosen": -459.66644287109375, + "logps/rejected": -604.8404541015625, + "loss": 0.4133, + "rewards/accuracies": 0.9333333969116211, + "rewards/chosen": -1.758650541305542, + "rewards/margins": 1.7326768636703491, + "rewards/rejected": -3.4913272857666016, + "step": 6610 + }, + { + "epoch": 0.9745326070955396, + "grad_norm": 76.00502026421299, + "learning_rate": 9.87431721806209e-10, + "logits/chosen": -1.2628120183944702, + "logits/rejected": -1.0344897508621216, + "logps/chosen": -449.8192443847656, + "logps/rejected": -556.6915283203125, + "loss": 0.3874, + "rewards/accuracies": 0.7333332896232605, + "rewards/chosen": -1.8382011651992798, + "rewards/margins": 1.5226261615753174, + "rewards/rejected": -3.3608272075653076, + "step": 6620 + }, + { + "epoch": 0.9760047107316355, + "grad_norm": 64.87092914865472, + "learning_rate": 8.766418477623716e-10, + "logits/chosen": -1.058807373046875, + "logits/rejected": -0.6764766573905945, + "logps/chosen": -434.329345703125, + "logps/rejected": -542.9319458007812, + "loss": 0.3819, + "rewards/accuracies": 0.8666666746139526, + "rewards/chosen": -1.6859748363494873, + "rewards/margins": 1.561607003211975, + "rewards/rejected": -3.247581958770752, + "step": 6630 + }, + { + "epoch": 0.9774768143677315, + "grad_norm": 122.93794135817826, + "learning_rate": 7.724316487954796e-10, + "logits/chosen": -1.04678213596344, + "logits/rejected": -0.8110519647598267, + "logps/chosen": -431.2456970214844, + "logps/rejected": -600.9210815429688, + "loss": 0.6105, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -1.9062204360961914, + "rewards/margins": 1.6373655796051025, + "rewards/rejected": -3.543586015701294, + "step": 6640 + }, + { + "epoch": 0.9789489180038274, + "grad_norm": 148.803050947404, + "learning_rate": 6.748038772337595e-10, + "logits/chosen": -1.1234979629516602, + "logits/rejected": -0.7777267694473267, + "logps/chosen": -476.57891845703125, + "logps/rejected": -585.7210693359375, + "loss": 0.5091, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1657650470733643, + "rewards/margins": 1.3379297256469727, + "rewards/rejected": -3.503694534301758, + "step": 6650 + }, + { + "epoch": 0.9804210216399234, + "grad_norm": 122.94696558468053, + "learning_rate": 5.837611115549201e-10, + "logits/chosen": -1.40057373046875, + "logits/rejected": -0.7224053144454956, + "logps/chosen": -483.59857177734375, + "logps/rejected": -555.85400390625, + "loss": 0.5587, + "rewards/accuracies": 0.76666659116745, + "rewards/chosen": -1.665423035621643, + "rewards/margins": 1.4769444465637207, + "rewards/rejected": -3.1423678398132324, + "step": 6660 + }, + { + "epoch": 0.9818931252760195, + "grad_norm": 132.08034833370462, + "learning_rate": 4.993057563179303e-10, + "logits/chosen": -1.3287341594696045, + "logits/rejected": -0.8660466074943542, + "logps/chosen": -524.1640625, + "logps/rejected": -584.7789306640625, + "loss": 0.364, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9598894119262695, + "rewards/margins": 1.5529369115829468, + "rewards/rejected": -3.5128262042999268, + "step": 6670 + }, + { + "epoch": 0.9833652289121154, + "grad_norm": 163.63501242009892, + "learning_rate": 4.2144004209962445e-10, + "logits/chosen": -1.0573452711105347, + "logits/rejected": -1.115231990814209, + "logps/chosen": -489.37957763671875, + "logps/rejected": -534.5518798828125, + "loss": 0.4854, + "rewards/accuracies": 0.5666667222976685, + "rewards/chosen": -1.828873634338379, + "rewards/margins": 0.7341600656509399, + "rewards/rejected": -2.5630335807800293, + "step": 6680 + }, + { + "epoch": 0.9848373325482114, + "grad_norm": 67.54902903837528, + "learning_rate": 3.501660254358607e-10, + "logits/chosen": -0.9518839716911316, + "logits/rejected": -0.5863426327705383, + "logps/chosen": -458.6871032714844, + "logps/rejected": -574.5501708984375, + "loss": 0.4845, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.404907703399658, + "rewards/margins": 1.390256643295288, + "rewards/rejected": -3.7951648235321045, + "step": 6690 + }, + { + "epoch": 0.9863094361843073, + "grad_norm": 89.64282871222667, + "learning_rate": 2.854855887669538e-10, + "logits/chosen": -1.0713691711425781, + "logits/rejected": -1.1653220653533936, + "logps/chosen": -527.0560913085938, + "logps/rejected": -659.8582763671875, + "loss": 0.4668, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2019684314727783, + "rewards/margins": 1.088714361190796, + "rewards/rejected": -3.2906830310821533, + "step": 6700 + }, + { + "epoch": 0.9877815398204034, + "grad_norm": 89.66056636668796, + "learning_rate": 2.274004403882146e-10, + "logits/chosen": -1.168025016784668, + "logits/rejected": -0.8619769811630249, + "logps/chosen": -444.8487243652344, + "logps/rejected": -631.8175048828125, + "loss": 0.3937, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.8021628856658936, + "rewards/margins": 1.7806934118270874, + "rewards/rejected": -3.5828566551208496, + "step": 6710 + }, + { + "epoch": 0.9892536434564994, + "grad_norm": 179.45903756800232, + "learning_rate": 1.7591211440468047e-10, + "logits/chosen": -1.033997654914856, + "logits/rejected": -0.7377156019210815, + "logps/chosen": -445.30401611328125, + "logps/rejected": -596.3392333984375, + "loss": 0.5579, + "rewards/accuracies": 0.7999999523162842, + "rewards/chosen": -1.8336889743804932, + "rewards/margins": 1.8341083526611328, + "rewards/rejected": -3.667797565460205, + "step": 6720 + }, + { + "epoch": 0.9907257470925953, + "grad_norm": 95.1448439914369, + "learning_rate": 1.3102197069067567e-10, + "logits/chosen": -1.089720368385315, + "logits/rejected": -0.31129345297813416, + "logps/chosen": -491.3125915527344, + "logps/rejected": -630.314208984375, + "loss": 0.4488, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.3085856437683105, + "rewards/margins": 1.567988395690918, + "rewards/rejected": -3.8765742778778076, + "step": 6730 + }, + { + "epoch": 0.9921978507286913, + "grad_norm": 136.00797944590792, + "learning_rate": 9.273119485386783e-11, + "logits/chosen": -1.1903355121612549, + "logits/rejected": -0.9880250096321106, + "logps/chosen": -528.3790283203125, + "logps/rejected": -664.5582275390625, + "loss": 0.6222, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -2.509011745452881, + "rewards/margins": 1.1903620958328247, + "rewards/rejected": -3.6993744373321533, + "step": 6740 + }, + { + "epoch": 0.9936699543647873, + "grad_norm": 101.7869581097204, + "learning_rate": 6.104079820390407e-11, + "logits/chosen": -1.1776105165481567, + "logits/rejected": -1.037445068359375, + "logps/chosen": -539.5609130859375, + "logps/rejected": -589.29248046875, + "loss": 0.5251, + "rewards/accuracies": 0.6333333253860474, + "rewards/chosen": -2.3894200325012207, + "rewards/margins": 0.5667029619216919, + "rewards/rejected": -2.956122875213623, + "step": 6750 + }, + { + "epoch": 0.9951420580008833, + "grad_norm": 95.83703523474757, + "learning_rate": 3.595161772582123e-11, + "logits/chosen": -1.046011209487915, + "logits/rejected": -0.7565838098526001, + "logps/chosen": -389.10504150390625, + "logps/rejected": -486.26116943359375, + "loss": 0.5416, + "rewards/accuracies": 0.7666666507720947, + "rewards/chosen": -1.673667550086975, + "rewards/margins": 1.3734490871429443, + "rewards/rejected": -3.04711651802063, + "step": 6760 + }, + { + "epoch": 0.9966141616369792, + "grad_norm": 159.90882275994576, + "learning_rate": 1.7464316057785866e-11, + "logits/chosen": -1.2152490615844727, + "logits/rejected": -0.7487831115722656, + "logps/chosen": -496.7948303222656, + "logps/rejected": -582.3119506835938, + "loss": 0.4963, + "rewards/accuracies": 0.6999999284744263, + "rewards/chosen": -2.1264190673828125, + "rewards/margins": 0.9500812292098999, + "rewards/rejected": -3.076500415802002, + "step": 6770 + }, + { + "epoch": 0.9980862652730752, + "grad_norm": 117.38374462687855, + "learning_rate": 5.5793814737192805e-12, + "logits/chosen": -1.0559961795806885, + "logits/rejected": -0.9266453981399536, + "logps/chosen": -526.2113647460938, + "logps/rejected": -558.4434814453125, + "loss": 0.5379, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.4753551483154297, + "rewards/margins": 0.7254849672317505, + "rewards/rejected": -3.2008399963378906, + "step": 6780 + }, + { + "epoch": 0.9995583689091712, + "grad_norm": 110.6028421066786, + "learning_rate": 2.9712787039115617e-13, + "logits/chosen": -0.6587592363357544, + "logits/rejected": -0.27146559953689575, + "logps/chosen": -444.2125549316406, + "logps/rejected": -552.4807739257812, + "loss": 0.5379, + "rewards/accuracies": 0.7333333492279053, + "rewards/chosen": -2.2807974815368652, + "rewards/margins": 1.2155722379684448, + "rewards/rejected": -3.4963698387145996, + "step": 6790 + }, + { + "epoch": 1.0, + "step": 6793, + "total_flos": 0.0, + "train_loss": 0.5255466872807298, + "train_runtime": 32845.2924, + "train_samples_per_second": 1.861, + "train_steps_per_second": 0.207 + } + ], + "logging_steps": 10, + "max_steps": 6793, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}