mistral-7b-v0.3-dpo / trainer_state.json
simonycl's picture
Upload folder using huggingface_hub
481e639 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998691442030882,
"eval_steps": 500,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02093692750588851,
"grad_norm": 85.02439880371094,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": -3.096651315689087,
"logits/rejected": -3.0814244747161865,
"logps/chosen": -295.3846130371094,
"logps/rejected": -279.3940124511719,
"loss": 0.692,
"rewards/accuracies": 0.4281249940395355,
"rewards/chosen": 0.002148410538211465,
"rewards/margins": 0.004025185946375132,
"rewards/rejected": -0.0018767757574096322,
"step": 10
},
{
"epoch": 0.04187385501177702,
"grad_norm": 74.03569030761719,
"learning_rate": 4.998555145953054e-07,
"logits/chosen": -3.083890199661255,
"logits/rejected": -3.068505048751831,
"logps/chosen": -278.1134338378906,
"logps/rejected": -266.706298828125,
"loss": 0.6728,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": 0.011356602422893047,
"rewards/margins": 0.07497048377990723,
"rewards/rejected": -0.0636138841509819,
"step": 20
},
{
"epoch": 0.06281078251766553,
"grad_norm": 67.47853088378906,
"learning_rate": 4.98700633214251e-07,
"logits/chosen": -3.0271506309509277,
"logits/rejected": -3.0370867252349854,
"logps/chosen": -246.0901336669922,
"logps/rejected": -250.2740478515625,
"loss": 0.6305,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": 0.018177634105086327,
"rewards/margins": 0.28142982721328735,
"rewards/rejected": -0.2632521986961365,
"step": 30
},
{
"epoch": 0.08374771002355404,
"grad_norm": 75.60296630859375,
"learning_rate": 4.963962085412632e-07,
"logits/chosen": -3.030393123626709,
"logits/rejected": -3.009413242340088,
"logps/chosen": -298.85662841796875,
"logps/rejected": -275.070068359375,
"loss": 0.6267,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.03324083238840103,
"rewards/margins": 0.2483668327331543,
"rewards/rejected": -0.28160765767097473,
"step": 40
},
{
"epoch": 0.10468463752944256,
"grad_norm": 69.39188385009766,
"learning_rate": 4.929528920808854e-07,
"logits/chosen": -3.052746534347534,
"logits/rejected": -3.066401720046997,
"logps/chosen": -281.92706298828125,
"logps/rejected": -246.51901245117188,
"loss": 0.6084,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.020495222881436348,
"rewards/margins": 0.40510186553001404,
"rewards/rejected": -0.42559710144996643,
"step": 50
},
{
"epoch": 0.12562156503533106,
"grad_norm": 83.05278015136719,
"learning_rate": 4.883865995197318e-07,
"logits/chosen": -3.035808563232422,
"logits/rejected": -3.0392653942108154,
"logps/chosen": -290.5362548828125,
"logps/rejected": -272.5738830566406,
"loss": 0.5792,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.049367621541023254,
"rewards/margins": 0.44638770818710327,
"rewards/rejected": -0.49575528502464294,
"step": 60
},
{
"epoch": 0.14655849254121958,
"grad_norm": 68.99510955810547,
"learning_rate": 4.82718437161051e-07,
"logits/chosen": -3.0192034244537354,
"logits/rejected": -3.006897449493408,
"logps/chosen": -265.6653747558594,
"logps/rejected": -260.2899169921875,
"loss": 0.5846,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.11491725593805313,
"rewards/margins": 0.38759148120880127,
"rewards/rejected": -0.5025087594985962,
"step": 70
},
{
"epoch": 0.16749542004710807,
"grad_norm": 63.006248474121094,
"learning_rate": 4.7597460436723613e-07,
"logits/chosen": -3.007894992828369,
"logits/rejected": -2.984534740447998,
"logps/chosen": -291.2572326660156,
"logps/rejected": -261.5260009765625,
"loss": 0.5843,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.1174750104546547,
"rewards/margins": 0.4169933795928955,
"rewards/rejected": -0.5344683527946472,
"step": 80
},
{
"epoch": 0.1884323475529966,
"grad_norm": 69.54000854492188,
"learning_rate": 4.68186272461214e-07,
"logits/chosen": -3.0481808185577393,
"logits/rejected": -3.036348819732666,
"logps/chosen": -273.8735656738281,
"logps/rejected": -258.81866455078125,
"loss": 0.5849,
"rewards/accuracies": 0.684374988079071,
"rewards/chosen": -0.09798178821802139,
"rewards/margins": 0.40805816650390625,
"rewards/rejected": -0.5060399770736694,
"step": 90
},
{
"epoch": 0.2093692750588851,
"grad_norm": 75.06998443603516,
"learning_rate": 4.593894406464536e-07,
"logits/chosen": -3.038364887237549,
"logits/rejected": -3.0354368686676025,
"logps/chosen": -296.1470031738281,
"logps/rejected": -286.38592529296875,
"loss": 0.5834,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.08966656774282455,
"rewards/margins": 0.5078560709953308,
"rewards/rejected": -0.5975226759910583,
"step": 100
},
{
"epoch": 0.23030620256477363,
"grad_norm": 137.9207305908203,
"learning_rate": 4.496247696115597e-07,
"logits/chosen": -3.039151191711426,
"logits/rejected": -3.0391647815704346,
"logps/chosen": -303.8061828613281,
"logps/rejected": -295.7118225097656,
"loss": 0.5804,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.07505225390195847,
"rewards/margins": 0.6039966344833374,
"rewards/rejected": -0.6790488958358765,
"step": 110
},
{
"epoch": 0.2512431300706621,
"grad_norm": 77.84745788574219,
"learning_rate": 4.3893739358856455e-07,
"logits/chosen": -3.008737087249756,
"logits/rejected": -2.9903557300567627,
"logps/chosen": -305.4298095703125,
"logps/rejected": -278.39947509765625,
"loss": 0.5582,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.14889295399188995,
"rewards/margins": 0.5994052886962891,
"rewards/rejected": -0.7482982277870178,
"step": 120
},
{
"epoch": 0.2721800575765506,
"grad_norm": 67.5359115600586,
"learning_rate": 4.273767117336217e-07,
"logits/chosen": -3.0301320552825928,
"logits/rejected": -3.012173891067505,
"logps/chosen": -308.94891357421875,
"logps/rejected": -295.3975524902344,
"loss": 0.5478,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.14121344685554504,
"rewards/margins": 0.6831844449043274,
"rewards/rejected": -0.82439786195755,
"step": 130
},
{
"epoch": 0.29311698508243916,
"grad_norm": 70.47966766357422,
"learning_rate": 4.1499615979437983e-07,
"logits/chosen": -2.9864563941955566,
"logits/rejected": -2.9899039268493652,
"logps/chosen": -279.08477783203125,
"logps/rejected": -257.7115173339844,
"loss": 0.5548,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.09717626124620438,
"rewards/margins": 0.624592661857605,
"rewards/rejected": -0.7217689752578735,
"step": 140
},
{
"epoch": 0.31405391258832765,
"grad_norm": 90.4140396118164,
"learning_rate": 4.018529631194369e-07,
"logits/chosen": -2.9848761558532715,
"logits/rejected": -2.9709620475769043,
"logps/chosen": -281.3067932128906,
"logps/rejected": -271.0277099609375,
"loss": 0.5703,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.2502523362636566,
"rewards/margins": 0.6211402416229248,
"rewards/rejected": -0.871392548084259,
"step": 150
},
{
"epoch": 0.33499084009421615,
"grad_norm": 68.7781753540039,
"learning_rate": 3.8800787215151164e-07,
"logits/chosen": -3.032036066055298,
"logits/rejected": -3.009941339492798,
"logps/chosen": -321.748779296875,
"logps/rejected": -281.04107666015625,
"loss": 0.5392,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.12777641415596008,
"rewards/margins": 0.6283574104309082,
"rewards/rejected": -0.7561337947845459,
"step": 160
},
{
"epoch": 0.3559277676001047,
"grad_norm": 66.1634292602539,
"learning_rate": 3.7352488162693715e-07,
"logits/chosen": -3.0462286472320557,
"logits/rejected": -3.030794620513916,
"logps/chosen": -274.5036926269531,
"logps/rejected": -251.90499877929688,
"loss": 0.5505,
"rewards/accuracies": 0.6968749761581421,
"rewards/chosen": -0.12955203652381897,
"rewards/margins": 0.6119082570075989,
"rewards/rejected": -0.7414603233337402,
"step": 170
},
{
"epoch": 0.3768646951059932,
"grad_norm": 75.37867736816406,
"learning_rate": 3.584709347793895e-07,
"logits/chosen": -3.058922052383423,
"logits/rejected": -3.0691912174224854,
"logps/chosen": -301.69635009765625,
"logps/rejected": -248.55593872070312,
"loss": 0.5508,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2145983725786209,
"rewards/margins": 0.5311049222946167,
"rewards/rejected": -0.7457033395767212,
"step": 180
},
{
"epoch": 0.39780162261188173,
"grad_norm": 75.07308959960938,
"learning_rate": 3.4291561391508185e-07,
"logits/chosen": -3.0233283042907715,
"logits/rejected": -3.0086400508880615,
"logps/chosen": -278.5184326171875,
"logps/rejected": -270.7456970214844,
"loss": 0.5632,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.28689366579055786,
"rewards/margins": 0.6087759733200073,
"rewards/rejected": -0.8956696391105652,
"step": 190
},
{
"epoch": 0.4187385501177702,
"grad_norm": 71.18640899658203,
"learning_rate": 3.2693081878964544e-07,
"logits/chosen": -3.0013060569763184,
"logits/rejected": -3.005615472793579,
"logps/chosen": -292.04852294921875,
"logps/rejected": -276.50811767578125,
"loss": 0.5475,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.20541390776634216,
"rewards/margins": 0.6916528940200806,
"rewards/rejected": -0.8970667719841003,
"step": 200
},
{
"epoch": 0.4396754776236587,
"grad_norm": 85.28279113769531,
"learning_rate": 3.1059043427330314e-07,
"logits/chosen": -2.9617443084716797,
"logits/rejected": -2.9682388305664062,
"logps/chosen": -261.1861572265625,
"logps/rejected": -263.7696838378906,
"loss": 0.533,
"rewards/accuracies": 0.778124988079071,
"rewards/chosen": -0.204990416765213,
"rewards/margins": 0.7386445999145508,
"rewards/rejected": -0.9436351656913757,
"step": 210
},
{
"epoch": 0.46061240512954726,
"grad_norm": 70.95091247558594,
"learning_rate": 2.9396998884045234e-07,
"logits/chosen": -3.0342681407928467,
"logits/rejected": -3.040320873260498,
"logps/chosen": -300.98077392578125,
"logps/rejected": -272.7954406738281,
"loss": 0.5389,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.25737327337265015,
"rewards/margins": 0.695563018321991,
"rewards/rejected": -0.9529362916946411,
"step": 220
},
{
"epoch": 0.48154933263543576,
"grad_norm": 64.26698303222656,
"learning_rate": 2.7714630546218634e-07,
"logits/chosen": -3.1135382652282715,
"logits/rejected": -3.1126351356506348,
"logps/chosen": -326.8101806640625,
"logps/rejected": -296.044921875,
"loss": 0.5438,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.2218112051486969,
"rewards/margins": 0.7040417790412903,
"rewards/rejected": -0.9258529543876648,
"step": 230
},
{
"epoch": 0.5024862601413242,
"grad_norm": 85.34664154052734,
"learning_rate": 2.6019714651539645e-07,
"logits/chosen": -3.0325405597686768,
"logits/rejected": -3.017796516418457,
"logps/chosen": -297.9241638183594,
"logps/rejected": -286.4637756347656,
"loss": 0.5647,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.274208128452301,
"rewards/margins": 0.7521761655807495,
"rewards/rejected": -1.0263843536376953,
"step": 240
},
{
"epoch": 0.5234231876472127,
"grad_norm": 70.70326232910156,
"learning_rate": 2.4320085434975556e-07,
"logits/chosen": -3.0199804306030273,
"logits/rejected": -3.01350736618042,
"logps/chosen": -284.5586853027344,
"logps/rejected": -259.7466125488281,
"loss": 0.56,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.2730976641178131,
"rewards/margins": 0.7632043957710266,
"rewards/rejected": -1.036302089691162,
"step": 250
},
{
"epoch": 0.5443601151531012,
"grad_norm": 63.27799606323242,
"learning_rate": 2.2623598917395436e-07,
"logits/chosen": -2.9862048625946045,
"logits/rejected": -3.020139217376709,
"logps/chosen": -296.0469665527344,
"logps/rejected": -276.1849365234375,
"loss": 0.5463,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.23287267982959747,
"rewards/margins": 0.7090679407119751,
"rewards/rejected": -0.9419406652450562,
"step": 260
},
{
"epoch": 0.5652970426589898,
"grad_norm": 66.7594223022461,
"learning_rate": 2.0938096593494853e-07,
"logits/chosen": -3.041605234146118,
"logits/rejected": -3.052452325820923,
"logps/chosen": -286.18707275390625,
"logps/rejected": -260.3746032714844,
"loss": 0.5256,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.11122454702854156,
"rewards/margins": 0.802563488483429,
"rewards/rejected": -0.9137881398200989,
"step": 270
},
{
"epoch": 0.5862339701648783,
"grad_norm": 88.30416107177734,
"learning_rate": 1.9271369186863618e-07,
"logits/chosen": -3.0525062084198,
"logits/rejected": -3.0589468479156494,
"logps/chosen": -284.6452941894531,
"logps/rejected": -277.75067138671875,
"loss": 0.5551,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.22388038039207458,
"rewards/margins": 0.6198626756668091,
"rewards/rejected": -0.8437430262565613,
"step": 280
},
{
"epoch": 0.6071708976707668,
"grad_norm": 65.08110809326172,
"learning_rate": 1.763112063972739e-07,
"logits/chosen": -3.044279098510742,
"logits/rejected": -3.0555179119110107,
"logps/chosen": -285.0969543457031,
"logps/rejected": -259.02142333984375,
"loss": 0.5278,
"rewards/accuracies": 0.7718750238418579,
"rewards/chosen": -0.16408179700374603,
"rewards/margins": 0.8104633092880249,
"rewards/rejected": -0.9745450019836426,
"step": 290
},
{
"epoch": 0.6281078251766553,
"grad_norm": 87.96784210205078,
"learning_rate": 1.602493250381003e-07,
"logits/chosen": -3.0667061805725098,
"logits/rejected": -3.064436435699463,
"logps/chosen": -287.88372802734375,
"logps/rejected": -248.08615112304688,
"loss": 0.564,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.27062320709228516,
"rewards/margins": 0.6274420022964478,
"rewards/rejected": -0.8980652093887329,
"step": 300
},
{
"epoch": 0.6490447526825438,
"grad_norm": 67.1192398071289,
"learning_rate": 1.446022889690875e-07,
"logits/chosen": -3.0603392124176025,
"logits/rejected": -3.0506479740142822,
"logps/chosen": -275.33941650390625,
"logps/rejected": -292.2793884277344,
"loss": 0.5304,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.27399036288261414,
"rewards/margins": 0.7174574136734009,
"rewards/rejected": -0.9914478063583374,
"step": 310
},
{
"epoch": 0.6699816801884323,
"grad_norm": 68.73091125488281,
"learning_rate": 1.2944242187160015e-07,
"logits/chosen": -3.0304224491119385,
"logits/rejected": -3.0630006790161133,
"logps/chosen": -265.5944519042969,
"logps/rejected": -270.86041259765625,
"loss": 0.5819,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -0.20603282749652863,
"rewards/margins": 0.8553716540336609,
"rewards/rejected": -1.0614043474197388,
"step": 320
},
{
"epoch": 0.6909186076943209,
"grad_norm": 78.73789978027344,
"learning_rate": 1.1483979563610069e-07,
"logits/chosen": -3.044661045074463,
"logits/rejected": -3.035492181777954,
"logps/chosen": -274.28204345703125,
"logps/rejected": -274.99151611328125,
"loss": 0.5374,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.1646738052368164,
"rewards/margins": 0.8839667439460754,
"rewards/rejected": -1.048640489578247,
"step": 330
},
{
"epoch": 0.7118555352002094,
"grad_norm": 70.24629211425781,
"learning_rate": 1.0086190647607529e-07,
"logits/chosen": -3.0631115436553955,
"logits/rejected": -3.089351177215576,
"logps/chosen": -287.9900817871094,
"logps/rejected": -272.482421875,
"loss": 0.5607,
"rewards/accuracies": 0.753125011920929,
"rewards/chosen": -0.11857350915670395,
"rewards/margins": 0.8544532060623169,
"rewards/rejected": -0.9730268716812134,
"step": 340
},
{
"epoch": 0.7327924627060979,
"grad_norm": 96.91629791259766,
"learning_rate": 8.757336294724687e-08,
"logits/chosen": -3.068084239959717,
"logits/rejected": -3.0875658988952637,
"logps/chosen": -291.7541198730469,
"logps/rejected": -258.79132080078125,
"loss": 0.5348,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.15175102651119232,
"rewards/margins": 0.8772052526473999,
"rewards/rejected": -1.028956413269043,
"step": 350
},
{
"epoch": 0.7537293902119864,
"grad_norm": 69.54812622070312,
"learning_rate": 7.503558731410958e-08,
"logits/chosen": -3.07660174369812,
"logits/rejected": -3.0733513832092285,
"logps/chosen": -252.8855438232422,
"logps/rejected": -264.5438232421875,
"loss": 0.5477,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.31728893518447876,
"rewards/margins": 0.6826174259185791,
"rewards/rejected": -0.9999063611030579,
"step": 360
},
{
"epoch": 0.7746663177178749,
"grad_norm": 68.41463470458984,
"learning_rate": 6.330653164412908e-08,
"logits/chosen": -3.0837528705596924,
"logits/rejected": -3.074859619140625,
"logps/chosen": -292.6845703125,
"logps/rejected": -274.19189453125,
"loss": 0.5639,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.18258486688137054,
"rewards/margins": 0.7360013723373413,
"rewards/rejected": -0.9185863733291626,
"step": 370
},
{
"epoch": 0.7956032452237635,
"grad_norm": 73.8513412475586,
"learning_rate": 5.2440409941877456e-08,
"logits/chosen": -3.080451250076294,
"logits/rejected": -3.1014645099639893,
"logps/chosen": -282.2720642089844,
"logps/rejected": -274.5783996582031,
"loss": 0.5627,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.17349520325660706,
"rewards/margins": 0.7617751359939575,
"rewards/rejected": -0.9352704286575317,
"step": 380
},
{
"epoch": 0.816540172729652,
"grad_norm": 62.425689697265625,
"learning_rate": 4.248744756122985e-08,
"logits/chosen": -3.1146225929260254,
"logits/rejected": -3.1159985065460205,
"logps/chosen": -284.4311828613281,
"logps/rejected": -270.375244140625,
"loss": 0.5397,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.1737302988767624,
"rewards/margins": 0.7495090365409851,
"rewards/rejected": -0.9232394099235535,
"step": 390
},
{
"epoch": 0.8374771002355405,
"grad_norm": 67.75579833984375,
"learning_rate": 3.349364905389032e-08,
"logits/chosen": -3.039133071899414,
"logits/rejected": -3.0417704582214355,
"logps/chosen": -289.43792724609375,
"logps/rejected": -279.08123779296875,
"loss": 0.5557,
"rewards/accuracies": 0.7281249761581421,
"rewards/chosen": -0.19276252388954163,
"rewards/margins": 0.713485062122345,
"rewards/rejected": -0.906247615814209,
"step": 400
},
{
"epoch": 0.8584140277414289,
"grad_norm": 60.96617126464844,
"learning_rate": 2.550058552729639e-08,
"logits/chosen": -3.0589489936828613,
"logits/rejected": -3.0491528511047363,
"logps/chosen": -298.5786437988281,
"logps/rejected": -275.2989807128906,
"loss": 0.5378,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.22146447002887726,
"rewards/margins": 0.7704640626907349,
"rewards/rejected": -0.9919285774230957,
"step": 410
},
{
"epoch": 0.8793509552473174,
"grad_norm": 57.156639099121094,
"learning_rate": 1.854520249477551e-08,
"logits/chosen": -3.0775399208068848,
"logits/rejected": -3.0917420387268066,
"logps/chosen": -281.49053955078125,
"logps/rejected": -252.451416015625,
"loss": 0.5338,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.14828899502754211,
"rewards/margins": 0.7465869188308716,
"rewards/rejected": -0.8948760032653809,
"step": 420
},
{
"epoch": 0.9002878827532059,
"grad_norm": 80.24808502197266,
"learning_rate": 1.265964910610884e-08,
"logits/chosen": -3.1026782989501953,
"logits/rejected": -3.111166477203369,
"logps/chosen": -285.04193115234375,
"logps/rejected": -284.14410400390625,
"loss": 0.5455,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.1942686289548874,
"rewards/margins": 0.8707529306411743,
"rewards/rejected": -1.0650215148925781,
"step": 430
},
{
"epoch": 0.9212248102590945,
"grad_norm": 61.17852020263672,
"learning_rate": 7.871129547831062e-09,
"logits/chosen": -3.0820913314819336,
"logits/rejected": -3.0653717517852783,
"logps/chosen": -278.7796325683594,
"logps/rejected": -235.0684814453125,
"loss": 0.5408,
"rewards/accuracies": 0.7093750238418579,
"rewards/chosen": -0.23389343917369843,
"rewards/margins": 0.6883670091629028,
"rewards/rejected": -0.9222604632377625,
"step": 440
},
{
"epoch": 0.942161737764983,
"grad_norm": 85.3263168334961,
"learning_rate": 4.201777300124249e-09,
"logits/chosen": -3.0574049949645996,
"logits/rejected": -3.0575528144836426,
"logps/chosen": -273.01531982421875,
"logps/rejected": -243.1544189453125,
"loss": 0.5495,
"rewards/accuracies": 0.746874988079071,
"rewards/chosen": -0.13399073481559753,
"rewards/margins": 0.6954258680343628,
"rewards/rejected": -0.8294164538383484,
"step": 450
},
{
"epoch": 0.9630986652708715,
"grad_norm": 67.3755874633789,
"learning_rate": 1.6685528315146802e-09,
"logits/chosen": -3.0953588485717773,
"logits/rejected": -3.0970802307128906,
"logps/chosen": -282.9346618652344,
"logps/rejected": -261.16497802734375,
"loss": 0.5443,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.24288193881511688,
"rewards/margins": 0.7198012471199036,
"rewards/rejected": -0.9626832008361816,
"step": 460
},
{
"epoch": 0.98403559277676,
"grad_norm": 61.79122543334961,
"learning_rate": 2.831652042480093e-10,
"logits/chosen": -3.086475372314453,
"logits/rejected": -3.0854830741882324,
"logps/chosen": -301.7154235839844,
"logps/rejected": -291.1816101074219,
"loss": 0.5439,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.232115238904953,
"rewards/margins": 0.7247028350830078,
"rewards/rejected": -0.9568179845809937,
"step": 470
},
{
"epoch": 0.998691442030882,
"step": 477,
"total_flos": 5.005717235969294e+18,
"train_loss": 0.5631812908364542,
"train_runtime": 18694.5367,
"train_samples_per_second": 3.27,
"train_steps_per_second": 0.026
}
],
"logging_steps": 10,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 256,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.005717235969294e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}