diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4038 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9994447529150472, + "eval_steps": 100, + "global_step": 2700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0037016472330186935, + "grad_norm": 6.938235277319067, + "learning_rate": 1.8518518518518518e-07, + "loss": 1.5302, + "step": 5 + }, + { + "epoch": 0.007403294466037387, + "grad_norm": 7.268031041449709, + "learning_rate": 3.7037037037037036e-07, + "loss": 1.5099, + "step": 10 + }, + { + "epoch": 0.01110494169905608, + "grad_norm": 6.707361833146337, + "learning_rate": 5.555555555555555e-07, + "loss": 1.5182, + "step": 15 + }, + { + "epoch": 0.014806588932074774, + "grad_norm": 5.451093836504048, + "learning_rate": 7.407407407407407e-07, + "loss": 1.4956, + "step": 20 + }, + { + "epoch": 0.018508236165093468, + "grad_norm": 4.673928540068199, + "learning_rate": 9.259259259259259e-07, + "loss": 1.4582, + "step": 25 + }, + { + "epoch": 0.02220988339811216, + "grad_norm": 4.159325167616714, + "learning_rate": 1.111111111111111e-06, + "loss": 1.4219, + "step": 30 + }, + { + "epoch": 0.025911530631130855, + "grad_norm": 3.3344977569830063, + "learning_rate": 1.2962962962962962e-06, + "loss": 1.4247, + "step": 35 + }, + { + "epoch": 0.029613177864149548, + "grad_norm": 2.855470171195436, + "learning_rate": 1.4814814814814815e-06, + "loss": 1.421, + "step": 40 + }, + { + "epoch": 0.03331482509716824, + "grad_norm": 2.6416618989533287, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.33, + "step": 45 + }, + { + "epoch": 0.037016472330186935, + "grad_norm": 2.263261354668403, + "learning_rate": 1.8518518518518519e-06, + "loss": 1.3312, + "step": 50 + }, + { + "epoch": 0.040718119563205625, + "grad_norm": 2.1013541579712753, + "learning_rate": 2.037037037037037e-06, + "loss": 1.3097, + "step": 55 + }, + { + "epoch": 0.04441976679622432, + "grad_norm": 1.9120786676503165, + "learning_rate": 2.222222222222222e-06, + "loss": 1.3118, + "step": 60 + }, + { + "epoch": 0.04812141402924301, + "grad_norm": 2.050461699793645, + "learning_rate": 2.4074074074074075e-06, + "loss": 1.2272, + "step": 65 + }, + { + "epoch": 0.05182306126226171, + "grad_norm": 1.8323934446281782, + "learning_rate": 2.5925925925925925e-06, + "loss": 1.218, + "step": 70 + }, + { + "epoch": 0.0555247084952804, + "grad_norm": 1.7538859492518115, + "learning_rate": 2.7777777777777783e-06, + "loss": 1.2018, + "step": 75 + }, + { + "epoch": 0.059226355728299096, + "grad_norm": 1.6618831514082206, + "learning_rate": 2.962962962962963e-06, + "loss": 1.1764, + "step": 80 + }, + { + "epoch": 0.06292800296131779, + "grad_norm": 1.574905727360327, + "learning_rate": 3.1481481481481483e-06, + "loss": 1.215, + "step": 85 + }, + { + "epoch": 0.06662965019433648, + "grad_norm": 1.5752273016779739, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.182, + "step": 90 + }, + { + "epoch": 0.07033129742735518, + "grad_norm": 1.5042772619553242, + "learning_rate": 3.5185185185185187e-06, + "loss": 1.1873, + "step": 95 + }, + { + "epoch": 0.07403294466037387, + "grad_norm": 1.5685723198311041, + "learning_rate": 3.7037037037037037e-06, + "loss": 1.1626, + "step": 100 + }, + { + "epoch": 0.07403294466037387, + "eval_loss": 1.1988961696624756, + "eval_runtime": 8.7616, + "eval_samples_per_second": 58.437, + "eval_steps_per_second": 14.609, + "step": 100 + }, + { + "epoch": 0.07773459189339256, + "grad_norm": 1.6895282160559515, + "learning_rate": 3.88888888888889e-06, + "loss": 1.1575, + "step": 105 + }, + { + "epoch": 0.08143623912641125, + "grad_norm": 1.7296810758427557, + "learning_rate": 4.074074074074074e-06, + "loss": 1.1554, + "step": 110 + }, + { + "epoch": 0.08513788635942994, + "grad_norm": 1.6334197275112947, + "learning_rate": 4.2592592592592596e-06, + "loss": 1.1336, + "step": 115 + }, + { + "epoch": 0.08883953359244864, + "grad_norm": 1.6157911900138535, + "learning_rate": 4.444444444444444e-06, + "loss": 1.1568, + "step": 120 + }, + { + "epoch": 0.09254118082546733, + "grad_norm": 1.819386269418747, + "learning_rate": 4.62962962962963e-06, + "loss": 1.1508, + "step": 125 + }, + { + "epoch": 0.09624282805848602, + "grad_norm": 1.5300312016091477, + "learning_rate": 4.814814814814815e-06, + "loss": 1.1636, + "step": 130 + }, + { + "epoch": 0.09994447529150471, + "grad_norm": 1.6687523332204426, + "learning_rate": 5e-06, + "loss": 1.1528, + "step": 135 + }, + { + "epoch": 0.10364612252452342, + "grad_norm": 1.6037586197720166, + "learning_rate": 5.185185185185185e-06, + "loss": 1.1502, + "step": 140 + }, + { + "epoch": 0.10734776975754211, + "grad_norm": 1.6775275762359088, + "learning_rate": 5.370370370370371e-06, + "loss": 1.1469, + "step": 145 + }, + { + "epoch": 0.1110494169905608, + "grad_norm": 1.682996179151718, + "learning_rate": 5.555555555555557e-06, + "loss": 1.1636, + "step": 150 + }, + { + "epoch": 0.11475106422357949, + "grad_norm": 1.6277070415853683, + "learning_rate": 5.740740740740741e-06, + "loss": 1.1258, + "step": 155 + }, + { + "epoch": 0.11845271145659819, + "grad_norm": 1.553623964811703, + "learning_rate": 5.925925925925926e-06, + "loss": 1.1304, + "step": 160 + }, + { + "epoch": 0.12215435868961688, + "grad_norm": 1.588580332839279, + "learning_rate": 6.111111111111112e-06, + "loss": 1.1439, + "step": 165 + }, + { + "epoch": 0.12585600592263557, + "grad_norm": 1.6616684567469457, + "learning_rate": 6.296296296296297e-06, + "loss": 1.1079, + "step": 170 + }, + { + "epoch": 0.12955765315565426, + "grad_norm": 1.6762897748350518, + "learning_rate": 6.481481481481482e-06, + "loss": 1.0921, + "step": 175 + }, + { + "epoch": 0.13325930038867295, + "grad_norm": 1.6596422210586161, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1092, + "step": 180 + }, + { + "epoch": 0.13696094762169164, + "grad_norm": 1.7722037854995745, + "learning_rate": 6.851851851851853e-06, + "loss": 1.1006, + "step": 185 + }, + { + "epoch": 0.14066259485471036, + "grad_norm": 1.7298048533999484, + "learning_rate": 7.0370370370370375e-06, + "loss": 1.1233, + "step": 190 + }, + { + "epoch": 0.14436424208772905, + "grad_norm": 1.98573216584534, + "learning_rate": 7.222222222222223e-06, + "loss": 1.088, + "step": 195 + }, + { + "epoch": 0.14806588932074774, + "grad_norm": 1.8587833627332044, + "learning_rate": 7.4074074074074075e-06, + "loss": 1.1218, + "step": 200 + }, + { + "epoch": 0.14806588932074774, + "eval_loss": 1.1345889568328857, + "eval_runtime": 11.8208, + "eval_samples_per_second": 43.313, + "eval_steps_per_second": 10.828, + "step": 200 + }, + { + "epoch": 0.15176753655376643, + "grad_norm": 1.7130597917326755, + "learning_rate": 7.592592592592594e-06, + "loss": 1.101, + "step": 205 + }, + { + "epoch": 0.15546918378678512, + "grad_norm": 1.663391530176506, + "learning_rate": 7.77777777777778e-06, + "loss": 1.1329, + "step": 210 + }, + { + "epoch": 0.1591708310198038, + "grad_norm": 1.5996505363015205, + "learning_rate": 7.962962962962963e-06, + "loss": 1.1075, + "step": 215 + }, + { + "epoch": 0.1628724782528225, + "grad_norm": 1.7038076686493917, + "learning_rate": 8.148148148148148e-06, + "loss": 1.1177, + "step": 220 + }, + { + "epoch": 0.1665741254858412, + "grad_norm": 1.6542551343222383, + "learning_rate": 8.333333333333334e-06, + "loss": 1.1093, + "step": 225 + }, + { + "epoch": 0.17027577271885988, + "grad_norm": 1.651873246968653, + "learning_rate": 8.518518518518519e-06, + "loss": 1.0843, + "step": 230 + }, + { + "epoch": 0.1739774199518786, + "grad_norm": 1.7198765328480365, + "learning_rate": 8.703703703703705e-06, + "loss": 1.0937, + "step": 235 + }, + { + "epoch": 0.1776790671848973, + "grad_norm": 1.6510428501911072, + "learning_rate": 8.888888888888888e-06, + "loss": 1.1111, + "step": 240 + }, + { + "epoch": 0.18138071441791598, + "grad_norm": 1.6213365368916086, + "learning_rate": 9.074074074074075e-06, + "loss": 1.0756, + "step": 245 + }, + { + "epoch": 0.18508236165093467, + "grad_norm": 1.7093203475906262, + "learning_rate": 9.25925925925926e-06, + "loss": 1.0694, + "step": 250 + }, + { + "epoch": 0.18878400888395336, + "grad_norm": 1.5228912650464108, + "learning_rate": 9.444444444444445e-06, + "loss": 1.1033, + "step": 255 + }, + { + "epoch": 0.19248565611697205, + "grad_norm": 1.673445899569891, + "learning_rate": 9.62962962962963e-06, + "loss": 1.0678, + "step": 260 + }, + { + "epoch": 0.19618730334999074, + "grad_norm": 1.6304796670467894, + "learning_rate": 9.814814814814815e-06, + "loss": 1.1158, + "step": 265 + }, + { + "epoch": 0.19988895058300943, + "grad_norm": 1.8244867074957254, + "learning_rate": 1e-05, + "loss": 1.1145, + "step": 270 + }, + { + "epoch": 0.20359059781602815, + "grad_norm": 1.6240030719245686, + "learning_rate": 9.999895536228031e-06, + "loss": 1.0802, + "step": 275 + }, + { + "epoch": 0.20729224504904684, + "grad_norm": 1.6243177291419117, + "learning_rate": 9.999582149277188e-06, + "loss": 1.0542, + "step": 280 + }, + { + "epoch": 0.21099389228206553, + "grad_norm": 1.629944565116536, + "learning_rate": 9.999059852242508e-06, + "loss": 1.0698, + "step": 285 + }, + { + "epoch": 0.21469553951508422, + "grad_norm": 1.5777459565221383, + "learning_rate": 9.998328666948437e-06, + "loss": 1.0681, + "step": 290 + }, + { + "epoch": 0.2183971867481029, + "grad_norm": 1.7250795726626251, + "learning_rate": 9.997388623947927e-06, + "loss": 1.0762, + "step": 295 + }, + { + "epoch": 0.2220988339811216, + "grad_norm": 1.9272571692059781, + "learning_rate": 9.996239762521152e-06, + "loss": 1.066, + "step": 300 + }, + { + "epoch": 0.2220988339811216, + "eval_loss": 1.1053733825683594, + "eval_runtime": 10.8836, + "eval_samples_per_second": 47.043, + "eval_steps_per_second": 11.761, + "step": 300 + }, + { + "epoch": 0.2258004812141403, + "grad_norm": 1.5120677355850114, + "learning_rate": 9.994882130673869e-06, + "loss": 1.0493, + "step": 305 + }, + { + "epoch": 0.22950212844715898, + "grad_norm": 1.6029522673182492, + "learning_rate": 9.993315785135417e-06, + "loss": 1.0682, + "step": 310 + }, + { + "epoch": 0.23320377568017767, + "grad_norm": 1.7123312053771258, + "learning_rate": 9.991540791356342e-06, + "loss": 1.0763, + "step": 315 + }, + { + "epoch": 0.23690542291319638, + "grad_norm": 1.5724741182176134, + "learning_rate": 9.989557223505661e-06, + "loss": 1.0401, + "step": 320 + }, + { + "epoch": 0.24060707014621507, + "grad_norm": 1.6045932069501792, + "learning_rate": 9.987365164467767e-06, + "loss": 1.0529, + "step": 325 + }, + { + "epoch": 0.24430871737923376, + "grad_norm": 1.8264353953198955, + "learning_rate": 9.98496470583896e-06, + "loss": 1.1018, + "step": 330 + }, + { + "epoch": 0.24801036461225245, + "grad_norm": 1.6022738272462664, + "learning_rate": 9.98235594792363e-06, + "loss": 1.0571, + "step": 335 + }, + { + "epoch": 0.25171201184527114, + "grad_norm": 1.590245570446662, + "learning_rate": 9.979538999730047e-06, + "loss": 1.0606, + "step": 340 + }, + { + "epoch": 0.25541365907828983, + "grad_norm": 1.6383785467577274, + "learning_rate": 9.976513978965829e-06, + "loss": 1.0662, + "step": 345 + }, + { + "epoch": 0.2591153063113085, + "grad_norm": 1.6184390160064126, + "learning_rate": 9.973281012033009e-06, + "loss": 1.0597, + "step": 350 + }, + { + "epoch": 0.2628169535443272, + "grad_norm": 1.6462129640549557, + "learning_rate": 9.96984023402275e-06, + "loss": 1.0455, + "step": 355 + }, + { + "epoch": 0.2665186007773459, + "grad_norm": 1.5771952400129876, + "learning_rate": 9.966191788709716e-06, + "loss": 1.0928, + "step": 360 + }, + { + "epoch": 0.2702202480103646, + "grad_norm": 1.6788673967555205, + "learning_rate": 9.962335828546049e-06, + "loss": 1.0652, + "step": 365 + }, + { + "epoch": 0.2739218952433833, + "grad_norm": 1.7130930928872348, + "learning_rate": 9.958272514655006e-06, + "loss": 1.0429, + "step": 370 + }, + { + "epoch": 0.277623542476402, + "grad_norm": 1.6689391337155577, + "learning_rate": 9.954002016824226e-06, + "loss": 1.0877, + "step": 375 + }, + { + "epoch": 0.2813251897094207, + "grad_norm": 1.690319301390716, + "learning_rate": 9.949524513498636e-06, + "loss": 1.0795, + "step": 380 + }, + { + "epoch": 0.2850268369424394, + "grad_norm": 1.572172606275347, + "learning_rate": 9.944840191772987e-06, + "loss": 1.0663, + "step": 385 + }, + { + "epoch": 0.2887284841754581, + "grad_norm": 1.5263034631804209, + "learning_rate": 9.939949247384046e-06, + "loss": 1.0851, + "step": 390 + }, + { + "epoch": 0.2924301314084768, + "grad_norm": 1.7211710040759145, + "learning_rate": 9.934851884702415e-06, + "loss": 1.0759, + "step": 395 + }, + { + "epoch": 0.2961317786414955, + "grad_norm": 1.6740011312182885, + "learning_rate": 9.929548316723983e-06, + "loss": 1.0549, + "step": 400 + }, + { + "epoch": 0.2961317786414955, + "eval_loss": 1.0882835388183594, + "eval_runtime": 8.7432, + "eval_samples_per_second": 58.56, + "eval_steps_per_second": 14.64, + "step": 400 + }, + { + "epoch": 0.29983342587451417, + "grad_norm": 1.6831994602911653, + "learning_rate": 9.924038765061042e-06, + "loss": 1.0567, + "step": 405 + }, + { + "epoch": 0.30353507310753286, + "grad_norm": 1.8233912351646488, + "learning_rate": 9.918323459933006e-06, + "loss": 1.043, + "step": 410 + }, + { + "epoch": 0.30723672034055155, + "grad_norm": 1.5346197193130573, + "learning_rate": 9.912402640156812e-06, + "loss": 1.067, + "step": 415 + }, + { + "epoch": 0.31093836757357024, + "grad_norm": 1.6970688002458603, + "learning_rate": 9.906276553136924e-06, + "loss": 1.0554, + "step": 420 + }, + { + "epoch": 0.31464001480658893, + "grad_norm": 1.7448280302843093, + "learning_rate": 9.899945454855007e-06, + "loss": 1.0667, + "step": 425 + }, + { + "epoch": 0.3183416620396076, + "grad_norm": 1.7073354168183785, + "learning_rate": 9.893409609859221e-06, + "loss": 1.0479, + "step": 430 + }, + { + "epoch": 0.3220433092726263, + "grad_norm": 1.5924364453552773, + "learning_rate": 9.886669291253178e-06, + "loss": 1.0556, + "step": 435 + }, + { + "epoch": 0.325744956505645, + "grad_norm": 1.6684326207150462, + "learning_rate": 9.879724780684518e-06, + "loss": 1.0508, + "step": 440 + }, + { + "epoch": 0.3294466037386637, + "grad_norm": 1.6967295571568946, + "learning_rate": 9.872576368333152e-06, + "loss": 1.0651, + "step": 445 + }, + { + "epoch": 0.3331482509716824, + "grad_norm": 1.562036282290576, + "learning_rate": 9.86522435289912e-06, + "loss": 1.0743, + "step": 450 + }, + { + "epoch": 0.33684989820470107, + "grad_norm": 1.6107399617591995, + "learning_rate": 9.857669041590135e-06, + "loss": 1.0358, + "step": 455 + }, + { + "epoch": 0.34055154543771976, + "grad_norm": 1.6404525122379863, + "learning_rate": 9.849910750108718e-06, + "loss": 1.0306, + "step": 460 + }, + { + "epoch": 0.3442531926707385, + "grad_norm": 1.5534884160132494, + "learning_rate": 9.841949802639031e-06, + "loss": 1.0229, + "step": 465 + }, + { + "epoch": 0.3479548399037572, + "grad_norm": 1.6683504420781063, + "learning_rate": 9.833786531833311e-06, + "loss": 1.0499, + "step": 470 + }, + { + "epoch": 0.3516564871367759, + "grad_norm": 1.6125669309723765, + "learning_rate": 9.825421278797984e-06, + "loss": 1.055, + "step": 475 + }, + { + "epoch": 0.3553581343697946, + "grad_norm": 1.5428815014911474, + "learning_rate": 9.816854393079402e-06, + "loss": 1.0221, + "step": 480 + }, + { + "epoch": 0.35905978160281327, + "grad_norm": 1.617284949544144, + "learning_rate": 9.808086232649246e-06, + "loss": 1.033, + "step": 485 + }, + { + "epoch": 0.36276142883583196, + "grad_norm": 1.5951767676523292, + "learning_rate": 9.79911716388956e-06, + "loss": 1.0782, + "step": 490 + }, + { + "epoch": 0.36646307606885065, + "grad_norm": 1.6581136429430168, + "learning_rate": 9.789947561577445e-06, + "loss": 1.0635, + "step": 495 + }, + { + "epoch": 0.37016472330186934, + "grad_norm": 1.6065188309294918, + "learning_rate": 9.7805778088694e-06, + "loss": 1.0669, + "step": 500 + }, + { + "epoch": 0.37016472330186934, + "eval_loss": 1.0748300552368164, + "eval_runtime": 8.6678, + "eval_samples_per_second": 59.07, + "eval_steps_per_second": 14.767, + "step": 500 + }, + { + "epoch": 0.373866370534888, + "grad_norm": 1.6616285045031425, + "learning_rate": 9.771008297285307e-06, + "loss": 1.0263, + "step": 505 + }, + { + "epoch": 0.3775680177679067, + "grad_norm": 1.6890553974206772, + "learning_rate": 9.761239426692077e-06, + "loss": 1.0379, + "step": 510 + }, + { + "epoch": 0.3812696650009254, + "grad_norm": 1.642371294327726, + "learning_rate": 9.75127160528694e-06, + "loss": 1.0363, + "step": 515 + }, + { + "epoch": 0.3849713122339441, + "grad_norm": 1.609971736170351, + "learning_rate": 9.741105249580383e-06, + "loss": 1.0148, + "step": 520 + }, + { + "epoch": 0.3886729594669628, + "grad_norm": 1.5388625837247312, + "learning_rate": 9.730740784378755e-06, + "loss": 1.045, + "step": 525 + }, + { + "epoch": 0.3923746066999815, + "grad_norm": 1.5888188137718782, + "learning_rate": 9.7201786427665e-06, + "loss": 1.0341, + "step": 530 + }, + { + "epoch": 0.39607625393300017, + "grad_norm": 1.4974736547619503, + "learning_rate": 9.709419266088086e-06, + "loss": 1.0573, + "step": 535 + }, + { + "epoch": 0.39977790116601886, + "grad_norm": 1.6247908776077464, + "learning_rate": 9.698463103929542e-06, + "loss": 1.0545, + "step": 540 + }, + { + "epoch": 0.40347954839903755, + "grad_norm": 1.6041126674957051, + "learning_rate": 9.687310614099676e-06, + "loss": 1.0513, + "step": 545 + }, + { + "epoch": 0.4071811956320563, + "grad_norm": 1.6658902497560741, + "learning_rate": 9.67596226261095e-06, + "loss": 1.031, + "step": 550 + }, + { + "epoch": 0.410882842865075, + "grad_norm": 1.6137694638926725, + "learning_rate": 9.664418523660004e-06, + "loss": 1.0359, + "step": 555 + }, + { + "epoch": 0.41458449009809367, + "grad_norm": 1.6095305212035607, + "learning_rate": 9.652679879607843e-06, + "loss": 1.0798, + "step": 560 + }, + { + "epoch": 0.41828613733111236, + "grad_norm": 3.8514579839708425, + "learning_rate": 9.640746820959684e-06, + "loss": 1.0199, + "step": 565 + }, + { + "epoch": 0.42198778456413105, + "grad_norm": 1.5890073416005945, + "learning_rate": 9.628619846344453e-06, + "loss": 1.0462, + "step": 570 + }, + { + "epoch": 0.42568943179714974, + "grad_norm": 1.510801346821129, + "learning_rate": 9.616299462493952e-06, + "loss": 1.0209, + "step": 575 + }, + { + "epoch": 0.42939107903016843, + "grad_norm": 1.7546849226440295, + "learning_rate": 9.603786184221693e-06, + "loss": 1.0521, + "step": 580 + }, + { + "epoch": 0.4330927262631871, + "grad_norm": 1.6762272283133195, + "learning_rate": 9.591080534401371e-06, + "loss": 1.0218, + "step": 585 + }, + { + "epoch": 0.4367943734962058, + "grad_norm": 1.6662530229421226, + "learning_rate": 9.578183043945031e-06, + "loss": 1.0593, + "step": 590 + }, + { + "epoch": 0.4404960207292245, + "grad_norm": 1.6167637404172526, + "learning_rate": 9.565094251780872e-06, + "loss": 1.069, + "step": 595 + }, + { + "epoch": 0.4441976679622432, + "grad_norm": 1.5708072893669605, + "learning_rate": 9.551814704830734e-06, + "loss": 1.0228, + "step": 600 + }, + { + "epoch": 0.4441976679622432, + "eval_loss": 1.064416766166687, + "eval_runtime": 8.755, + "eval_samples_per_second": 58.481, + "eval_steps_per_second": 14.62, + "step": 600 + }, + { + "epoch": 0.4478993151952619, + "grad_norm": 1.7242405674867898, + "learning_rate": 9.538344957987245e-06, + "loss": 1.0317, + "step": 605 + }, + { + "epoch": 0.4516009624282806, + "grad_norm": 1.6824094274691348, + "learning_rate": 9.524685574090627e-06, + "loss": 1.0295, + "step": 610 + }, + { + "epoch": 0.45530260966129926, + "grad_norm": 1.839445106245174, + "learning_rate": 9.51083712390519e-06, + "loss": 1.0313, + "step": 615 + }, + { + "epoch": 0.45900425689431795, + "grad_norm": 1.5181275348104526, + "learning_rate": 9.496800186095466e-06, + "loss": 1.0494, + "step": 620 + }, + { + "epoch": 0.46270590412733664, + "grad_norm": 1.6758656485484829, + "learning_rate": 9.482575347202047e-06, + "loss": 1.0054, + "step": 625 + }, + { + "epoch": 0.46640755136035533, + "grad_norm": 1.522464483917177, + "learning_rate": 9.468163201617063e-06, + "loss": 1.0111, + "step": 630 + }, + { + "epoch": 0.4701091985933741, + "grad_norm": 1.5939009948400995, + "learning_rate": 9.453564351559348e-06, + "loss": 1.0108, + "step": 635 + }, + { + "epoch": 0.47381084582639277, + "grad_norm": 1.7148818415747054, + "learning_rate": 9.438779407049282e-06, + "loss": 1.0196, + "step": 640 + }, + { + "epoch": 0.47751249305941146, + "grad_norm": 1.6494094050372314, + "learning_rate": 9.423808985883289e-06, + "loss": 1.0287, + "step": 645 + }, + { + "epoch": 0.48121414029243015, + "grad_norm": 1.6800798770987615, + "learning_rate": 9.40865371360804e-06, + "loss": 1.0222, + "step": 650 + }, + { + "epoch": 0.48491578752544884, + "grad_norm": 1.6487616779870438, + "learning_rate": 9.393314223494297e-06, + "loss": 1.0468, + "step": 655 + }, + { + "epoch": 0.48861743475846753, + "grad_norm": 1.5971477827325937, + "learning_rate": 9.377791156510456e-06, + "loss": 1.0257, + "step": 660 + }, + { + "epoch": 0.4923190819914862, + "grad_norm": 1.6349785933485341, + "learning_rate": 9.362085161295768e-06, + "loss": 1.0143, + "step": 665 + }, + { + "epoch": 0.4960207292245049, + "grad_norm": 1.7836047329450633, + "learning_rate": 9.346196894133239e-06, + "loss": 1.0279, + "step": 670 + }, + { + "epoch": 0.4997223764575236, + "grad_norm": 1.5554656111349356, + "learning_rate": 9.330127018922195e-06, + "loss": 1.025, + "step": 675 + }, + { + "epoch": 0.5034240236905423, + "grad_norm": 1.5873567496879941, + "learning_rate": 9.313876207150544e-06, + "loss": 1.0369, + "step": 680 + }, + { + "epoch": 0.507125670923561, + "grad_norm": 1.4901436339732956, + "learning_rate": 9.297445137866726e-06, + "loss": 1.0138, + "step": 685 + }, + { + "epoch": 0.5108273181565797, + "grad_norm": 1.5964192406854771, + "learning_rate": 9.280834497651334e-06, + "loss": 1.0254, + "step": 690 + }, + { + "epoch": 0.5145289653895984, + "grad_norm": 1.6347585768126325, + "learning_rate": 9.264044980588415e-06, + "loss": 1.0491, + "step": 695 + }, + { + "epoch": 0.518230612622617, + "grad_norm": 1.5001440241949608, + "learning_rate": 9.247077288236488e-06, + "loss": 1.0423, + "step": 700 + }, + { + "epoch": 0.518230612622617, + "eval_loss": 1.0569534301757812, + "eval_runtime": 8.6934, + "eval_samples_per_second": 58.895, + "eval_steps_per_second": 14.724, + "step": 700 + }, + { + "epoch": 0.5219322598556357, + "grad_norm": 1.6942056219401487, + "learning_rate": 9.229932129599206e-06, + "loss": 1.0227, + "step": 705 + }, + { + "epoch": 0.5256339070886544, + "grad_norm": 1.575360988478343, + "learning_rate": 9.212610221095748e-06, + "loss": 1.0232, + "step": 710 + }, + { + "epoch": 0.5293355543216731, + "grad_norm": 1.5434758990559243, + "learning_rate": 9.195112286530874e-06, + "loss": 0.9868, + "step": 715 + }, + { + "epoch": 0.5330372015546918, + "grad_norm": 1.5834059602030823, + "learning_rate": 9.177439057064684e-06, + "loss": 1.0066, + "step": 720 + }, + { + "epoch": 0.5367388487877105, + "grad_norm": 1.5583588394086199, + "learning_rate": 9.159591271182058e-06, + "loss": 1.0504, + "step": 725 + }, + { + "epoch": 0.5404404960207292, + "grad_norm": 1.5241057918795764, + "learning_rate": 9.141569674661816e-06, + "loss": 1.0459, + "step": 730 + }, + { + "epoch": 0.5441421432537479, + "grad_norm": 1.6179959479360446, + "learning_rate": 9.123375020545534e-06, + "loss": 1.0304, + "step": 735 + }, + { + "epoch": 0.5478437904867666, + "grad_norm": 1.7974028021647042, + "learning_rate": 9.105008069106093e-06, + "loss": 1.0442, + "step": 740 + }, + { + "epoch": 0.5515454377197853, + "grad_norm": 1.649050893416802, + "learning_rate": 9.086469587815904e-06, + "loss": 1.0086, + "step": 745 + }, + { + "epoch": 0.555247084952804, + "grad_norm": 1.614894669068509, + "learning_rate": 9.067760351314838e-06, + "loss": 1.0469, + "step": 750 + }, + { + "epoch": 0.5589487321858226, + "grad_norm": 1.520568884970898, + "learning_rate": 9.048881141377863e-06, + "loss": 1.0086, + "step": 755 + }, + { + "epoch": 0.5626503794188414, + "grad_norm": 1.5551225273423361, + "learning_rate": 9.029832746882372e-06, + "loss": 1.0065, + "step": 760 + }, + { + "epoch": 0.5663520266518601, + "grad_norm": 1.4730472068014824, + "learning_rate": 9.01061596377522e-06, + "loss": 1.0281, + "step": 765 + }, + { + "epoch": 0.5700536738848788, + "grad_norm": 1.6014116272335734, + "learning_rate": 8.991231595039464e-06, + "loss": 1.0151, + "step": 770 + }, + { + "epoch": 0.5737553211178975, + "grad_norm": 1.5967294946956374, + "learning_rate": 8.97168045066082e-06, + "loss": 1.0046, + "step": 775 + }, + { + "epoch": 0.5774569683509162, + "grad_norm": 1.506719942875412, + "learning_rate": 8.951963347593797e-06, + "loss": 0.9995, + "step": 780 + }, + { + "epoch": 0.5811586155839349, + "grad_norm": 1.6981243082186395, + "learning_rate": 8.932081109727582e-06, + "loss": 1.0185, + "step": 785 + }, + { + "epoch": 0.5848602628169536, + "grad_norm": 1.5548965922799378, + "learning_rate": 8.9120345678516e-06, + "loss": 1.035, + "step": 790 + }, + { + "epoch": 0.5885619100499723, + "grad_norm": 1.4696827174157245, + "learning_rate": 8.891824559620801e-06, + "loss": 0.9988, + "step": 795 + }, + { + "epoch": 0.592263557282991, + "grad_norm": 1.6048014454677177, + "learning_rate": 8.871451929520662e-06, + "loss": 1.0263, + "step": 800 + }, + { + "epoch": 0.592263557282991, + "eval_loss": 1.0503827333450317, + "eval_runtime": 8.6982, + "eval_samples_per_second": 58.863, + "eval_steps_per_second": 14.716, + "step": 800 + }, + { + "epoch": 0.5959652045160097, + "grad_norm": 1.6988170070957813, + "learning_rate": 8.8509175288319e-06, + "loss": 1.0232, + "step": 805 + }, + { + "epoch": 0.5996668517490283, + "grad_norm": 1.5659115399712737, + "learning_rate": 8.83022221559489e-06, + "loss": 1.0354, + "step": 810 + }, + { + "epoch": 0.603368498982047, + "grad_norm": 1.5221814047695017, + "learning_rate": 8.80936685457383e-06, + "loss": 1.0333, + "step": 815 + }, + { + "epoch": 0.6070701462150657, + "grad_norm": 1.562353756566854, + "learning_rate": 8.78835231722059e-06, + "loss": 1.0382, + "step": 820 + }, + { + "epoch": 0.6107717934480844, + "grad_norm": 1.490360737763961, + "learning_rate": 8.767179481638303e-06, + "loss": 1.0184, + "step": 825 + }, + { + "epoch": 0.6144734406811031, + "grad_norm": 1.5813235550962066, + "learning_rate": 8.74584923254468e-06, + "loss": 1.0046, + "step": 830 + }, + { + "epoch": 0.6181750879141218, + "grad_norm": 1.618903249194125, + "learning_rate": 8.72436246123503e-06, + "loss": 0.9847, + "step": 835 + }, + { + "epoch": 0.6218767351471405, + "grad_norm": 1.6051868684169093, + "learning_rate": 8.702720065545024e-06, + "loss": 1.0241, + "step": 840 + }, + { + "epoch": 0.6255783823801592, + "grad_norm": 1.6080381216679098, + "learning_rate": 8.680922949813177e-06, + "loss": 1.0455, + "step": 845 + }, + { + "epoch": 0.6292800296131779, + "grad_norm": 1.5398170796291812, + "learning_rate": 8.658972024843063e-06, + "loss": 1.0123, + "step": 850 + }, + { + "epoch": 0.6329816768461966, + "grad_norm": 1.5079959968532555, + "learning_rate": 8.636868207865244e-06, + "loss": 1.0012, + "step": 855 + }, + { + "epoch": 0.6366833240792152, + "grad_norm": 1.5108165451129463, + "learning_rate": 8.614612422498965e-06, + "loss": 0.987, + "step": 860 + }, + { + "epoch": 0.6403849713122339, + "grad_norm": 1.5072453921526188, + "learning_rate": 8.592205598713539e-06, + "loss": 1.0063, + "step": 865 + }, + { + "epoch": 0.6440866185452526, + "grad_norm": 1.5236818405854384, + "learning_rate": 8.569648672789496e-06, + "loss": 1.0531, + "step": 870 + }, + { + "epoch": 0.6477882657782713, + "grad_norm": 1.6365347434254418, + "learning_rate": 8.546942587279465e-06, + "loss": 1.0125, + "step": 875 + }, + { + "epoch": 0.65148991301129, + "grad_norm": 1.4635136559877178, + "learning_rate": 8.524088290968781e-06, + "loss": 1.0235, + "step": 880 + }, + { + "epoch": 0.6551915602443087, + "grad_norm": 1.6403726541339527, + "learning_rate": 8.501086738835843e-06, + "loss": 1.017, + "step": 885 + }, + { + "epoch": 0.6588932074773274, + "grad_norm": 1.593396456570902, + "learning_rate": 8.477938892012209e-06, + "loss": 1.0169, + "step": 890 + }, + { + "epoch": 0.6625948547103461, + "grad_norm": 1.5617836951270059, + "learning_rate": 8.45464571774244e-06, + "loss": 1.0362, + "step": 895 + }, + { + "epoch": 0.6662965019433648, + "grad_norm": 1.5971242667528898, + "learning_rate": 8.43120818934367e-06, + "loss": 1.0085, + "step": 900 + }, + { + "epoch": 0.6662965019433648, + "eval_loss": 1.0438976287841797, + "eval_runtime": 8.7306, + "eval_samples_per_second": 58.644, + "eval_steps_per_second": 14.661, + "step": 900 + }, + { + "epoch": 0.6699981491763835, + "grad_norm": 1.515185532674787, + "learning_rate": 8.407627286164948e-06, + "loss": 1.0029, + "step": 905 + }, + { + "epoch": 0.6736997964094021, + "grad_norm": 1.5243850755391344, + "learning_rate": 8.38390399354631e-06, + "loss": 1.0266, + "step": 910 + }, + { + "epoch": 0.6774014436424208, + "grad_norm": 1.580588659304687, + "learning_rate": 8.360039302777614e-06, + "loss": 1.028, + "step": 915 + }, + { + "epoch": 0.6811030908754395, + "grad_norm": 1.5511561732012566, + "learning_rate": 8.336034211057098e-06, + "loss": 1.0203, + "step": 920 + }, + { + "epoch": 0.6848047381084582, + "grad_norm": 1.5317152606248665, + "learning_rate": 8.31188972144974e-06, + "loss": 1.0222, + "step": 925 + }, + { + "epoch": 0.688506385341477, + "grad_norm": 1.640160330004564, + "learning_rate": 8.28760684284532e-06, + "loss": 0.9964, + "step": 930 + }, + { + "epoch": 0.6922080325744957, + "grad_norm": 1.5621396147571478, + "learning_rate": 8.263186589916273e-06, + "loss": 1.0085, + "step": 935 + }, + { + "epoch": 0.6959096798075144, + "grad_norm": 1.4901980514863815, + "learning_rate": 8.238629983075296e-06, + "loss": 1.0224, + "step": 940 + }, + { + "epoch": 0.6996113270405331, + "grad_norm": 1.5553443210091746, + "learning_rate": 8.213938048432697e-06, + "loss": 1.0073, + "step": 945 + }, + { + "epoch": 0.7033129742735518, + "grad_norm": 1.5439376225728318, + "learning_rate": 8.18911181775353e-06, + "loss": 1.0071, + "step": 950 + }, + { + "epoch": 0.7070146215065705, + "grad_norm": 1.5106567899403467, + "learning_rate": 8.164152328414476e-06, + "loss": 1.0113, + "step": 955 + }, + { + "epoch": 0.7107162687395892, + "grad_norm": 1.6943973028544896, + "learning_rate": 8.139060623360494e-06, + "loss": 0.9877, + "step": 960 + }, + { + "epoch": 0.7144179159726078, + "grad_norm": 1.6055362835498677, + "learning_rate": 8.113837751061246e-06, + "loss": 1.003, + "step": 965 + }, + { + "epoch": 0.7181195632056265, + "grad_norm": 1.6279162631191268, + "learning_rate": 8.088484765467286e-06, + "loss": 1.0049, + "step": 970 + }, + { + "epoch": 0.7218212104386452, + "grad_norm": 1.6604430854839634, + "learning_rate": 8.063002725966014e-06, + "loss": 1.0049, + "step": 975 + }, + { + "epoch": 0.7255228576716639, + "grad_norm": 1.5017388079752865, + "learning_rate": 8.037392697337418e-06, + "loss": 1.0115, + "step": 980 + }, + { + "epoch": 0.7292245049046826, + "grad_norm": 1.6030200207735454, + "learning_rate": 8.011655749709575e-06, + "loss": 1.0044, + "step": 985 + }, + { + "epoch": 0.7329261521377013, + "grad_norm": 1.6579368837511645, + "learning_rate": 7.985792958513932e-06, + "loss": 1.0342, + "step": 990 + }, + { + "epoch": 0.73662779937072, + "grad_norm": 1.5292653573625021, + "learning_rate": 7.95980540444038e-06, + "loss": 1.011, + "step": 995 + }, + { + "epoch": 0.7403294466037387, + "grad_norm": 1.681691332550864, + "learning_rate": 7.93369417339209e-06, + "loss": 0.9926, + "step": 1000 + }, + { + "epoch": 0.7403294466037387, + "eval_loss": 1.0382704734802246, + "eval_runtime": 8.6988, + "eval_samples_per_second": 58.859, + "eval_steps_per_second": 14.715, + "step": 1000 + }, + { + "epoch": 0.7440310938367574, + "grad_norm": 1.6752177251862725, + "learning_rate": 7.907460356440133e-06, + "loss": 0.982, + "step": 1005 + }, + { + "epoch": 0.747732741069776, + "grad_norm": 1.6158791648303268, + "learning_rate": 7.881105049777902e-06, + "loss": 1.0065, + "step": 1010 + }, + { + "epoch": 0.7514343883027947, + "grad_norm": 1.5008165210546065, + "learning_rate": 7.854629354675292e-06, + "loss": 0.9998, + "step": 1015 + }, + { + "epoch": 0.7551360355358134, + "grad_norm": 1.5488208344349068, + "learning_rate": 7.828034377432694e-06, + "loss": 1.0172, + "step": 1020 + }, + { + "epoch": 0.7588376827688321, + "grad_norm": 1.469701423604682, + "learning_rate": 7.801321229334764e-06, + "loss": 1.0002, + "step": 1025 + }, + { + "epoch": 0.7625393300018508, + "grad_norm": 1.4531574430918555, + "learning_rate": 7.774491026603985e-06, + "loss": 1.0055, + "step": 1030 + }, + { + "epoch": 0.7662409772348695, + "grad_norm": 1.6900598064531231, + "learning_rate": 7.747544890354031e-06, + "loss": 1.0192, + "step": 1035 + }, + { + "epoch": 0.7699426244678882, + "grad_norm": 1.558014346947998, + "learning_rate": 7.720483946542913e-06, + "loss": 1.0013, + "step": 1040 + }, + { + "epoch": 0.7736442717009069, + "grad_norm": 1.5500371825585382, + "learning_rate": 7.69330932592594e-06, + "loss": 0.9878, + "step": 1045 + }, + { + "epoch": 0.7773459189339256, + "grad_norm": 1.5367469436528767, + "learning_rate": 7.666022164008458e-06, + "loss": 0.9858, + "step": 1050 + }, + { + "epoch": 0.7810475661669443, + "grad_norm": 1.4601423824350541, + "learning_rate": 7.638623600998409e-06, + "loss": 1.0007, + "step": 1055 + }, + { + "epoch": 0.784749213399963, + "grad_norm": 1.648616874802218, + "learning_rate": 7.6111147817586925e-06, + "loss": 0.9931, + "step": 1060 + }, + { + "epoch": 0.7884508606329816, + "grad_norm": 1.5849178244300517, + "learning_rate": 7.5834968557593155e-06, + "loss": 1.0205, + "step": 1065 + }, + { + "epoch": 0.7921525078660003, + "grad_norm": 1.5867119629960735, + "learning_rate": 7.5557709770293664e-06, + "loss": 1.0306, + "step": 1070 + }, + { + "epoch": 0.795854155099019, + "grad_norm": 1.5641079232184893, + "learning_rate": 7.527938304108795e-06, + "loss": 1.0229, + "step": 1075 + }, + { + "epoch": 0.7995558023320377, + "grad_norm": 1.5590165376402654, + "learning_rate": 7.500000000000001e-06, + "loss": 0.9696, + "step": 1080 + }, + { + "epoch": 0.8032574495650564, + "grad_norm": 1.4736677882280342, + "learning_rate": 7.471957232119235e-06, + "loss": 0.9958, + "step": 1085 + }, + { + "epoch": 0.8069590967980751, + "grad_norm": 1.495242221963721, + "learning_rate": 7.443811172247822e-06, + "loss": 0.9797, + "step": 1090 + }, + { + "epoch": 0.8106607440310938, + "grad_norm": 1.5646379206160475, + "learning_rate": 7.415562996483193e-06, + "loss": 1.0331, + "step": 1095 + }, + { + "epoch": 0.8143623912641126, + "grad_norm": 1.4867013854186002, + "learning_rate": 7.387213885189746e-06, + "loss": 0.9972, + "step": 1100 + }, + { + "epoch": 0.8143623912641126, + "eval_loss": 1.0345816612243652, + "eval_runtime": 8.7672, + "eval_samples_per_second": 58.399, + "eval_steps_per_second": 14.6, + "step": 1100 + }, + { + "epoch": 0.8180640384971313, + "grad_norm": 1.6986654279574274, + "learning_rate": 7.358765022949519e-06, + "loss": 1.0015, + "step": 1105 + }, + { + "epoch": 0.82176568573015, + "grad_norm": 1.4979227472262977, + "learning_rate": 7.330217598512696e-06, + "loss": 0.9918, + "step": 1110 + }, + { + "epoch": 0.8254673329631687, + "grad_norm": 1.4409780655786124, + "learning_rate": 7.30157280474793e-06, + "loss": 0.9759, + "step": 1115 + }, + { + "epoch": 0.8291689801961873, + "grad_norm": 1.4674103925105133, + "learning_rate": 7.2728318385925035e-06, + "loss": 0.9865, + "step": 1120 + }, + { + "epoch": 0.832870627429206, + "grad_norm": 1.605252575982056, + "learning_rate": 7.243995901002312e-06, + "loss": 0.9832, + "step": 1125 + }, + { + "epoch": 0.8365722746622247, + "grad_norm": 1.68151860441419, + "learning_rate": 7.215066196901676e-06, + "loss": 1.0105, + "step": 1130 + }, + { + "epoch": 0.8402739218952434, + "grad_norm": 1.5248287910415732, + "learning_rate": 7.186043935133005e-06, + "loss": 1.0174, + "step": 1135 + }, + { + "epoch": 0.8439755691282621, + "grad_norm": 1.5354066430867073, + "learning_rate": 7.156930328406268e-06, + "loss": 0.9881, + "step": 1140 + }, + { + "epoch": 0.8476772163612808, + "grad_norm": 1.5525671237362146, + "learning_rate": 7.127726593248337e-06, + "loss": 0.9895, + "step": 1145 + }, + { + "epoch": 0.8513788635942995, + "grad_norm": 1.5174388784248503, + "learning_rate": 7.098433949952146e-06, + "loss": 0.9887, + "step": 1150 + }, + { + "epoch": 0.8550805108273182, + "grad_norm": 1.5315674808308293, + "learning_rate": 7.069053622525697e-06, + "loss": 0.9502, + "step": 1155 + }, + { + "epoch": 0.8587821580603369, + "grad_norm": 1.4149587100662266, + "learning_rate": 7.039586838640918e-06, + "loss": 1.0154, + "step": 1160 + }, + { + "epoch": 0.8624838052933556, + "grad_norm": 1.5648914937353318, + "learning_rate": 7.0100348295823706e-06, + "loss": 0.988, + "step": 1165 + }, + { + "epoch": 0.8661854525263742, + "grad_norm": 1.5539675815083542, + "learning_rate": 6.980398830195785e-06, + "loss": 1.0112, + "step": 1170 + }, + { + "epoch": 0.8698870997593929, + "grad_norm": 1.421648283396261, + "learning_rate": 6.950680078836475e-06, + "loss": 1.0135, + "step": 1175 + }, + { + "epoch": 0.8735887469924116, + "grad_norm": 1.4464001727685705, + "learning_rate": 6.920879817317588e-06, + "loss": 0.9909, + "step": 1180 + }, + { + "epoch": 0.8772903942254303, + "grad_norm": 1.4937571669427057, + "learning_rate": 6.890999290858213e-06, + "loss": 0.9702, + "step": 1185 + }, + { + "epoch": 0.880992041458449, + "grad_norm": 1.4509930616773992, + "learning_rate": 6.861039748031351e-06, + "loss": 1.0, + "step": 1190 + }, + { + "epoch": 0.8846936886914677, + "grad_norm": 1.4945840852689367, + "learning_rate": 6.8310024407117405e-06, + "loss": 1.0049, + "step": 1195 + }, + { + "epoch": 0.8883953359244864, + "grad_norm": 1.4989120949638919, + "learning_rate": 6.800888624023552e-06, + "loss": 0.9729, + "step": 1200 + }, + { + "epoch": 0.8883953359244864, + "eval_loss": 1.0306613445281982, + "eval_runtime": 8.685, + "eval_samples_per_second": 58.952, + "eval_steps_per_second": 14.738, + "step": 1200 + }, + { + "epoch": 0.8920969831575051, + "grad_norm": 1.4664191520171954, + "learning_rate": 6.770699556287939e-06, + "loss": 1.0008, + "step": 1205 + }, + { + "epoch": 0.8957986303905238, + "grad_norm": 1.5754399133388788, + "learning_rate": 6.740436498970453e-06, + "loss": 1.0126, + "step": 1210 + }, + { + "epoch": 0.8995002776235425, + "grad_norm": 1.60094861067482, + "learning_rate": 6.710100716628345e-06, + "loss": 1.0, + "step": 1215 + }, + { + "epoch": 0.9032019248565611, + "grad_norm": 1.4595385666093144, + "learning_rate": 6.679693476857712e-06, + "loss": 0.9861, + "step": 1220 + }, + { + "epoch": 0.9069035720895798, + "grad_norm": 1.5022484134921414, + "learning_rate": 6.649216050240539e-06, + "loss": 1.0129, + "step": 1225 + }, + { + "epoch": 0.9106052193225985, + "grad_norm": 1.4991474920735144, + "learning_rate": 6.618669710291607e-06, + "loss": 0.9906, + "step": 1230 + }, + { + "epoch": 0.9143068665556172, + "grad_norm": 1.9098143096411326, + "learning_rate": 6.588055733405266e-06, + "loss": 0.996, + "step": 1235 + }, + { + "epoch": 0.9180085137886359, + "grad_norm": 1.582034170801525, + "learning_rate": 6.557375398802124e-06, + "loss": 1.01, + "step": 1240 + }, + { + "epoch": 0.9217101610216546, + "grad_norm": 1.541103799700962, + "learning_rate": 6.526629988475567e-06, + "loss": 0.9986, + "step": 1245 + }, + { + "epoch": 0.9254118082546733, + "grad_norm": 1.7275592833521085, + "learning_rate": 6.495820787138209e-06, + "loss": 1.0112, + "step": 1250 + }, + { + "epoch": 0.929113455487692, + "grad_norm": 1.6073614158652252, + "learning_rate": 6.4649490821682035e-06, + "loss": 1.0329, + "step": 1255 + }, + { + "epoch": 0.9328151027207107, + "grad_norm": 1.520357101228401, + "learning_rate": 6.434016163555452e-06, + "loss": 0.9895, + "step": 1260 + }, + { + "epoch": 0.9365167499537294, + "grad_norm": 1.5144821625571732, + "learning_rate": 6.403023323847695e-06, + "loss": 0.9797, + "step": 1265 + }, + { + "epoch": 0.9402183971867482, + "grad_norm": 1.6223270323749646, + "learning_rate": 6.371971858096509e-06, + "loss": 1.0114, + "step": 1270 + }, + { + "epoch": 0.9439200444197668, + "grad_norm": 1.4650778858538542, + "learning_rate": 6.340863063803187e-06, + "loss": 1.0004, + "step": 1275 + }, + { + "epoch": 0.9476216916527855, + "grad_norm": 1.7363358348857403, + "learning_rate": 6.30969824086453e-06, + "loss": 0.9973, + "step": 1280 + }, + { + "epoch": 0.9513233388858042, + "grad_norm": 1.5673091008974074, + "learning_rate": 6.278478691518519e-06, + "loss": 0.9903, + "step": 1285 + }, + { + "epoch": 0.9550249861188229, + "grad_norm": 1.5737677986125558, + "learning_rate": 6.247205720289907e-06, + "loss": 1.0283, + "step": 1290 + }, + { + "epoch": 0.9587266333518416, + "grad_norm": 1.490823774845133, + "learning_rate": 6.215880633935709e-06, + "loss": 1.005, + "step": 1295 + }, + { + "epoch": 0.9624282805848603, + "grad_norm": 1.5221265311599004, + "learning_rate": 6.184504741390596e-06, + "loss": 1.016, + "step": 1300 + }, + { + "epoch": 0.9624282805848603, + "eval_loss": 1.0261740684509277, + "eval_runtime": 8.6903, + "eval_samples_per_second": 58.916, + "eval_steps_per_second": 14.729, + "step": 1300 + }, + { + "epoch": 0.966129927817879, + "grad_norm": 1.5497782760360854, + "learning_rate": 6.153079353712201e-06, + "loss": 0.9538, + "step": 1305 + }, + { + "epoch": 0.9698315750508977, + "grad_norm": 1.4891969599026476, + "learning_rate": 6.121605784026339e-06, + "loss": 1.0251, + "step": 1310 + }, + { + "epoch": 0.9735332222839164, + "grad_norm": 1.5148913492479763, + "learning_rate": 6.09008534747213e-06, + "loss": 0.9565, + "step": 1315 + }, + { + "epoch": 0.9772348695169351, + "grad_norm": 1.4933554149502895, + "learning_rate": 6.058519361147055e-06, + "loss": 0.9968, + "step": 1320 + }, + { + "epoch": 0.9809365167499537, + "grad_norm": 1.4313288190909095, + "learning_rate": 6.02690914405191e-06, + "loss": 0.9423, + "step": 1325 + }, + { + "epoch": 0.9846381639829724, + "grad_norm": 1.5414508677830214, + "learning_rate": 5.995256017035703e-06, + "loss": 0.9976, + "step": 1330 + }, + { + "epoch": 0.9883398112159911, + "grad_norm": 1.5902685807729529, + "learning_rate": 5.9635613027404495e-06, + "loss": 1.0087, + "step": 1335 + }, + { + "epoch": 0.9920414584490098, + "grad_norm": 1.4804073390294674, + "learning_rate": 5.931826325545912e-06, + "loss": 0.9951, + "step": 1340 + }, + { + "epoch": 0.9957431056820285, + "grad_norm": 1.5203977933330328, + "learning_rate": 5.900052411514257e-06, + "loss": 0.9927, + "step": 1345 + }, + { + "epoch": 0.9994447529150472, + "grad_norm": 1.4592290182162446, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.9849, + "step": 1350 + }, + { + "epoch": 1.0037016472330187, + "grad_norm": 1.6078346951337656, + "learning_rate": 5.836393085267777e-06, + "loss": 1.1427, + "step": 1355 + }, + { + "epoch": 1.0074032944660374, + "grad_norm": 1.6427326520740766, + "learning_rate": 5.804510333090287e-06, + "loss": 0.952, + "step": 1360 + }, + { + "epoch": 1.011104941699056, + "grad_norm": 1.5456535118472365, + "learning_rate": 5.772593964039203e-06, + "loss": 0.9688, + "step": 1365 + }, + { + "epoch": 1.0148065889320748, + "grad_norm": 1.5530197411372806, + "learning_rate": 5.740645311756246e-06, + "loss": 0.964, + "step": 1370 + }, + { + "epoch": 1.0185082361650935, + "grad_norm": 1.4947763198029176, + "learning_rate": 5.708665711232103e-06, + "loss": 0.9444, + "step": 1375 + }, + { + "epoch": 1.0222098833981121, + "grad_norm": 1.494845190296128, + "learning_rate": 5.6766564987506564e-06, + "loss": 0.9677, + "step": 1380 + }, + { + "epoch": 1.0259115306311308, + "grad_norm": 1.4684775854511547, + "learning_rate": 5.644619011833134e-06, + "loss": 0.9446, + "step": 1385 + }, + { + "epoch": 1.0296131778641495, + "grad_norm": 1.5375658212489818, + "learning_rate": 5.612554589182228e-06, + "loss": 0.9389, + "step": 1390 + }, + { + "epoch": 1.0333148250971682, + "grad_norm": 1.5529509671731727, + "learning_rate": 5.5804645706261515e-06, + "loss": 0.9572, + "step": 1395 + }, + { + "epoch": 1.037016472330187, + "grad_norm": 1.5523403019399546, + "learning_rate": 5.548350297062659e-06, + "loss": 0.9554, + "step": 1400 + }, + { + "epoch": 1.037016472330187, + "eval_loss": 1.0255022048950195, + "eval_runtime": 8.6881, + "eval_samples_per_second": 58.931, + "eval_steps_per_second": 14.733, + "step": 1400 + }, + { + "epoch": 1.0407181195632056, + "grad_norm": 1.5104158625012352, + "learning_rate": 5.516213110403009e-06, + "loss": 0.9575, + "step": 1405 + }, + { + "epoch": 1.0444197667962243, + "grad_norm": 1.4890138153731032, + "learning_rate": 5.484054353515896e-06, + "loss": 0.9427, + "step": 1410 + }, + { + "epoch": 1.048121414029243, + "grad_norm": 1.5548599476236042, + "learning_rate": 5.451875370171341e-06, + "loss": 0.9524, + "step": 1415 + }, + { + "epoch": 1.0518230612622617, + "grad_norm": 1.4627948267025965, + "learning_rate": 5.419677504984534e-06, + "loss": 0.9587, + "step": 1420 + }, + { + "epoch": 1.0555247084952804, + "grad_norm": 1.5347877324134909, + "learning_rate": 5.387462103359655e-06, + "loss": 0.9512, + "step": 1425 + }, + { + "epoch": 1.059226355728299, + "grad_norm": 1.5055307314944746, + "learning_rate": 5.3552305114336515e-06, + "loss": 0.9562, + "step": 1430 + }, + { + "epoch": 1.0629280029613177, + "grad_norm": 1.5673533023313868, + "learning_rate": 5.32298407601999e-06, + "loss": 0.9456, + "step": 1435 + }, + { + "epoch": 1.0666296501943364, + "grad_norm": 1.564595024828045, + "learning_rate": 5.290724144552379e-06, + "loss": 0.947, + "step": 1440 + }, + { + "epoch": 1.070331297427355, + "grad_norm": 1.5287981221284181, + "learning_rate": 5.258452065028473e-06, + "loss": 0.9536, + "step": 1445 + }, + { + "epoch": 1.0740329446603738, + "grad_norm": 1.4465299230775315, + "learning_rate": 5.2261691859535325e-06, + "loss": 0.9186, + "step": 1450 + }, + { + "epoch": 1.0777345918933925, + "grad_norm": 1.634117352035441, + "learning_rate": 5.193876856284085e-06, + "loss": 0.9328, + "step": 1455 + }, + { + "epoch": 1.0814362391264112, + "grad_norm": 1.4385431388650558, + "learning_rate": 5.161576425371554e-06, + "loss": 0.9479, + "step": 1460 + }, + { + "epoch": 1.0851378863594299, + "grad_norm": 1.5072383725336713, + "learning_rate": 5.1292692429058824e-06, + "loss": 0.9861, + "step": 1465 + }, + { + "epoch": 1.0888395335924486, + "grad_norm": 1.4625718771019833, + "learning_rate": 5.096956658859122e-06, + "loss": 0.9557, + "step": 1470 + }, + { + "epoch": 1.0925411808254673, + "grad_norm": 1.4758402229370147, + "learning_rate": 5.064640023429042e-06, + "loss": 0.944, + "step": 1475 + }, + { + "epoch": 1.096242828058486, + "grad_norm": 1.51962588520605, + "learning_rate": 5.032320686982697e-06, + "loss": 0.9213, + "step": 1480 + }, + { + "epoch": 1.0999444752915046, + "grad_norm": 1.5103027827969508, + "learning_rate": 5e-06, + "loss": 0.9411, + "step": 1485 + }, + { + "epoch": 1.1036461225245233, + "grad_norm": 1.5224293264101985, + "learning_rate": 4.967679313017304e-06, + "loss": 0.9547, + "step": 1490 + }, + { + "epoch": 1.107347769757542, + "grad_norm": 1.5430895119123675, + "learning_rate": 4.9353599765709585e-06, + "loss": 0.9118, + "step": 1495 + }, + { + "epoch": 1.1110494169905607, + "grad_norm": 1.481637878176428, + "learning_rate": 4.903043341140879e-06, + "loss": 0.9564, + "step": 1500 + }, + { + "epoch": 1.1110494169905607, + "eval_loss": 1.0229578018188477, + "eval_runtime": 8.679, + "eval_samples_per_second": 58.993, + "eval_steps_per_second": 14.748, + "step": 1500 + }, + { + "epoch": 1.1147510642235794, + "grad_norm": 1.5115835881244595, + "learning_rate": 4.870730757094121e-06, + "loss": 0.9338, + "step": 1505 + }, + { + "epoch": 1.118452711456598, + "grad_norm": 1.5449975680141834, + "learning_rate": 4.838423574628447e-06, + "loss": 0.9661, + "step": 1510 + }, + { + "epoch": 1.1221543586896168, + "grad_norm": 1.5861346370310576, + "learning_rate": 4.806123143715916e-06, + "loss": 0.9246, + "step": 1515 + }, + { + "epoch": 1.1258560059226355, + "grad_norm": 1.5449553152899103, + "learning_rate": 4.773830814046469e-06, + "loss": 0.9599, + "step": 1520 + }, + { + "epoch": 1.1295576531556542, + "grad_norm": 1.5015297636472575, + "learning_rate": 4.741547934971528e-06, + "loss": 0.9462, + "step": 1525 + }, + { + "epoch": 1.1332593003886728, + "grad_norm": 1.5234680183519724, + "learning_rate": 4.7092758554476215e-06, + "loss": 0.9413, + "step": 1530 + }, + { + "epoch": 1.1369609476216915, + "grad_norm": 1.4388073264946348, + "learning_rate": 4.677015923980012e-06, + "loss": 0.9286, + "step": 1535 + }, + { + "epoch": 1.1406625948547104, + "grad_norm": 1.5248461875650898, + "learning_rate": 4.644769488566351e-06, + "loss": 0.9435, + "step": 1540 + }, + { + "epoch": 1.1443642420877291, + "grad_norm": 1.5303284099305183, + "learning_rate": 4.6125378966403465e-06, + "loss": 0.9522, + "step": 1545 + }, + { + "epoch": 1.1480658893207478, + "grad_norm": 1.4682502346893527, + "learning_rate": 4.580322495015466e-06, + "loss": 0.9384, + "step": 1550 + }, + { + "epoch": 1.1517675365537665, + "grad_norm": 1.509753203698729, + "learning_rate": 4.548124629828661e-06, + "loss": 0.9573, + "step": 1555 + }, + { + "epoch": 1.1554691837867852, + "grad_norm": 1.4247250737749155, + "learning_rate": 4.515945646484105e-06, + "loss": 0.934, + "step": 1560 + }, + { + "epoch": 1.159170831019804, + "grad_norm": 1.4971184840487013, + "learning_rate": 4.483786889596993e-06, + "loss": 0.9446, + "step": 1565 + }, + { + "epoch": 1.1628724782528226, + "grad_norm": 1.4886733261293026, + "learning_rate": 4.451649702937343e-06, + "loss": 0.9591, + "step": 1570 + }, + { + "epoch": 1.1665741254858413, + "grad_norm": 1.517255771510319, + "learning_rate": 4.4195354293738484e-06, + "loss": 0.9446, + "step": 1575 + }, + { + "epoch": 1.17027577271886, + "grad_norm": 1.5799027115378548, + "learning_rate": 4.387445410817774e-06, + "loss": 0.9598, + "step": 1580 + }, + { + "epoch": 1.1739774199518787, + "grad_norm": 1.5335978469668663, + "learning_rate": 4.355380988166867e-06, + "loss": 0.9526, + "step": 1585 + }, + { + "epoch": 1.1776790671848973, + "grad_norm": 1.4421944530376716, + "learning_rate": 4.323343501249346e-06, + "loss": 0.936, + "step": 1590 + }, + { + "epoch": 1.181380714417916, + "grad_norm": 1.4733228432144814, + "learning_rate": 4.291334288767899e-06, + "loss": 0.9249, + "step": 1595 + }, + { + "epoch": 1.1850823616509347, + "grad_norm": 1.4103746479202812, + "learning_rate": 4.259354688243758e-06, + "loss": 0.9551, + "step": 1600 + }, + { + "epoch": 1.1850823616509347, + "eval_loss": 1.0203170776367188, + "eval_runtime": 8.6729, + "eval_samples_per_second": 59.035, + "eval_steps_per_second": 14.759, + "step": 1600 + }, + { + "epoch": 1.1887840088839534, + "grad_norm": 1.4794301636693654, + "learning_rate": 4.227406035960798e-06, + "loss": 0.928, + "step": 1605 + }, + { + "epoch": 1.192485656116972, + "grad_norm": 1.5324684655198086, + "learning_rate": 4.195489666909714e-06, + "loss": 0.9556, + "step": 1610 + }, + { + "epoch": 1.1961873033499908, + "grad_norm": 1.5550207701455285, + "learning_rate": 4.163606914732224e-06, + "loss": 0.9024, + "step": 1615 + }, + { + "epoch": 1.1998889505830095, + "grad_norm": 1.6078242471595632, + "learning_rate": 4.131759111665349e-06, + "loss": 0.9371, + "step": 1620 + }, + { + "epoch": 1.2035905978160282, + "grad_norm": 1.5130235230070461, + "learning_rate": 4.099947588485744e-06, + "loss": 0.9193, + "step": 1625 + }, + { + "epoch": 1.2072922450490469, + "grad_norm": 1.5880211247444604, + "learning_rate": 4.06817367445409e-06, + "loss": 0.9338, + "step": 1630 + }, + { + "epoch": 1.2109938922820656, + "grad_norm": 1.5601507103852923, + "learning_rate": 4.036438697259551e-06, + "loss": 0.946, + "step": 1635 + }, + { + "epoch": 1.2146955395150842, + "grad_norm": 1.460584421184121, + "learning_rate": 4.004743982964298e-06, + "loss": 0.9468, + "step": 1640 + }, + { + "epoch": 1.218397186748103, + "grad_norm": 1.5301475424382758, + "learning_rate": 3.9730908559480904e-06, + "loss": 0.9304, + "step": 1645 + }, + { + "epoch": 1.2220988339811216, + "grad_norm": 1.4152802250426284, + "learning_rate": 3.941480638852948e-06, + "loss": 0.9288, + "step": 1650 + }, + { + "epoch": 1.2258004812141403, + "grad_norm": 1.4558762027477072, + "learning_rate": 3.909914652527872e-06, + "loss": 0.9532, + "step": 1655 + }, + { + "epoch": 1.229502128447159, + "grad_norm": 1.475608066439469, + "learning_rate": 3.878394215973663e-06, + "loss": 0.9267, + "step": 1660 + }, + { + "epoch": 1.2332037756801777, + "grad_norm": 1.4527083725574788, + "learning_rate": 3.8469206462878e-06, + "loss": 0.9397, + "step": 1665 + }, + { + "epoch": 1.2369054229131964, + "grad_norm": 1.4252791523516992, + "learning_rate": 3.815495258609404e-06, + "loss": 0.9515, + "step": 1670 + }, + { + "epoch": 1.240607070146215, + "grad_norm": 1.458412822434971, + "learning_rate": 3.784119366064293e-06, + "loss": 0.9627, + "step": 1675 + }, + { + "epoch": 1.2443087173792338, + "grad_norm": 1.4371535265538804, + "learning_rate": 3.752794279710094e-06, + "loss": 0.9492, + "step": 1680 + }, + { + "epoch": 1.2480103646122525, + "grad_norm": 1.5956163025607901, + "learning_rate": 3.721521308481483e-06, + "loss": 0.941, + "step": 1685 + }, + { + "epoch": 1.2517120118452711, + "grad_norm": 1.4606215041268646, + "learning_rate": 3.690301759135471e-06, + "loss": 0.9218, + "step": 1690 + }, + { + "epoch": 1.2554136590782898, + "grad_norm": 1.551540798338896, + "learning_rate": 3.6591369361968127e-06, + "loss": 0.9452, + "step": 1695 + }, + { + "epoch": 1.2591153063113085, + "grad_norm": 1.438162871091009, + "learning_rate": 3.6280281419034934e-06, + "loss": 0.9515, + "step": 1700 + }, + { + "epoch": 1.2591153063113085, + "eval_loss": 1.018484354019165, + "eval_runtime": 8.675, + "eval_samples_per_second": 59.02, + "eval_steps_per_second": 14.755, + "step": 1700 + }, + { + "epoch": 1.2628169535443272, + "grad_norm": 1.5280110385731542, + "learning_rate": 3.596976676152306e-06, + "loss": 0.9312, + "step": 1705 + }, + { + "epoch": 1.266518600777346, + "grad_norm": 1.439021134519739, + "learning_rate": 3.5659838364445505e-06, + "loss": 0.9502, + "step": 1710 + }, + { + "epoch": 1.2702202480103646, + "grad_norm": 1.542129828765787, + "learning_rate": 3.535050917831797e-06, + "loss": 0.9245, + "step": 1715 + }, + { + "epoch": 1.2739218952433833, + "grad_norm": 1.4387624494665334, + "learning_rate": 3.504179212861793e-06, + "loss": 0.9646, + "step": 1720 + }, + { + "epoch": 1.277623542476402, + "grad_norm": 1.4576756291542614, + "learning_rate": 3.473370011524435e-06, + "loss": 0.9444, + "step": 1725 + }, + { + "epoch": 1.2813251897094207, + "grad_norm": 1.4116630998926993, + "learning_rate": 3.442624601197877e-06, + "loss": 0.935, + "step": 1730 + }, + { + "epoch": 1.2850268369424394, + "grad_norm": 1.448732573638826, + "learning_rate": 3.4119442665947346e-06, + "loss": 0.9205, + "step": 1735 + }, + { + "epoch": 1.288728484175458, + "grad_norm": 1.4616328188755885, + "learning_rate": 3.3813302897083955e-06, + "loss": 0.9515, + "step": 1740 + }, + { + "epoch": 1.2924301314084767, + "grad_norm": 1.4994123055203914, + "learning_rate": 3.350783949759462e-06, + "loss": 0.9406, + "step": 1745 + }, + { + "epoch": 1.2961317786414954, + "grad_norm": 1.5510537992943896, + "learning_rate": 3.3203065231422904e-06, + "loss": 0.9536, + "step": 1750 + }, + { + "epoch": 1.2998334258745141, + "grad_norm": 1.4979569186605917, + "learning_rate": 3.289899283371657e-06, + "loss": 0.9364, + "step": 1755 + }, + { + "epoch": 1.3035350731075328, + "grad_norm": 1.4342400603002827, + "learning_rate": 3.259563501029548e-06, + "loss": 0.9661, + "step": 1760 + }, + { + "epoch": 1.3072367203405515, + "grad_norm": 1.605308165753949, + "learning_rate": 3.2293004437120622e-06, + "loss": 0.9419, + "step": 1765 + }, + { + "epoch": 1.3109383675735702, + "grad_norm": 1.4535747016942286, + "learning_rate": 3.1991113759764493e-06, + "loss": 0.9399, + "step": 1770 + }, + { + "epoch": 1.3146400148065889, + "grad_norm": 1.5045929424819182, + "learning_rate": 3.1689975592882603e-06, + "loss": 0.9493, + "step": 1775 + }, + { + "epoch": 1.3183416620396076, + "grad_norm": 1.4298316802164306, + "learning_rate": 3.1389602519686515e-06, + "loss": 0.9684, + "step": 1780 + }, + { + "epoch": 1.3220433092726263, + "grad_norm": 1.448065974522092, + "learning_rate": 3.1090007091417884e-06, + "loss": 0.9337, + "step": 1785 + }, + { + "epoch": 1.325744956505645, + "grad_norm": 1.5957876876234358, + "learning_rate": 3.0791201826824117e-06, + "loss": 0.9697, + "step": 1790 + }, + { + "epoch": 1.3294466037386636, + "grad_norm": 1.439183790018425, + "learning_rate": 3.049319921163526e-06, + "loss": 0.9246, + "step": 1795 + }, + { + "epoch": 1.3331482509716823, + "grad_norm": 1.5233611691430629, + "learning_rate": 3.019601169804216e-06, + "loss": 0.9507, + "step": 1800 + }, + { + "epoch": 1.3331482509716823, + "eval_loss": 1.0166221857070923, + "eval_runtime": 8.6954, + "eval_samples_per_second": 58.882, + "eval_steps_per_second": 14.72, + "step": 1800 + }, + { + "epoch": 1.336849898204701, + "grad_norm": 1.535149119238128, + "learning_rate": 2.9899651704176324e-06, + "loss": 0.9488, + "step": 1805 + }, + { + "epoch": 1.3405515454377197, + "grad_norm": 1.8013917126510002, + "learning_rate": 2.9604131613590825e-06, + "loss": 0.9525, + "step": 1810 + }, + { + "epoch": 1.3442531926707386, + "grad_norm": 1.5758419968483395, + "learning_rate": 2.9309463774743047e-06, + "loss": 0.9296, + "step": 1815 + }, + { + "epoch": 1.3479548399037573, + "grad_norm": 1.6434309252154637, + "learning_rate": 2.901566050047855e-06, + "loss": 0.972, + "step": 1820 + }, + { + "epoch": 1.351656487136776, + "grad_norm": 1.4945819293763394, + "learning_rate": 2.8722734067516637e-06, + "loss": 0.9678, + "step": 1825 + }, + { + "epoch": 1.3553581343697947, + "grad_norm": 1.5747674704361272, + "learning_rate": 2.843069671593734e-06, + "loss": 0.9461, + "step": 1830 + }, + { + "epoch": 1.3590597816028134, + "grad_norm": 1.4849849381301548, + "learning_rate": 2.813956064866996e-06, + "loss": 0.9547, + "step": 1835 + }, + { + "epoch": 1.362761428835832, + "grad_norm": 1.4549463612438158, + "learning_rate": 2.784933803098326e-06, + "loss": 0.9395, + "step": 1840 + }, + { + "epoch": 1.3664630760688508, + "grad_norm": 1.465774668348825, + "learning_rate": 2.7560040989976894e-06, + "loss": 0.946, + "step": 1845 + }, + { + "epoch": 1.3701647233018694, + "grad_norm": 1.3769248284299485, + "learning_rate": 2.7271681614074973e-06, + "loss": 0.9447, + "step": 1850 + }, + { + "epoch": 1.3738663705348881, + "grad_norm": 1.486107645046795, + "learning_rate": 2.6984271952520723e-06, + "loss": 0.9292, + "step": 1855 + }, + { + "epoch": 1.3775680177679068, + "grad_norm": 1.4626507230942989, + "learning_rate": 2.6697824014873076e-06, + "loss": 0.9514, + "step": 1860 + }, + { + "epoch": 1.3812696650009255, + "grad_norm": 1.6022770155906472, + "learning_rate": 2.641234977050484e-06, + "loss": 0.9117, + "step": 1865 + }, + { + "epoch": 1.3849713122339442, + "grad_norm": 1.4831861982927594, + "learning_rate": 2.6127861148102552e-06, + "loss": 0.9551, + "step": 1870 + }, + { + "epoch": 1.388672959466963, + "grad_norm": 1.5391905518945717, + "learning_rate": 2.5844370035168077e-06, + "loss": 0.9673, + "step": 1875 + }, + { + "epoch": 1.3923746066999816, + "grad_norm": 1.4232605239839096, + "learning_rate": 2.5561888277521797e-06, + "loss": 0.9494, + "step": 1880 + }, + { + "epoch": 1.3960762539330003, + "grad_norm": 1.4657072720630537, + "learning_rate": 2.528042767880766e-06, + "loss": 0.948, + "step": 1885 + }, + { + "epoch": 1.399777901166019, + "grad_norm": 1.5609370972114847, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.95, + "step": 1890 + }, + { + "epoch": 1.4034795483990377, + "grad_norm": 1.4134009339502485, + "learning_rate": 2.4720616958912054e-06, + "loss": 0.9312, + "step": 1895 + }, + { + "epoch": 1.4071811956320563, + "grad_norm": 1.4265922385575842, + "learning_rate": 2.4442290229706344e-06, + "loss": 0.9208, + "step": 1900 + }, + { + "epoch": 1.4071811956320563, + "eval_loss": 1.0145511627197266, + "eval_runtime": 8.723, + "eval_samples_per_second": 58.696, + "eval_steps_per_second": 14.674, + "step": 1900 + }, + { + "epoch": 1.410882842865075, + "grad_norm": 1.5398189516335716, + "learning_rate": 2.4165031442406857e-06, + "loss": 0.934, + "step": 1905 + }, + { + "epoch": 1.4145844900980937, + "grad_norm": 1.4291248478737046, + "learning_rate": 2.3888852182413087e-06, + "loss": 0.9396, + "step": 1910 + }, + { + "epoch": 1.4182861373311124, + "grad_norm": 1.4682471847679497, + "learning_rate": 2.361376399001592e-06, + "loss": 0.929, + "step": 1915 + }, + { + "epoch": 1.421987784564131, + "grad_norm": 1.83568046169287, + "learning_rate": 2.333977835991545e-06, + "loss": 0.9679, + "step": 1920 + }, + { + "epoch": 1.4256894317971498, + "grad_norm": 1.4338451747010847, + "learning_rate": 2.3066906740740626e-06, + "loss": 0.9503, + "step": 1925 + }, + { + "epoch": 1.4293910790301685, + "grad_norm": 1.5848101818149134, + "learning_rate": 2.2795160534570866e-06, + "loss": 0.9386, + "step": 1930 + }, + { + "epoch": 1.4330927262631872, + "grad_norm": 1.5568704183626663, + "learning_rate": 2.2524551096459703e-06, + "loss": 0.9176, + "step": 1935 + }, + { + "epoch": 1.4367943734962059, + "grad_norm": 1.4342074642791622, + "learning_rate": 2.2255089733960162e-06, + "loss": 0.9268, + "step": 1940 + }, + { + "epoch": 1.4404960207292246, + "grad_norm": 1.5264043520803225, + "learning_rate": 2.1986787706652377e-06, + "loss": 0.9182, + "step": 1945 + }, + { + "epoch": 1.4441976679622432, + "grad_norm": 1.4394261424854324, + "learning_rate": 2.171965622567308e-06, + "loss": 0.9577, + "step": 1950 + }, + { + "epoch": 1.447899315195262, + "grad_norm": 1.6049354461149483, + "learning_rate": 2.1453706453247088e-06, + "loss": 0.9287, + "step": 1955 + }, + { + "epoch": 1.4516009624282806, + "grad_norm": 1.6106070007653557, + "learning_rate": 2.1188949502220987e-06, + "loss": 0.9276, + "step": 1960 + }, + { + "epoch": 1.4553026096612993, + "grad_norm": 1.4890495009473046, + "learning_rate": 2.0925396435598665e-06, + "loss": 0.9531, + "step": 1965 + }, + { + "epoch": 1.459004256894318, + "grad_norm": 1.4462853644582758, + "learning_rate": 2.066305826607911e-06, + "loss": 0.9197, + "step": 1970 + }, + { + "epoch": 1.4627059041273367, + "grad_norm": 1.4880943901640924, + "learning_rate": 2.0401945955596206e-06, + "loss": 0.9592, + "step": 1975 + }, + { + "epoch": 1.4664075513603554, + "grad_norm": 1.5124073104411913, + "learning_rate": 2.0142070414860704e-06, + "loss": 0.9504, + "step": 1980 + }, + { + "epoch": 1.470109198593374, + "grad_norm": 1.5207220675714455, + "learning_rate": 1.9883442502904284e-06, + "loss": 0.9408, + "step": 1985 + }, + { + "epoch": 1.4738108458263928, + "grad_norm": 1.4024329836356682, + "learning_rate": 1.962607302662582e-06, + "loss": 0.9414, + "step": 1990 + }, + { + "epoch": 1.4775124930594115, + "grad_norm": 1.598071795929375, + "learning_rate": 1.936997274033986e-06, + "loss": 0.9547, + "step": 1995 + }, + { + "epoch": 1.4812141402924301, + "grad_norm": 1.534264128372943, + "learning_rate": 1.9115152345327154e-06, + "loss": 0.9442, + "step": 2000 + }, + { + "epoch": 1.4812141402924301, + "eval_loss": 1.0129976272583008, + "eval_runtime": 8.7217, + "eval_samples_per_second": 58.704, + "eval_steps_per_second": 14.676, + "step": 2000 + }, + { + "epoch": 1.4849157875254488, + "grad_norm": 1.4981898335706558, + "learning_rate": 1.8861622489387555e-06, + "loss": 0.9297, + "step": 2005 + }, + { + "epoch": 1.4886174347584675, + "grad_norm": 1.4632628358653164, + "learning_rate": 1.8609393766395083e-06, + "loss": 0.9607, + "step": 2010 + }, + { + "epoch": 1.4923190819914862, + "grad_norm": 1.4831829971571602, + "learning_rate": 1.8358476715855262e-06, + "loss": 0.9172, + "step": 2015 + }, + { + "epoch": 1.496020729224505, + "grad_norm": 1.4304163837532902, + "learning_rate": 1.8108881822464697e-06, + "loss": 0.9579, + "step": 2020 + }, + { + "epoch": 1.4997223764575236, + "grad_norm": 1.4641568790604755, + "learning_rate": 1.7860619515673034e-06, + "loss": 0.9592, + "step": 2025 + }, + { + "epoch": 1.5034240236905423, + "grad_norm": 1.4108335100882108, + "learning_rate": 1.7613700169247055e-06, + "loss": 0.9559, + "step": 2030 + }, + { + "epoch": 1.507125670923561, + "grad_norm": 1.3960221127799861, + "learning_rate": 1.7368134100837286e-06, + "loss": 0.9445, + "step": 2035 + }, + { + "epoch": 1.5108273181565797, + "grad_norm": 1.629241663992989, + "learning_rate": 1.7123931571546826e-06, + "loss": 0.9475, + "step": 2040 + }, + { + "epoch": 1.5145289653895984, + "grad_norm": 1.4769846296544475, + "learning_rate": 1.6881102785502618e-06, + "loss": 0.9241, + "step": 2045 + }, + { + "epoch": 1.518230612622617, + "grad_norm": 1.4450985025765188, + "learning_rate": 1.6639657889429017e-06, + "loss": 0.9312, + "step": 2050 + }, + { + "epoch": 1.5219322598556357, + "grad_norm": 1.5147785679068462, + "learning_rate": 1.639960697222388e-06, + "loss": 0.9539, + "step": 2055 + }, + { + "epoch": 1.5256339070886544, + "grad_norm": 1.4133261880328718, + "learning_rate": 1.6160960064536907e-06, + "loss": 0.9289, + "step": 2060 + }, + { + "epoch": 1.5293355543216731, + "grad_norm": 1.4899070137565427, + "learning_rate": 1.5923727138350548e-06, + "loss": 0.9452, + "step": 2065 + }, + { + "epoch": 1.5330372015546918, + "grad_norm": 1.604006362692692, + "learning_rate": 1.5687918106563326e-06, + "loss": 0.9676, + "step": 2070 + }, + { + "epoch": 1.5367388487877105, + "grad_norm": 1.573211074541449, + "learning_rate": 1.5453542822575624e-06, + "loss": 0.9245, + "step": 2075 + }, + { + "epoch": 1.5404404960207292, + "grad_norm": 1.4149807137951171, + "learning_rate": 1.52206110798779e-06, + "loss": 0.9213, + "step": 2080 + }, + { + "epoch": 1.5441421432537479, + "grad_norm": 1.6082035265990422, + "learning_rate": 1.4989132611641576e-06, + "loss": 0.9516, + "step": 2085 + }, + { + "epoch": 1.5478437904867666, + "grad_norm": 1.4214335025262166, + "learning_rate": 1.4759117090312197e-06, + "loss": 0.9676, + "step": 2090 + }, + { + "epoch": 1.5515454377197853, + "grad_norm": 1.4659084277987675, + "learning_rate": 1.453057412720536e-06, + "loss": 0.9635, + "step": 2095 + }, + { + "epoch": 1.555247084952804, + "grad_norm": 1.406907426871313, + "learning_rate": 1.4303513272105057e-06, + "loss": 0.9307, + "step": 2100 + }, + { + "epoch": 1.555247084952804, + "eval_loss": 1.0118414163589478, + "eval_runtime": 8.6982, + "eval_samples_per_second": 58.863, + "eval_steps_per_second": 14.716, + "step": 2100 + }, + { + "epoch": 1.5589487321858226, + "grad_norm": 1.5213179697546635, + "learning_rate": 1.4077944012864636e-06, + "loss": 0.925, + "step": 2105 + }, + { + "epoch": 1.5626503794188413, + "grad_norm": 1.4785816893254842, + "learning_rate": 1.3853875775010355e-06, + "loss": 0.9335, + "step": 2110 + }, + { + "epoch": 1.56635202665186, + "grad_norm": 1.6319160798858328, + "learning_rate": 1.3631317921347564e-06, + "loss": 0.9659, + "step": 2115 + }, + { + "epoch": 1.5700536738848787, + "grad_norm": 1.4378863109702957, + "learning_rate": 1.3410279751569399e-06, + "loss": 0.9543, + "step": 2120 + }, + { + "epoch": 1.5737553211178974, + "grad_norm": 1.5220881879950408, + "learning_rate": 1.3190770501868243e-06, + "loss": 0.9489, + "step": 2125 + }, + { + "epoch": 1.577456968350916, + "grad_norm": 1.5579237503089478, + "learning_rate": 1.297279934454978e-06, + "loss": 0.9388, + "step": 2130 + }, + { + "epoch": 1.5811586155839348, + "grad_norm": 1.4591792161712915, + "learning_rate": 1.2756375387649717e-06, + "loss": 0.9192, + "step": 2135 + }, + { + "epoch": 1.5848602628169535, + "grad_norm": 1.462271376262188, + "learning_rate": 1.25415076745532e-06, + "loss": 0.9513, + "step": 2140 + }, + { + "epoch": 1.5885619100499722, + "grad_norm": 1.4585813420478615, + "learning_rate": 1.2328205183616964e-06, + "loss": 0.9305, + "step": 2145 + }, + { + "epoch": 1.5922635572829908, + "grad_norm": 1.3996273344722394, + "learning_rate": 1.2116476827794104e-06, + "loss": 0.946, + "step": 2150 + }, + { + "epoch": 1.5959652045160095, + "grad_norm": 1.4264637319705211, + "learning_rate": 1.1906331454261704e-06, + "loss": 0.9572, + "step": 2155 + }, + { + "epoch": 1.5996668517490282, + "grad_norm": 1.4973661367660114, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.9275, + "step": 2160 + }, + { + "epoch": 1.603368498982047, + "grad_norm": 1.4060812708874741, + "learning_rate": 1.1490824711681026e-06, + "loss": 0.9268, + "step": 2165 + }, + { + "epoch": 1.6070701462150656, + "grad_norm": 1.491081236302123, + "learning_rate": 1.1285480704793378e-06, + "loss": 0.9279, + "step": 2170 + }, + { + "epoch": 1.6107717934480843, + "grad_norm": 1.5268525956740233, + "learning_rate": 1.1081754403792e-06, + "loss": 0.9523, + "step": 2175 + }, + { + "epoch": 1.614473440681103, + "grad_norm": 1.4071150743669196, + "learning_rate": 1.0879654321484012e-06, + "loss": 0.9361, + "step": 2180 + }, + { + "epoch": 1.6181750879141217, + "grad_norm": 1.438776125548561, + "learning_rate": 1.067918890272419e-06, + "loss": 0.9657, + "step": 2185 + }, + { + "epoch": 1.6218767351471404, + "grad_norm": 1.474475325920584, + "learning_rate": 1.0480366524062041e-06, + "loss": 0.8989, + "step": 2190 + }, + { + "epoch": 1.625578382380159, + "grad_norm": 1.5065652638225282, + "learning_rate": 1.0283195493391823e-06, + "loss": 0.9269, + "step": 2195 + }, + { + "epoch": 1.6292800296131777, + "grad_norm": 1.380340972623135, + "learning_rate": 1.008768404960535e-06, + "loss": 0.9382, + "step": 2200 + }, + { + "epoch": 1.6292800296131777, + "eval_loss": 1.0108721256256104, + "eval_runtime": 8.6862, + "eval_samples_per_second": 58.944, + "eval_steps_per_second": 14.736, + "step": 2200 + }, + { + "epoch": 1.6329816768461964, + "grad_norm": 1.508384357350366, + "learning_rate": 9.893840362247809e-07, + "loss": 0.9536, + "step": 2205 + }, + { + "epoch": 1.6366833240792151, + "grad_norm": 1.4302336284783967, + "learning_rate": 9.701672531176287e-07, + "loss": 0.9472, + "step": 2210 + }, + { + "epoch": 1.6403849713122338, + "grad_norm": 1.5021914595466455, + "learning_rate": 9.511188586221376e-07, + "loss": 0.9528, + "step": 2215 + }, + { + "epoch": 1.6440866185452525, + "grad_norm": 1.4896957386140668, + "learning_rate": 9.322396486851626e-07, + "loss": 0.9191, + "step": 2220 + }, + { + "epoch": 1.6477882657782712, + "grad_norm": 1.410027647026084, + "learning_rate": 9.135304121840976e-07, + "loss": 0.9528, + "step": 2225 + }, + { + "epoch": 1.65148991301129, + "grad_norm": 1.4647072795828986, + "learning_rate": 8.949919308939081e-07, + "loss": 0.9406, + "step": 2230 + }, + { + "epoch": 1.6551915602443086, + "grad_norm": 1.4421036811256112, + "learning_rate": 8.766249794544662e-07, + "loss": 0.9417, + "step": 2235 + }, + { + "epoch": 1.6588932074773273, + "grad_norm": 1.5154978969386064, + "learning_rate": 8.584303253381848e-07, + "loss": 0.9561, + "step": 2240 + }, + { + "epoch": 1.662594854710346, + "grad_norm": 1.4562633895782477, + "learning_rate": 8.404087288179425e-07, + "loss": 0.9466, + "step": 2245 + }, + { + "epoch": 1.6662965019433646, + "grad_norm": 1.3887325189281265, + "learning_rate": 8.225609429353187e-07, + "loss": 0.93, + "step": 2250 + }, + { + "epoch": 1.6699981491763833, + "grad_norm": 1.4325023361891722, + "learning_rate": 8.048877134691269e-07, + "loss": 0.9187, + "step": 2255 + }, + { + "epoch": 1.673699796409402, + "grad_norm": 1.42483840932513, + "learning_rate": 7.873897789042523e-07, + "loss": 0.9397, + "step": 2260 + }, + { + "epoch": 1.6774014436424207, + "grad_norm": 1.436754914963747, + "learning_rate": 7.700678704007947e-07, + "loss": 0.9232, + "step": 2265 + }, + { + "epoch": 1.6811030908754394, + "grad_norm": 1.4213278153304933, + "learning_rate": 7.529227117635135e-07, + "loss": 0.9438, + "step": 2270 + }, + { + "epoch": 1.684804738108458, + "grad_norm": 1.4376904196493825, + "learning_rate": 7.35955019411585e-07, + "loss": 0.9474, + "step": 2275 + }, + { + "epoch": 1.688506385341477, + "grad_norm": 1.440330895903481, + "learning_rate": 7.191655023486682e-07, + "loss": 0.9309, + "step": 2280 + }, + { + "epoch": 1.6922080325744957, + "grad_norm": 1.4504946046355287, + "learning_rate": 7.02554862133275e-07, + "loss": 0.9232, + "step": 2285 + }, + { + "epoch": 1.6959096798075144, + "grad_norm": 1.406976117754757, + "learning_rate": 6.86123792849458e-07, + "loss": 0.9404, + "step": 2290 + }, + { + "epoch": 1.699611327040533, + "grad_norm": 1.3998459625736135, + "learning_rate": 6.698729810778065e-07, + "loss": 0.9004, + "step": 2295 + }, + { + "epoch": 1.7033129742735518, + "grad_norm": 1.4323508272233076, + "learning_rate": 6.53803105866761e-07, + "loss": 0.9224, + "step": 2300 + }, + { + "epoch": 1.7033129742735518, + "eval_loss": 1.0102295875549316, + "eval_runtime": 8.7111, + "eval_samples_per_second": 58.775, + "eval_steps_per_second": 14.694, + "step": 2300 + }, + { + "epoch": 1.7070146215065705, + "grad_norm": 1.4397884549696236, + "learning_rate": 6.379148387042317e-07, + "loss": 0.9172, + "step": 2305 + }, + { + "epoch": 1.7107162687395892, + "grad_norm": 1.3919854340288227, + "learning_rate": 6.222088434895462e-07, + "loss": 0.9426, + "step": 2310 + }, + { + "epoch": 1.7144179159726078, + "grad_norm": 1.5011837528149663, + "learning_rate": 6.066857765057055e-07, + "loss": 0.91, + "step": 2315 + }, + { + "epoch": 1.7181195632056265, + "grad_norm": 1.4347878112668406, + "learning_rate": 5.9134628639196e-07, + "loss": 0.9355, + "step": 2320 + }, + { + "epoch": 1.7218212104386452, + "grad_norm": 1.3752936187697653, + "learning_rate": 5.76191014116711e-07, + "loss": 0.9344, + "step": 2325 + }, + { + "epoch": 1.725522857671664, + "grad_norm": 1.6317281996837152, + "learning_rate": 5.612205929507209e-07, + "loss": 0.9579, + "step": 2330 + }, + { + "epoch": 1.7292245049046826, + "grad_norm": 1.410242165503811, + "learning_rate": 5.464356484406535e-07, + "loss": 0.9463, + "step": 2335 + }, + { + "epoch": 1.7329261521377013, + "grad_norm": 1.6054001196121293, + "learning_rate": 5.318367983829393e-07, + "loss": 0.9412, + "step": 2340 + }, + { + "epoch": 1.73662779937072, + "grad_norm": 1.5108735566791855, + "learning_rate": 5.174246527979532e-07, + "loss": 0.9336, + "step": 2345 + }, + { + "epoch": 1.7403294466037387, + "grad_norm": 1.3879969337954057, + "learning_rate": 5.031998139045352e-07, + "loss": 0.9711, + "step": 2350 + }, + { + "epoch": 1.7440310938367574, + "grad_norm": 1.4140765269828401, + "learning_rate": 4.891628760948114e-07, + "loss": 0.9361, + "step": 2355 + }, + { + "epoch": 1.747732741069776, + "grad_norm": 1.380062448755359, + "learning_rate": 4.753144259093734e-07, + "loss": 0.9169, + "step": 2360 + }, + { + "epoch": 1.7514343883027947, + "grad_norm": 1.4140375581071665, + "learning_rate": 4.6165504201275635e-07, + "loss": 0.9439, + "step": 2365 + }, + { + "epoch": 1.7551360355358134, + "grad_norm": 1.4112810465379273, + "learning_rate": 4.481852951692672e-07, + "loss": 0.9548, + "step": 2370 + }, + { + "epoch": 1.7588376827688321, + "grad_norm": 1.4942125652112794, + "learning_rate": 4.349057482191299e-07, + "loss": 0.9697, + "step": 2375 + }, + { + "epoch": 1.7625393300018508, + "grad_norm": 1.4925795445437877, + "learning_rate": 4.2181695605497066e-07, + "loss": 0.9428, + "step": 2380 + }, + { + "epoch": 1.7662409772348695, + "grad_norm": 1.42841396976241, + "learning_rate": 4.089194655986306e-07, + "loss": 0.961, + "step": 2385 + }, + { + "epoch": 1.7699426244678882, + "grad_norm": 1.4682730700276878, + "learning_rate": 3.9621381577830855e-07, + "loss": 0.9654, + "step": 2390 + }, + { + "epoch": 1.7736442717009069, + "grad_norm": 1.4896297456091034, + "learning_rate": 3.837005375060482e-07, + "loss": 0.9333, + "step": 2395 + }, + { + "epoch": 1.7773459189339256, + "grad_norm": 1.4361351794217565, + "learning_rate": 3.7138015365554834e-07, + "loss": 0.9107, + "step": 2400 + }, + { + "epoch": 1.7773459189339256, + "eval_loss": 1.0095593929290771, + "eval_runtime": 8.7265, + "eval_samples_per_second": 58.672, + "eval_steps_per_second": 14.668, + "step": 2400 + }, + { + "epoch": 1.7810475661669443, + "grad_norm": 1.391141743720589, + "learning_rate": 3.592531790403159e-07, + "loss": 0.9338, + "step": 2405 + }, + { + "epoch": 1.784749213399963, + "grad_norm": 1.4629443997006013, + "learning_rate": 3.473201203921578e-07, + "loss": 0.9281, + "step": 2410 + }, + { + "epoch": 1.7884508606329816, + "grad_norm": 1.3944795908599104, + "learning_rate": 3.355814763399973e-07, + "loss": 0.9217, + "step": 2415 + }, + { + "epoch": 1.7921525078660003, + "grad_norm": 1.5108952168010161, + "learning_rate": 3.2403773738905185e-07, + "loss": 0.9312, + "step": 2420 + }, + { + "epoch": 1.795854155099019, + "grad_norm": 1.4747389522523875, + "learning_rate": 3.1268938590032495e-07, + "loss": 0.9405, + "step": 2425 + }, + { + "epoch": 1.7995558023320377, + "grad_norm": 1.4670095692146419, + "learning_rate": 3.015368960704584e-07, + "loss": 0.9341, + "step": 2430 + }, + { + "epoch": 1.8032574495650564, + "grad_norm": 1.420938494793171, + "learning_rate": 2.905807339119138e-07, + "loss": 0.9133, + "step": 2435 + }, + { + "epoch": 1.806959096798075, + "grad_norm": 1.485589398600447, + "learning_rate": 2.798213572335001e-07, + "loss": 0.9477, + "step": 2440 + }, + { + "epoch": 1.8106607440310938, + "grad_norm": 1.398836941771861, + "learning_rate": 2.6925921562124867e-07, + "loss": 0.9559, + "step": 2445 + }, + { + "epoch": 1.8143623912641127, + "grad_norm": 1.460425549307081, + "learning_rate": 2.5889475041961767e-07, + "loss": 0.912, + "step": 2450 + }, + { + "epoch": 1.8180640384971314, + "grad_norm": 1.374269506100965, + "learning_rate": 2.487283947130609e-07, + "loss": 0.9398, + "step": 2455 + }, + { + "epoch": 1.82176568573015, + "grad_norm": 1.4896413268024193, + "learning_rate": 2.3876057330792344e-07, + "loss": 0.941, + "step": 2460 + }, + { + "epoch": 1.8254673329631688, + "grad_norm": 1.4690641033644323, + "learning_rate": 2.289917027146943e-07, + "loss": 0.9454, + "step": 2465 + }, + { + "epoch": 1.8291689801961875, + "grad_norm": 1.477863619064755, + "learning_rate": 2.1942219113060215e-07, + "loss": 0.9168, + "step": 2470 + }, + { + "epoch": 1.8328706274292061, + "grad_norm": 1.470521817548036, + "learning_rate": 2.1005243842255552e-07, + "loss": 0.9643, + "step": 2475 + }, + { + "epoch": 1.8365722746622248, + "grad_norm": 1.4220441215199895, + "learning_rate": 2.0088283611044034e-07, + "loss": 0.9661, + "step": 2480 + }, + { + "epoch": 1.8402739218952435, + "grad_norm": 1.425688228229182, + "learning_rate": 1.919137673507543e-07, + "loss": 0.9178, + "step": 2485 + }, + { + "epoch": 1.8439755691282622, + "grad_norm": 1.4518435265086809, + "learning_rate": 1.8314560692059836e-07, + "loss": 0.955, + "step": 2490 + }, + { + "epoch": 1.847677216361281, + "grad_norm": 1.3744149843020026, + "learning_rate": 1.745787212020178e-07, + "loss": 0.9461, + "step": 2495 + }, + { + "epoch": 1.8513788635942996, + "grad_norm": 1.3888947471272055, + "learning_rate": 1.6621346816668993e-07, + "loss": 0.9126, + "step": 2500 + }, + { + "epoch": 1.8513788635942996, + "eval_loss": 1.009551763534546, + "eval_runtime": 8.7021, + "eval_samples_per_second": 58.836, + "eval_steps_per_second": 14.709, + "step": 2500 + }, + { + "epoch": 1.8550805108273183, + "grad_norm": 1.4437352870170306, + "learning_rate": 1.5805019736097105e-07, + "loss": 0.964, + "step": 2505 + }, + { + "epoch": 1.858782158060337, + "grad_norm": 1.489377218702996, + "learning_rate": 1.500892498912826e-07, + "loss": 0.953, + "step": 2510 + }, + { + "epoch": 1.8624838052933557, + "grad_norm": 1.4083382533340174, + "learning_rate": 1.4233095840986756e-07, + "loss": 0.9395, + "step": 2515 + }, + { + "epoch": 1.8661854525263744, + "grad_norm": 1.534772930422051, + "learning_rate": 1.3477564710088097e-07, + "loss": 0.9518, + "step": 2520 + }, + { + "epoch": 1.869887099759393, + "grad_norm": 1.4635427384257957, + "learning_rate": 1.2742363166685035e-07, + "loss": 0.9302, + "step": 2525 + }, + { + "epoch": 1.8735887469924117, + "grad_norm": 1.4435798379334432, + "learning_rate": 1.2027521931548214e-07, + "loss": 0.9367, + "step": 2530 + }, + { + "epoch": 1.8772903942254304, + "grad_norm": 1.5934946790816538, + "learning_rate": 1.1333070874682217e-07, + "loss": 0.9569, + "step": 2535 + }, + { + "epoch": 1.8809920414584491, + "grad_norm": 1.5279098622590994, + "learning_rate": 1.0659039014077943e-07, + "loss": 0.946, + "step": 2540 + }, + { + "epoch": 1.8846936886914678, + "grad_norm": 1.4305237726798707, + "learning_rate": 1.0005454514499413e-07, + "loss": 0.9393, + "step": 2545 + }, + { + "epoch": 1.8883953359244865, + "grad_norm": 1.3607031449762257, + "learning_rate": 9.372344686307655e-08, + "loss": 0.9208, + "step": 2550 + }, + { + "epoch": 1.8920969831575052, + "grad_norm": 1.4516808837913941, + "learning_rate": 8.759735984318896e-08, + "loss": 0.935, + "step": 2555 + }, + { + "epoch": 1.8957986303905239, + "grad_norm": 1.4761229374330997, + "learning_rate": 8.167654006699444e-08, + "loss": 0.9256, + "step": 2560 + }, + { + "epoch": 1.8995002776235426, + "grad_norm": 1.3922043238331803, + "learning_rate": 7.59612349389599e-08, + "loss": 0.9267, + "step": 2565 + }, + { + "epoch": 1.9032019248565613, + "grad_norm": 1.4640748538799848, + "learning_rate": 7.04516832760177e-08, + "loss": 0.9353, + "step": 2570 + }, + { + "epoch": 1.90690357208958, + "grad_norm": 1.4570342845008322, + "learning_rate": 6.514811529758747e-08, + "loss": 0.9688, + "step": 2575 + }, + { + "epoch": 1.9106052193225986, + "grad_norm": 1.5068310410001753, + "learning_rate": 6.005075261595495e-08, + "loss": 0.9386, + "step": 2580 + }, + { + "epoch": 1.9143068665556173, + "grad_norm": 1.6357739595126155, + "learning_rate": 5.515980822701439e-08, + "loss": 0.9142, + "step": 2585 + }, + { + "epoch": 1.918008513788636, + "grad_norm": 1.4936931467730468, + "learning_rate": 5.047548650136513e-08, + "loss": 0.9457, + "step": 2590 + }, + { + "epoch": 1.9217101610216547, + "grad_norm": 1.4313994134689743, + "learning_rate": 4.599798317577342e-08, + "loss": 0.9445, + "step": 2595 + }, + { + "epoch": 1.9254118082546734, + "grad_norm": 1.3922872350713908, + "learning_rate": 4.172748534499449e-08, + "loss": 0.9436, + "step": 2600 + }, + { + "epoch": 1.9254118082546734, + "eval_loss": 1.0094008445739746, + "eval_runtime": 8.7208, + "eval_samples_per_second": 58.71, + "eval_steps_per_second": 14.678, + "step": 2600 + }, + { + "epoch": 1.929113455487692, + "grad_norm": 1.4644306273860928, + "learning_rate": 3.766417145395218e-08, + "loss": 0.8973, + "step": 2605 + }, + { + "epoch": 1.9328151027207108, + "grad_norm": 1.4664829295420516, + "learning_rate": 3.3808211290284886e-08, + "loss": 0.9474, + "step": 2610 + }, + { + "epoch": 1.9365167499537295, + "grad_norm": 1.6215118114126437, + "learning_rate": 3.015976597725068e-08, + "loss": 0.9375, + "step": 2615 + }, + { + "epoch": 1.9402183971867482, + "grad_norm": 1.5227133782361597, + "learning_rate": 2.6718987966992683e-08, + "loss": 0.9311, + "step": 2620 + }, + { + "epoch": 1.9439200444197668, + "grad_norm": 1.4393447693954848, + "learning_rate": 2.3486021034170857e-08, + "loss": 0.9631, + "step": 2625 + }, + { + "epoch": 1.9476216916527855, + "grad_norm": 1.5109187749460615, + "learning_rate": 2.0461000269953457e-08, + "loss": 0.9268, + "step": 2630 + }, + { + "epoch": 1.9513233388858042, + "grad_norm": 1.4311275838119613, + "learning_rate": 1.7644052076371544e-08, + "loss": 0.9497, + "step": 2635 + }, + { + "epoch": 1.955024986118823, + "grad_norm": 1.5072494573524384, + "learning_rate": 1.5035294161039882e-08, + "loss": 0.9454, + "step": 2640 + }, + { + "epoch": 1.9587266333518416, + "grad_norm": 1.377355135058928, + "learning_rate": 1.2634835532233658e-08, + "loss": 0.9502, + "step": 2645 + }, + { + "epoch": 1.9624282805848603, + "grad_norm": 1.4159739187815894, + "learning_rate": 1.044277649433989e-08, + "loss": 0.9266, + "step": 2650 + }, + { + "epoch": 1.966129927817879, + "grad_norm": 1.4394933242463066, + "learning_rate": 8.459208643659122e-09, + "loss": 0.9562, + "step": 2655 + }, + { + "epoch": 1.9698315750508977, + "grad_norm": 1.5178638738982297, + "learning_rate": 6.6842148645840374e-09, + "loss": 0.9454, + "step": 2660 + }, + { + "epoch": 1.9735332222839164, + "grad_norm": 1.4200491669164597, + "learning_rate": 5.11786932613223e-09, + "loss": 0.9401, + "step": 2665 + }, + { + "epoch": 1.977234869516935, + "grad_norm": 1.4450080285866973, + "learning_rate": 3.760237478849793e-09, + "loss": 0.941, + "step": 2670 + }, + { + "epoch": 1.9809365167499537, + "grad_norm": 1.4766549402543119, + "learning_rate": 2.611376052073511e-09, + "loss": 0.9267, + "step": 2675 + }, + { + "epoch": 1.9846381639829724, + "grad_norm": 1.4163540389673057, + "learning_rate": 1.6713330515627512e-09, + "loss": 0.9383, + "step": 2680 + }, + { + "epoch": 1.9883398112159911, + "grad_norm": 1.6265977059825938, + "learning_rate": 9.401477574932927e-10, + "loss": 0.9718, + "step": 2685 + }, + { + "epoch": 1.9920414584490098, + "grad_norm": 1.4536437823952244, + "learning_rate": 4.178507228136397e-10, + "loss": 0.9253, + "step": 2690 + }, + { + "epoch": 1.9957431056820285, + "grad_norm": 1.4668631253091544, + "learning_rate": 1.0446377197104174e-10, + "loss": 0.9352, + "step": 2695 + }, + { + "epoch": 1.9994447529150472, + "grad_norm": 1.4405225324032018, + "learning_rate": 0.0, + "loss": 0.9419, + "step": 2700 + }, + { + "epoch": 1.9994447529150472, + "eval_loss": 1.0093477964401245, + "eval_runtime": 8.6956, + "eval_samples_per_second": 58.88, + "eval_steps_per_second": 14.72, + "step": 2700 + }, + { + "epoch": 1.9994447529150472, + "step": 2700, + "total_flos": 75972590174208.0, + "train_loss": 1.0003288196634363, + "train_runtime": 4442.862, + "train_samples_per_second": 38.911, + "train_steps_per_second": 0.608 + } + ], + "logging_steps": 5, + "max_steps": 2700, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 75972590174208.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}