{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, "eval_steps": 500, "global_step": 3750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 3.0283896923065186, "learning_rate": 5.333333333333335e-07, "loss": 1.5494, "step": 10 }, { "epoch": 0.08, "grad_norm": 2.980957508087158, "learning_rate": 1.066666666666667e-06, "loss": 1.5781, "step": 20 }, { "epoch": 0.12, "grad_norm": 1.3325070142745972, "learning_rate": 1.6000000000000001e-06, "loss": 1.271, "step": 30 }, { "epoch": 0.16, "grad_norm": 1.1161530017852783, "learning_rate": 2.133333333333334e-06, "loss": 1.1263, "step": 40 }, { "epoch": 0.2, "grad_norm": 0.9847109317779541, "learning_rate": 2.666666666666667e-06, "loss": 1.4759, "step": 50 }, { "epoch": 0.24, "grad_norm": 1.3176578283309937, "learning_rate": 3.2000000000000003e-06, "loss": 1.1906, "step": 60 }, { "epoch": 0.28, "grad_norm": 2.9614243507385254, "learning_rate": 3.7333333333333337e-06, "loss": 1.3136, "step": 70 }, { "epoch": 0.32, "grad_norm": 1.0635404586791992, "learning_rate": 4.266666666666668e-06, "loss": 1.159, "step": 80 }, { "epoch": 0.36, "grad_norm": 1.4873822927474976, "learning_rate": 4.800000000000001e-06, "loss": 1.0645, "step": 90 }, { "epoch": 0.4, "grad_norm": 2.666663646697998, "learning_rate": 5.333333333333334e-06, "loss": 1.3174, "step": 100 }, { "epoch": 0.44, "grad_norm": 0.9259383678436279, "learning_rate": 5.8666666666666675e-06, "loss": 1.0041, "step": 110 }, { "epoch": 0.48, "grad_norm": 1.422500729560852, "learning_rate": 6.4000000000000006e-06, "loss": 1.1208, "step": 120 }, { "epoch": 0.52, "grad_norm": 1.513822317123413, "learning_rate": 6.9333333333333344e-06, "loss": 0.9806, "step": 130 }, { "epoch": 0.56, "grad_norm": 3.26381254196167, "learning_rate": 7.4666666666666675e-06, "loss": 0.9904, "step": 140 }, { "epoch": 0.6, "grad_norm": 2.899075984954834, "learning_rate": 8.000000000000001e-06, "loss": 0.9365, "step": 150 }, { "epoch": 0.64, "grad_norm": 0.7061178088188171, "learning_rate": 8.533333333333335e-06, "loss": 0.8843, "step": 160 }, { "epoch": 0.68, "grad_norm": 1.0236766338348389, "learning_rate": 9.066666666666667e-06, "loss": 0.9131, "step": 170 }, { "epoch": 0.72, "grad_norm": 1.3964245319366455, "learning_rate": 9.600000000000001e-06, "loss": 0.9142, "step": 180 }, { "epoch": 0.76, "grad_norm": 1.7908815145492554, "learning_rate": 1.0133333333333335e-05, "loss": 0.8973, "step": 190 }, { "epoch": 0.8, "grad_norm": 0.9264830946922302, "learning_rate": 1.0666666666666667e-05, "loss": 0.8407, "step": 200 }, { "epoch": 0.84, "grad_norm": 2.5779850482940674, "learning_rate": 1.1200000000000001e-05, "loss": 0.8228, "step": 210 }, { "epoch": 0.88, "grad_norm": 1.6980341672897339, "learning_rate": 1.1733333333333335e-05, "loss": 0.799, "step": 220 }, { "epoch": 0.92, "grad_norm": 1.9194531440734863, "learning_rate": 1.2266666666666667e-05, "loss": 0.7759, "step": 230 }, { "epoch": 0.96, "grad_norm": 2.0350003242492676, "learning_rate": 1.2800000000000001e-05, "loss": 0.7466, "step": 240 }, { "epoch": 1.0, "grad_norm": 3.474932909011841, "learning_rate": 1.3333333333333333e-05, "loss": 0.7317, "step": 250 }, { "epoch": 1.04, "grad_norm": 1.7944034337997437, "learning_rate": 1.3866666666666669e-05, "loss": 0.7046, "step": 260 }, { "epoch": 1.08, "grad_norm": 2.945058584213257, "learning_rate": 1.4400000000000001e-05, "loss": 0.6625, "step": 270 }, { "epoch": 1.12, "grad_norm": 1.820989966392517, "learning_rate": 1.4933333333333335e-05, "loss": 0.7307, "step": 280 }, { "epoch": 1.16, "grad_norm": 2.9544613361358643, "learning_rate": 1.546666666666667e-05, "loss": 0.7996, "step": 290 }, { "epoch": 1.2, "grad_norm": 0.9499707221984863, "learning_rate": 1.6000000000000003e-05, "loss": 0.706, "step": 300 }, { "epoch": 1.24, "grad_norm": 5.53220796585083, "learning_rate": 1.6533333333333333e-05, "loss": 0.6941, "step": 310 }, { "epoch": 1.28, "grad_norm": 2.7142622470855713, "learning_rate": 1.706666666666667e-05, "loss": 0.746, "step": 320 }, { "epoch": 1.32, "grad_norm": 4.010003089904785, "learning_rate": 1.76e-05, "loss": 0.7262, "step": 330 }, { "epoch": 1.3599999999999999, "grad_norm": 2.3094098567962646, "learning_rate": 1.8133333333333335e-05, "loss": 0.6552, "step": 340 }, { "epoch": 1.4, "grad_norm": 5.371938228607178, "learning_rate": 1.866666666666667e-05, "loss": 0.677, "step": 350 }, { "epoch": 1.44, "grad_norm": 1.662387728691101, "learning_rate": 1.9200000000000003e-05, "loss": 0.6551, "step": 360 }, { "epoch": 1.48, "grad_norm": 1.2856159210205078, "learning_rate": 1.9733333333333336e-05, "loss": 0.7076, "step": 370 }, { "epoch": 1.52, "grad_norm": 1.2962098121643066, "learning_rate": 1.999989169177959e-05, "loss": 0.6818, "step": 380 }, { "epoch": 1.56, "grad_norm": 1.3778997659683228, "learning_rate": 1.9999025240093045e-05, "loss": 0.6823, "step": 390 }, { "epoch": 1.6, "grad_norm": 1.4772292375564575, "learning_rate": 1.999729241179462e-05, "loss": 0.7704, "step": 400 }, { "epoch": 1.6400000000000001, "grad_norm": 1.13938307762146, "learning_rate": 1.999469335702714e-05, "loss": 0.6668, "step": 410 }, { "epoch": 1.6800000000000002, "grad_norm": 2.4890644550323486, "learning_rate": 1.9991228300988586e-05, "loss": 0.6448, "step": 420 }, { "epoch": 1.72, "grad_norm": 2.2066543102264404, "learning_rate": 1.998689754391257e-05, "loss": 0.7159, "step": 430 }, { "epoch": 1.76, "grad_norm": 1.4606579542160034, "learning_rate": 1.998170146104234e-05, "loss": 0.6443, "step": 440 }, { "epoch": 1.8, "grad_norm": 5.692836284637451, "learning_rate": 1.9975640502598243e-05, "loss": 0.7253, "step": 450 }, { "epoch": 1.8399999999999999, "grad_norm": 1.7549424171447754, "learning_rate": 1.9968715193738738e-05, "loss": 0.6349, "step": 460 }, { "epoch": 1.88, "grad_norm": 2.5602545738220215, "learning_rate": 1.9960926134514875e-05, "loss": 0.6793, "step": 470 }, { "epoch": 1.92, "grad_norm": 1.393797755241394, "learning_rate": 1.9952273999818312e-05, "loss": 0.6686, "step": 480 }, { "epoch": 1.96, "grad_norm": 0.6151896119117737, "learning_rate": 1.9942759539322845e-05, "loss": 0.6584, "step": 490 }, { "epoch": 2.0, "grad_norm": 2.0221006870269775, "learning_rate": 1.9932383577419432e-05, "loss": 0.6771, "step": 500 }, { "epoch": 2.04, "grad_norm": 2.0078063011169434, "learning_rate": 1.9921147013144782e-05, "loss": 0.6664, "step": 510 }, { "epoch": 2.08, "grad_norm": 2.788282871246338, "learning_rate": 1.990905082010344e-05, "loss": 0.6243, "step": 520 }, { "epoch": 2.12, "grad_norm": 2.064715623855591, "learning_rate": 1.9896096046383456e-05, "loss": 0.6253, "step": 530 }, { "epoch": 2.16, "grad_norm": 2.5293374061584473, "learning_rate": 1.988228381446553e-05, "loss": 0.6362, "step": 540 }, { "epoch": 2.2, "grad_norm": 1.461493730545044, "learning_rate": 1.9867615321125796e-05, "loss": 0.6517, "step": 550 }, { "epoch": 2.24, "grad_norm": 1.1433868408203125, "learning_rate": 1.985209183733209e-05, "loss": 0.6849, "step": 560 }, { "epoch": 2.2800000000000002, "grad_norm": 1.6532901525497437, "learning_rate": 1.983571470813386e-05, "loss": 0.6298, "step": 570 }, { "epoch": 2.32, "grad_norm": 3.705383539199829, "learning_rate": 1.9818485352545595e-05, "loss": 0.6588, "step": 580 }, { "epoch": 2.36, "grad_norm": 2.4615492820739746, "learning_rate": 1.980040526342388e-05, "loss": 0.6154, "step": 590 }, { "epoch": 2.4, "grad_norm": 0.8189066052436829, "learning_rate": 1.9781476007338058e-05, "loss": 0.6393, "step": 600 }, { "epoch": 2.44, "grad_norm": 1.1122651100158691, "learning_rate": 1.9761699224434476e-05, "loss": 0.6245, "step": 610 }, { "epoch": 2.48, "grad_norm": 1.4684017896652222, "learning_rate": 1.9741076628294387e-05, "loss": 0.6592, "step": 620 }, { "epoch": 2.52, "grad_norm": 0.9914065599441528, "learning_rate": 1.9719610005785466e-05, "loss": 0.6262, "step": 630 }, { "epoch": 2.56, "grad_norm": 1.7366482019424438, "learning_rate": 1.969730121690698e-05, "loss": 0.672, "step": 640 }, { "epoch": 2.6, "grad_norm": 3.544377326965332, "learning_rate": 1.967415219462864e-05, "loss": 0.6057, "step": 650 }, { "epoch": 2.64, "grad_norm": 1.9553754329681396, "learning_rate": 1.9650164944723116e-05, "loss": 0.6142, "step": 660 }, { "epoch": 2.68, "grad_norm": 2.1661672592163086, "learning_rate": 1.9625341545592226e-05, "loss": 0.6238, "step": 670 }, { "epoch": 2.7199999999999998, "grad_norm": 3.7167468070983887, "learning_rate": 1.9599684148086876e-05, "loss": 0.7166, "step": 680 }, { "epoch": 2.76, "grad_norm": 2.688824415206909, "learning_rate": 1.9573194975320672e-05, "loss": 0.6769, "step": 690 }, { "epoch": 2.8, "grad_norm": 4.10930061340332, "learning_rate": 1.954587632247732e-05, "loss": 0.6199, "step": 700 }, { "epoch": 2.84, "grad_norm": 1.5201390981674194, "learning_rate": 1.951773055661174e-05, "loss": 0.6242, "step": 710 }, { "epoch": 2.88, "grad_norm": 3.6892731189727783, "learning_rate": 1.9488760116444966e-05, "loss": 0.6245, "step": 720 }, { "epoch": 2.92, "grad_norm": 0.8859150409698486, "learning_rate": 1.9458967512152872e-05, "loss": 0.628, "step": 730 }, { "epoch": 2.96, "grad_norm": 1.4320142269134521, "learning_rate": 1.9428355325148632e-05, "loss": 0.5806, "step": 740 }, { "epoch": 3.0, "grad_norm": 1.4816261529922485, "learning_rate": 1.9396926207859085e-05, "loss": 0.5818, "step": 750 }, { "epoch": 3.04, "grad_norm": 2.1367580890655518, "learning_rate": 1.9364682883494892e-05, "loss": 0.6387, "step": 760 }, { "epoch": 3.08, "grad_norm": 2.2321407794952393, "learning_rate": 1.9331628145814587e-05, "loss": 0.6207, "step": 770 }, { "epoch": 3.12, "grad_norm": 3.410268783569336, "learning_rate": 1.9297764858882516e-05, "loss": 0.5868, "step": 780 }, { "epoch": 3.16, "grad_norm": 3.323219060897827, "learning_rate": 1.926309595682066e-05, "loss": 0.5444, "step": 790 }, { "epoch": 3.2, "grad_norm": 2.397799015045166, "learning_rate": 1.9227624443554425e-05, "loss": 0.5891, "step": 800 }, { "epoch": 3.24, "grad_norm": 9.090506553649902, "learning_rate": 1.9191353392552346e-05, "loss": 0.5453, "step": 810 }, { "epoch": 3.2800000000000002, "grad_norm": 1.3556101322174072, "learning_rate": 1.9154285946559792e-05, "loss": 0.6406, "step": 820 }, { "epoch": 3.32, "grad_norm": 1.056227684020996, "learning_rate": 1.911642531732666e-05, "loss": 0.5613, "step": 830 }, { "epoch": 3.36, "grad_norm": 1.2134612798690796, "learning_rate": 1.907777478532909e-05, "loss": 0.6439, "step": 840 }, { "epoch": 3.4, "grad_norm": 2.153582811355591, "learning_rate": 1.9038337699485207e-05, "loss": 0.6268, "step": 850 }, { "epoch": 3.44, "grad_norm": 1.4763509035110474, "learning_rate": 1.8998117476864984e-05, "loss": 0.6358, "step": 860 }, { "epoch": 3.48, "grad_norm": 2.120673656463623, "learning_rate": 1.895711760239413e-05, "loss": 0.5479, "step": 870 }, { "epoch": 3.52, "grad_norm": 3.2643983364105225, "learning_rate": 1.8915341628552166e-05, "loss": 0.5908, "step": 880 }, { "epoch": 3.56, "grad_norm": 1.7468228340148926, "learning_rate": 1.8872793175064594e-05, "loss": 0.6167, "step": 890 }, { "epoch": 3.6, "grad_norm": 1.6314669847488403, "learning_rate": 1.8829475928589272e-05, "loss": 0.605, "step": 900 }, { "epoch": 3.64, "grad_norm": 1.2006853818893433, "learning_rate": 1.8785393642396976e-05, "loss": 0.6374, "step": 910 }, { "epoch": 3.68, "grad_norm": 5.40316915512085, "learning_rate": 1.8740550136046195e-05, "loss": 0.5919, "step": 920 }, { "epoch": 3.7199999999999998, "grad_norm": 1.0310533046722412, "learning_rate": 1.869494929505219e-05, "loss": 0.596, "step": 930 }, { "epoch": 3.76, "grad_norm": 3.058582067489624, "learning_rate": 1.8653264281300622e-05, "loss": 0.5709, "step": 940 }, { "epoch": 3.8, "grad_norm": 0.8612210750579834, "learning_rate": 1.8606235443821602e-05, "loss": 0.6734, "step": 950 }, { "epoch": 3.84, "grad_norm": 1.0079221725463867, "learning_rate": 1.8558460909544564e-05, "loss": 0.5874, "step": 960 }, { "epoch": 3.88, "grad_norm": 1.8223471641540527, "learning_rate": 1.850994481794692e-05, "loss": 0.6199, "step": 970 }, { "epoch": 3.92, "grad_norm": 2.1497292518615723, "learning_rate": 1.846069137275914e-05, "loss": 0.5527, "step": 980 }, { "epoch": 3.96, "grad_norm": 1.5918537378311157, "learning_rate": 1.8410704841600506e-05, "loss": 0.5998, "step": 990 }, { "epoch": 4.0, "grad_norm": 4.21558952331543, "learning_rate": 1.8359989555609355e-05, "loss": 0.6357, "step": 1000 }, { "epoch": 4.04, "grad_norm": 2.512099504470825, "learning_rate": 1.830854990906779e-05, "loss": 0.6116, "step": 1010 }, { "epoch": 4.08, "grad_norm": 2.5760135650634766, "learning_rate": 1.825639035902093e-05, "loss": 0.54, "step": 1020 }, { "epoch": 4.12, "grad_norm": 1.5788276195526123, "learning_rate": 1.8203515424890738e-05, "loss": 0.6258, "step": 1030 }, { "epoch": 4.16, "grad_norm": 1.5123496055603027, "learning_rate": 1.814992968808442e-05, "loss": 0.5147, "step": 1040 }, { "epoch": 4.2, "grad_norm": 1.539919376373291, "learning_rate": 1.809563779159746e-05, "loss": 0.569, "step": 1050 }, { "epoch": 4.24, "grad_norm": 1.6823704242706299, "learning_rate": 1.8040644439611348e-05, "loss": 0.5588, "step": 1060 }, { "epoch": 4.28, "grad_norm": 1.1113232374191284, "learning_rate": 1.798495439708594e-05, "loss": 0.5692, "step": 1070 }, { "epoch": 4.32, "grad_norm": 1.6908786296844482, "learning_rate": 1.792857248934663e-05, "loss": 0.6102, "step": 1080 }, { "epoch": 4.36, "grad_norm": 1.7746518850326538, "learning_rate": 1.7871503601666233e-05, "loss": 0.5706, "step": 1090 }, { "epoch": 4.4, "grad_norm": 1.2888718843460083, "learning_rate": 1.7813752678841702e-05, "loss": 0.5964, "step": 1100 }, { "epoch": 4.44, "grad_norm": 2.7955427169799805, "learning_rate": 1.7755324724765688e-05, "loss": 0.6055, "step": 1110 }, { "epoch": 4.48, "grad_norm": 1.4672576189041138, "learning_rate": 1.7696224801992947e-05, "loss": 0.5548, "step": 1120 }, { "epoch": 4.52, "grad_norm": 2.2973852157592773, "learning_rate": 1.7636458031301725e-05, "loss": 0.5967, "step": 1130 }, { "epoch": 4.5600000000000005, "grad_norm": 2.6734001636505127, "learning_rate": 1.7576029591250036e-05, "loss": 0.5567, "step": 1140 }, { "epoch": 4.6, "grad_norm": 2.127830743789673, "learning_rate": 1.7514944717726962e-05, "loss": 0.6065, "step": 1150 }, { "epoch": 4.64, "grad_norm": 2.201108455657959, "learning_rate": 1.7453208703499006e-05, "loss": 0.566, "step": 1160 }, { "epoch": 4.68, "grad_norm": 3.8374786376953125, "learning_rate": 1.739082689775146e-05, "loss": 0.55, "step": 1170 }, { "epoch": 4.72, "grad_norm": 2.7282190322875977, "learning_rate": 1.732780470562496e-05, "loss": 0.5748, "step": 1180 }, { "epoch": 4.76, "grad_norm": 1.8128880262374878, "learning_rate": 1.7264147587747097e-05, "loss": 0.6309, "step": 1190 }, { "epoch": 4.8, "grad_norm": 2.522096633911133, "learning_rate": 1.7199861059759338e-05, "loss": 0.5504, "step": 1200 }, { "epoch": 4.84, "grad_norm": 1.186700463294983, "learning_rate": 1.7134950691839063e-05, "loss": 0.5741, "step": 1210 }, { "epoch": 4.88, "grad_norm": 4.312258720397949, "learning_rate": 1.7069422108216973e-05, "loss": 0.575, "step": 1220 }, { "epoch": 4.92, "grad_norm": 3.402963399887085, "learning_rate": 1.7003280986689733e-05, "loss": 0.5842, "step": 1230 }, { "epoch": 4.96, "grad_norm": 2.346266269683838, "learning_rate": 1.693653305812805e-05, "loss": 0.5877, "step": 1240 }, { "epoch": 5.0, "grad_norm": 1.304891586303711, "learning_rate": 1.686918410598009e-05, "loss": 0.6153, "step": 1250 }, { "epoch": 5.04, "grad_norm": 2.874284505844116, "learning_rate": 1.6801239965770366e-05, "loss": 0.5776, "step": 1260 }, { "epoch": 5.08, "grad_norm": 3.4333293437957764, "learning_rate": 1.6732706524594138e-05, "loss": 0.5099, "step": 1270 }, { "epoch": 5.12, "grad_norm": 1.4809489250183105, "learning_rate": 1.6663589720607287e-05, "loss": 0.5535, "step": 1280 }, { "epoch": 5.16, "grad_norm": 3.005042314529419, "learning_rate": 1.659389554251181e-05, "loss": 0.561, "step": 1290 }, { "epoch": 5.2, "grad_norm": 1.8546274900436401, "learning_rate": 1.652363002903693e-05, "loss": 0.555, "step": 1300 }, { "epoch": 5.24, "grad_norm": 10.558731079101562, "learning_rate": 1.6452799268415857e-05, "loss": 0.5333, "step": 1310 }, { "epoch": 5.28, "grad_norm": 2.2776925563812256, "learning_rate": 1.6381409397858257e-05, "loss": 0.5941, "step": 1320 }, { "epoch": 5.32, "grad_norm": 2.6703994274139404, "learning_rate": 1.6309466603018497e-05, "loss": 0.5676, "step": 1330 }, { "epoch": 5.36, "grad_norm": 3.8042726516723633, "learning_rate": 1.6236977117459693e-05, "loss": 0.5609, "step": 1340 }, { "epoch": 5.4, "grad_norm": 1.2701845169067383, "learning_rate": 1.616394722211357e-05, "loss": 0.5702, "step": 1350 }, { "epoch": 5.44, "grad_norm": 1.392269253730774, "learning_rate": 1.6090383244736256e-05, "loss": 0.5388, "step": 1360 }, { "epoch": 5.48, "grad_norm": 2.634445905685425, "learning_rate": 1.6016291559360023e-05, "loss": 0.573, "step": 1370 }, { "epoch": 5.52, "grad_norm": 3.3800487518310547, "learning_rate": 1.5941678585740976e-05, "loss": 0.5522, "step": 1380 }, { "epoch": 5.5600000000000005, "grad_norm": 2.935079336166382, "learning_rate": 1.5866550788802815e-05, "loss": 0.5615, "step": 1390 }, { "epoch": 5.6, "grad_norm": 1.9093431234359741, "learning_rate": 1.579091467807668e-05, "loss": 0.5537, "step": 1400 }, { "epoch": 5.64, "grad_norm": 2.825533628463745, "learning_rate": 1.5714776807137128e-05, "loss": 0.6197, "step": 1410 }, { "epoch": 5.68, "grad_norm": 3.172933578491211, "learning_rate": 1.5638143773034268e-05, "loss": 0.5415, "step": 1420 }, { "epoch": 5.72, "grad_norm": 1.2932432889938354, "learning_rate": 1.556102221572219e-05, "loss": 0.5792, "step": 1430 }, { "epoch": 5.76, "grad_norm": 2.1744470596313477, "learning_rate": 1.5483418817483607e-05, "loss": 0.52, "step": 1440 }, { "epoch": 5.8, "grad_norm": 2.4148924350738525, "learning_rate": 1.540534030235087e-05, "loss": 0.602, "step": 1450 }, { "epoch": 5.84, "grad_norm": 2.427771806716919, "learning_rate": 1.5326793435523374e-05, "loss": 0.515, "step": 1460 }, { "epoch": 5.88, "grad_norm": 1.566942811012268, "learning_rate": 1.5247785022781343e-05, "loss": 0.5795, "step": 1470 }, { "epoch": 5.92, "grad_norm": 1.7555649280548096, "learning_rate": 1.5168321909896171e-05, "loss": 0.5819, "step": 1480 }, { "epoch": 5.96, "grad_norm": 1.6367748975753784, "learning_rate": 1.5088410982037251e-05, "loss": 0.5244, "step": 1490 }, { "epoch": 6.0, "grad_norm": 1.9994490146636963, "learning_rate": 1.50080591631754e-05, "loss": 0.5823, "step": 1500 }, { "epoch": 6.04, "grad_norm": 6.0024261474609375, "learning_rate": 1.4927273415482916e-05, "loss": 0.5641, "step": 1510 }, { "epoch": 6.08, "grad_norm": 1.50034499168396, "learning_rate": 1.484606073873035e-05, "loss": 0.5325, "step": 1520 }, { "epoch": 6.12, "grad_norm": 12.008216857910156, "learning_rate": 1.4764428169679987e-05, "loss": 0.5384, "step": 1530 }, { "epoch": 6.16, "grad_norm": 1.6358847618103027, "learning_rate": 1.4682382781476146e-05, "loss": 0.5822, "step": 1540 }, { "epoch": 6.2, "grad_norm": 2.520883321762085, "learning_rate": 1.4599931683032327e-05, "loss": 0.5256, "step": 1550 }, { "epoch": 6.24, "grad_norm": 1.136460304260254, "learning_rate": 1.4517082018415231e-05, "loss": 0.5589, "step": 1560 }, { "epoch": 6.28, "grad_norm": 2.730435848236084, "learning_rate": 1.4433840966225772e-05, "loss": 0.4939, "step": 1570 }, { "epoch": 6.32, "grad_norm": 0.9575507044792175, "learning_rate": 1.4350215738977077e-05, "loss": 0.5277, "step": 1580 }, { "epoch": 6.36, "grad_norm": 1.9613964557647705, "learning_rate": 1.4266213582469543e-05, "loss": 0.5457, "step": 1590 }, { "epoch": 6.4, "grad_norm": 2.436429500579834, "learning_rate": 1.4181841775163014e-05, "loss": 0.547, "step": 1600 }, { "epoch": 6.44, "grad_norm": 2.1270251274108887, "learning_rate": 1.409710762754615e-05, "loss": 0.5422, "step": 1610 }, { "epoch": 6.48, "grad_norm": 2.4047000408172607, "learning_rate": 1.4012018481502975e-05, "loss": 0.5219, "step": 1620 }, { "epoch": 6.52, "grad_norm": 1.1183472871780396, "learning_rate": 1.3926581709676752e-05, "loss": 0.5088, "step": 1630 }, { "epoch": 6.5600000000000005, "grad_norm": 1.89410400390625, "learning_rate": 1.3840804714831164e-05, "loss": 0.5707, "step": 1640 }, { "epoch": 6.6, "grad_norm": 1.2478140592575073, "learning_rate": 1.3754694929208891e-05, "loss": 0.5893, "step": 1650 }, { "epoch": 6.64, "grad_norm": 4.104971885681152, "learning_rate": 1.3668259813887644e-05, "loss": 0.5768, "step": 1660 }, { "epoch": 6.68, "grad_norm": 1.4798212051391602, "learning_rate": 1.3581506858133677e-05, "loss": 0.5223, "step": 1670 }, { "epoch": 6.72, "grad_norm": 3.716698408126831, "learning_rate": 1.3494443578752893e-05, "loss": 0.5208, "step": 1680 }, { "epoch": 6.76, "grad_norm": 2.887411117553711, "learning_rate": 1.340707751943952e-05, "loss": 0.5415, "step": 1690 }, { "epoch": 6.8, "grad_norm": 3.6842939853668213, "learning_rate": 1.3319416250122484e-05, "loss": 0.5272, "step": 1700 }, { "epoch": 6.84, "grad_norm": 4.174267292022705, "learning_rate": 1.3231467366309523e-05, "loss": 0.5341, "step": 1710 }, { "epoch": 6.88, "grad_norm": 4.511580467224121, "learning_rate": 1.3143238488429042e-05, "loss": 0.573, "step": 1720 }, { "epoch": 6.92, "grad_norm": 1.930474042892456, "learning_rate": 1.3054737261169838e-05, "loss": 0.5134, "step": 1730 }, { "epoch": 6.96, "grad_norm": 1.2770944833755493, "learning_rate": 1.2965971352818736e-05, "loss": 0.4917, "step": 1740 }, { "epoch": 7.0, "grad_norm": 2.182473659515381, "learning_rate": 1.287694845459613e-05, "loss": 0.5725, "step": 1750 }, { "epoch": 7.04, "grad_norm": 4.948633670806885, "learning_rate": 1.2787676279989594e-05, "loss": 0.5122, "step": 1760 }, { "epoch": 7.08, "grad_norm": 2.5241198539733887, "learning_rate": 1.2698162564085536e-05, "loss": 0.4839, "step": 1770 }, { "epoch": 7.12, "grad_norm": 1.8158693313598633, "learning_rate": 1.2608415062898971e-05, "loss": 0.4502, "step": 1780 }, { "epoch": 7.16, "grad_norm": 3.540010929107666, "learning_rate": 1.2518441552701493e-05, "loss": 0.5585, "step": 1790 }, { "epoch": 7.2, "grad_norm": 1.9738972187042236, "learning_rate": 1.2428249829347509e-05, "loss": 0.4918, "step": 1800 }, { "epoch": 7.24, "grad_norm": 1.0376015901565552, "learning_rate": 1.2337847707598738e-05, "loss": 0.4989, "step": 1810 }, { "epoch": 7.28, "grad_norm": 2.5017688274383545, "learning_rate": 1.2247243020447104e-05, "loss": 0.4962, "step": 1820 }, { "epoch": 7.32, "grad_norm": 1.2260102033615112, "learning_rate": 1.2156443618436033e-05, "loss": 0.5316, "step": 1830 }, { "epoch": 7.36, "grad_norm": 2.4500176906585693, "learning_rate": 1.2065457368980236e-05, "loss": 0.4841, "step": 1840 }, { "epoch": 7.4, "grad_norm": 3.0221171379089355, "learning_rate": 1.197429215568403e-05, "loss": 0.5507, "step": 1850 }, { "epoch": 7.44, "grad_norm": 5.26624059677124, "learning_rate": 1.1882955877658252e-05, "loss": 0.532, "step": 1860 }, { "epoch": 7.48, "grad_norm": 2.411428213119507, "learning_rate": 1.1791456448835825e-05, "loss": 0.4802, "step": 1870 }, { "epoch": 7.52, "grad_norm": 2.334620952606201, "learning_rate": 1.169980179728606e-05, "loss": 0.5331, "step": 1880 }, { "epoch": 7.5600000000000005, "grad_norm": 2.4401047229766846, "learning_rate": 1.1607999864527718e-05, "loss": 0.4994, "step": 1890 }, { "epoch": 7.6, "grad_norm": 2.3867135047912598, "learning_rate": 1.1516058604840891e-05, "loss": 0.5124, "step": 1900 }, { "epoch": 7.64, "grad_norm": 2.3309555053710938, "learning_rate": 1.1423985984577813e-05, "loss": 0.574, "step": 1910 }, { "epoch": 7.68, "grad_norm": 1.1885383129119873, "learning_rate": 1.1331789981472603e-05, "loss": 0.5361, "step": 1920 }, { "epoch": 7.72, "grad_norm": 1.6586416959762573, "learning_rate": 1.1239478583950019e-05, "loss": 0.5388, "step": 1930 }, { "epoch": 7.76, "grad_norm": 1.3869335651397705, "learning_rate": 1.1147059790433296e-05, "loss": 0.536, "step": 1940 }, { "epoch": 7.8, "grad_norm": 1.5383076667785645, "learning_rate": 1.1054541608651121e-05, "loss": 0.5165, "step": 1950 }, { "epoch": 7.84, "grad_norm": 1.1627497673034668, "learning_rate": 1.0961932054943778e-05, "loss": 0.5369, "step": 1960 }, { "epoch": 7.88, "grad_norm": 1.4803476333618164, "learning_rate": 1.0869239153568575e-05, "loss": 0.548, "step": 1970 }, { "epoch": 7.92, "grad_norm": 1.503915786743164, "learning_rate": 1.0776470936004572e-05, "loss": 0.5377, "step": 1980 }, { "epoch": 7.96, "grad_norm": 1.9053574800491333, "learning_rate": 1.0683635440256689e-05, "loss": 0.5249, "step": 1990 }, { "epoch": 8.0, "grad_norm": 2.171719551086426, "learning_rate": 1.059074071015923e-05, "loss": 0.5162, "step": 2000 }, { "epoch": 8.04, "grad_norm": 3.7397103309631348, "learning_rate": 1.0497794794678923e-05, "loss": 0.5067, "step": 2010 }, { "epoch": 8.08, "grad_norm": 3.237569570541382, "learning_rate": 1.0404805747217525e-05, "loss": 0.4901, "step": 2020 }, { "epoch": 8.12, "grad_norm": 2.6131529808044434, "learning_rate": 1.0311781624914e-05, "loss": 0.4834, "step": 2030 }, { "epoch": 8.16, "grad_norm": 2.543020009994507, "learning_rate": 1.0228036587536431e-05, "loss": 0.4991, "step": 2040 }, { "epoch": 8.2, "grad_norm": 2.420510768890381, "learning_rate": 1.013496803077246e-05, "loss": 0.5326, "step": 2050 }, { "epoch": 8.24, "grad_norm": 1.7979626655578613, "learning_rate": 1.0041887779554041e-05, "loss": 0.501, "step": 2060 }, { "epoch": 8.28, "grad_norm": 3.0351650714874268, "learning_rate": 9.948803898922586e-06, "loss": 0.5263, "step": 2070 }, { "epoch": 8.32, "grad_norm": 2.1602799892425537, "learning_rate": 9.85572445423399e-06, "loss": 0.505, "step": 2080 }, { "epoch": 8.36, "grad_norm": 2.298388957977295, "learning_rate": 9.762657510459784e-06, "loss": 0.4962, "step": 2090 }, { "epoch": 8.4, "grad_norm": 1.9878581762313843, "learning_rate": 9.669611131488346e-06, "loss": 0.5086, "step": 2100 }, { "epoch": 8.44, "grad_norm": 3.1122074127197266, "learning_rate": 9.576593379426196e-06, "loss": 0.5105, "step": 2110 }, { "epoch": 8.48, "grad_norm": 1.8491990566253662, "learning_rate": 9.483612313899436e-06, "loss": 0.5028, "step": 2120 }, { "epoch": 8.52, "grad_norm": 2.2476413249969482, "learning_rate": 9.390675991355435e-06, "loss": 0.5273, "step": 2130 }, { "epoch": 8.56, "grad_norm": 3.6653342247009277, "learning_rate": 9.297792464364748e-06, "loss": 0.4313, "step": 2140 }, { "epoch": 8.6, "grad_norm": 0.8962536454200745, "learning_rate": 9.204969780923404e-06, "loss": 0.5045, "step": 2150 }, { "epoch": 8.64, "grad_norm": 6.7541823387146, "learning_rate": 9.112215983755573e-06, "loss": 0.4818, "step": 2160 }, { "epoch": 8.68, "grad_norm": 3.15523362159729, "learning_rate": 9.019539109616694e-06, "loss": 0.4779, "step": 2170 }, { "epoch": 8.72, "grad_norm": 1.048300862312317, "learning_rate": 8.926947188597133e-06, "loss": 0.4815, "step": 2180 }, { "epoch": 8.76, "grad_norm": 4.415710926055908, "learning_rate": 8.8344482434264e-06, "loss": 0.5259, "step": 2190 }, { "epoch": 8.8, "grad_norm": 4.474966049194336, "learning_rate": 8.742050288778e-06, "loss": 0.5378, "step": 2200 }, { "epoch": 8.84, "grad_norm": 3.487746477127075, "learning_rate": 8.649761330575009e-06, "loss": 0.5144, "step": 2210 }, { "epoch": 8.88, "grad_norm": 1.44117271900177, "learning_rate": 8.557589365296385e-06, "loss": 0.5383, "step": 2220 }, { "epoch": 8.92, "grad_norm": 2.9286913871765137, "learning_rate": 8.4655423792841e-06, "loss": 0.4653, "step": 2230 }, { "epoch": 8.96, "grad_norm": 1.4172818660736084, "learning_rate": 8.373628348051165e-06, "loss": 0.4868, "step": 2240 }, { "epoch": 9.0, "grad_norm": 1.9049030542373657, "learning_rate": 8.281855235590574e-06, "loss": 0.5606, "step": 2250 }, { "epoch": 9.04, "grad_norm": 3.6202874183654785, "learning_rate": 8.19023099368526e-06, "loss": 0.4717, "step": 2260 }, { "epoch": 9.08, "grad_norm": 1.4381736516952515, "learning_rate": 8.098763561219101e-06, "loss": 0.4578, "step": 2270 }, { "epoch": 9.12, "grad_norm": 3.8551642894744873, "learning_rate": 8.007460863489042e-06, "loss": 0.4553, "step": 2280 }, { "epoch": 9.16, "grad_norm": 2.2333943843841553, "learning_rate": 7.91633081151841e-06, "loss": 0.4861, "step": 2290 }, { "epoch": 9.2, "grad_norm": 3.517455816268921, "learning_rate": 7.825381301371452e-06, "loss": 0.4518, "step": 2300 }, { "epoch": 9.24, "grad_norm": 1.2912664413452148, "learning_rate": 7.734620213469166e-06, "loss": 0.4832, "step": 2310 }, { "epoch": 9.28, "grad_norm": 3.6948964595794678, "learning_rate": 7.644055411906493e-06, "loss": 0.4969, "step": 2320 }, { "epoch": 9.32, "grad_norm": 1.5953376293182373, "learning_rate": 7.553694743770928e-06, "loss": 0.4606, "step": 2330 }, { "epoch": 9.36, "grad_norm": 2.7939870357513428, "learning_rate": 7.463546038462602e-06, "loss": 0.5225, "step": 2340 }, { "epoch": 9.4, "grad_norm": 1.0297088623046875, "learning_rate": 7.373617107015889e-06, "loss": 0.529, "step": 2350 }, { "epoch": 9.44, "grad_norm": 2.87479305267334, "learning_rate": 7.283915741422611e-06, "loss": 0.5134, "step": 2360 }, { "epoch": 9.48, "grad_norm": 3.1623082160949707, "learning_rate": 7.194449713956908e-06, "loss": 0.4509, "step": 2370 }, { "epoch": 9.52, "grad_norm": 1.8917375802993774, "learning_rate": 7.105226776501772e-06, "loss": 0.5175, "step": 2380 }, { "epoch": 9.56, "grad_norm": 1.6095237731933594, "learning_rate": 7.016254659877398e-06, "loss": 0.4742, "step": 2390 }, { "epoch": 9.6, "grad_norm": 3.2498207092285156, "learning_rate": 6.927541073171333e-06, "loss": 0.4605, "step": 2400 }, { "epoch": 9.64, "grad_norm": 1.7395751476287842, "learning_rate": 6.839093703070512e-06, "loss": 0.4987, "step": 2410 }, { "epoch": 9.68, "grad_norm": 2.4571480751037598, "learning_rate": 6.750920213195238e-06, "loss": 0.4829, "step": 2420 }, { "epoch": 9.72, "grad_norm": 4.019631385803223, "learning_rate": 6.6630282434351535e-06, "loss": 0.4842, "step": 2430 }, { "epoch": 9.76, "grad_norm": 2.756540298461914, "learning_rate": 6.575425409287292e-06, "loss": 0.5198, "step": 2440 }, { "epoch": 9.8, "grad_norm": 2.0040042400360107, "learning_rate": 6.488119301196201e-06, "loss": 0.5239, "step": 2450 }, { "epoch": 9.84, "grad_norm": 7.419244766235352, "learning_rate": 6.4011174838962706e-06, "loss": 0.4636, "step": 2460 }, { "epoch": 9.88, "grad_norm": 2.879230260848999, "learning_rate": 6.314427495756283e-06, "loss": 0.4693, "step": 2470 }, { "epoch": 9.92, "grad_norm": 2.1217892169952393, "learning_rate": 6.228056848126236e-06, "loss": 0.475, "step": 2480 }, { "epoch": 9.96, "grad_norm": 2.1474809646606445, "learning_rate": 6.142013024686509e-06, "loss": 0.4995, "step": 2490 }, { "epoch": 10.0, "grad_norm": 4.3783721923828125, "learning_rate": 6.056303480799449e-06, "loss": 0.486, "step": 2500 }, { "epoch": 10.04, "grad_norm": 1.397594928741455, "learning_rate": 5.970935642863375e-06, "loss": 0.4537, "step": 2510 }, { "epoch": 10.08, "grad_norm": 3.2472903728485107, "learning_rate": 5.885916907669114e-06, "loss": 0.3856, "step": 2520 }, { "epoch": 10.12, "grad_norm": 3.1067910194396973, "learning_rate": 5.801254641759103e-06, "loss": 0.4705, "step": 2530 }, { "epoch": 10.16, "grad_norm": 2.5792055130004883, "learning_rate": 5.716956180789098e-06, "loss": 0.5011, "step": 2540 }, { "epoch": 10.2, "grad_norm": 3.1249446868896484, "learning_rate": 5.6330288288925805e-06, "loss": 0.462, "step": 2550 }, { "epoch": 10.24, "grad_norm": 4.708195209503174, "learning_rate": 5.549479858047875e-06, "loss": 0.5043, "step": 2560 }, { "epoch": 10.28, "grad_norm": 1.4850634336471558, "learning_rate": 5.466316507448049e-06, "loss": 0.5244, "step": 2570 }, { "epoch": 10.32, "grad_norm": 1.0298686027526855, "learning_rate": 5.3835459828736945e-06, "loss": 0.4362, "step": 2580 }, { "epoch": 10.36, "grad_norm": 2.251105546951294, "learning_rate": 5.30117545606854e-06, "loss": 0.4788, "step": 2590 }, { "epoch": 10.4, "grad_norm": 4.622702121734619, "learning_rate": 5.219212064118079e-06, "loss": 0.4265, "step": 2600 }, { "epoch": 10.44, "grad_norm": 3.4977996349334717, "learning_rate": 5.137662908831147e-06, "loss": 0.5, "step": 2610 }, { "epoch": 10.48, "grad_norm": 5.369349002838135, "learning_rate": 5.056535056124592e-06, "loss": 0.4409, "step": 2620 }, { "epoch": 10.52, "grad_norm": 2.318140983581543, "learning_rate": 4.97583553541102e-06, "loss": 0.4594, "step": 2630 }, { "epoch": 10.56, "grad_norm": 2.4116406440734863, "learning_rate": 4.895571338989754e-06, "loss": 0.4953, "step": 2640 }, { "epoch": 10.6, "grad_norm": 3.0506629943847656, "learning_rate": 4.8157494214409475e-06, "loss": 0.4795, "step": 2650 }, { "epoch": 10.64, "grad_norm": 1.8125630617141724, "learning_rate": 4.736376699023023e-06, "loss": 0.481, "step": 2660 }, { "epoch": 10.68, "grad_norm": 1.9686360359191895, "learning_rate": 4.6574600490733794e-06, "loss": 0.4713, "step": 2670 }, { "epoch": 10.72, "grad_norm": 2.8264060020446777, "learning_rate": 4.579006309412533e-06, "loss": 0.4501, "step": 2680 }, { "epoch": 10.76, "grad_norm": 1.9835346937179565, "learning_rate": 4.501022277751602e-06, "loss": 0.4754, "step": 2690 }, { "epoch": 10.8, "grad_norm": 4.487490653991699, "learning_rate": 4.423514711103355e-06, "loss": 0.5056, "step": 2700 }, { "epoch": 10.84, "grad_norm": 3.1984522342681885, "learning_rate": 4.346490325196704e-06, "loss": 0.4415, "step": 2710 }, { "epoch": 10.88, "grad_norm": 2.0367348194122314, "learning_rate": 4.26995579389485e-06, "loss": 0.5117, "step": 2720 }, { "epoch": 10.92, "grad_norm": 1.78911292552948, "learning_rate": 4.193917748616979e-06, "loss": 0.475, "step": 2730 }, { "epoch": 10.96, "grad_norm": 2.0589475631713867, "learning_rate": 4.118382777763711e-06, "loss": 0.4363, "step": 2740 }, { "epoch": 11.0, "grad_norm": 3.54664945602417, "learning_rate": 4.04335742614622e-06, "loss": 0.4665, "step": 2750 }, { "epoch": 11.04, "grad_norm": 1.714920997619629, "learning_rate": 3.968848194419163e-06, "loss": 0.4515, "step": 2760 }, { "epoch": 11.08, "grad_norm": 11.161652565002441, "learning_rate": 3.894861538517401e-06, "loss": 0.4285, "step": 2770 }, { "epoch": 11.12, "grad_norm": 2.627831220626831, "learning_rate": 3.821403869096658e-06, "loss": 0.4343, "step": 2780 }, { "epoch": 11.16, "grad_norm": 2.6865172386169434, "learning_rate": 3.748481550978017e-06, "loss": 0.4766, "step": 2790 }, { "epoch": 11.2, "grad_norm": 4.996657848358154, "learning_rate": 3.6761009025964657e-06, "loss": 0.4096, "step": 2800 }, { "epoch": 11.24, "grad_norm": 1.6282066106796265, "learning_rate": 3.604268195453421e-06, "loss": 0.4622, "step": 2810 }, { "epoch": 11.28, "grad_norm": 1.6030402183532715, "learning_rate": 3.5329896535733133e-06, "loss": 0.4437, "step": 2820 }, { "epoch": 11.32, "grad_norm": 2.8916399478912354, "learning_rate": 3.462271452964321e-06, "loss": 0.4871, "step": 2830 }, { "epoch": 11.36, "grad_norm": 2.375190019607544, "learning_rate": 3.3921197210832235e-06, "loss": 0.4575, "step": 2840 }, { "epoch": 11.4, "grad_norm": 4.021700382232666, "learning_rate": 3.3225405363045016e-06, "loss": 0.4699, "step": 2850 }, { "epoch": 11.44, "grad_norm": 1.7844669818878174, "learning_rate": 3.2535399273936407e-06, "loss": 0.4648, "step": 2860 }, { "epoch": 11.48, "grad_norm": 2.744528293609619, "learning_rate": 3.1851238729848033e-06, "loss": 0.3923, "step": 2870 }, { "epoch": 11.52, "grad_norm": 1.6203703880310059, "learning_rate": 3.11729830106276e-06, "loss": 0.4717, "step": 2880 }, { "epoch": 11.56, "grad_norm": 1.696370244026184, "learning_rate": 3.0500690884492836e-06, "loss": 0.4556, "step": 2890 }, { "epoch": 11.6, "grad_norm": 4.04744291305542, "learning_rate": 2.983442060293926e-06, "loss": 0.4785, "step": 2900 }, { "epoch": 11.64, "grad_norm": 2.629739284515381, "learning_rate": 2.917422989569311e-06, "loss": 0.463, "step": 2910 }, { "epoch": 11.68, "grad_norm": 2.43945050239563, "learning_rate": 2.852017596570901e-06, "loss": 0.4551, "step": 2920 }, { "epoch": 11.72, "grad_norm": 6.5788116455078125, "learning_rate": 2.7872315484213954e-06, "loss": 0.4501, "step": 2930 }, { "epoch": 11.76, "grad_norm": 2.3283305168151855, "learning_rate": 2.723070458579653e-06, "loss": 0.4338, "step": 2940 }, { "epoch": 11.8, "grad_norm": 4.168436527252197, "learning_rate": 2.6595398863543407e-06, "loss": 0.4744, "step": 2950 }, { "epoch": 11.84, "grad_norm": 3.3579213619232178, "learning_rate": 2.596645336422219e-06, "loss": 0.4257, "step": 2960 }, { "epoch": 11.88, "grad_norm": 4.208755970001221, "learning_rate": 2.5343922583512026e-06, "loss": 0.4676, "step": 2970 }, { "epoch": 11.92, "grad_norm": 2.752279281616211, "learning_rate": 2.472786046128156e-06, "loss": 0.455, "step": 2980 }, { "epoch": 11.96, "grad_norm": 3.5390079021453857, "learning_rate": 2.411832037691545e-06, "loss": 0.4646, "step": 2990 }, { "epoch": 12.0, "grad_norm": 2.805065870285034, "learning_rate": 2.3515355144689155e-06, "loss": 0.4774, "step": 3000 }, { "epoch": 12.04, "grad_norm": 2.3203487396240234, "learning_rate": 2.2919017009192703e-06, "loss": 0.4333, "step": 3010 }, { "epoch": 12.08, "grad_norm": 2.62695050239563, "learning_rate": 2.2329357640804118e-06, "loss": 0.456, "step": 3020 }, { "epoch": 12.12, "grad_norm": 1.8128643035888672, "learning_rate": 2.1746428131212126e-06, "loss": 0.4054, "step": 3030 }, { "epoch": 12.16, "grad_norm": 1.5641071796417236, "learning_rate": 2.117027898898948e-06, "loss": 0.4875, "step": 3040 }, { "epoch": 12.2, "grad_norm": 2.1929521560668945, "learning_rate": 2.0600960135216463e-06, "loss": 0.4041, "step": 3050 }, { "epoch": 12.24, "grad_norm": 1.746474027633667, "learning_rate": 2.003852089915548e-06, "loss": 0.5115, "step": 3060 }, { "epoch": 12.28, "grad_norm": 2.731269121170044, "learning_rate": 1.9483010013976766e-06, "loss": 0.4459, "step": 3070 }, { "epoch": 12.32, "grad_norm": 1.9771169424057007, "learning_rate": 1.8934475612536019e-06, "loss": 0.3677, "step": 3080 }, { "epoch": 12.36, "grad_norm": 3.043891191482544, "learning_rate": 1.8392965223203707e-06, "loss": 0.4353, "step": 3090 }, { "epoch": 12.4, "grad_norm": 3.361074447631836, "learning_rate": 1.7858525765747047e-06, "loss": 0.4578, "step": 3100 }, { "epoch": 12.44, "grad_norm": 3.8681182861328125, "learning_rate": 1.7331203547264452e-06, "loss": 0.4057, "step": 3110 }, { "epoch": 12.48, "grad_norm": 2.581637382507324, "learning_rate": 1.6811044258173425e-06, "loss": 0.4532, "step": 3120 }, { "epoch": 12.52, "grad_norm": 2.8616292476654053, "learning_rate": 1.629809296825139e-06, "loss": 0.4551, "step": 3130 }, { "epoch": 12.56, "grad_norm": 2.4617111682891846, "learning_rate": 1.579239412273078e-06, "loss": 0.4388, "step": 3140 }, { "epoch": 12.6, "grad_norm": 14.222563743591309, "learning_rate": 1.5293991538447882e-06, "loss": 0.412, "step": 3150 }, { "epoch": 12.64, "grad_norm": 3.5921924114227295, "learning_rate": 1.4802928400046457e-06, "loss": 0.4517, "step": 3160 }, { "epoch": 12.68, "grad_norm": 2.4046990871429443, "learning_rate": 1.4319247256235713e-06, "loss": 0.4893, "step": 3170 }, { "epoch": 12.72, "grad_norm": 2.5496039390563965, "learning_rate": 1.3842990016103886e-06, "loss": 0.4305, "step": 3180 }, { "epoch": 12.76, "grad_norm": 2.3980159759521484, "learning_rate": 1.3374197945486833e-06, "loss": 0.3833, "step": 3190 }, { "epoch": 12.8, "grad_norm": 1.515519142150879, "learning_rate": 1.2912911663392468e-06, "loss": 0.4513, "step": 3200 }, { "epoch": 12.84, "grad_norm": 2.939988136291504, "learning_rate": 1.245917113848144e-06, "loss": 0.4712, "step": 3210 }, { "epoch": 12.88, "grad_norm": 1.846997857093811, "learning_rate": 1.2013015685603813e-06, "loss": 0.4789, "step": 3220 }, { "epoch": 12.92, "grad_norm": 2.960897445678711, "learning_rate": 1.1574483962392768e-06, "loss": 0.4128, "step": 3230 }, { "epoch": 12.96, "grad_norm": 1.8638381958007812, "learning_rate": 1.114361396591498e-06, "loss": 0.4949, "step": 3240 }, { "epoch": 13.0, "grad_norm": 2.134097099304199, "learning_rate": 1.0720443029378303e-06, "loss": 0.4167, "step": 3250 }, { "epoch": 13.04, "grad_norm": 2.2456820011138916, "learning_rate": 1.0305007818897006e-06, "loss": 0.4483, "step": 3260 }, { "epoch": 13.08, "grad_norm": 9.045520782470703, "learning_rate": 9.897344330314862e-07, "loss": 0.454, "step": 3270 }, { "epoch": 13.12, "grad_norm": 2.646930694580078, "learning_rate": 9.497487886086132e-07, "loss": 0.4438, "step": 3280 }, { "epoch": 13.16, "grad_norm": 4.203260898590088, "learning_rate": 9.105473132215126e-07, "loss": 0.3904, "step": 3290 }, { "epoch": 13.2, "grad_norm": 3.177109479904175, "learning_rate": 8.721334035254203e-07, "loss": 0.4128, "step": 3300 }, { "epoch": 13.24, "grad_norm": 1.825671911239624, "learning_rate": 8.345103879360695e-07, "loss": 0.4479, "step": 3310 }, { "epoch": 13.28, "grad_norm": 3.2267651557922363, "learning_rate": 7.976815263412963e-07, "loss": 0.3928, "step": 3320 }, { "epoch": 13.32, "grad_norm": 3.6811180114746094, "learning_rate": 7.616500098185908e-07, "loss": 0.4163, "step": 3330 }, { "epoch": 13.36, "grad_norm": 3.258467435836792, "learning_rate": 7.264189603585892e-07, "loss": 0.4186, "step": 3340 }, { "epoch": 13.4, "grad_norm": 1.861440658569336, "learning_rate": 6.919914305945774e-07, "loss": 0.4416, "step": 3350 }, { "epoch": 13.44, "grad_norm": 3.303765296936035, "learning_rate": 6.58370403537989e-07, "loss": 0.3958, "step": 3360 }, { "epoch": 13.48, "grad_norm": 1.9865639209747314, "learning_rate": 6.255587923199313e-07, "loss": 0.4424, "step": 3370 }, { "epoch": 13.52, "grad_norm": 2.3877573013305664, "learning_rate": 5.935594399387856e-07, "loss": 0.4778, "step": 3380 }, { "epoch": 13.56, "grad_norm": 2.6566929817199707, "learning_rate": 5.623751190138682e-07, "loss": 0.4045, "step": 3390 }, { "epoch": 13.6, "grad_norm": 4.0699052810668945, "learning_rate": 5.320085315451862e-07, "loss": 0.4275, "step": 3400 }, { "epoch": 13.64, "grad_norm": 4.235281944274902, "learning_rate": 5.024623086793323e-07, "loss": 0.4346, "step": 3410 }, { "epoch": 13.68, "grad_norm": 1.7936820983886719, "learning_rate": 4.737390104814954e-07, "loss": 0.4343, "step": 3420 }, { "epoch": 13.72, "grad_norm": 4.244424343109131, "learning_rate": 4.458411257136486e-07, "loss": 0.4355, "step": 3430 }, { "epoch": 13.76, "grad_norm": 3.0799527168273926, "learning_rate": 4.1877107161890416e-07, "loss": 0.4407, "step": 3440 }, { "epoch": 13.8, "grad_norm": 3.4574971199035645, "learning_rate": 3.9253119371206684e-07, "loss": 0.4369, "step": 3450 }, { "epoch": 13.84, "grad_norm": 1.3924965858459473, "learning_rate": 3.671237655764104e-07, "loss": 0.4823, "step": 3460 }, { "epoch": 13.88, "grad_norm": 1.9487115144729614, "learning_rate": 3.4255098866667114e-07, "loss": 0.4566, "step": 3470 }, { "epoch": 13.92, "grad_norm": 2.6502346992492676, "learning_rate": 3.188149921183115e-07, "loss": 0.4824, "step": 3480 }, { "epoch": 13.96, "grad_norm": 3.1728081703186035, "learning_rate": 2.959178325630296e-07, "loss": 0.3983, "step": 3490 }, { "epoch": 14.0, "grad_norm": 2.273251533508301, "learning_rate": 2.7386149395056463e-07, "loss": 0.4541, "step": 3500 }, { "epoch": 14.04, "grad_norm": 2.2681076526641846, "learning_rate": 2.526478873767946e-07, "loss": 0.4667, "step": 3510 }, { "epoch": 14.08, "grad_norm": 2.3255577087402344, "learning_rate": 2.322788509181484e-07, "loss": 0.441, "step": 3520 }, { "epoch": 14.12, "grad_norm": 1.8558521270751953, "learning_rate": 2.1275614947233624e-07, "loss": 0.4294, "step": 3530 }, { "epoch": 14.16, "grad_norm": 3.000098943710327, "learning_rate": 1.9408147460544203e-07, "loss": 0.4245, "step": 3540 }, { "epoch": 14.2, "grad_norm": 3.5599377155303955, "learning_rate": 1.7625644440534384e-07, "loss": 0.3884, "step": 3550 }, { "epoch": 14.24, "grad_norm": 3.208889961242676, "learning_rate": 1.5928260334151847e-07, "loss": 0.4434, "step": 3560 }, { "epoch": 14.28, "grad_norm": 3.4625139236450195, "learning_rate": 1.4316142213121386e-07, "loss": 0.4474, "step": 3570 }, { "epoch": 14.32, "grad_norm": 4.883245944976807, "learning_rate": 1.2789429761202565e-07, "loss": 0.3927, "step": 3580 }, { "epoch": 14.36, "grad_norm": 4.052615165710449, "learning_rate": 1.134825526208605e-07, "loss": 0.3967, "step": 3590 }, { "epoch": 14.4, "grad_norm": 1.4959968328475952, "learning_rate": 9.992743587931674e-08, "loss": 0.4807, "step": 3600 }, { "epoch": 14.44, "grad_norm": 2.757499933242798, "learning_rate": 8.723012188549318e-08, "loss": 0.4506, "step": 3610 }, { "epoch": 14.48, "grad_norm": 2.0996549129486084, "learning_rate": 7.539171081221597e-08, "loss": 0.3934, "step": 3620 }, { "epoch": 14.52, "grad_norm": 2.4962105751037598, "learning_rate": 6.44132284117216e-08, "loss": 0.4549, "step": 3630 }, { "epoch": 14.56, "grad_norm": 1.9287108182907104, "learning_rate": 5.429562592677018e-08, "loss": 0.4208, "step": 3640 }, { "epoch": 14.6, "grad_norm": 4.647493362426758, "learning_rate": 4.503978000823028e-08, "loss": 0.4376, "step": 3650 }, { "epoch": 14.64, "grad_norm": 1.24240243434906, "learning_rate": 3.6646492639118567e-08, "loss": 0.4562, "step": 3660 }, { "epoch": 14.68, "grad_norm": 2.359651565551758, "learning_rate": 2.911649106511316e-08, "loss": 0.42, "step": 3670 }, { "epoch": 14.72, "grad_norm": 2.7108898162841797, "learning_rate": 2.2450427731534052e-08, "loss": 0.4489, "step": 3680 }, { "epoch": 14.76, "grad_norm": 1.4741981029510498, "learning_rate": 1.664888022682165e-08, "loss": 0.4532, "step": 3690 }, { "epoch": 14.8, "grad_norm": 1.676648497581482, "learning_rate": 1.1712351232480157e-08, "loss": 0.4375, "step": 3700 }, { "epoch": 14.84, "grad_norm": 1.2909196615219116, "learning_rate": 7.641268479531283e-09, "loss": 0.4574, "step": 3710 }, { "epoch": 14.88, "grad_norm": 3.2380595207214355, "learning_rate": 4.435984711446128e-09, "loss": 0.3538, "step": 3720 }, { "epoch": 14.92, "grad_norm": 1.9485658407211304, "learning_rate": 2.0967776535851802e-09, "loss": 0.3884, "step": 3730 }, { "epoch": 14.96, "grad_norm": 2.0014994144439697, "learning_rate": 6.238499891353389e-10, "loss": 0.4266, "step": 3740 }, { "epoch": 15.0, "grad_norm": 2.8281970024108887, "learning_rate": 1.7329341542859922e-11, "loss": 0.4743, "step": 3750 } ], "logging_steps": 10, "max_steps": 3750, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.815859931388314e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }