saim1212's picture
vision_on
b1946b1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.0,
"eval_steps": 500,
"global_step": 3750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"grad_norm": 3.0283896923065186,
"learning_rate": 5.333333333333335e-07,
"loss": 1.5494,
"step": 10
},
{
"epoch": 0.08,
"grad_norm": 2.980957508087158,
"learning_rate": 1.066666666666667e-06,
"loss": 1.5781,
"step": 20
},
{
"epoch": 0.12,
"grad_norm": 1.3325070142745972,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.271,
"step": 30
},
{
"epoch": 0.16,
"grad_norm": 1.1161530017852783,
"learning_rate": 2.133333333333334e-06,
"loss": 1.1263,
"step": 40
},
{
"epoch": 0.2,
"grad_norm": 0.9847109317779541,
"learning_rate": 2.666666666666667e-06,
"loss": 1.4759,
"step": 50
},
{
"epoch": 0.24,
"grad_norm": 1.3176578283309937,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.1906,
"step": 60
},
{
"epoch": 0.28,
"grad_norm": 2.9614243507385254,
"learning_rate": 3.7333333333333337e-06,
"loss": 1.3136,
"step": 70
},
{
"epoch": 0.32,
"grad_norm": 1.0635404586791992,
"learning_rate": 4.266666666666668e-06,
"loss": 1.159,
"step": 80
},
{
"epoch": 0.36,
"grad_norm": 1.4873822927474976,
"learning_rate": 4.800000000000001e-06,
"loss": 1.0645,
"step": 90
},
{
"epoch": 0.4,
"grad_norm": 2.666663646697998,
"learning_rate": 5.333333333333334e-06,
"loss": 1.3174,
"step": 100
},
{
"epoch": 0.44,
"grad_norm": 0.9259383678436279,
"learning_rate": 5.8666666666666675e-06,
"loss": 1.0041,
"step": 110
},
{
"epoch": 0.48,
"grad_norm": 1.422500729560852,
"learning_rate": 6.4000000000000006e-06,
"loss": 1.1208,
"step": 120
},
{
"epoch": 0.52,
"grad_norm": 1.513822317123413,
"learning_rate": 6.9333333333333344e-06,
"loss": 0.9806,
"step": 130
},
{
"epoch": 0.56,
"grad_norm": 3.26381254196167,
"learning_rate": 7.4666666666666675e-06,
"loss": 0.9904,
"step": 140
},
{
"epoch": 0.6,
"grad_norm": 2.899075984954834,
"learning_rate": 8.000000000000001e-06,
"loss": 0.9365,
"step": 150
},
{
"epoch": 0.64,
"grad_norm": 0.7061178088188171,
"learning_rate": 8.533333333333335e-06,
"loss": 0.8843,
"step": 160
},
{
"epoch": 0.68,
"grad_norm": 1.0236766338348389,
"learning_rate": 9.066666666666667e-06,
"loss": 0.9131,
"step": 170
},
{
"epoch": 0.72,
"grad_norm": 1.3964245319366455,
"learning_rate": 9.600000000000001e-06,
"loss": 0.9142,
"step": 180
},
{
"epoch": 0.76,
"grad_norm": 1.7908815145492554,
"learning_rate": 1.0133333333333335e-05,
"loss": 0.8973,
"step": 190
},
{
"epoch": 0.8,
"grad_norm": 0.9264830946922302,
"learning_rate": 1.0666666666666667e-05,
"loss": 0.8407,
"step": 200
},
{
"epoch": 0.84,
"grad_norm": 2.5779850482940674,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.8228,
"step": 210
},
{
"epoch": 0.88,
"grad_norm": 1.6980341672897339,
"learning_rate": 1.1733333333333335e-05,
"loss": 0.799,
"step": 220
},
{
"epoch": 0.92,
"grad_norm": 1.9194531440734863,
"learning_rate": 1.2266666666666667e-05,
"loss": 0.7759,
"step": 230
},
{
"epoch": 0.96,
"grad_norm": 2.0350003242492676,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.7466,
"step": 240
},
{
"epoch": 1.0,
"grad_norm": 3.474932909011841,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.7317,
"step": 250
},
{
"epoch": 1.04,
"grad_norm": 1.7944034337997437,
"learning_rate": 1.3866666666666669e-05,
"loss": 0.7046,
"step": 260
},
{
"epoch": 1.08,
"grad_norm": 2.945058584213257,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.6625,
"step": 270
},
{
"epoch": 1.12,
"grad_norm": 1.820989966392517,
"learning_rate": 1.4933333333333335e-05,
"loss": 0.7307,
"step": 280
},
{
"epoch": 1.16,
"grad_norm": 2.9544613361358643,
"learning_rate": 1.546666666666667e-05,
"loss": 0.7996,
"step": 290
},
{
"epoch": 1.2,
"grad_norm": 0.9499707221984863,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.706,
"step": 300
},
{
"epoch": 1.24,
"grad_norm": 5.53220796585083,
"learning_rate": 1.6533333333333333e-05,
"loss": 0.6941,
"step": 310
},
{
"epoch": 1.28,
"grad_norm": 2.7142622470855713,
"learning_rate": 1.706666666666667e-05,
"loss": 0.746,
"step": 320
},
{
"epoch": 1.32,
"grad_norm": 4.010003089904785,
"learning_rate": 1.76e-05,
"loss": 0.7262,
"step": 330
},
{
"epoch": 1.3599999999999999,
"grad_norm": 2.3094098567962646,
"learning_rate": 1.8133333333333335e-05,
"loss": 0.6552,
"step": 340
},
{
"epoch": 1.4,
"grad_norm": 5.371938228607178,
"learning_rate": 1.866666666666667e-05,
"loss": 0.677,
"step": 350
},
{
"epoch": 1.44,
"grad_norm": 1.662387728691101,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.6551,
"step": 360
},
{
"epoch": 1.48,
"grad_norm": 1.2856159210205078,
"learning_rate": 1.9733333333333336e-05,
"loss": 0.7076,
"step": 370
},
{
"epoch": 1.52,
"grad_norm": 1.2962098121643066,
"learning_rate": 1.999989169177959e-05,
"loss": 0.6818,
"step": 380
},
{
"epoch": 1.56,
"grad_norm": 1.3778997659683228,
"learning_rate": 1.9999025240093045e-05,
"loss": 0.6823,
"step": 390
},
{
"epoch": 1.6,
"grad_norm": 1.4772292375564575,
"learning_rate": 1.999729241179462e-05,
"loss": 0.7704,
"step": 400
},
{
"epoch": 1.6400000000000001,
"grad_norm": 1.13938307762146,
"learning_rate": 1.999469335702714e-05,
"loss": 0.6668,
"step": 410
},
{
"epoch": 1.6800000000000002,
"grad_norm": 2.4890644550323486,
"learning_rate": 1.9991228300988586e-05,
"loss": 0.6448,
"step": 420
},
{
"epoch": 1.72,
"grad_norm": 2.2066543102264404,
"learning_rate": 1.998689754391257e-05,
"loss": 0.7159,
"step": 430
},
{
"epoch": 1.76,
"grad_norm": 1.4606579542160034,
"learning_rate": 1.998170146104234e-05,
"loss": 0.6443,
"step": 440
},
{
"epoch": 1.8,
"grad_norm": 5.692836284637451,
"learning_rate": 1.9975640502598243e-05,
"loss": 0.7253,
"step": 450
},
{
"epoch": 1.8399999999999999,
"grad_norm": 1.7549424171447754,
"learning_rate": 1.9968715193738738e-05,
"loss": 0.6349,
"step": 460
},
{
"epoch": 1.88,
"grad_norm": 2.5602545738220215,
"learning_rate": 1.9960926134514875e-05,
"loss": 0.6793,
"step": 470
},
{
"epoch": 1.92,
"grad_norm": 1.393797755241394,
"learning_rate": 1.9952273999818312e-05,
"loss": 0.6686,
"step": 480
},
{
"epoch": 1.96,
"grad_norm": 0.6151896119117737,
"learning_rate": 1.9942759539322845e-05,
"loss": 0.6584,
"step": 490
},
{
"epoch": 2.0,
"grad_norm": 2.0221006870269775,
"learning_rate": 1.9932383577419432e-05,
"loss": 0.6771,
"step": 500
},
{
"epoch": 2.04,
"grad_norm": 2.0078063011169434,
"learning_rate": 1.9921147013144782e-05,
"loss": 0.6664,
"step": 510
},
{
"epoch": 2.08,
"grad_norm": 2.788282871246338,
"learning_rate": 1.990905082010344e-05,
"loss": 0.6243,
"step": 520
},
{
"epoch": 2.12,
"grad_norm": 2.064715623855591,
"learning_rate": 1.9896096046383456e-05,
"loss": 0.6253,
"step": 530
},
{
"epoch": 2.16,
"grad_norm": 2.5293374061584473,
"learning_rate": 1.988228381446553e-05,
"loss": 0.6362,
"step": 540
},
{
"epoch": 2.2,
"grad_norm": 1.461493730545044,
"learning_rate": 1.9867615321125796e-05,
"loss": 0.6517,
"step": 550
},
{
"epoch": 2.24,
"grad_norm": 1.1433868408203125,
"learning_rate": 1.985209183733209e-05,
"loss": 0.6849,
"step": 560
},
{
"epoch": 2.2800000000000002,
"grad_norm": 1.6532901525497437,
"learning_rate": 1.983571470813386e-05,
"loss": 0.6298,
"step": 570
},
{
"epoch": 2.32,
"grad_norm": 3.705383539199829,
"learning_rate": 1.9818485352545595e-05,
"loss": 0.6588,
"step": 580
},
{
"epoch": 2.36,
"grad_norm": 2.4615492820739746,
"learning_rate": 1.980040526342388e-05,
"loss": 0.6154,
"step": 590
},
{
"epoch": 2.4,
"grad_norm": 0.8189066052436829,
"learning_rate": 1.9781476007338058e-05,
"loss": 0.6393,
"step": 600
},
{
"epoch": 2.44,
"grad_norm": 1.1122651100158691,
"learning_rate": 1.9761699224434476e-05,
"loss": 0.6245,
"step": 610
},
{
"epoch": 2.48,
"grad_norm": 1.4684017896652222,
"learning_rate": 1.9741076628294387e-05,
"loss": 0.6592,
"step": 620
},
{
"epoch": 2.52,
"grad_norm": 0.9914065599441528,
"learning_rate": 1.9719610005785466e-05,
"loss": 0.6262,
"step": 630
},
{
"epoch": 2.56,
"grad_norm": 1.7366482019424438,
"learning_rate": 1.969730121690698e-05,
"loss": 0.672,
"step": 640
},
{
"epoch": 2.6,
"grad_norm": 3.544377326965332,
"learning_rate": 1.967415219462864e-05,
"loss": 0.6057,
"step": 650
},
{
"epoch": 2.64,
"grad_norm": 1.9553754329681396,
"learning_rate": 1.9650164944723116e-05,
"loss": 0.6142,
"step": 660
},
{
"epoch": 2.68,
"grad_norm": 2.1661672592163086,
"learning_rate": 1.9625341545592226e-05,
"loss": 0.6238,
"step": 670
},
{
"epoch": 2.7199999999999998,
"grad_norm": 3.7167468070983887,
"learning_rate": 1.9599684148086876e-05,
"loss": 0.7166,
"step": 680
},
{
"epoch": 2.76,
"grad_norm": 2.688824415206909,
"learning_rate": 1.9573194975320672e-05,
"loss": 0.6769,
"step": 690
},
{
"epoch": 2.8,
"grad_norm": 4.10930061340332,
"learning_rate": 1.954587632247732e-05,
"loss": 0.6199,
"step": 700
},
{
"epoch": 2.84,
"grad_norm": 1.5201390981674194,
"learning_rate": 1.951773055661174e-05,
"loss": 0.6242,
"step": 710
},
{
"epoch": 2.88,
"grad_norm": 3.6892731189727783,
"learning_rate": 1.9488760116444966e-05,
"loss": 0.6245,
"step": 720
},
{
"epoch": 2.92,
"grad_norm": 0.8859150409698486,
"learning_rate": 1.9458967512152872e-05,
"loss": 0.628,
"step": 730
},
{
"epoch": 2.96,
"grad_norm": 1.4320142269134521,
"learning_rate": 1.9428355325148632e-05,
"loss": 0.5806,
"step": 740
},
{
"epoch": 3.0,
"grad_norm": 1.4816261529922485,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.5818,
"step": 750
},
{
"epoch": 3.04,
"grad_norm": 2.1367580890655518,
"learning_rate": 1.9364682883494892e-05,
"loss": 0.6387,
"step": 760
},
{
"epoch": 3.08,
"grad_norm": 2.2321407794952393,
"learning_rate": 1.9331628145814587e-05,
"loss": 0.6207,
"step": 770
},
{
"epoch": 3.12,
"grad_norm": 3.410268783569336,
"learning_rate": 1.9297764858882516e-05,
"loss": 0.5868,
"step": 780
},
{
"epoch": 3.16,
"grad_norm": 3.323219060897827,
"learning_rate": 1.926309595682066e-05,
"loss": 0.5444,
"step": 790
},
{
"epoch": 3.2,
"grad_norm": 2.397799015045166,
"learning_rate": 1.9227624443554425e-05,
"loss": 0.5891,
"step": 800
},
{
"epoch": 3.24,
"grad_norm": 9.090506553649902,
"learning_rate": 1.9191353392552346e-05,
"loss": 0.5453,
"step": 810
},
{
"epoch": 3.2800000000000002,
"grad_norm": 1.3556101322174072,
"learning_rate": 1.9154285946559792e-05,
"loss": 0.6406,
"step": 820
},
{
"epoch": 3.32,
"grad_norm": 1.056227684020996,
"learning_rate": 1.911642531732666e-05,
"loss": 0.5613,
"step": 830
},
{
"epoch": 3.36,
"grad_norm": 1.2134612798690796,
"learning_rate": 1.907777478532909e-05,
"loss": 0.6439,
"step": 840
},
{
"epoch": 3.4,
"grad_norm": 2.153582811355591,
"learning_rate": 1.9038337699485207e-05,
"loss": 0.6268,
"step": 850
},
{
"epoch": 3.44,
"grad_norm": 1.4763509035110474,
"learning_rate": 1.8998117476864984e-05,
"loss": 0.6358,
"step": 860
},
{
"epoch": 3.48,
"grad_norm": 2.120673656463623,
"learning_rate": 1.895711760239413e-05,
"loss": 0.5479,
"step": 870
},
{
"epoch": 3.52,
"grad_norm": 3.2643983364105225,
"learning_rate": 1.8915341628552166e-05,
"loss": 0.5908,
"step": 880
},
{
"epoch": 3.56,
"grad_norm": 1.7468228340148926,
"learning_rate": 1.8872793175064594e-05,
"loss": 0.6167,
"step": 890
},
{
"epoch": 3.6,
"grad_norm": 1.6314669847488403,
"learning_rate": 1.8829475928589272e-05,
"loss": 0.605,
"step": 900
},
{
"epoch": 3.64,
"grad_norm": 1.2006853818893433,
"learning_rate": 1.8785393642396976e-05,
"loss": 0.6374,
"step": 910
},
{
"epoch": 3.68,
"grad_norm": 5.40316915512085,
"learning_rate": 1.8740550136046195e-05,
"loss": 0.5919,
"step": 920
},
{
"epoch": 3.7199999999999998,
"grad_norm": 1.0310533046722412,
"learning_rate": 1.869494929505219e-05,
"loss": 0.596,
"step": 930
},
{
"epoch": 3.76,
"grad_norm": 3.058582067489624,
"learning_rate": 1.8653264281300622e-05,
"loss": 0.5709,
"step": 940
},
{
"epoch": 3.8,
"grad_norm": 0.8612210750579834,
"learning_rate": 1.8606235443821602e-05,
"loss": 0.6734,
"step": 950
},
{
"epoch": 3.84,
"grad_norm": 1.0079221725463867,
"learning_rate": 1.8558460909544564e-05,
"loss": 0.5874,
"step": 960
},
{
"epoch": 3.88,
"grad_norm": 1.8223471641540527,
"learning_rate": 1.850994481794692e-05,
"loss": 0.6199,
"step": 970
},
{
"epoch": 3.92,
"grad_norm": 2.1497292518615723,
"learning_rate": 1.846069137275914e-05,
"loss": 0.5527,
"step": 980
},
{
"epoch": 3.96,
"grad_norm": 1.5918537378311157,
"learning_rate": 1.8410704841600506e-05,
"loss": 0.5998,
"step": 990
},
{
"epoch": 4.0,
"grad_norm": 4.21558952331543,
"learning_rate": 1.8359989555609355e-05,
"loss": 0.6357,
"step": 1000
},
{
"epoch": 4.04,
"grad_norm": 2.512099504470825,
"learning_rate": 1.830854990906779e-05,
"loss": 0.6116,
"step": 1010
},
{
"epoch": 4.08,
"grad_norm": 2.5760135650634766,
"learning_rate": 1.825639035902093e-05,
"loss": 0.54,
"step": 1020
},
{
"epoch": 4.12,
"grad_norm": 1.5788276195526123,
"learning_rate": 1.8203515424890738e-05,
"loss": 0.6258,
"step": 1030
},
{
"epoch": 4.16,
"grad_norm": 1.5123496055603027,
"learning_rate": 1.814992968808442e-05,
"loss": 0.5147,
"step": 1040
},
{
"epoch": 4.2,
"grad_norm": 1.539919376373291,
"learning_rate": 1.809563779159746e-05,
"loss": 0.569,
"step": 1050
},
{
"epoch": 4.24,
"grad_norm": 1.6823704242706299,
"learning_rate": 1.8040644439611348e-05,
"loss": 0.5588,
"step": 1060
},
{
"epoch": 4.28,
"grad_norm": 1.1113232374191284,
"learning_rate": 1.798495439708594e-05,
"loss": 0.5692,
"step": 1070
},
{
"epoch": 4.32,
"grad_norm": 1.6908786296844482,
"learning_rate": 1.792857248934663e-05,
"loss": 0.6102,
"step": 1080
},
{
"epoch": 4.36,
"grad_norm": 1.7746518850326538,
"learning_rate": 1.7871503601666233e-05,
"loss": 0.5706,
"step": 1090
},
{
"epoch": 4.4,
"grad_norm": 1.2888718843460083,
"learning_rate": 1.7813752678841702e-05,
"loss": 0.5964,
"step": 1100
},
{
"epoch": 4.44,
"grad_norm": 2.7955427169799805,
"learning_rate": 1.7755324724765688e-05,
"loss": 0.6055,
"step": 1110
},
{
"epoch": 4.48,
"grad_norm": 1.4672576189041138,
"learning_rate": 1.7696224801992947e-05,
"loss": 0.5548,
"step": 1120
},
{
"epoch": 4.52,
"grad_norm": 2.2973852157592773,
"learning_rate": 1.7636458031301725e-05,
"loss": 0.5967,
"step": 1130
},
{
"epoch": 4.5600000000000005,
"grad_norm": 2.6734001636505127,
"learning_rate": 1.7576029591250036e-05,
"loss": 0.5567,
"step": 1140
},
{
"epoch": 4.6,
"grad_norm": 2.127830743789673,
"learning_rate": 1.7514944717726962e-05,
"loss": 0.6065,
"step": 1150
},
{
"epoch": 4.64,
"grad_norm": 2.201108455657959,
"learning_rate": 1.7453208703499006e-05,
"loss": 0.566,
"step": 1160
},
{
"epoch": 4.68,
"grad_norm": 3.8374786376953125,
"learning_rate": 1.739082689775146e-05,
"loss": 0.55,
"step": 1170
},
{
"epoch": 4.72,
"grad_norm": 2.7282190322875977,
"learning_rate": 1.732780470562496e-05,
"loss": 0.5748,
"step": 1180
},
{
"epoch": 4.76,
"grad_norm": 1.8128880262374878,
"learning_rate": 1.7264147587747097e-05,
"loss": 0.6309,
"step": 1190
},
{
"epoch": 4.8,
"grad_norm": 2.522096633911133,
"learning_rate": 1.7199861059759338e-05,
"loss": 0.5504,
"step": 1200
},
{
"epoch": 4.84,
"grad_norm": 1.186700463294983,
"learning_rate": 1.7134950691839063e-05,
"loss": 0.5741,
"step": 1210
},
{
"epoch": 4.88,
"grad_norm": 4.312258720397949,
"learning_rate": 1.7069422108216973e-05,
"loss": 0.575,
"step": 1220
},
{
"epoch": 4.92,
"grad_norm": 3.402963399887085,
"learning_rate": 1.7003280986689733e-05,
"loss": 0.5842,
"step": 1230
},
{
"epoch": 4.96,
"grad_norm": 2.346266269683838,
"learning_rate": 1.693653305812805e-05,
"loss": 0.5877,
"step": 1240
},
{
"epoch": 5.0,
"grad_norm": 1.304891586303711,
"learning_rate": 1.686918410598009e-05,
"loss": 0.6153,
"step": 1250
},
{
"epoch": 5.04,
"grad_norm": 2.874284505844116,
"learning_rate": 1.6801239965770366e-05,
"loss": 0.5776,
"step": 1260
},
{
"epoch": 5.08,
"grad_norm": 3.4333293437957764,
"learning_rate": 1.6732706524594138e-05,
"loss": 0.5099,
"step": 1270
},
{
"epoch": 5.12,
"grad_norm": 1.4809489250183105,
"learning_rate": 1.6663589720607287e-05,
"loss": 0.5535,
"step": 1280
},
{
"epoch": 5.16,
"grad_norm": 3.005042314529419,
"learning_rate": 1.659389554251181e-05,
"loss": 0.561,
"step": 1290
},
{
"epoch": 5.2,
"grad_norm": 1.8546274900436401,
"learning_rate": 1.652363002903693e-05,
"loss": 0.555,
"step": 1300
},
{
"epoch": 5.24,
"grad_norm": 10.558731079101562,
"learning_rate": 1.6452799268415857e-05,
"loss": 0.5333,
"step": 1310
},
{
"epoch": 5.28,
"grad_norm": 2.2776925563812256,
"learning_rate": 1.6381409397858257e-05,
"loss": 0.5941,
"step": 1320
},
{
"epoch": 5.32,
"grad_norm": 2.6703994274139404,
"learning_rate": 1.6309466603018497e-05,
"loss": 0.5676,
"step": 1330
},
{
"epoch": 5.36,
"grad_norm": 3.8042726516723633,
"learning_rate": 1.6236977117459693e-05,
"loss": 0.5609,
"step": 1340
},
{
"epoch": 5.4,
"grad_norm": 1.2701845169067383,
"learning_rate": 1.616394722211357e-05,
"loss": 0.5702,
"step": 1350
},
{
"epoch": 5.44,
"grad_norm": 1.392269253730774,
"learning_rate": 1.6090383244736256e-05,
"loss": 0.5388,
"step": 1360
},
{
"epoch": 5.48,
"grad_norm": 2.634445905685425,
"learning_rate": 1.6016291559360023e-05,
"loss": 0.573,
"step": 1370
},
{
"epoch": 5.52,
"grad_norm": 3.3800487518310547,
"learning_rate": 1.5941678585740976e-05,
"loss": 0.5522,
"step": 1380
},
{
"epoch": 5.5600000000000005,
"grad_norm": 2.935079336166382,
"learning_rate": 1.5866550788802815e-05,
"loss": 0.5615,
"step": 1390
},
{
"epoch": 5.6,
"grad_norm": 1.9093431234359741,
"learning_rate": 1.579091467807668e-05,
"loss": 0.5537,
"step": 1400
},
{
"epoch": 5.64,
"grad_norm": 2.825533628463745,
"learning_rate": 1.5714776807137128e-05,
"loss": 0.6197,
"step": 1410
},
{
"epoch": 5.68,
"grad_norm": 3.172933578491211,
"learning_rate": 1.5638143773034268e-05,
"loss": 0.5415,
"step": 1420
},
{
"epoch": 5.72,
"grad_norm": 1.2932432889938354,
"learning_rate": 1.556102221572219e-05,
"loss": 0.5792,
"step": 1430
},
{
"epoch": 5.76,
"grad_norm": 2.1744470596313477,
"learning_rate": 1.5483418817483607e-05,
"loss": 0.52,
"step": 1440
},
{
"epoch": 5.8,
"grad_norm": 2.4148924350738525,
"learning_rate": 1.540534030235087e-05,
"loss": 0.602,
"step": 1450
},
{
"epoch": 5.84,
"grad_norm": 2.427771806716919,
"learning_rate": 1.5326793435523374e-05,
"loss": 0.515,
"step": 1460
},
{
"epoch": 5.88,
"grad_norm": 1.566942811012268,
"learning_rate": 1.5247785022781343e-05,
"loss": 0.5795,
"step": 1470
},
{
"epoch": 5.92,
"grad_norm": 1.7555649280548096,
"learning_rate": 1.5168321909896171e-05,
"loss": 0.5819,
"step": 1480
},
{
"epoch": 5.96,
"grad_norm": 1.6367748975753784,
"learning_rate": 1.5088410982037251e-05,
"loss": 0.5244,
"step": 1490
},
{
"epoch": 6.0,
"grad_norm": 1.9994490146636963,
"learning_rate": 1.50080591631754e-05,
"loss": 0.5823,
"step": 1500
},
{
"epoch": 6.04,
"grad_norm": 6.0024261474609375,
"learning_rate": 1.4927273415482916e-05,
"loss": 0.5641,
"step": 1510
},
{
"epoch": 6.08,
"grad_norm": 1.50034499168396,
"learning_rate": 1.484606073873035e-05,
"loss": 0.5325,
"step": 1520
},
{
"epoch": 6.12,
"grad_norm": 12.008216857910156,
"learning_rate": 1.4764428169679987e-05,
"loss": 0.5384,
"step": 1530
},
{
"epoch": 6.16,
"grad_norm": 1.6358847618103027,
"learning_rate": 1.4682382781476146e-05,
"loss": 0.5822,
"step": 1540
},
{
"epoch": 6.2,
"grad_norm": 2.520883321762085,
"learning_rate": 1.4599931683032327e-05,
"loss": 0.5256,
"step": 1550
},
{
"epoch": 6.24,
"grad_norm": 1.136460304260254,
"learning_rate": 1.4517082018415231e-05,
"loss": 0.5589,
"step": 1560
},
{
"epoch": 6.28,
"grad_norm": 2.730435848236084,
"learning_rate": 1.4433840966225772e-05,
"loss": 0.4939,
"step": 1570
},
{
"epoch": 6.32,
"grad_norm": 0.9575507044792175,
"learning_rate": 1.4350215738977077e-05,
"loss": 0.5277,
"step": 1580
},
{
"epoch": 6.36,
"grad_norm": 1.9613964557647705,
"learning_rate": 1.4266213582469543e-05,
"loss": 0.5457,
"step": 1590
},
{
"epoch": 6.4,
"grad_norm": 2.436429500579834,
"learning_rate": 1.4181841775163014e-05,
"loss": 0.547,
"step": 1600
},
{
"epoch": 6.44,
"grad_norm": 2.1270251274108887,
"learning_rate": 1.409710762754615e-05,
"loss": 0.5422,
"step": 1610
},
{
"epoch": 6.48,
"grad_norm": 2.4047000408172607,
"learning_rate": 1.4012018481502975e-05,
"loss": 0.5219,
"step": 1620
},
{
"epoch": 6.52,
"grad_norm": 1.1183472871780396,
"learning_rate": 1.3926581709676752e-05,
"loss": 0.5088,
"step": 1630
},
{
"epoch": 6.5600000000000005,
"grad_norm": 1.89410400390625,
"learning_rate": 1.3840804714831164e-05,
"loss": 0.5707,
"step": 1640
},
{
"epoch": 6.6,
"grad_norm": 1.2478140592575073,
"learning_rate": 1.3754694929208891e-05,
"loss": 0.5893,
"step": 1650
},
{
"epoch": 6.64,
"grad_norm": 4.104971885681152,
"learning_rate": 1.3668259813887644e-05,
"loss": 0.5768,
"step": 1660
},
{
"epoch": 6.68,
"grad_norm": 1.4798212051391602,
"learning_rate": 1.3581506858133677e-05,
"loss": 0.5223,
"step": 1670
},
{
"epoch": 6.72,
"grad_norm": 3.716698408126831,
"learning_rate": 1.3494443578752893e-05,
"loss": 0.5208,
"step": 1680
},
{
"epoch": 6.76,
"grad_norm": 2.887411117553711,
"learning_rate": 1.340707751943952e-05,
"loss": 0.5415,
"step": 1690
},
{
"epoch": 6.8,
"grad_norm": 3.6842939853668213,
"learning_rate": 1.3319416250122484e-05,
"loss": 0.5272,
"step": 1700
},
{
"epoch": 6.84,
"grad_norm": 4.174267292022705,
"learning_rate": 1.3231467366309523e-05,
"loss": 0.5341,
"step": 1710
},
{
"epoch": 6.88,
"grad_norm": 4.511580467224121,
"learning_rate": 1.3143238488429042e-05,
"loss": 0.573,
"step": 1720
},
{
"epoch": 6.92,
"grad_norm": 1.930474042892456,
"learning_rate": 1.3054737261169838e-05,
"loss": 0.5134,
"step": 1730
},
{
"epoch": 6.96,
"grad_norm": 1.2770944833755493,
"learning_rate": 1.2965971352818736e-05,
"loss": 0.4917,
"step": 1740
},
{
"epoch": 7.0,
"grad_norm": 2.182473659515381,
"learning_rate": 1.287694845459613e-05,
"loss": 0.5725,
"step": 1750
},
{
"epoch": 7.04,
"grad_norm": 4.948633670806885,
"learning_rate": 1.2787676279989594e-05,
"loss": 0.5122,
"step": 1760
},
{
"epoch": 7.08,
"grad_norm": 2.5241198539733887,
"learning_rate": 1.2698162564085536e-05,
"loss": 0.4839,
"step": 1770
},
{
"epoch": 7.12,
"grad_norm": 1.8158693313598633,
"learning_rate": 1.2608415062898971e-05,
"loss": 0.4502,
"step": 1780
},
{
"epoch": 7.16,
"grad_norm": 3.540010929107666,
"learning_rate": 1.2518441552701493e-05,
"loss": 0.5585,
"step": 1790
},
{
"epoch": 7.2,
"grad_norm": 1.9738972187042236,
"learning_rate": 1.2428249829347509e-05,
"loss": 0.4918,
"step": 1800
},
{
"epoch": 7.24,
"grad_norm": 1.0376015901565552,
"learning_rate": 1.2337847707598738e-05,
"loss": 0.4989,
"step": 1810
},
{
"epoch": 7.28,
"grad_norm": 2.5017688274383545,
"learning_rate": 1.2247243020447104e-05,
"loss": 0.4962,
"step": 1820
},
{
"epoch": 7.32,
"grad_norm": 1.2260102033615112,
"learning_rate": 1.2156443618436033e-05,
"loss": 0.5316,
"step": 1830
},
{
"epoch": 7.36,
"grad_norm": 2.4500176906585693,
"learning_rate": 1.2065457368980236e-05,
"loss": 0.4841,
"step": 1840
},
{
"epoch": 7.4,
"grad_norm": 3.0221171379089355,
"learning_rate": 1.197429215568403e-05,
"loss": 0.5507,
"step": 1850
},
{
"epoch": 7.44,
"grad_norm": 5.26624059677124,
"learning_rate": 1.1882955877658252e-05,
"loss": 0.532,
"step": 1860
},
{
"epoch": 7.48,
"grad_norm": 2.411428213119507,
"learning_rate": 1.1791456448835825e-05,
"loss": 0.4802,
"step": 1870
},
{
"epoch": 7.52,
"grad_norm": 2.334620952606201,
"learning_rate": 1.169980179728606e-05,
"loss": 0.5331,
"step": 1880
},
{
"epoch": 7.5600000000000005,
"grad_norm": 2.4401047229766846,
"learning_rate": 1.1607999864527718e-05,
"loss": 0.4994,
"step": 1890
},
{
"epoch": 7.6,
"grad_norm": 2.3867135047912598,
"learning_rate": 1.1516058604840891e-05,
"loss": 0.5124,
"step": 1900
},
{
"epoch": 7.64,
"grad_norm": 2.3309555053710938,
"learning_rate": 1.1423985984577813e-05,
"loss": 0.574,
"step": 1910
},
{
"epoch": 7.68,
"grad_norm": 1.1885383129119873,
"learning_rate": 1.1331789981472603e-05,
"loss": 0.5361,
"step": 1920
},
{
"epoch": 7.72,
"grad_norm": 1.6586416959762573,
"learning_rate": 1.1239478583950019e-05,
"loss": 0.5388,
"step": 1930
},
{
"epoch": 7.76,
"grad_norm": 1.3869335651397705,
"learning_rate": 1.1147059790433296e-05,
"loss": 0.536,
"step": 1940
},
{
"epoch": 7.8,
"grad_norm": 1.5383076667785645,
"learning_rate": 1.1054541608651121e-05,
"loss": 0.5165,
"step": 1950
},
{
"epoch": 7.84,
"grad_norm": 1.1627497673034668,
"learning_rate": 1.0961932054943778e-05,
"loss": 0.5369,
"step": 1960
},
{
"epoch": 7.88,
"grad_norm": 1.4803476333618164,
"learning_rate": 1.0869239153568575e-05,
"loss": 0.548,
"step": 1970
},
{
"epoch": 7.92,
"grad_norm": 1.503915786743164,
"learning_rate": 1.0776470936004572e-05,
"loss": 0.5377,
"step": 1980
},
{
"epoch": 7.96,
"grad_norm": 1.9053574800491333,
"learning_rate": 1.0683635440256689e-05,
"loss": 0.5249,
"step": 1990
},
{
"epoch": 8.0,
"grad_norm": 2.171719551086426,
"learning_rate": 1.059074071015923e-05,
"loss": 0.5162,
"step": 2000
},
{
"epoch": 8.04,
"grad_norm": 3.7397103309631348,
"learning_rate": 1.0497794794678923e-05,
"loss": 0.5067,
"step": 2010
},
{
"epoch": 8.08,
"grad_norm": 3.237569570541382,
"learning_rate": 1.0404805747217525e-05,
"loss": 0.4901,
"step": 2020
},
{
"epoch": 8.12,
"grad_norm": 2.6131529808044434,
"learning_rate": 1.0311781624914e-05,
"loss": 0.4834,
"step": 2030
},
{
"epoch": 8.16,
"grad_norm": 2.543020009994507,
"learning_rate": 1.0228036587536431e-05,
"loss": 0.4991,
"step": 2040
},
{
"epoch": 8.2,
"grad_norm": 2.420510768890381,
"learning_rate": 1.013496803077246e-05,
"loss": 0.5326,
"step": 2050
},
{
"epoch": 8.24,
"grad_norm": 1.7979626655578613,
"learning_rate": 1.0041887779554041e-05,
"loss": 0.501,
"step": 2060
},
{
"epoch": 8.28,
"grad_norm": 3.0351650714874268,
"learning_rate": 9.948803898922586e-06,
"loss": 0.5263,
"step": 2070
},
{
"epoch": 8.32,
"grad_norm": 2.1602799892425537,
"learning_rate": 9.85572445423399e-06,
"loss": 0.505,
"step": 2080
},
{
"epoch": 8.36,
"grad_norm": 2.298388957977295,
"learning_rate": 9.762657510459784e-06,
"loss": 0.4962,
"step": 2090
},
{
"epoch": 8.4,
"grad_norm": 1.9878581762313843,
"learning_rate": 9.669611131488346e-06,
"loss": 0.5086,
"step": 2100
},
{
"epoch": 8.44,
"grad_norm": 3.1122074127197266,
"learning_rate": 9.576593379426196e-06,
"loss": 0.5105,
"step": 2110
},
{
"epoch": 8.48,
"grad_norm": 1.8491990566253662,
"learning_rate": 9.483612313899436e-06,
"loss": 0.5028,
"step": 2120
},
{
"epoch": 8.52,
"grad_norm": 2.2476413249969482,
"learning_rate": 9.390675991355435e-06,
"loss": 0.5273,
"step": 2130
},
{
"epoch": 8.56,
"grad_norm": 3.6653342247009277,
"learning_rate": 9.297792464364748e-06,
"loss": 0.4313,
"step": 2140
},
{
"epoch": 8.6,
"grad_norm": 0.8962536454200745,
"learning_rate": 9.204969780923404e-06,
"loss": 0.5045,
"step": 2150
},
{
"epoch": 8.64,
"grad_norm": 6.7541823387146,
"learning_rate": 9.112215983755573e-06,
"loss": 0.4818,
"step": 2160
},
{
"epoch": 8.68,
"grad_norm": 3.15523362159729,
"learning_rate": 9.019539109616694e-06,
"loss": 0.4779,
"step": 2170
},
{
"epoch": 8.72,
"grad_norm": 1.048300862312317,
"learning_rate": 8.926947188597133e-06,
"loss": 0.4815,
"step": 2180
},
{
"epoch": 8.76,
"grad_norm": 4.415710926055908,
"learning_rate": 8.8344482434264e-06,
"loss": 0.5259,
"step": 2190
},
{
"epoch": 8.8,
"grad_norm": 4.474966049194336,
"learning_rate": 8.742050288778e-06,
"loss": 0.5378,
"step": 2200
},
{
"epoch": 8.84,
"grad_norm": 3.487746477127075,
"learning_rate": 8.649761330575009e-06,
"loss": 0.5144,
"step": 2210
},
{
"epoch": 8.88,
"grad_norm": 1.44117271900177,
"learning_rate": 8.557589365296385e-06,
"loss": 0.5383,
"step": 2220
},
{
"epoch": 8.92,
"grad_norm": 2.9286913871765137,
"learning_rate": 8.4655423792841e-06,
"loss": 0.4653,
"step": 2230
},
{
"epoch": 8.96,
"grad_norm": 1.4172818660736084,
"learning_rate": 8.373628348051165e-06,
"loss": 0.4868,
"step": 2240
},
{
"epoch": 9.0,
"grad_norm": 1.9049030542373657,
"learning_rate": 8.281855235590574e-06,
"loss": 0.5606,
"step": 2250
},
{
"epoch": 9.04,
"grad_norm": 3.6202874183654785,
"learning_rate": 8.19023099368526e-06,
"loss": 0.4717,
"step": 2260
},
{
"epoch": 9.08,
"grad_norm": 1.4381736516952515,
"learning_rate": 8.098763561219101e-06,
"loss": 0.4578,
"step": 2270
},
{
"epoch": 9.12,
"grad_norm": 3.8551642894744873,
"learning_rate": 8.007460863489042e-06,
"loss": 0.4553,
"step": 2280
},
{
"epoch": 9.16,
"grad_norm": 2.2333943843841553,
"learning_rate": 7.91633081151841e-06,
"loss": 0.4861,
"step": 2290
},
{
"epoch": 9.2,
"grad_norm": 3.517455816268921,
"learning_rate": 7.825381301371452e-06,
"loss": 0.4518,
"step": 2300
},
{
"epoch": 9.24,
"grad_norm": 1.2912664413452148,
"learning_rate": 7.734620213469166e-06,
"loss": 0.4832,
"step": 2310
},
{
"epoch": 9.28,
"grad_norm": 3.6948964595794678,
"learning_rate": 7.644055411906493e-06,
"loss": 0.4969,
"step": 2320
},
{
"epoch": 9.32,
"grad_norm": 1.5953376293182373,
"learning_rate": 7.553694743770928e-06,
"loss": 0.4606,
"step": 2330
},
{
"epoch": 9.36,
"grad_norm": 2.7939870357513428,
"learning_rate": 7.463546038462602e-06,
"loss": 0.5225,
"step": 2340
},
{
"epoch": 9.4,
"grad_norm": 1.0297088623046875,
"learning_rate": 7.373617107015889e-06,
"loss": 0.529,
"step": 2350
},
{
"epoch": 9.44,
"grad_norm": 2.87479305267334,
"learning_rate": 7.283915741422611e-06,
"loss": 0.5134,
"step": 2360
},
{
"epoch": 9.48,
"grad_norm": 3.1623082160949707,
"learning_rate": 7.194449713956908e-06,
"loss": 0.4509,
"step": 2370
},
{
"epoch": 9.52,
"grad_norm": 1.8917375802993774,
"learning_rate": 7.105226776501772e-06,
"loss": 0.5175,
"step": 2380
},
{
"epoch": 9.56,
"grad_norm": 1.6095237731933594,
"learning_rate": 7.016254659877398e-06,
"loss": 0.4742,
"step": 2390
},
{
"epoch": 9.6,
"grad_norm": 3.2498207092285156,
"learning_rate": 6.927541073171333e-06,
"loss": 0.4605,
"step": 2400
},
{
"epoch": 9.64,
"grad_norm": 1.7395751476287842,
"learning_rate": 6.839093703070512e-06,
"loss": 0.4987,
"step": 2410
},
{
"epoch": 9.68,
"grad_norm": 2.4571480751037598,
"learning_rate": 6.750920213195238e-06,
"loss": 0.4829,
"step": 2420
},
{
"epoch": 9.72,
"grad_norm": 4.019631385803223,
"learning_rate": 6.6630282434351535e-06,
"loss": 0.4842,
"step": 2430
},
{
"epoch": 9.76,
"grad_norm": 2.756540298461914,
"learning_rate": 6.575425409287292e-06,
"loss": 0.5198,
"step": 2440
},
{
"epoch": 9.8,
"grad_norm": 2.0040042400360107,
"learning_rate": 6.488119301196201e-06,
"loss": 0.5239,
"step": 2450
},
{
"epoch": 9.84,
"grad_norm": 7.419244766235352,
"learning_rate": 6.4011174838962706e-06,
"loss": 0.4636,
"step": 2460
},
{
"epoch": 9.88,
"grad_norm": 2.879230260848999,
"learning_rate": 6.314427495756283e-06,
"loss": 0.4693,
"step": 2470
},
{
"epoch": 9.92,
"grad_norm": 2.1217892169952393,
"learning_rate": 6.228056848126236e-06,
"loss": 0.475,
"step": 2480
},
{
"epoch": 9.96,
"grad_norm": 2.1474809646606445,
"learning_rate": 6.142013024686509e-06,
"loss": 0.4995,
"step": 2490
},
{
"epoch": 10.0,
"grad_norm": 4.3783721923828125,
"learning_rate": 6.056303480799449e-06,
"loss": 0.486,
"step": 2500
},
{
"epoch": 10.04,
"grad_norm": 1.397594928741455,
"learning_rate": 5.970935642863375e-06,
"loss": 0.4537,
"step": 2510
},
{
"epoch": 10.08,
"grad_norm": 3.2472903728485107,
"learning_rate": 5.885916907669114e-06,
"loss": 0.3856,
"step": 2520
},
{
"epoch": 10.12,
"grad_norm": 3.1067910194396973,
"learning_rate": 5.801254641759103e-06,
"loss": 0.4705,
"step": 2530
},
{
"epoch": 10.16,
"grad_norm": 2.5792055130004883,
"learning_rate": 5.716956180789098e-06,
"loss": 0.5011,
"step": 2540
},
{
"epoch": 10.2,
"grad_norm": 3.1249446868896484,
"learning_rate": 5.6330288288925805e-06,
"loss": 0.462,
"step": 2550
},
{
"epoch": 10.24,
"grad_norm": 4.708195209503174,
"learning_rate": 5.549479858047875e-06,
"loss": 0.5043,
"step": 2560
},
{
"epoch": 10.28,
"grad_norm": 1.4850634336471558,
"learning_rate": 5.466316507448049e-06,
"loss": 0.5244,
"step": 2570
},
{
"epoch": 10.32,
"grad_norm": 1.0298686027526855,
"learning_rate": 5.3835459828736945e-06,
"loss": 0.4362,
"step": 2580
},
{
"epoch": 10.36,
"grad_norm": 2.251105546951294,
"learning_rate": 5.30117545606854e-06,
"loss": 0.4788,
"step": 2590
},
{
"epoch": 10.4,
"grad_norm": 4.622702121734619,
"learning_rate": 5.219212064118079e-06,
"loss": 0.4265,
"step": 2600
},
{
"epoch": 10.44,
"grad_norm": 3.4977996349334717,
"learning_rate": 5.137662908831147e-06,
"loss": 0.5,
"step": 2610
},
{
"epoch": 10.48,
"grad_norm": 5.369349002838135,
"learning_rate": 5.056535056124592e-06,
"loss": 0.4409,
"step": 2620
},
{
"epoch": 10.52,
"grad_norm": 2.318140983581543,
"learning_rate": 4.97583553541102e-06,
"loss": 0.4594,
"step": 2630
},
{
"epoch": 10.56,
"grad_norm": 2.4116406440734863,
"learning_rate": 4.895571338989754e-06,
"loss": 0.4953,
"step": 2640
},
{
"epoch": 10.6,
"grad_norm": 3.0506629943847656,
"learning_rate": 4.8157494214409475e-06,
"loss": 0.4795,
"step": 2650
},
{
"epoch": 10.64,
"grad_norm": 1.8125630617141724,
"learning_rate": 4.736376699023023e-06,
"loss": 0.481,
"step": 2660
},
{
"epoch": 10.68,
"grad_norm": 1.9686360359191895,
"learning_rate": 4.6574600490733794e-06,
"loss": 0.4713,
"step": 2670
},
{
"epoch": 10.72,
"grad_norm": 2.8264060020446777,
"learning_rate": 4.579006309412533e-06,
"loss": 0.4501,
"step": 2680
},
{
"epoch": 10.76,
"grad_norm": 1.9835346937179565,
"learning_rate": 4.501022277751602e-06,
"loss": 0.4754,
"step": 2690
},
{
"epoch": 10.8,
"grad_norm": 4.487490653991699,
"learning_rate": 4.423514711103355e-06,
"loss": 0.5056,
"step": 2700
},
{
"epoch": 10.84,
"grad_norm": 3.1984522342681885,
"learning_rate": 4.346490325196704e-06,
"loss": 0.4415,
"step": 2710
},
{
"epoch": 10.88,
"grad_norm": 2.0367348194122314,
"learning_rate": 4.26995579389485e-06,
"loss": 0.5117,
"step": 2720
},
{
"epoch": 10.92,
"grad_norm": 1.78911292552948,
"learning_rate": 4.193917748616979e-06,
"loss": 0.475,
"step": 2730
},
{
"epoch": 10.96,
"grad_norm": 2.0589475631713867,
"learning_rate": 4.118382777763711e-06,
"loss": 0.4363,
"step": 2740
},
{
"epoch": 11.0,
"grad_norm": 3.54664945602417,
"learning_rate": 4.04335742614622e-06,
"loss": 0.4665,
"step": 2750
},
{
"epoch": 11.04,
"grad_norm": 1.714920997619629,
"learning_rate": 3.968848194419163e-06,
"loss": 0.4515,
"step": 2760
},
{
"epoch": 11.08,
"grad_norm": 11.161652565002441,
"learning_rate": 3.894861538517401e-06,
"loss": 0.4285,
"step": 2770
},
{
"epoch": 11.12,
"grad_norm": 2.627831220626831,
"learning_rate": 3.821403869096658e-06,
"loss": 0.4343,
"step": 2780
},
{
"epoch": 11.16,
"grad_norm": 2.6865172386169434,
"learning_rate": 3.748481550978017e-06,
"loss": 0.4766,
"step": 2790
},
{
"epoch": 11.2,
"grad_norm": 4.996657848358154,
"learning_rate": 3.6761009025964657e-06,
"loss": 0.4096,
"step": 2800
},
{
"epoch": 11.24,
"grad_norm": 1.6282066106796265,
"learning_rate": 3.604268195453421e-06,
"loss": 0.4622,
"step": 2810
},
{
"epoch": 11.28,
"grad_norm": 1.6030402183532715,
"learning_rate": 3.5329896535733133e-06,
"loss": 0.4437,
"step": 2820
},
{
"epoch": 11.32,
"grad_norm": 2.8916399478912354,
"learning_rate": 3.462271452964321e-06,
"loss": 0.4871,
"step": 2830
},
{
"epoch": 11.36,
"grad_norm": 2.375190019607544,
"learning_rate": 3.3921197210832235e-06,
"loss": 0.4575,
"step": 2840
},
{
"epoch": 11.4,
"grad_norm": 4.021700382232666,
"learning_rate": 3.3225405363045016e-06,
"loss": 0.4699,
"step": 2850
},
{
"epoch": 11.44,
"grad_norm": 1.7844669818878174,
"learning_rate": 3.2535399273936407e-06,
"loss": 0.4648,
"step": 2860
},
{
"epoch": 11.48,
"grad_norm": 2.744528293609619,
"learning_rate": 3.1851238729848033e-06,
"loss": 0.3923,
"step": 2870
},
{
"epoch": 11.52,
"grad_norm": 1.6203703880310059,
"learning_rate": 3.11729830106276e-06,
"loss": 0.4717,
"step": 2880
},
{
"epoch": 11.56,
"grad_norm": 1.696370244026184,
"learning_rate": 3.0500690884492836e-06,
"loss": 0.4556,
"step": 2890
},
{
"epoch": 11.6,
"grad_norm": 4.04744291305542,
"learning_rate": 2.983442060293926e-06,
"loss": 0.4785,
"step": 2900
},
{
"epoch": 11.64,
"grad_norm": 2.629739284515381,
"learning_rate": 2.917422989569311e-06,
"loss": 0.463,
"step": 2910
},
{
"epoch": 11.68,
"grad_norm": 2.43945050239563,
"learning_rate": 2.852017596570901e-06,
"loss": 0.4551,
"step": 2920
},
{
"epoch": 11.72,
"grad_norm": 6.5788116455078125,
"learning_rate": 2.7872315484213954e-06,
"loss": 0.4501,
"step": 2930
},
{
"epoch": 11.76,
"grad_norm": 2.3283305168151855,
"learning_rate": 2.723070458579653e-06,
"loss": 0.4338,
"step": 2940
},
{
"epoch": 11.8,
"grad_norm": 4.168436527252197,
"learning_rate": 2.6595398863543407e-06,
"loss": 0.4744,
"step": 2950
},
{
"epoch": 11.84,
"grad_norm": 3.3579213619232178,
"learning_rate": 2.596645336422219e-06,
"loss": 0.4257,
"step": 2960
},
{
"epoch": 11.88,
"grad_norm": 4.208755970001221,
"learning_rate": 2.5343922583512026e-06,
"loss": 0.4676,
"step": 2970
},
{
"epoch": 11.92,
"grad_norm": 2.752279281616211,
"learning_rate": 2.472786046128156e-06,
"loss": 0.455,
"step": 2980
},
{
"epoch": 11.96,
"grad_norm": 3.5390079021453857,
"learning_rate": 2.411832037691545e-06,
"loss": 0.4646,
"step": 2990
},
{
"epoch": 12.0,
"grad_norm": 2.805065870285034,
"learning_rate": 2.3515355144689155e-06,
"loss": 0.4774,
"step": 3000
},
{
"epoch": 12.04,
"grad_norm": 2.3203487396240234,
"learning_rate": 2.2919017009192703e-06,
"loss": 0.4333,
"step": 3010
},
{
"epoch": 12.08,
"grad_norm": 2.62695050239563,
"learning_rate": 2.2329357640804118e-06,
"loss": 0.456,
"step": 3020
},
{
"epoch": 12.12,
"grad_norm": 1.8128643035888672,
"learning_rate": 2.1746428131212126e-06,
"loss": 0.4054,
"step": 3030
},
{
"epoch": 12.16,
"grad_norm": 1.5641071796417236,
"learning_rate": 2.117027898898948e-06,
"loss": 0.4875,
"step": 3040
},
{
"epoch": 12.2,
"grad_norm": 2.1929521560668945,
"learning_rate": 2.0600960135216463e-06,
"loss": 0.4041,
"step": 3050
},
{
"epoch": 12.24,
"grad_norm": 1.746474027633667,
"learning_rate": 2.003852089915548e-06,
"loss": 0.5115,
"step": 3060
},
{
"epoch": 12.28,
"grad_norm": 2.731269121170044,
"learning_rate": 1.9483010013976766e-06,
"loss": 0.4459,
"step": 3070
},
{
"epoch": 12.32,
"grad_norm": 1.9771169424057007,
"learning_rate": 1.8934475612536019e-06,
"loss": 0.3677,
"step": 3080
},
{
"epoch": 12.36,
"grad_norm": 3.043891191482544,
"learning_rate": 1.8392965223203707e-06,
"loss": 0.4353,
"step": 3090
},
{
"epoch": 12.4,
"grad_norm": 3.361074447631836,
"learning_rate": 1.7858525765747047e-06,
"loss": 0.4578,
"step": 3100
},
{
"epoch": 12.44,
"grad_norm": 3.8681182861328125,
"learning_rate": 1.7331203547264452e-06,
"loss": 0.4057,
"step": 3110
},
{
"epoch": 12.48,
"grad_norm": 2.581637382507324,
"learning_rate": 1.6811044258173425e-06,
"loss": 0.4532,
"step": 3120
},
{
"epoch": 12.52,
"grad_norm": 2.8616292476654053,
"learning_rate": 1.629809296825139e-06,
"loss": 0.4551,
"step": 3130
},
{
"epoch": 12.56,
"grad_norm": 2.4617111682891846,
"learning_rate": 1.579239412273078e-06,
"loss": 0.4388,
"step": 3140
},
{
"epoch": 12.6,
"grad_norm": 14.222563743591309,
"learning_rate": 1.5293991538447882e-06,
"loss": 0.412,
"step": 3150
},
{
"epoch": 12.64,
"grad_norm": 3.5921924114227295,
"learning_rate": 1.4802928400046457e-06,
"loss": 0.4517,
"step": 3160
},
{
"epoch": 12.68,
"grad_norm": 2.4046990871429443,
"learning_rate": 1.4319247256235713e-06,
"loss": 0.4893,
"step": 3170
},
{
"epoch": 12.72,
"grad_norm": 2.5496039390563965,
"learning_rate": 1.3842990016103886e-06,
"loss": 0.4305,
"step": 3180
},
{
"epoch": 12.76,
"grad_norm": 2.3980159759521484,
"learning_rate": 1.3374197945486833e-06,
"loss": 0.3833,
"step": 3190
},
{
"epoch": 12.8,
"grad_norm": 1.515519142150879,
"learning_rate": 1.2912911663392468e-06,
"loss": 0.4513,
"step": 3200
},
{
"epoch": 12.84,
"grad_norm": 2.939988136291504,
"learning_rate": 1.245917113848144e-06,
"loss": 0.4712,
"step": 3210
},
{
"epoch": 12.88,
"grad_norm": 1.846997857093811,
"learning_rate": 1.2013015685603813e-06,
"loss": 0.4789,
"step": 3220
},
{
"epoch": 12.92,
"grad_norm": 2.960897445678711,
"learning_rate": 1.1574483962392768e-06,
"loss": 0.4128,
"step": 3230
},
{
"epoch": 12.96,
"grad_norm": 1.8638381958007812,
"learning_rate": 1.114361396591498e-06,
"loss": 0.4949,
"step": 3240
},
{
"epoch": 13.0,
"grad_norm": 2.134097099304199,
"learning_rate": 1.0720443029378303e-06,
"loss": 0.4167,
"step": 3250
},
{
"epoch": 13.04,
"grad_norm": 2.2456820011138916,
"learning_rate": 1.0305007818897006e-06,
"loss": 0.4483,
"step": 3260
},
{
"epoch": 13.08,
"grad_norm": 9.045520782470703,
"learning_rate": 9.897344330314862e-07,
"loss": 0.454,
"step": 3270
},
{
"epoch": 13.12,
"grad_norm": 2.646930694580078,
"learning_rate": 9.497487886086132e-07,
"loss": 0.4438,
"step": 3280
},
{
"epoch": 13.16,
"grad_norm": 4.203260898590088,
"learning_rate": 9.105473132215126e-07,
"loss": 0.3904,
"step": 3290
},
{
"epoch": 13.2,
"grad_norm": 3.177109479904175,
"learning_rate": 8.721334035254203e-07,
"loss": 0.4128,
"step": 3300
},
{
"epoch": 13.24,
"grad_norm": 1.825671911239624,
"learning_rate": 8.345103879360695e-07,
"loss": 0.4479,
"step": 3310
},
{
"epoch": 13.28,
"grad_norm": 3.2267651557922363,
"learning_rate": 7.976815263412963e-07,
"loss": 0.3928,
"step": 3320
},
{
"epoch": 13.32,
"grad_norm": 3.6811180114746094,
"learning_rate": 7.616500098185908e-07,
"loss": 0.4163,
"step": 3330
},
{
"epoch": 13.36,
"grad_norm": 3.258467435836792,
"learning_rate": 7.264189603585892e-07,
"loss": 0.4186,
"step": 3340
},
{
"epoch": 13.4,
"grad_norm": 1.861440658569336,
"learning_rate": 6.919914305945774e-07,
"loss": 0.4416,
"step": 3350
},
{
"epoch": 13.44,
"grad_norm": 3.303765296936035,
"learning_rate": 6.58370403537989e-07,
"loss": 0.3958,
"step": 3360
},
{
"epoch": 13.48,
"grad_norm": 1.9865639209747314,
"learning_rate": 6.255587923199313e-07,
"loss": 0.4424,
"step": 3370
},
{
"epoch": 13.52,
"grad_norm": 2.3877573013305664,
"learning_rate": 5.935594399387856e-07,
"loss": 0.4778,
"step": 3380
},
{
"epoch": 13.56,
"grad_norm": 2.6566929817199707,
"learning_rate": 5.623751190138682e-07,
"loss": 0.4045,
"step": 3390
},
{
"epoch": 13.6,
"grad_norm": 4.0699052810668945,
"learning_rate": 5.320085315451862e-07,
"loss": 0.4275,
"step": 3400
},
{
"epoch": 13.64,
"grad_norm": 4.235281944274902,
"learning_rate": 5.024623086793323e-07,
"loss": 0.4346,
"step": 3410
},
{
"epoch": 13.68,
"grad_norm": 1.7936820983886719,
"learning_rate": 4.737390104814954e-07,
"loss": 0.4343,
"step": 3420
},
{
"epoch": 13.72,
"grad_norm": 4.244424343109131,
"learning_rate": 4.458411257136486e-07,
"loss": 0.4355,
"step": 3430
},
{
"epoch": 13.76,
"grad_norm": 3.0799527168273926,
"learning_rate": 4.1877107161890416e-07,
"loss": 0.4407,
"step": 3440
},
{
"epoch": 13.8,
"grad_norm": 3.4574971199035645,
"learning_rate": 3.9253119371206684e-07,
"loss": 0.4369,
"step": 3450
},
{
"epoch": 13.84,
"grad_norm": 1.3924965858459473,
"learning_rate": 3.671237655764104e-07,
"loss": 0.4823,
"step": 3460
},
{
"epoch": 13.88,
"grad_norm": 1.9487115144729614,
"learning_rate": 3.4255098866667114e-07,
"loss": 0.4566,
"step": 3470
},
{
"epoch": 13.92,
"grad_norm": 2.6502346992492676,
"learning_rate": 3.188149921183115e-07,
"loss": 0.4824,
"step": 3480
},
{
"epoch": 13.96,
"grad_norm": 3.1728081703186035,
"learning_rate": 2.959178325630296e-07,
"loss": 0.3983,
"step": 3490
},
{
"epoch": 14.0,
"grad_norm": 2.273251533508301,
"learning_rate": 2.7386149395056463e-07,
"loss": 0.4541,
"step": 3500
},
{
"epoch": 14.04,
"grad_norm": 2.2681076526641846,
"learning_rate": 2.526478873767946e-07,
"loss": 0.4667,
"step": 3510
},
{
"epoch": 14.08,
"grad_norm": 2.3255577087402344,
"learning_rate": 2.322788509181484e-07,
"loss": 0.441,
"step": 3520
},
{
"epoch": 14.12,
"grad_norm": 1.8558521270751953,
"learning_rate": 2.1275614947233624e-07,
"loss": 0.4294,
"step": 3530
},
{
"epoch": 14.16,
"grad_norm": 3.000098943710327,
"learning_rate": 1.9408147460544203e-07,
"loss": 0.4245,
"step": 3540
},
{
"epoch": 14.2,
"grad_norm": 3.5599377155303955,
"learning_rate": 1.7625644440534384e-07,
"loss": 0.3884,
"step": 3550
},
{
"epoch": 14.24,
"grad_norm": 3.208889961242676,
"learning_rate": 1.5928260334151847e-07,
"loss": 0.4434,
"step": 3560
},
{
"epoch": 14.28,
"grad_norm": 3.4625139236450195,
"learning_rate": 1.4316142213121386e-07,
"loss": 0.4474,
"step": 3570
},
{
"epoch": 14.32,
"grad_norm": 4.883245944976807,
"learning_rate": 1.2789429761202565e-07,
"loss": 0.3927,
"step": 3580
},
{
"epoch": 14.36,
"grad_norm": 4.052615165710449,
"learning_rate": 1.134825526208605e-07,
"loss": 0.3967,
"step": 3590
},
{
"epoch": 14.4,
"grad_norm": 1.4959968328475952,
"learning_rate": 9.992743587931674e-08,
"loss": 0.4807,
"step": 3600
},
{
"epoch": 14.44,
"grad_norm": 2.757499933242798,
"learning_rate": 8.723012188549318e-08,
"loss": 0.4506,
"step": 3610
},
{
"epoch": 14.48,
"grad_norm": 2.0996549129486084,
"learning_rate": 7.539171081221597e-08,
"loss": 0.3934,
"step": 3620
},
{
"epoch": 14.52,
"grad_norm": 2.4962105751037598,
"learning_rate": 6.44132284117216e-08,
"loss": 0.4549,
"step": 3630
},
{
"epoch": 14.56,
"grad_norm": 1.9287108182907104,
"learning_rate": 5.429562592677018e-08,
"loss": 0.4208,
"step": 3640
},
{
"epoch": 14.6,
"grad_norm": 4.647493362426758,
"learning_rate": 4.503978000823028e-08,
"loss": 0.4376,
"step": 3650
},
{
"epoch": 14.64,
"grad_norm": 1.24240243434906,
"learning_rate": 3.6646492639118567e-08,
"loss": 0.4562,
"step": 3660
},
{
"epoch": 14.68,
"grad_norm": 2.359651565551758,
"learning_rate": 2.911649106511316e-08,
"loss": 0.42,
"step": 3670
},
{
"epoch": 14.72,
"grad_norm": 2.7108898162841797,
"learning_rate": 2.2450427731534052e-08,
"loss": 0.4489,
"step": 3680
},
{
"epoch": 14.76,
"grad_norm": 1.4741981029510498,
"learning_rate": 1.664888022682165e-08,
"loss": 0.4532,
"step": 3690
},
{
"epoch": 14.8,
"grad_norm": 1.676648497581482,
"learning_rate": 1.1712351232480157e-08,
"loss": 0.4375,
"step": 3700
},
{
"epoch": 14.84,
"grad_norm": 1.2909196615219116,
"learning_rate": 7.641268479531283e-09,
"loss": 0.4574,
"step": 3710
},
{
"epoch": 14.88,
"grad_norm": 3.2380595207214355,
"learning_rate": 4.435984711446128e-09,
"loss": 0.3538,
"step": 3720
},
{
"epoch": 14.92,
"grad_norm": 1.9485658407211304,
"learning_rate": 2.0967776535851802e-09,
"loss": 0.3884,
"step": 3730
},
{
"epoch": 14.96,
"grad_norm": 2.0014994144439697,
"learning_rate": 6.238499891353389e-10,
"loss": 0.4266,
"step": 3740
},
{
"epoch": 15.0,
"grad_norm": 2.8281970024108887,
"learning_rate": 1.7329341542859922e-11,
"loss": 0.4743,
"step": 3750
}
],
"logging_steps": 10,
"max_steps": 3750,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.815859931388314e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}