cbb-3b / checkpoint-732 /trainer_state.json
awilliamson's picture
Upload folder using huggingface_hub
5b9ca7f verified
{
"best_metric": 0.40529951453208923,
"best_model_checkpoint": "cbb-3b/checkpoint-732",
"epoch": 1.9986348122866895,
"eval_steps": 500,
"global_step": 732,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0027303754266211604,
"grad_norm": 0.7549694776535034,
"learning_rate": 1.360544217687075e-06,
"loss": 1.2225,
"step": 1
},
{
"epoch": 0.005460750853242321,
"grad_norm": 0.7538214325904846,
"learning_rate": 2.72108843537415e-06,
"loss": 1.2103,
"step": 2
},
{
"epoch": 0.008191126279863481,
"grad_norm": 0.7328954935073853,
"learning_rate": 4.081632653061224e-06,
"loss": 1.1858,
"step": 3
},
{
"epoch": 0.010921501706484642,
"grad_norm": 0.7359272837638855,
"learning_rate": 5.4421768707483e-06,
"loss": 1.1885,
"step": 4
},
{
"epoch": 0.013651877133105802,
"grad_norm": 0.740386426448822,
"learning_rate": 6.802721088435375e-06,
"loss": 1.1781,
"step": 5
},
{
"epoch": 0.016382252559726963,
"grad_norm": 0.6984951496124268,
"learning_rate": 8.163265306122448e-06,
"loss": 1.1395,
"step": 6
},
{
"epoch": 0.01911262798634812,
"grad_norm": 0.6689624786376953,
"learning_rate": 9.523809523809523e-06,
"loss": 1.137,
"step": 7
},
{
"epoch": 0.021843003412969283,
"grad_norm": 0.6134174466133118,
"learning_rate": 1.08843537414966e-05,
"loss": 1.1531,
"step": 8
},
{
"epoch": 0.024573378839590442,
"grad_norm": 0.5647606253623962,
"learning_rate": 1.2244897959183674e-05,
"loss": 1.1201,
"step": 9
},
{
"epoch": 0.027303754266211604,
"grad_norm": 0.541833221912384,
"learning_rate": 1.360544217687075e-05,
"loss": 1.0989,
"step": 10
},
{
"epoch": 0.030034129692832763,
"grad_norm": 0.4785626232624054,
"learning_rate": 1.4965986394557824e-05,
"loss": 1.0664,
"step": 11
},
{
"epoch": 0.032764505119453925,
"grad_norm": 0.42421552538871765,
"learning_rate": 1.6326530612244897e-05,
"loss": 1.057,
"step": 12
},
{
"epoch": 0.03549488054607509,
"grad_norm": 0.384870707988739,
"learning_rate": 1.7687074829931973e-05,
"loss": 0.9794,
"step": 13
},
{
"epoch": 0.03822525597269624,
"grad_norm": 0.31449463963508606,
"learning_rate": 1.9047619047619046e-05,
"loss": 0.9485,
"step": 14
},
{
"epoch": 0.040955631399317405,
"grad_norm": 0.29094135761260986,
"learning_rate": 2.0408163265306123e-05,
"loss": 0.9581,
"step": 15
},
{
"epoch": 0.04368600682593857,
"grad_norm": 0.2500893771648407,
"learning_rate": 2.17687074829932e-05,
"loss": 0.9363,
"step": 16
},
{
"epoch": 0.04641638225255973,
"grad_norm": 0.2445881962776184,
"learning_rate": 2.3129251700680275e-05,
"loss": 0.9186,
"step": 17
},
{
"epoch": 0.049146757679180884,
"grad_norm": 0.2477860301733017,
"learning_rate": 2.448979591836735e-05,
"loss": 0.9099,
"step": 18
},
{
"epoch": 0.05187713310580205,
"grad_norm": 0.24853268265724182,
"learning_rate": 2.5850340136054425e-05,
"loss": 0.912,
"step": 19
},
{
"epoch": 0.05460750853242321,
"grad_norm": 0.22501873970031738,
"learning_rate": 2.72108843537415e-05,
"loss": 0.8836,
"step": 20
},
{
"epoch": 0.05733788395904437,
"grad_norm": 0.21223071217536926,
"learning_rate": 2.857142857142857e-05,
"loss": 0.8651,
"step": 21
},
{
"epoch": 0.060068259385665526,
"grad_norm": 0.20172430574893951,
"learning_rate": 2.9931972789115647e-05,
"loss": 0.8393,
"step": 22
},
{
"epoch": 0.06279863481228669,
"grad_norm": 0.17902718484401703,
"learning_rate": 3.1292517006802724e-05,
"loss": 0.8033,
"step": 23
},
{
"epoch": 0.06552901023890785,
"grad_norm": 0.1813097447156906,
"learning_rate": 3.265306122448979e-05,
"loss": 0.8152,
"step": 24
},
{
"epoch": 0.06825938566552901,
"grad_norm": 0.19280143082141876,
"learning_rate": 3.401360544217687e-05,
"loss": 0.8051,
"step": 25
},
{
"epoch": 0.07098976109215017,
"grad_norm": 0.17157189548015594,
"learning_rate": 3.5374149659863946e-05,
"loss": 0.794,
"step": 26
},
{
"epoch": 0.07372013651877134,
"grad_norm": 0.1467738002538681,
"learning_rate": 3.673469387755102e-05,
"loss": 0.7874,
"step": 27
},
{
"epoch": 0.07645051194539249,
"grad_norm": 0.13913457095623016,
"learning_rate": 3.809523809523809e-05,
"loss": 0.7519,
"step": 28
},
{
"epoch": 0.07918088737201365,
"grad_norm": 0.13179022073745728,
"learning_rate": 3.945578231292517e-05,
"loss": 0.76,
"step": 29
},
{
"epoch": 0.08191126279863481,
"grad_norm": 0.1376553773880005,
"learning_rate": 4.0816326530612245e-05,
"loss": 0.7369,
"step": 30
},
{
"epoch": 0.08464163822525597,
"grad_norm": 0.14040575921535492,
"learning_rate": 4.217687074829932e-05,
"loss": 0.7463,
"step": 31
},
{
"epoch": 0.08737201365187713,
"grad_norm": 0.13217338919639587,
"learning_rate": 4.35374149659864e-05,
"loss": 0.7298,
"step": 32
},
{
"epoch": 0.0901023890784983,
"grad_norm": 0.11285194754600525,
"learning_rate": 4.4897959183673474e-05,
"loss": 0.7134,
"step": 33
},
{
"epoch": 0.09283276450511946,
"grad_norm": 0.10098642110824585,
"learning_rate": 4.625850340136055e-05,
"loss": 0.7238,
"step": 34
},
{
"epoch": 0.09556313993174062,
"grad_norm": 0.10341370850801468,
"learning_rate": 4.761904761904762e-05,
"loss": 0.6908,
"step": 35
},
{
"epoch": 0.09829351535836177,
"grad_norm": 0.09662918746471405,
"learning_rate": 4.89795918367347e-05,
"loss": 0.7,
"step": 36
},
{
"epoch": 0.10102389078498293,
"grad_norm": 0.09548471122980118,
"learning_rate": 5.034013605442177e-05,
"loss": 0.7207,
"step": 37
},
{
"epoch": 0.1037542662116041,
"grad_norm": 0.09512269496917725,
"learning_rate": 5.170068027210885e-05,
"loss": 0.7016,
"step": 38
},
{
"epoch": 0.10648464163822526,
"grad_norm": 0.0912129282951355,
"learning_rate": 5.3061224489795926e-05,
"loss": 0.6891,
"step": 39
},
{
"epoch": 0.10921501706484642,
"grad_norm": 0.08661182224750519,
"learning_rate": 5.4421768707483e-05,
"loss": 0.6982,
"step": 40
},
{
"epoch": 0.11194539249146758,
"grad_norm": 0.09124922007322311,
"learning_rate": 5.5782312925170065e-05,
"loss": 0.7051,
"step": 41
},
{
"epoch": 0.11467576791808874,
"grad_norm": 0.09174500405788422,
"learning_rate": 5.714285714285714e-05,
"loss": 0.6978,
"step": 42
},
{
"epoch": 0.1174061433447099,
"grad_norm": 0.0679943636059761,
"learning_rate": 5.850340136054422e-05,
"loss": 0.6889,
"step": 43
},
{
"epoch": 0.12013651877133105,
"grad_norm": 0.07204238325357437,
"learning_rate": 5.9863945578231295e-05,
"loss": 0.704,
"step": 44
},
{
"epoch": 0.12286689419795221,
"grad_norm": 0.08089234679937363,
"learning_rate": 6.122448979591838e-05,
"loss": 0.6838,
"step": 45
},
{
"epoch": 0.12559726962457338,
"grad_norm": 0.09053023904561996,
"learning_rate": 6.258503401360545e-05,
"loss": 0.6754,
"step": 46
},
{
"epoch": 0.12832764505119454,
"grad_norm": 0.07513958215713501,
"learning_rate": 6.394557823129253e-05,
"loss": 0.6894,
"step": 47
},
{
"epoch": 0.1310580204778157,
"grad_norm": 0.07480401545763016,
"learning_rate": 6.530612244897959e-05,
"loss": 0.6809,
"step": 48
},
{
"epoch": 0.13378839590443686,
"grad_norm": 0.07617643475532532,
"learning_rate": 6.666666666666667e-05,
"loss": 0.697,
"step": 49
},
{
"epoch": 0.13651877133105803,
"grad_norm": 0.06744271516799927,
"learning_rate": 6.802721088435374e-05,
"loss": 0.6921,
"step": 50
},
{
"epoch": 0.1392491467576792,
"grad_norm": 0.07185206562280655,
"learning_rate": 6.938775510204082e-05,
"loss": 0.6536,
"step": 51
},
{
"epoch": 0.14197952218430035,
"grad_norm": 0.07255382090806961,
"learning_rate": 7.074829931972789e-05,
"loss": 0.653,
"step": 52
},
{
"epoch": 0.1447098976109215,
"grad_norm": 0.07474930584430695,
"learning_rate": 7.210884353741498e-05,
"loss": 0.6888,
"step": 53
},
{
"epoch": 0.14744027303754267,
"grad_norm": 0.0754467323422432,
"learning_rate": 7.346938775510205e-05,
"loss": 0.6818,
"step": 54
},
{
"epoch": 0.15017064846416384,
"grad_norm": 0.07726683467626572,
"learning_rate": 7.482993197278913e-05,
"loss": 0.6835,
"step": 55
},
{
"epoch": 0.15290102389078497,
"grad_norm": 0.07462974637746811,
"learning_rate": 7.619047619047618e-05,
"loss": 0.667,
"step": 56
},
{
"epoch": 0.15563139931740613,
"grad_norm": 0.06939647346735,
"learning_rate": 7.755102040816327e-05,
"loss": 0.6668,
"step": 57
},
{
"epoch": 0.1583617747440273,
"grad_norm": 0.08218149840831757,
"learning_rate": 7.891156462585034e-05,
"loss": 0.6762,
"step": 58
},
{
"epoch": 0.16109215017064846,
"grad_norm": 0.0838819146156311,
"learning_rate": 8.027210884353742e-05,
"loss": 0.6685,
"step": 59
},
{
"epoch": 0.16382252559726962,
"grad_norm": 0.07441603392362595,
"learning_rate": 8.163265306122449e-05,
"loss": 0.6573,
"step": 60
},
{
"epoch": 0.16655290102389078,
"grad_norm": 0.0746053010225296,
"learning_rate": 8.299319727891157e-05,
"loss": 0.6582,
"step": 61
},
{
"epoch": 0.16928327645051194,
"grad_norm": 0.08602144569158554,
"learning_rate": 8.435374149659864e-05,
"loss": 0.6547,
"step": 62
},
{
"epoch": 0.1720136518771331,
"grad_norm": 0.08236663043498993,
"learning_rate": 8.571428571428571e-05,
"loss": 0.6081,
"step": 63
},
{
"epoch": 0.17474402730375427,
"grad_norm": 0.08744888752698898,
"learning_rate": 8.70748299319728e-05,
"loss": 0.6576,
"step": 64
},
{
"epoch": 0.17747440273037543,
"grad_norm": 0.08321461081504822,
"learning_rate": 8.843537414965987e-05,
"loss": 0.6137,
"step": 65
},
{
"epoch": 0.1802047781569966,
"grad_norm": 0.08639347553253174,
"learning_rate": 8.979591836734695e-05,
"loss": 0.6579,
"step": 66
},
{
"epoch": 0.18293515358361775,
"grad_norm": 0.09154847264289856,
"learning_rate": 9.115646258503402e-05,
"loss": 0.6391,
"step": 67
},
{
"epoch": 0.18566552901023892,
"grad_norm": 0.1094379723072052,
"learning_rate": 9.25170068027211e-05,
"loss": 0.61,
"step": 68
},
{
"epoch": 0.18839590443686008,
"grad_norm": 0.11089900881052017,
"learning_rate": 9.387755102040817e-05,
"loss": 0.6452,
"step": 69
},
{
"epoch": 0.19112627986348124,
"grad_norm": 0.11615785956382751,
"learning_rate": 9.523809523809524e-05,
"loss": 0.6463,
"step": 70
},
{
"epoch": 0.19385665529010238,
"grad_norm": 0.08359086513519287,
"learning_rate": 9.659863945578231e-05,
"loss": 0.6364,
"step": 71
},
{
"epoch": 0.19658703071672354,
"grad_norm": 0.0885363295674324,
"learning_rate": 9.79591836734694e-05,
"loss": 0.6092,
"step": 72
},
{
"epoch": 0.1993174061433447,
"grad_norm": 0.09258115291595459,
"learning_rate": 9.931972789115646e-05,
"loss": 0.6229,
"step": 73
},
{
"epoch": 0.20204778156996586,
"grad_norm": 0.08969170600175858,
"learning_rate": 0.00010068027210884355,
"loss": 0.6173,
"step": 74
},
{
"epoch": 0.20477815699658702,
"grad_norm": 0.10124260932207108,
"learning_rate": 0.00010204081632653062,
"loss": 0.6414,
"step": 75
},
{
"epoch": 0.2075085324232082,
"grad_norm": 0.08671349287033081,
"learning_rate": 0.0001034013605442177,
"loss": 0.6145,
"step": 76
},
{
"epoch": 0.21023890784982935,
"grad_norm": 0.09684890508651733,
"learning_rate": 0.00010476190476190477,
"loss": 0.6262,
"step": 77
},
{
"epoch": 0.2129692832764505,
"grad_norm": 0.08690830320119858,
"learning_rate": 0.00010612244897959185,
"loss": 0.6316,
"step": 78
},
{
"epoch": 0.21569965870307167,
"grad_norm": 0.10457205027341843,
"learning_rate": 0.00010748299319727892,
"loss": 0.639,
"step": 79
},
{
"epoch": 0.21843003412969283,
"grad_norm": 0.10080841183662415,
"learning_rate": 0.000108843537414966,
"loss": 0.592,
"step": 80
},
{
"epoch": 0.221160409556314,
"grad_norm": 0.08858262002468109,
"learning_rate": 0.00011020408163265306,
"loss": 0.6471,
"step": 81
},
{
"epoch": 0.22389078498293516,
"grad_norm": 0.08708172291517258,
"learning_rate": 0.00011156462585034013,
"loss": 0.6222,
"step": 82
},
{
"epoch": 0.22662116040955632,
"grad_norm": 0.1075206995010376,
"learning_rate": 0.00011292517006802721,
"loss": 0.5961,
"step": 83
},
{
"epoch": 0.22935153583617748,
"grad_norm": 0.11788732558488846,
"learning_rate": 0.00011428571428571428,
"loss": 0.609,
"step": 84
},
{
"epoch": 0.23208191126279865,
"grad_norm": 0.0956830084323883,
"learning_rate": 0.00011564625850340137,
"loss": 0.6042,
"step": 85
},
{
"epoch": 0.2348122866894198,
"grad_norm": 0.09799174964427948,
"learning_rate": 0.00011700680272108844,
"loss": 0.6045,
"step": 86
},
{
"epoch": 0.23754266211604094,
"grad_norm": 0.09177012741565704,
"learning_rate": 0.00011836734693877552,
"loss": 0.6068,
"step": 87
},
{
"epoch": 0.2402730375426621,
"grad_norm": 0.10407502949237823,
"learning_rate": 0.00011972789115646259,
"loss": 0.5993,
"step": 88
},
{
"epoch": 0.24300341296928327,
"grad_norm": 0.1047271341085434,
"learning_rate": 0.00012108843537414967,
"loss": 0.6144,
"step": 89
},
{
"epoch": 0.24573378839590443,
"grad_norm": 0.0866198018193245,
"learning_rate": 0.00012244897959183676,
"loss": 0.6203,
"step": 90
},
{
"epoch": 0.2484641638225256,
"grad_norm": 0.09400323033332825,
"learning_rate": 0.0001238095238095238,
"loss": 0.6056,
"step": 91
},
{
"epoch": 0.25119453924914675,
"grad_norm": 0.0817628726363182,
"learning_rate": 0.0001251700680272109,
"loss": 0.5853,
"step": 92
},
{
"epoch": 0.25392491467576794,
"grad_norm": 0.09105788916349411,
"learning_rate": 0.00012653061224489798,
"loss": 0.5952,
"step": 93
},
{
"epoch": 0.2566552901023891,
"grad_norm": 0.09889201074838638,
"learning_rate": 0.00012789115646258506,
"loss": 0.5994,
"step": 94
},
{
"epoch": 0.2593856655290102,
"grad_norm": 0.09481444954872131,
"learning_rate": 0.00012925170068027212,
"loss": 0.5918,
"step": 95
},
{
"epoch": 0.2621160409556314,
"grad_norm": 0.11730329692363739,
"learning_rate": 0.00013061224489795917,
"loss": 0.592,
"step": 96
},
{
"epoch": 0.26484641638225254,
"grad_norm": 0.15733356773853302,
"learning_rate": 0.00013197278911564626,
"loss": 0.5636,
"step": 97
},
{
"epoch": 0.2675767918088737,
"grad_norm": 0.20819880068302155,
"learning_rate": 0.00013333333333333334,
"loss": 0.6101,
"step": 98
},
{
"epoch": 0.27030716723549486,
"grad_norm": 0.18305541574954987,
"learning_rate": 0.0001346938775510204,
"loss": 0.5814,
"step": 99
},
{
"epoch": 0.27303754266211605,
"grad_norm": 0.10316050797700882,
"learning_rate": 0.00013605442176870748,
"loss": 0.5871,
"step": 100
},
{
"epoch": 0.2757679180887372,
"grad_norm": 0.13305549323558807,
"learning_rate": 0.00013741496598639456,
"loss": 0.5846,
"step": 101
},
{
"epoch": 0.2784982935153584,
"grad_norm": 0.0950811356306076,
"learning_rate": 0.00013877551020408165,
"loss": 0.5711,
"step": 102
},
{
"epoch": 0.2812286689419795,
"grad_norm": 0.1198628693819046,
"learning_rate": 0.0001401360544217687,
"loss": 0.5914,
"step": 103
},
{
"epoch": 0.2839590443686007,
"grad_norm": 0.08809541165828705,
"learning_rate": 0.00014149659863945578,
"loss": 0.5872,
"step": 104
},
{
"epoch": 0.28668941979522183,
"grad_norm": 0.09801067411899567,
"learning_rate": 0.00014285714285714287,
"loss": 0.566,
"step": 105
},
{
"epoch": 0.289419795221843,
"grad_norm": 0.08766568452119827,
"learning_rate": 0.00014421768707482995,
"loss": 0.5808,
"step": 106
},
{
"epoch": 0.29215017064846416,
"grad_norm": 0.09133429825305939,
"learning_rate": 0.000145578231292517,
"loss": 0.6037,
"step": 107
},
{
"epoch": 0.29488054607508535,
"grad_norm": 0.09074072539806366,
"learning_rate": 0.0001469387755102041,
"loss": 0.5897,
"step": 108
},
{
"epoch": 0.2976109215017065,
"grad_norm": 0.08934789896011353,
"learning_rate": 0.00014829931972789117,
"loss": 0.5998,
"step": 109
},
{
"epoch": 0.3003412969283277,
"grad_norm": 0.08707176148891449,
"learning_rate": 0.00014965986394557826,
"loss": 0.5762,
"step": 110
},
{
"epoch": 0.3030716723549488,
"grad_norm": 0.0948200449347496,
"learning_rate": 0.0001510204081632653,
"loss": 0.5734,
"step": 111
},
{
"epoch": 0.30580204778156994,
"grad_norm": 0.08889783173799515,
"learning_rate": 0.00015238095238095237,
"loss": 0.5867,
"step": 112
},
{
"epoch": 0.30853242320819113,
"grad_norm": 0.08152323961257935,
"learning_rate": 0.00015374149659863945,
"loss": 0.5527,
"step": 113
},
{
"epoch": 0.31126279863481227,
"grad_norm": 0.09019389748573303,
"learning_rate": 0.00015510204081632654,
"loss": 0.6007,
"step": 114
},
{
"epoch": 0.31399317406143346,
"grad_norm": 0.08257456868886948,
"learning_rate": 0.00015646258503401362,
"loss": 0.5569,
"step": 115
},
{
"epoch": 0.3167235494880546,
"grad_norm": 0.08834348618984222,
"learning_rate": 0.00015782312925170067,
"loss": 0.6026,
"step": 116
},
{
"epoch": 0.3194539249146758,
"grad_norm": 0.08634665608406067,
"learning_rate": 0.00015918367346938776,
"loss": 0.5926,
"step": 117
},
{
"epoch": 0.3221843003412969,
"grad_norm": 0.07867719978094101,
"learning_rate": 0.00016054421768707484,
"loss": 0.5707,
"step": 118
},
{
"epoch": 0.3249146757679181,
"grad_norm": 0.09690061956644058,
"learning_rate": 0.00016190476190476192,
"loss": 0.5793,
"step": 119
},
{
"epoch": 0.32764505119453924,
"grad_norm": 0.08276376128196716,
"learning_rate": 0.00016326530612244898,
"loss": 0.5459,
"step": 120
},
{
"epoch": 0.33037542662116043,
"grad_norm": 0.09276240319013596,
"learning_rate": 0.00016462585034013606,
"loss": 0.5732,
"step": 121
},
{
"epoch": 0.33310580204778156,
"grad_norm": 0.0819844901561737,
"learning_rate": 0.00016598639455782315,
"loss": 0.5349,
"step": 122
},
{
"epoch": 0.33583617747440275,
"grad_norm": 0.08146791905164719,
"learning_rate": 0.00016734693877551023,
"loss": 0.5656,
"step": 123
},
{
"epoch": 0.3385665529010239,
"grad_norm": 0.0879024788737297,
"learning_rate": 0.00016870748299319729,
"loss": 0.5758,
"step": 124
},
{
"epoch": 0.3412969283276451,
"grad_norm": 0.07890356332063675,
"learning_rate": 0.00017006802721088434,
"loss": 0.5332,
"step": 125
},
{
"epoch": 0.3440273037542662,
"grad_norm": 0.10049955546855927,
"learning_rate": 0.00017142857142857143,
"loss": 0.5671,
"step": 126
},
{
"epoch": 0.34675767918088735,
"grad_norm": 0.09643971920013428,
"learning_rate": 0.0001727891156462585,
"loss": 0.5812,
"step": 127
},
{
"epoch": 0.34948805460750854,
"grad_norm": 0.08666185289621353,
"learning_rate": 0.0001741496598639456,
"loss": 0.5487,
"step": 128
},
{
"epoch": 0.35221843003412967,
"grad_norm": 0.1031438484787941,
"learning_rate": 0.00017551020408163265,
"loss": 0.5558,
"step": 129
},
{
"epoch": 0.35494880546075086,
"grad_norm": 0.09404855966567993,
"learning_rate": 0.00017687074829931973,
"loss": 0.5615,
"step": 130
},
{
"epoch": 0.357679180887372,
"grad_norm": 0.09127198159694672,
"learning_rate": 0.00017823129251700681,
"loss": 0.5656,
"step": 131
},
{
"epoch": 0.3604095563139932,
"grad_norm": 0.08694130182266235,
"learning_rate": 0.0001795918367346939,
"loss": 0.5379,
"step": 132
},
{
"epoch": 0.3631399317406143,
"grad_norm": 0.09511597454547882,
"learning_rate": 0.00018095238095238095,
"loss": 0.5535,
"step": 133
},
{
"epoch": 0.3658703071672355,
"grad_norm": 0.09129739552736282,
"learning_rate": 0.00018231292517006804,
"loss": 0.5678,
"step": 134
},
{
"epoch": 0.36860068259385664,
"grad_norm": 0.09248334169387817,
"learning_rate": 0.00018367346938775512,
"loss": 0.5574,
"step": 135
},
{
"epoch": 0.37133105802047783,
"grad_norm": 0.09906318038702011,
"learning_rate": 0.0001850340136054422,
"loss": 0.5499,
"step": 136
},
{
"epoch": 0.37406143344709897,
"grad_norm": 0.09928654134273529,
"learning_rate": 0.00018639455782312926,
"loss": 0.5413,
"step": 137
},
{
"epoch": 0.37679180887372016,
"grad_norm": 0.07559472322463989,
"learning_rate": 0.00018775510204081634,
"loss": 0.5475,
"step": 138
},
{
"epoch": 0.3795221843003413,
"grad_norm": 0.08408834040164948,
"learning_rate": 0.00018911564625850343,
"loss": 0.5432,
"step": 139
},
{
"epoch": 0.3822525597269625,
"grad_norm": 0.08800789713859558,
"learning_rate": 0.00019047619047619048,
"loss": 0.5587,
"step": 140
},
{
"epoch": 0.3849829351535836,
"grad_norm": 0.09994784742593765,
"learning_rate": 0.00019183673469387756,
"loss": 0.555,
"step": 141
},
{
"epoch": 0.38771331058020475,
"grad_norm": 0.07616768032312393,
"learning_rate": 0.00019319727891156462,
"loss": 0.5621,
"step": 142
},
{
"epoch": 0.39044368600682594,
"grad_norm": 0.10337202996015549,
"learning_rate": 0.0001945578231292517,
"loss": 0.5282,
"step": 143
},
{
"epoch": 0.3931740614334471,
"grad_norm": 0.08526328206062317,
"learning_rate": 0.0001959183673469388,
"loss": 0.5439,
"step": 144
},
{
"epoch": 0.39590443686006827,
"grad_norm": 0.10538353770971298,
"learning_rate": 0.00019727891156462587,
"loss": 0.5481,
"step": 145
},
{
"epoch": 0.3986348122866894,
"grad_norm": 0.07550521194934845,
"learning_rate": 0.00019863945578231293,
"loss": 0.5414,
"step": 146
},
{
"epoch": 0.4013651877133106,
"grad_norm": 0.10045620799064636,
"learning_rate": 0.0002,
"loss": 0.5382,
"step": 147
},
{
"epoch": 0.4040955631399317,
"grad_norm": 0.08987366408109665,
"learning_rate": 0.00019999971548969982,
"loss": 0.5417,
"step": 148
},
{
"epoch": 0.4068259385665529,
"grad_norm": 0.0801815390586853,
"learning_rate": 0.0001999988619604182,
"loss": 0.5275,
"step": 149
},
{
"epoch": 0.40955631399317405,
"grad_norm": 0.08214934170246124,
"learning_rate": 0.00019999743941701188,
"loss": 0.543,
"step": 150
},
{
"epoch": 0.41228668941979524,
"grad_norm": 0.08146006613969803,
"learning_rate": 0.00019999544786757545,
"loss": 0.5409,
"step": 151
},
{
"epoch": 0.4150170648464164,
"grad_norm": 0.08081945031881332,
"learning_rate": 0.00019999288732344122,
"loss": 0.5509,
"step": 152
},
{
"epoch": 0.41774744027303756,
"grad_norm": 0.09135357290506363,
"learning_rate": 0.0001999897577991792,
"loss": 0.518,
"step": 153
},
{
"epoch": 0.4204778156996587,
"grad_norm": 0.09191333502531052,
"learning_rate": 0.0001999860593125971,
"loss": 0.5276,
"step": 154
},
{
"epoch": 0.4232081911262799,
"grad_norm": 0.08375995606184006,
"learning_rate": 0.00019998179188473997,
"loss": 0.5319,
"step": 155
},
{
"epoch": 0.425938566552901,
"grad_norm": 0.08481922000646591,
"learning_rate": 0.00019997695553989042,
"loss": 0.5437,
"step": 156
},
{
"epoch": 0.4286689419795222,
"grad_norm": 0.08768640458583832,
"learning_rate": 0.00019997155030556822,
"loss": 0.5445,
"step": 157
},
{
"epoch": 0.43139931740614335,
"grad_norm": 0.08787625283002853,
"learning_rate": 0.00019996557621253027,
"loss": 0.5479,
"step": 158
},
{
"epoch": 0.4341296928327645,
"grad_norm": 0.09505843371152878,
"learning_rate": 0.0001999590332947704,
"loss": 0.5263,
"step": 159
},
{
"epoch": 0.43686006825938567,
"grad_norm": 0.10003377497196198,
"learning_rate": 0.00019995192158951919,
"loss": 0.5228,
"step": 160
},
{
"epoch": 0.4395904436860068,
"grad_norm": 0.0675501748919487,
"learning_rate": 0.00019994424113724363,
"loss": 0.4977,
"step": 161
},
{
"epoch": 0.442320819112628,
"grad_norm": 0.09747067093849182,
"learning_rate": 0.00019993599198164715,
"loss": 0.5161,
"step": 162
},
{
"epoch": 0.44505119453924913,
"grad_norm": 0.0837995857000351,
"learning_rate": 0.0001999271741696691,
"loss": 0.5243,
"step": 163
},
{
"epoch": 0.4477815699658703,
"grad_norm": 0.0793512687087059,
"learning_rate": 0.00019991778775148465,
"loss": 0.5141,
"step": 164
},
{
"epoch": 0.45051194539249145,
"grad_norm": 0.07802822440862656,
"learning_rate": 0.00019990783278050448,
"loss": 0.515,
"step": 165
},
{
"epoch": 0.45324232081911264,
"grad_norm": 0.08355724066495895,
"learning_rate": 0.0001998973093133744,
"loss": 0.5176,
"step": 166
},
{
"epoch": 0.4559726962457338,
"grad_norm": 0.08045308291912079,
"learning_rate": 0.00019988621740997512,
"loss": 0.5151,
"step": 167
},
{
"epoch": 0.45870307167235497,
"grad_norm": 0.07589907944202423,
"learning_rate": 0.00019987455713342187,
"loss": 0.5249,
"step": 168
},
{
"epoch": 0.4614334470989761,
"grad_norm": 0.08553771674633026,
"learning_rate": 0.000199862328550064,
"loss": 0.5485,
"step": 169
},
{
"epoch": 0.4641638225255973,
"grad_norm": 0.08599649369716644,
"learning_rate": 0.00019984953172948465,
"loss": 0.53,
"step": 170
},
{
"epoch": 0.4668941979522184,
"grad_norm": 0.06906479597091675,
"learning_rate": 0.0001998361667445004,
"loss": 0.5336,
"step": 171
},
{
"epoch": 0.4696245733788396,
"grad_norm": 0.07526392489671707,
"learning_rate": 0.00019982223367116076,
"loss": 0.5013,
"step": 172
},
{
"epoch": 0.47235494880546075,
"grad_norm": 0.0722610279917717,
"learning_rate": 0.00019980773258874778,
"loss": 0.5217,
"step": 173
},
{
"epoch": 0.4750853242320819,
"grad_norm": 0.0773632749915123,
"learning_rate": 0.00019979266357977564,
"loss": 0.5184,
"step": 174
},
{
"epoch": 0.4778156996587031,
"grad_norm": 0.07160216569900513,
"learning_rate": 0.00019977702672999007,
"loss": 0.5009,
"step": 175
},
{
"epoch": 0.4805460750853242,
"grad_norm": 0.0764177069067955,
"learning_rate": 0.00019976082212836793,
"loss": 0.5126,
"step": 176
},
{
"epoch": 0.4832764505119454,
"grad_norm": 0.07116773724555969,
"learning_rate": 0.0001997440498671168,
"loss": 0.514,
"step": 177
},
{
"epoch": 0.48600682593856653,
"grad_norm": 0.08402683585882187,
"learning_rate": 0.00019972671004167433,
"loss": 0.5133,
"step": 178
},
{
"epoch": 0.4887372013651877,
"grad_norm": 0.07286666333675385,
"learning_rate": 0.00019970880275070762,
"loss": 0.5221,
"step": 179
},
{
"epoch": 0.49146757679180886,
"grad_norm": 0.08641263097524643,
"learning_rate": 0.00019969032809611287,
"loss": 0.4959,
"step": 180
},
{
"epoch": 0.49419795221843005,
"grad_norm": 0.08849737048149109,
"learning_rate": 0.0001996712861830147,
"loss": 0.4952,
"step": 181
},
{
"epoch": 0.4969283276450512,
"grad_norm": 0.08661802858114243,
"learning_rate": 0.00019965167711976552,
"loss": 0.5023,
"step": 182
},
{
"epoch": 0.49965870307167237,
"grad_norm": 0.08355259150266647,
"learning_rate": 0.0001996315010179449,
"loss": 0.5235,
"step": 183
},
{
"epoch": 0.5023890784982935,
"grad_norm": 0.07524804770946503,
"learning_rate": 0.00019961075799235903,
"loss": 0.5143,
"step": 184
},
{
"epoch": 0.5051194539249146,
"grad_norm": 0.08126044273376465,
"learning_rate": 0.00019958944816104,
"loss": 0.496,
"step": 185
},
{
"epoch": 0.5078498293515359,
"grad_norm": 0.08320248872041702,
"learning_rate": 0.00019956757164524516,
"loss": 0.5106,
"step": 186
},
{
"epoch": 0.510580204778157,
"grad_norm": 0.07375509291887283,
"learning_rate": 0.00019954512856945632,
"loss": 0.4811,
"step": 187
},
{
"epoch": 0.5133105802047782,
"grad_norm": 0.07187776267528534,
"learning_rate": 0.00019952211906137932,
"loss": 0.5104,
"step": 188
},
{
"epoch": 0.5160409556313993,
"grad_norm": 0.07441398501396179,
"learning_rate": 0.00019949854325194294,
"loss": 0.5304,
"step": 189
},
{
"epoch": 0.5187713310580204,
"grad_norm": 0.07976701855659485,
"learning_rate": 0.00019947440127529836,
"loss": 0.4945,
"step": 190
},
{
"epoch": 0.5215017064846417,
"grad_norm": 0.07280328124761581,
"learning_rate": 0.00019944969326881845,
"loss": 0.4848,
"step": 191
},
{
"epoch": 0.5242320819112628,
"grad_norm": 0.07618428766727448,
"learning_rate": 0.00019942441937309684,
"loss": 0.4858,
"step": 192
},
{
"epoch": 0.5269624573378839,
"grad_norm": 0.0665225088596344,
"learning_rate": 0.00019939857973194717,
"loss": 0.4955,
"step": 193
},
{
"epoch": 0.5296928327645051,
"grad_norm": 0.08379194140434265,
"learning_rate": 0.0001993721744924024,
"loss": 0.5067,
"step": 194
},
{
"epoch": 0.5324232081911263,
"grad_norm": 0.07564423978328705,
"learning_rate": 0.00019934520380471372,
"loss": 0.5159,
"step": 195
},
{
"epoch": 0.5351535836177475,
"grad_norm": 0.07225633412599564,
"learning_rate": 0.0001993176678223499,
"loss": 0.5144,
"step": 196
},
{
"epoch": 0.5378839590443686,
"grad_norm": 0.07224252074956894,
"learning_rate": 0.0001992895667019964,
"loss": 0.4859,
"step": 197
},
{
"epoch": 0.5406143344709897,
"grad_norm": 0.079926997423172,
"learning_rate": 0.0001992609006035543,
"loss": 0.4872,
"step": 198
},
{
"epoch": 0.543344709897611,
"grad_norm": 0.08545151352882385,
"learning_rate": 0.0001992316696901397,
"loss": 0.5105,
"step": 199
},
{
"epoch": 0.5460750853242321,
"grad_norm": 0.08008193224668503,
"learning_rate": 0.00019920187412808248,
"loss": 0.4903,
"step": 200
},
{
"epoch": 0.5488054607508532,
"grad_norm": 0.06717066466808319,
"learning_rate": 0.0001991715140869255,
"loss": 0.5037,
"step": 201
},
{
"epoch": 0.5515358361774744,
"grad_norm": 0.08613338321447372,
"learning_rate": 0.00019914058973942368,
"loss": 0.4999,
"step": 202
},
{
"epoch": 0.5542662116040956,
"grad_norm": 0.07288234680891037,
"learning_rate": 0.00019910910126154293,
"loss": 0.5019,
"step": 203
},
{
"epoch": 0.5569965870307167,
"grad_norm": 0.07831370085477829,
"learning_rate": 0.00019907704883245916,
"loss": 0.4595,
"step": 204
},
{
"epoch": 0.5597269624573379,
"grad_norm": 0.0916525200009346,
"learning_rate": 0.00019904443263455728,
"loss": 0.4994,
"step": 205
},
{
"epoch": 0.562457337883959,
"grad_norm": 0.07431495934724808,
"learning_rate": 0.00019901125285343022,
"loss": 0.5059,
"step": 206
},
{
"epoch": 0.5651877133105802,
"grad_norm": 0.07864730060100555,
"learning_rate": 0.0001989775096778777,
"loss": 0.4824,
"step": 207
},
{
"epoch": 0.5679180887372014,
"grad_norm": 0.06928006559610367,
"learning_rate": 0.0001989432032999054,
"loss": 0.4887,
"step": 208
},
{
"epoch": 0.5706484641638225,
"grad_norm": 0.07330948859453201,
"learning_rate": 0.0001989083339147237,
"loss": 0.4804,
"step": 209
},
{
"epoch": 0.5733788395904437,
"grad_norm": 0.07905860990285873,
"learning_rate": 0.0001988729017207465,
"loss": 0.5126,
"step": 210
},
{
"epoch": 0.5761092150170648,
"grad_norm": 0.07062509655952454,
"learning_rate": 0.00019883690691959035,
"loss": 0.5063,
"step": 211
},
{
"epoch": 0.578839590443686,
"grad_norm": 0.071404367685318,
"learning_rate": 0.00019880034971607308,
"loss": 0.495,
"step": 212
},
{
"epoch": 0.5815699658703072,
"grad_norm": 0.0727284774184227,
"learning_rate": 0.00019876323031821266,
"loss": 0.4994,
"step": 213
},
{
"epoch": 0.5843003412969283,
"grad_norm": 0.07198608666658401,
"learning_rate": 0.00019872554893722618,
"loss": 0.4903,
"step": 214
},
{
"epoch": 0.5870307167235495,
"grad_norm": 0.07637451589107513,
"learning_rate": 0.0001986873057875284,
"loss": 0.5057,
"step": 215
},
{
"epoch": 0.5897610921501707,
"grad_norm": 0.06596951186656952,
"learning_rate": 0.00019864850108673073,
"loss": 0.4932,
"step": 216
},
{
"epoch": 0.5924914675767918,
"grad_norm": 0.06999579071998596,
"learning_rate": 0.0001986091350556399,
"loss": 0.4887,
"step": 217
},
{
"epoch": 0.595221843003413,
"grad_norm": 0.06687980890274048,
"learning_rate": 0.00019856920791825683,
"loss": 0.472,
"step": 218
},
{
"epoch": 0.5979522184300341,
"grad_norm": 0.07001427561044693,
"learning_rate": 0.00019852871990177503,
"loss": 0.4692,
"step": 219
},
{
"epoch": 0.6006825938566553,
"grad_norm": 0.06714101880788803,
"learning_rate": 0.00019848767123657976,
"loss": 0.4813,
"step": 220
},
{
"epoch": 0.6034129692832765,
"grad_norm": 0.07292049378156662,
"learning_rate": 0.0001984460621562463,
"loss": 0.4885,
"step": 221
},
{
"epoch": 0.6061433447098976,
"grad_norm": 0.06814104318618774,
"learning_rate": 0.00019840389289753896,
"loss": 0.4938,
"step": 222
},
{
"epoch": 0.6088737201365187,
"grad_norm": 0.06866355985403061,
"learning_rate": 0.00019836116370040944,
"loss": 0.4776,
"step": 223
},
{
"epoch": 0.6116040955631399,
"grad_norm": 0.07145702093839645,
"learning_rate": 0.00019831787480799568,
"loss": 0.4883,
"step": 224
},
{
"epoch": 0.6143344709897611,
"grad_norm": 0.06319977343082428,
"learning_rate": 0.00019827402646662047,
"loss": 0.4882,
"step": 225
},
{
"epoch": 0.6170648464163823,
"grad_norm": 0.08186688274145126,
"learning_rate": 0.0001982296189257898,
"loss": 0.4917,
"step": 226
},
{
"epoch": 0.6197952218430034,
"grad_norm": 0.06892900168895721,
"learning_rate": 0.00019818465243819184,
"loss": 0.4808,
"step": 227
},
{
"epoch": 0.6225255972696245,
"grad_norm": 0.0752168744802475,
"learning_rate": 0.00019813912725969509,
"loss": 0.4858,
"step": 228
},
{
"epoch": 0.6252559726962458,
"grad_norm": 0.08079662919044495,
"learning_rate": 0.0001980930436493472,
"loss": 0.5101,
"step": 229
},
{
"epoch": 0.6279863481228669,
"grad_norm": 0.0717153325676918,
"learning_rate": 0.00019804640186937343,
"loss": 0.4799,
"step": 230
},
{
"epoch": 0.630716723549488,
"grad_norm": 0.08962002396583557,
"learning_rate": 0.0001979992021851751,
"loss": 0.5067,
"step": 231
},
{
"epoch": 0.6334470989761092,
"grad_norm": 0.08904211223125458,
"learning_rate": 0.00019795144486532814,
"loss": 0.4725,
"step": 232
},
{
"epoch": 0.6361774744027304,
"grad_norm": 0.06842932850122452,
"learning_rate": 0.00019790313018158156,
"loss": 0.4996,
"step": 233
},
{
"epoch": 0.6389078498293516,
"grad_norm": 0.08361311256885529,
"learning_rate": 0.0001978542584088558,
"loss": 0.4945,
"step": 234
},
{
"epoch": 0.6416382252559727,
"grad_norm": 0.07219431549310684,
"learning_rate": 0.00019780482982524142,
"loss": 0.4488,
"step": 235
},
{
"epoch": 0.6443686006825938,
"grad_norm": 0.07717226445674896,
"learning_rate": 0.00019775484471199715,
"loss": 0.4814,
"step": 236
},
{
"epoch": 0.647098976109215,
"grad_norm": 0.07770105451345444,
"learning_rate": 0.0001977043033535486,
"loss": 0.4731,
"step": 237
},
{
"epoch": 0.6498293515358362,
"grad_norm": 0.06878919899463654,
"learning_rate": 0.00019765320603748655,
"loss": 0.4833,
"step": 238
},
{
"epoch": 0.6525597269624573,
"grad_norm": 0.07085343450307846,
"learning_rate": 0.0001976015530545652,
"loss": 0.4907,
"step": 239
},
{
"epoch": 0.6552901023890785,
"grad_norm": 0.07935165613889694,
"learning_rate": 0.0001975493446987007,
"loss": 0.4794,
"step": 240
},
{
"epoch": 0.6580204778156996,
"grad_norm": 0.06543820351362228,
"learning_rate": 0.00019749658126696934,
"loss": 0.4906,
"step": 241
},
{
"epoch": 0.6607508532423209,
"grad_norm": 0.07727054506540298,
"learning_rate": 0.00019744326305960595,
"loss": 0.4868,
"step": 242
},
{
"epoch": 0.663481228668942,
"grad_norm": 0.06668544560670853,
"learning_rate": 0.00019738939038000205,
"loss": 0.475,
"step": 243
},
{
"epoch": 0.6662116040955631,
"grad_norm": 0.07048569619655609,
"learning_rate": 0.00019733496353470433,
"loss": 0.4878,
"step": 244
},
{
"epoch": 0.6689419795221843,
"grad_norm": 0.07110477238893509,
"learning_rate": 0.00019727998283341274,
"loss": 0.4663,
"step": 245
},
{
"epoch": 0.6716723549488055,
"grad_norm": 0.07245586067438126,
"learning_rate": 0.00019722444858897878,
"loss": 0.4899,
"step": 246
},
{
"epoch": 0.6744027303754266,
"grad_norm": 0.07484875619411469,
"learning_rate": 0.00019716836111740378,
"loss": 0.4831,
"step": 247
},
{
"epoch": 0.6771331058020478,
"grad_norm": 0.07812648266553879,
"learning_rate": 0.00019711172073783696,
"loss": 0.4654,
"step": 248
},
{
"epoch": 0.6798634812286689,
"grad_norm": 0.060632165521383286,
"learning_rate": 0.00019705452777257377,
"loss": 0.4706,
"step": 249
},
{
"epoch": 0.6825938566552902,
"grad_norm": 0.07092992216348648,
"learning_rate": 0.000196996782547054,
"loss": 0.4792,
"step": 250
},
{
"epoch": 0.6853242320819113,
"grad_norm": 0.06629595905542374,
"learning_rate": 0.00019693848538985983,
"loss": 0.4791,
"step": 251
},
{
"epoch": 0.6880546075085324,
"grad_norm": 0.06915664672851562,
"learning_rate": 0.00019687963663271409,
"loss": 0.4623,
"step": 252
},
{
"epoch": 0.6907849829351536,
"grad_norm": 0.0694665014743805,
"learning_rate": 0.00019682023661047836,
"loss": 0.48,
"step": 253
},
{
"epoch": 0.6935153583617747,
"grad_norm": 0.06899196654558182,
"learning_rate": 0.00019676028566115102,
"loss": 0.4855,
"step": 254
},
{
"epoch": 0.6962457337883959,
"grad_norm": 0.0740811675786972,
"learning_rate": 0.00019669978412586528,
"loss": 0.4833,
"step": 255
},
{
"epoch": 0.6989761092150171,
"grad_norm": 0.06517481803894043,
"learning_rate": 0.00019663873234888733,
"loss": 0.4523,
"step": 256
},
{
"epoch": 0.7017064846416382,
"grad_norm": 0.06481153517961502,
"learning_rate": 0.0001965771306776144,
"loss": 0.4689,
"step": 257
},
{
"epoch": 0.7044368600682593,
"grad_norm": 0.06042364612221718,
"learning_rate": 0.00019651497946257266,
"loss": 0.4757,
"step": 258
},
{
"epoch": 0.7071672354948806,
"grad_norm": 0.0717868059873581,
"learning_rate": 0.00019645227905741534,
"loss": 0.4773,
"step": 259
},
{
"epoch": 0.7098976109215017,
"grad_norm": 0.06427443772554398,
"learning_rate": 0.00019638902981892068,
"loss": 0.4875,
"step": 260
},
{
"epoch": 0.7126279863481229,
"grad_norm": 0.07786547392606735,
"learning_rate": 0.00019632523210698987,
"loss": 0.4758,
"step": 261
},
{
"epoch": 0.715358361774744,
"grad_norm": 0.07115910202264786,
"learning_rate": 0.00019626088628464498,
"loss": 0.4651,
"step": 262
},
{
"epoch": 0.7180887372013652,
"grad_norm": 0.06626811623573303,
"learning_rate": 0.00019619599271802706,
"loss": 0.4873,
"step": 263
},
{
"epoch": 0.7208191126279864,
"grad_norm": 0.07854583859443665,
"learning_rate": 0.00019613055177639384,
"loss": 0.4945,
"step": 264
},
{
"epoch": 0.7235494880546075,
"grad_norm": 0.0847892239689827,
"learning_rate": 0.00019606456383211777,
"loss": 0.4671,
"step": 265
},
{
"epoch": 0.7262798634812286,
"grad_norm": 0.06735772639513016,
"learning_rate": 0.00019599802926068384,
"loss": 0.4767,
"step": 266
},
{
"epoch": 0.7290102389078499,
"grad_norm": 0.07502768933773041,
"learning_rate": 0.00019593094844068748,
"loss": 0.462,
"step": 267
},
{
"epoch": 0.731740614334471,
"grad_norm": 0.07276903837919235,
"learning_rate": 0.00019586332175383238,
"loss": 0.4754,
"step": 268
},
{
"epoch": 0.7344709897610922,
"grad_norm": 0.07755447924137115,
"learning_rate": 0.00019579514958492826,
"loss": 0.492,
"step": 269
},
{
"epoch": 0.7372013651877133,
"grad_norm": 0.07876396179199219,
"learning_rate": 0.0001957264323218889,
"loss": 0.4737,
"step": 270
},
{
"epoch": 0.7399317406143344,
"grad_norm": 0.07997962832450867,
"learning_rate": 0.0001956571703557296,
"loss": 0.4592,
"step": 271
},
{
"epoch": 0.7426621160409557,
"grad_norm": 0.08079583197832108,
"learning_rate": 0.00019558736408056525,
"loss": 0.473,
"step": 272
},
{
"epoch": 0.7453924914675768,
"grad_norm": 0.0736604854464531,
"learning_rate": 0.00019551701389360795,
"loss": 0.4741,
"step": 273
},
{
"epoch": 0.7481228668941979,
"grad_norm": 0.0741550549864769,
"learning_rate": 0.00019544612019516472,
"loss": 0.4611,
"step": 274
},
{
"epoch": 0.7508532423208191,
"grad_norm": 0.06802786141633987,
"learning_rate": 0.00019537468338863537,
"loss": 0.4621,
"step": 275
},
{
"epoch": 0.7535836177474403,
"grad_norm": 0.06499720364809036,
"learning_rate": 0.00019530270388050998,
"loss": 0.4676,
"step": 276
},
{
"epoch": 0.7563139931740614,
"grad_norm": 0.06809037923812866,
"learning_rate": 0.00019523018208036677,
"loss": 0.475,
"step": 277
},
{
"epoch": 0.7590443686006826,
"grad_norm": 0.06455886363983154,
"learning_rate": 0.0001951571184008698,
"loss": 0.4807,
"step": 278
},
{
"epoch": 0.7617747440273037,
"grad_norm": 0.06833679229021072,
"learning_rate": 0.00019508351325776642,
"loss": 0.4751,
"step": 279
},
{
"epoch": 0.764505119453925,
"grad_norm": 0.07593976706266403,
"learning_rate": 0.00019500936706988502,
"loss": 0.4714,
"step": 280
},
{
"epoch": 0.7672354948805461,
"grad_norm": 0.0687364712357521,
"learning_rate": 0.00019493468025913276,
"loss": 0.4575,
"step": 281
},
{
"epoch": 0.7699658703071672,
"grad_norm": 0.07183225452899933,
"learning_rate": 0.00019485945325049288,
"loss": 0.4815,
"step": 282
},
{
"epoch": 0.7726962457337884,
"grad_norm": 0.06775309145450592,
"learning_rate": 0.00019478368647202264,
"loss": 0.4543,
"step": 283
},
{
"epoch": 0.7754266211604095,
"grad_norm": 0.06261654198169708,
"learning_rate": 0.00019470738035485058,
"loss": 0.4724,
"step": 284
},
{
"epoch": 0.7781569965870307,
"grad_norm": 0.06674676388502121,
"learning_rate": 0.00019463053533317425,
"loss": 0.4667,
"step": 285
},
{
"epoch": 0.7808873720136519,
"grad_norm": 0.06266098469495773,
"learning_rate": 0.0001945531518442576,
"loss": 0.4614,
"step": 286
},
{
"epoch": 0.783617747440273,
"grad_norm": 0.06769178062677383,
"learning_rate": 0.0001944752303284287,
"loss": 0.4609,
"step": 287
},
{
"epoch": 0.7863481228668942,
"grad_norm": 0.07618339359760284,
"learning_rate": 0.00019439677122907697,
"loss": 0.4822,
"step": 288
},
{
"epoch": 0.7890784982935154,
"grad_norm": 0.06216439977288246,
"learning_rate": 0.00019431777499265087,
"loss": 0.4573,
"step": 289
},
{
"epoch": 0.7918088737201365,
"grad_norm": 0.06998062878847122,
"learning_rate": 0.00019423824206865527,
"loss": 0.4683,
"step": 290
},
{
"epoch": 0.7945392491467577,
"grad_norm": 0.06178448721766472,
"learning_rate": 0.00019415817290964883,
"loss": 0.4643,
"step": 291
},
{
"epoch": 0.7972696245733788,
"grad_norm": 0.06611185520887375,
"learning_rate": 0.00019407756797124164,
"loss": 0.4712,
"step": 292
},
{
"epoch": 0.8,
"grad_norm": 0.06682468205690384,
"learning_rate": 0.00019399642771209238,
"loss": 0.474,
"step": 293
},
{
"epoch": 0.8027303754266212,
"grad_norm": 0.0632803738117218,
"learning_rate": 0.00019391475259390584,
"loss": 0.4776,
"step": 294
},
{
"epoch": 0.8054607508532423,
"grad_norm": 0.06498962640762329,
"learning_rate": 0.0001938325430814302,
"loss": 0.4735,
"step": 295
},
{
"epoch": 0.8081911262798634,
"grad_norm": 0.06621643900871277,
"learning_rate": 0.00019374979964245463,
"loss": 0.4785,
"step": 296
},
{
"epoch": 0.8109215017064847,
"grad_norm": 0.05847141519188881,
"learning_rate": 0.00019366652274780628,
"loss": 0.4702,
"step": 297
},
{
"epoch": 0.8136518771331058,
"grad_norm": 0.06962229311466217,
"learning_rate": 0.00019358271287134784,
"loss": 0.4612,
"step": 298
},
{
"epoch": 0.816382252559727,
"grad_norm": 0.06132384389638901,
"learning_rate": 0.00019349837048997478,
"loss": 0.4453,
"step": 299
},
{
"epoch": 0.8191126279863481,
"grad_norm": 0.06574399024248123,
"learning_rate": 0.00019341349608361267,
"loss": 0.4545,
"step": 300
},
{
"epoch": 0.8218430034129692,
"grad_norm": 0.06561442464590073,
"learning_rate": 0.00019332809013521428,
"loss": 0.4619,
"step": 301
},
{
"epoch": 0.8245733788395905,
"grad_norm": 0.06309875100851059,
"learning_rate": 0.00019324215313075706,
"loss": 0.465,
"step": 302
},
{
"epoch": 0.8273037542662116,
"grad_norm": 0.06544878333806992,
"learning_rate": 0.00019315568555924035,
"loss": 0.4571,
"step": 303
},
{
"epoch": 0.8300341296928327,
"grad_norm": 0.07011238485574722,
"learning_rate": 0.0001930686879126824,
"loss": 0.4579,
"step": 304
},
{
"epoch": 0.8327645051194539,
"grad_norm": 0.06445574760437012,
"learning_rate": 0.0001929811606861177,
"loss": 0.4695,
"step": 305
},
{
"epoch": 0.8354948805460751,
"grad_norm": 0.061930734664201736,
"learning_rate": 0.00019289310437759427,
"loss": 0.4449,
"step": 306
},
{
"epoch": 0.8382252559726963,
"grad_norm": 0.0658838227391243,
"learning_rate": 0.00019280451948817059,
"loss": 0.4726,
"step": 307
},
{
"epoch": 0.8409556313993174,
"grad_norm": 0.06302706897258759,
"learning_rate": 0.00019271540652191296,
"loss": 0.447,
"step": 308
},
{
"epoch": 0.8436860068259385,
"grad_norm": 0.08308806270360947,
"learning_rate": 0.0001926257659858925,
"loss": 0.4605,
"step": 309
},
{
"epoch": 0.8464163822525598,
"grad_norm": 0.06508838385343552,
"learning_rate": 0.00019253559839018235,
"loss": 0.4778,
"step": 310
},
{
"epoch": 0.8491467576791809,
"grad_norm": 0.07429094612598419,
"learning_rate": 0.00019244490424785468,
"loss": 0.4659,
"step": 311
},
{
"epoch": 0.851877133105802,
"grad_norm": 0.07138285785913467,
"learning_rate": 0.00019235368407497788,
"loss": 0.4564,
"step": 312
},
{
"epoch": 0.8546075085324232,
"grad_norm": 0.07202211022377014,
"learning_rate": 0.00019226193839061347,
"loss": 0.4377,
"step": 313
},
{
"epoch": 0.8573378839590444,
"grad_norm": 0.0779070258140564,
"learning_rate": 0.0001921696677168133,
"loss": 0.4532,
"step": 314
},
{
"epoch": 0.8600682593856656,
"grad_norm": 0.07717596739530563,
"learning_rate": 0.00019207687257861655,
"loss": 0.4654,
"step": 315
},
{
"epoch": 0.8627986348122867,
"grad_norm": 0.0708346962928772,
"learning_rate": 0.00019198355350404667,
"loss": 0.4584,
"step": 316
},
{
"epoch": 0.8655290102389078,
"grad_norm": 0.0656716600060463,
"learning_rate": 0.00019188971102410837,
"loss": 0.4504,
"step": 317
},
{
"epoch": 0.868259385665529,
"grad_norm": 0.06869971752166748,
"learning_rate": 0.00019179534567278475,
"loss": 0.4592,
"step": 318
},
{
"epoch": 0.8709897610921502,
"grad_norm": 0.06358928978443146,
"learning_rate": 0.00019170045798703406,
"loss": 0.4376,
"step": 319
},
{
"epoch": 0.8737201365187713,
"grad_norm": 0.06602993607521057,
"learning_rate": 0.0001916050485067868,
"loss": 0.4692,
"step": 320
},
{
"epoch": 0.8764505119453925,
"grad_norm": 0.06115058436989784,
"learning_rate": 0.00019150911777494258,
"loss": 0.462,
"step": 321
},
{
"epoch": 0.8791808873720136,
"grad_norm": 0.06374403834342957,
"learning_rate": 0.00019141266633736697,
"loss": 0.4325,
"step": 322
},
{
"epoch": 0.8819112627986349,
"grad_norm": 0.06459895521402359,
"learning_rate": 0.0001913156947428886,
"loss": 0.4605,
"step": 323
},
{
"epoch": 0.884641638225256,
"grad_norm": 0.06160016357898712,
"learning_rate": 0.00019121820354329577,
"loss": 0.4604,
"step": 324
},
{
"epoch": 0.8873720136518771,
"grad_norm": 0.06345291435718536,
"learning_rate": 0.00019112019329333346,
"loss": 0.4565,
"step": 325
},
{
"epoch": 0.8901023890784983,
"grad_norm": 0.06534894555807114,
"learning_rate": 0.00019102166455070024,
"loss": 0.4619,
"step": 326
},
{
"epoch": 0.8928327645051195,
"grad_norm": 0.06186550110578537,
"learning_rate": 0.00019092261787604492,
"loss": 0.4477,
"step": 327
},
{
"epoch": 0.8955631399317406,
"grad_norm": 0.058699868619441986,
"learning_rate": 0.00019082305383296352,
"loss": 0.4484,
"step": 328
},
{
"epoch": 0.8982935153583618,
"grad_norm": 0.05798410624265671,
"learning_rate": 0.00019072297298799589,
"loss": 0.4605,
"step": 329
},
{
"epoch": 0.9010238907849829,
"grad_norm": 0.06147664040327072,
"learning_rate": 0.00019062237591062272,
"loss": 0.4489,
"step": 330
},
{
"epoch": 0.903754266211604,
"grad_norm": 0.06032559648156166,
"learning_rate": 0.00019052126317326207,
"loss": 0.4412,
"step": 331
},
{
"epoch": 0.9064846416382253,
"grad_norm": 0.06326504051685333,
"learning_rate": 0.00019041963535126625,
"loss": 0.4547,
"step": 332
},
{
"epoch": 0.9092150170648464,
"grad_norm": 0.06808637827634811,
"learning_rate": 0.0001903174930229185,
"loss": 0.4513,
"step": 333
},
{
"epoch": 0.9119453924914676,
"grad_norm": 0.06384904682636261,
"learning_rate": 0.00019021483676942973,
"loss": 0.4542,
"step": 334
},
{
"epoch": 0.9146757679180887,
"grad_norm": 0.07148803770542145,
"learning_rate": 0.00019011166717493517,
"loss": 0.4569,
"step": 335
},
{
"epoch": 0.9174061433447099,
"grad_norm": 0.06942867487668991,
"learning_rate": 0.000190007984826491,
"loss": 0.4496,
"step": 336
},
{
"epoch": 0.9201365187713311,
"grad_norm": 0.06153569370508194,
"learning_rate": 0.00018990379031407124,
"loss": 0.464,
"step": 337
},
{
"epoch": 0.9228668941979522,
"grad_norm": 0.07417679578065872,
"learning_rate": 0.00018979908423056408,
"loss": 0.4396,
"step": 338
},
{
"epoch": 0.9255972696245733,
"grad_norm": 0.06745341420173645,
"learning_rate": 0.0001896938671717687,
"loss": 0.4584,
"step": 339
},
{
"epoch": 0.9283276450511946,
"grad_norm": 0.060262780636548996,
"learning_rate": 0.00018958813973639184,
"loss": 0.4363,
"step": 340
},
{
"epoch": 0.9310580204778157,
"grad_norm": 0.06427337974309921,
"learning_rate": 0.0001894819025260444,
"loss": 0.4352,
"step": 341
},
{
"epoch": 0.9337883959044369,
"grad_norm": 0.06150776520371437,
"learning_rate": 0.00018937515614523797,
"loss": 0.4644,
"step": 342
},
{
"epoch": 0.936518771331058,
"grad_norm": 0.06864424049854279,
"learning_rate": 0.0001892679012013815,
"loss": 0.4608,
"step": 343
},
{
"epoch": 0.9392491467576792,
"grad_norm": 0.06174071133136749,
"learning_rate": 0.00018916013830477766,
"loss": 0.4402,
"step": 344
},
{
"epoch": 0.9419795221843004,
"grad_norm": 0.0684589147567749,
"learning_rate": 0.00018905186806861957,
"loss": 0.4569,
"step": 345
},
{
"epoch": 0.9447098976109215,
"grad_norm": 0.05750627443194389,
"learning_rate": 0.00018894309110898712,
"loss": 0.4522,
"step": 346
},
{
"epoch": 0.9474402730375426,
"grad_norm": 0.0697883740067482,
"learning_rate": 0.00018883380804484367,
"loss": 0.4594,
"step": 347
},
{
"epoch": 0.9501706484641638,
"grad_norm": 0.06613462418317795,
"learning_rate": 0.00018872401949803237,
"loss": 0.4459,
"step": 348
},
{
"epoch": 0.952901023890785,
"grad_norm": 0.06346327811479568,
"learning_rate": 0.00018861372609327263,
"loss": 0.4316,
"step": 349
},
{
"epoch": 0.9556313993174061,
"grad_norm": 0.06382953375577927,
"learning_rate": 0.00018850292845815672,
"loss": 0.4358,
"step": 350
},
{
"epoch": 0.9583617747440273,
"grad_norm": 0.07121171057224274,
"learning_rate": 0.0001883916272231459,
"loss": 0.465,
"step": 351
},
{
"epoch": 0.9610921501706484,
"grad_norm": 0.06311832368373871,
"learning_rate": 0.0001882798230215672,
"loss": 0.4478,
"step": 352
},
{
"epoch": 0.9638225255972697,
"grad_norm": 0.06858519464731216,
"learning_rate": 0.00018816751648960956,
"loss": 0.4402,
"step": 353
},
{
"epoch": 0.9665529010238908,
"grad_norm": 0.06063356623053551,
"learning_rate": 0.00018805470826632024,
"loss": 0.4373,
"step": 354
},
{
"epoch": 0.9692832764505119,
"grad_norm": 0.06550437211990356,
"learning_rate": 0.0001879413989936013,
"loss": 0.4448,
"step": 355
},
{
"epoch": 0.9720136518771331,
"grad_norm": 0.06248946860432625,
"learning_rate": 0.00018782758931620584,
"loss": 0.4576,
"step": 356
},
{
"epoch": 0.9747440273037543,
"grad_norm": 0.07067371159791946,
"learning_rate": 0.00018771327988173435,
"loss": 0.4644,
"step": 357
},
{
"epoch": 0.9774744027303754,
"grad_norm": 0.06225898116827011,
"learning_rate": 0.00018759847134063108,
"loss": 0.4617,
"step": 358
},
{
"epoch": 0.9802047781569966,
"grad_norm": 0.061437107622623444,
"learning_rate": 0.0001874831643461803,
"loss": 0.4339,
"step": 359
},
{
"epoch": 0.9829351535836177,
"grad_norm": 0.059149857610464096,
"learning_rate": 0.00018736735955450251,
"loss": 0.4238,
"step": 360
},
{
"epoch": 0.985665529010239,
"grad_norm": 0.06511219590902328,
"learning_rate": 0.0001872510576245509,
"loss": 0.4394,
"step": 361
},
{
"epoch": 0.9883959044368601,
"grad_norm": 0.06580841541290283,
"learning_rate": 0.00018713425921810733,
"loss": 0.4218,
"step": 362
},
{
"epoch": 0.9911262798634812,
"grad_norm": 0.07789267599582672,
"learning_rate": 0.00018701696499977884,
"loss": 0.4524,
"step": 363
},
{
"epoch": 0.9938566552901024,
"grad_norm": 0.06430528312921524,
"learning_rate": 0.0001868991756369937,
"loss": 0.4503,
"step": 364
},
{
"epoch": 0.9965870307167235,
"grad_norm": 0.06355779618024826,
"learning_rate": 0.00018678089179999762,
"loss": 0.4556,
"step": 365
},
{
"epoch": 0.9993174061433447,
"grad_norm": 0.06800378113985062,
"learning_rate": 0.00018666211416184999,
"loss": 0.44,
"step": 366
},
{
"epoch": 0.9993174061433447,
"eval_loss": 0.4462641775608063,
"eval_runtime": 311.1378,
"eval_samples_per_second": 8.369,
"eval_steps_per_second": 1.048,
"step": 366
},
{
"epoch": 1.0020477815699658,
"grad_norm": 0.14618873596191406,
"learning_rate": 0.00018654284339842013,
"loss": 0.7832,
"step": 367
},
{
"epoch": 1.004778156996587,
"grad_norm": 0.10670002549886703,
"learning_rate": 0.00018642308018838316,
"loss": 0.4482,
"step": 368
},
{
"epoch": 1.0075085324232083,
"grad_norm": 0.07775750756263733,
"learning_rate": 0.00018630282521321645,
"loss": 0.4345,
"step": 369
},
{
"epoch": 1.0102389078498293,
"grad_norm": 0.07130205631256104,
"learning_rate": 0.0001861820791571956,
"loss": 0.4294,
"step": 370
},
{
"epoch": 1.0129692832764505,
"grad_norm": 0.07318615168333054,
"learning_rate": 0.00018606084270739049,
"loss": 0.449,
"step": 371
},
{
"epoch": 1.0156996587030718,
"grad_norm": 0.06613319367170334,
"learning_rate": 0.0001859391165536615,
"loss": 0.4435,
"step": 372
},
{
"epoch": 1.0184300341296928,
"grad_norm": 0.06562095880508423,
"learning_rate": 0.0001858169013886556,
"loss": 0.4288,
"step": 373
},
{
"epoch": 1.021160409556314,
"grad_norm": 0.060670241713523865,
"learning_rate": 0.00018569419790780218,
"loss": 0.4029,
"step": 374
},
{
"epoch": 1.023890784982935,
"grad_norm": 0.06414277106523514,
"learning_rate": 0.00018557100680930937,
"loss": 0.4357,
"step": 375
},
{
"epoch": 1.0266211604095563,
"grad_norm": 0.06078667938709259,
"learning_rate": 0.00018544732879415986,
"loss": 0.4188,
"step": 376
},
{
"epoch": 1.0293515358361776,
"grad_norm": 0.06345190107822418,
"learning_rate": 0.00018532316456610704,
"loss": 0.4501,
"step": 377
},
{
"epoch": 1.0320819112627986,
"grad_norm": 0.06139195337891579,
"learning_rate": 0.00018519851483167097,
"loss": 0.438,
"step": 378
},
{
"epoch": 1.0348122866894198,
"grad_norm": 0.059995777904987335,
"learning_rate": 0.00018507338030013427,
"loss": 0.4505,
"step": 379
},
{
"epoch": 1.0375426621160408,
"grad_norm": 0.06199508160352707,
"learning_rate": 0.00018494776168353827,
"loss": 0.4564,
"step": 380
},
{
"epoch": 1.040273037542662,
"grad_norm": 0.062205228954553604,
"learning_rate": 0.00018482165969667874,
"loss": 0.4519,
"step": 381
},
{
"epoch": 1.0430034129692833,
"grad_norm": 0.06433286517858505,
"learning_rate": 0.00018469507505710194,
"loss": 0.4394,
"step": 382
},
{
"epoch": 1.0457337883959044,
"grad_norm": 0.06373082101345062,
"learning_rate": 0.00018456800848510056,
"loss": 0.4456,
"step": 383
},
{
"epoch": 1.0484641638225256,
"grad_norm": 0.0655735656619072,
"learning_rate": 0.00018444046070370963,
"loss": 0.4527,
"step": 384
},
{
"epoch": 1.0511945392491469,
"grad_norm": 0.059250976890325546,
"learning_rate": 0.00018431243243870223,
"loss": 0.4338,
"step": 385
},
{
"epoch": 1.0539249146757679,
"grad_norm": 0.05919628590345383,
"learning_rate": 0.00018418392441858555,
"loss": 0.4252,
"step": 386
},
{
"epoch": 1.0566552901023891,
"grad_norm": 0.07075149565935135,
"learning_rate": 0.0001840549373745968,
"loss": 0.4478,
"step": 387
},
{
"epoch": 1.0593856655290101,
"grad_norm": 0.06196924299001694,
"learning_rate": 0.0001839254720406987,
"loss": 0.4446,
"step": 388
},
{
"epoch": 1.0621160409556314,
"grad_norm": 0.07002051174640656,
"learning_rate": 0.00018379552915357575,
"loss": 0.4668,
"step": 389
},
{
"epoch": 1.0648464163822526,
"grad_norm": 0.05986930802464485,
"learning_rate": 0.00018366510945262972,
"loss": 0.4361,
"step": 390
},
{
"epoch": 1.0675767918088737,
"grad_norm": 0.06568475067615509,
"learning_rate": 0.00018353421367997563,
"loss": 0.4432,
"step": 391
},
{
"epoch": 1.070307167235495,
"grad_norm": 0.063268281519413,
"learning_rate": 0.00018340284258043732,
"loss": 0.4479,
"step": 392
},
{
"epoch": 1.073037542662116,
"grad_norm": 0.06184746325016022,
"learning_rate": 0.00018327099690154344,
"loss": 0.4392,
"step": 393
},
{
"epoch": 1.0757679180887372,
"grad_norm": 0.06682950258255005,
"learning_rate": 0.00018313867739352304,
"loss": 0.4469,
"step": 394
},
{
"epoch": 1.0784982935153584,
"grad_norm": 0.06049386039376259,
"learning_rate": 0.00018300588480930143,
"loss": 0.4448,
"step": 395
},
{
"epoch": 1.0812286689419794,
"grad_norm": 0.058452919125556946,
"learning_rate": 0.0001828726199044957,
"loss": 0.4387,
"step": 396
},
{
"epoch": 1.0839590443686007,
"grad_norm": 0.06608898937702179,
"learning_rate": 0.0001827388834374107,
"loss": 0.4316,
"step": 397
},
{
"epoch": 1.086689419795222,
"grad_norm": 0.06221776083111763,
"learning_rate": 0.0001826046761690344,
"loss": 0.4362,
"step": 398
},
{
"epoch": 1.089419795221843,
"grad_norm": 0.0670786052942276,
"learning_rate": 0.00018246999886303383,
"loss": 0.4394,
"step": 399
},
{
"epoch": 1.0921501706484642,
"grad_norm": 0.061892326921224594,
"learning_rate": 0.00018233485228575063,
"loss": 0.4565,
"step": 400
},
{
"epoch": 1.0948805460750852,
"grad_norm": 0.06282811611890793,
"learning_rate": 0.00018219923720619663,
"loss": 0.4421,
"step": 401
},
{
"epoch": 1.0976109215017065,
"grad_norm": 0.061520010232925415,
"learning_rate": 0.0001820631543960496,
"loss": 0.4346,
"step": 402
},
{
"epoch": 1.1003412969283277,
"grad_norm": 0.05969773232936859,
"learning_rate": 0.0001819266046296487,
"loss": 0.4472,
"step": 403
},
{
"epoch": 1.1030716723549487,
"grad_norm": 0.060664501041173935,
"learning_rate": 0.00018178958868399033,
"loss": 0.453,
"step": 404
},
{
"epoch": 1.10580204778157,
"grad_norm": 0.0612984299659729,
"learning_rate": 0.00018165210733872336,
"loss": 0.4406,
"step": 405
},
{
"epoch": 1.108532423208191,
"grad_norm": 0.059849295765161514,
"learning_rate": 0.000181514161376145,
"loss": 0.4423,
"step": 406
},
{
"epoch": 1.1112627986348123,
"grad_norm": 0.059180960059165955,
"learning_rate": 0.0001813757515811962,
"loss": 0.4401,
"step": 407
},
{
"epoch": 1.1139931740614335,
"grad_norm": 0.05857124924659729,
"learning_rate": 0.00018123687874145721,
"loss": 0.4159,
"step": 408
},
{
"epoch": 1.1167235494880545,
"grad_norm": 0.06205347552895546,
"learning_rate": 0.00018109754364714305,
"loss": 0.4318,
"step": 409
},
{
"epoch": 1.1194539249146758,
"grad_norm": 0.06382250785827637,
"learning_rate": 0.0001809577470910992,
"loss": 0.4416,
"step": 410
},
{
"epoch": 1.122184300341297,
"grad_norm": 0.05814497917890549,
"learning_rate": 0.00018081748986879679,
"loss": 0.4392,
"step": 411
},
{
"epoch": 1.124914675767918,
"grad_norm": 0.058424465358257294,
"learning_rate": 0.00018067677277832834,
"loss": 0.4266,
"step": 412
},
{
"epoch": 1.1276450511945393,
"grad_norm": 0.05630108341574669,
"learning_rate": 0.00018053559662040302,
"loss": 0.4401,
"step": 413
},
{
"epoch": 1.1303754266211605,
"grad_norm": 0.06453561037778854,
"learning_rate": 0.00018039396219834237,
"loss": 0.4267,
"step": 414
},
{
"epoch": 1.1331058020477816,
"grad_norm": 0.06126587092876434,
"learning_rate": 0.00018025187031807532,
"loss": 0.4346,
"step": 415
},
{
"epoch": 1.1358361774744028,
"grad_norm": 0.057017982006073,
"learning_rate": 0.00018010932178813397,
"loss": 0.4367,
"step": 416
},
{
"epoch": 1.1385665529010238,
"grad_norm": 0.06581621617078781,
"learning_rate": 0.00017996631741964888,
"loss": 0.4157,
"step": 417
},
{
"epoch": 1.141296928327645,
"grad_norm": 0.055874526500701904,
"learning_rate": 0.00017982285802634426,
"loss": 0.4341,
"step": 418
},
{
"epoch": 1.144027303754266,
"grad_norm": 0.059336546808481216,
"learning_rate": 0.0001796789444245337,
"loss": 0.4029,
"step": 419
},
{
"epoch": 1.1467576791808873,
"grad_norm": 0.06833340972661972,
"learning_rate": 0.00017953457743311523,
"loss": 0.4564,
"step": 420
},
{
"epoch": 1.1494880546075086,
"grad_norm": 0.061153508722782135,
"learning_rate": 0.00017938975787356673,
"loss": 0.4496,
"step": 421
},
{
"epoch": 1.1522184300341296,
"grad_norm": 0.0649651363492012,
"learning_rate": 0.00017924448656994133,
"loss": 0.4323,
"step": 422
},
{
"epoch": 1.1549488054607508,
"grad_norm": 0.0639922022819519,
"learning_rate": 0.00017909876434886273,
"loss": 0.4421,
"step": 423
},
{
"epoch": 1.157679180887372,
"grad_norm": 0.06662526726722717,
"learning_rate": 0.00017895259203952032,
"loss": 0.4532,
"step": 424
},
{
"epoch": 1.1604095563139931,
"grad_norm": 0.05699828639626503,
"learning_rate": 0.0001788059704736647,
"loss": 0.4382,
"step": 425
},
{
"epoch": 1.1631399317406144,
"grad_norm": 0.06322555243968964,
"learning_rate": 0.00017865890048560277,
"loss": 0.4423,
"step": 426
},
{
"epoch": 1.1658703071672356,
"grad_norm": 0.05652053654193878,
"learning_rate": 0.00017851138291219301,
"loss": 0.4338,
"step": 427
},
{
"epoch": 1.1686006825938566,
"grad_norm": 0.06619950383901596,
"learning_rate": 0.00017836341859284093,
"loss": 0.4272,
"step": 428
},
{
"epoch": 1.1713310580204779,
"grad_norm": 0.060171984136104584,
"learning_rate": 0.00017821500836949386,
"loss": 0.4371,
"step": 429
},
{
"epoch": 1.174061433447099,
"grad_norm": 0.06065813824534416,
"learning_rate": 0.0001780661530866366,
"loss": 0.4064,
"step": 430
},
{
"epoch": 1.1767918088737201,
"grad_norm": 0.06799128651618958,
"learning_rate": 0.00017791685359128633,
"loss": 0.43,
"step": 431
},
{
"epoch": 1.1795221843003414,
"grad_norm": 0.059587378054857254,
"learning_rate": 0.000177767110732988,
"loss": 0.4366,
"step": 432
},
{
"epoch": 1.1822525597269624,
"grad_norm": 0.06191541254520416,
"learning_rate": 0.00017761692536380928,
"loss": 0.415,
"step": 433
},
{
"epoch": 1.1849829351535837,
"grad_norm": 0.0611693374812603,
"learning_rate": 0.00017746629833833585,
"loss": 0.4396,
"step": 434
},
{
"epoch": 1.1877133105802047,
"grad_norm": 0.06228373572230339,
"learning_rate": 0.00017731523051366658,
"loss": 0.431,
"step": 435
},
{
"epoch": 1.190443686006826,
"grad_norm": 0.06130995601415634,
"learning_rate": 0.00017716372274940843,
"loss": 0.4538,
"step": 436
},
{
"epoch": 1.1931740614334472,
"grad_norm": 0.06163164600729942,
"learning_rate": 0.00017701177590767183,
"loss": 0.4251,
"step": 437
},
{
"epoch": 1.1959044368600682,
"grad_norm": 0.061723340302705765,
"learning_rate": 0.00017685939085306562,
"loss": 0.4274,
"step": 438
},
{
"epoch": 1.1986348122866894,
"grad_norm": 0.06078750640153885,
"learning_rate": 0.00017670656845269214,
"loss": 0.4432,
"step": 439
},
{
"epoch": 1.2013651877133107,
"grad_norm": 0.05991605296730995,
"learning_rate": 0.00017655330957614234,
"loss": 0.4167,
"step": 440
},
{
"epoch": 1.2040955631399317,
"grad_norm": 0.05879712477326393,
"learning_rate": 0.00017639961509549078,
"loss": 0.4232,
"step": 441
},
{
"epoch": 1.206825938566553,
"grad_norm": 0.060264360159635544,
"learning_rate": 0.00017624548588529072,
"loss": 0.4361,
"step": 442
},
{
"epoch": 1.209556313993174,
"grad_norm": 0.06511180847883224,
"learning_rate": 0.00017609092282256912,
"loss": 0.4327,
"step": 443
},
{
"epoch": 1.2122866894197952,
"grad_norm": 0.06026393920183182,
"learning_rate": 0.00017593592678682166,
"loss": 0.4195,
"step": 444
},
{
"epoch": 1.2150170648464165,
"grad_norm": 0.06378287822008133,
"learning_rate": 0.0001757804986600077,
"loss": 0.4404,
"step": 445
},
{
"epoch": 1.2177474402730375,
"grad_norm": 0.0656813457608223,
"learning_rate": 0.0001756246393265453,
"loss": 0.4354,
"step": 446
},
{
"epoch": 1.2204778156996587,
"grad_norm": 0.05804288387298584,
"learning_rate": 0.00017546834967330617,
"loss": 0.4352,
"step": 447
},
{
"epoch": 1.2232081911262798,
"grad_norm": 0.06775437295436859,
"learning_rate": 0.00017531163058961066,
"loss": 0.4393,
"step": 448
},
{
"epoch": 1.225938566552901,
"grad_norm": 0.06272158026695251,
"learning_rate": 0.00017515448296722262,
"loss": 0.4178,
"step": 449
},
{
"epoch": 1.2286689419795223,
"grad_norm": 0.06508231163024902,
"learning_rate": 0.00017499690770034443,
"loss": 0.4322,
"step": 450
},
{
"epoch": 1.2313993174061433,
"grad_norm": 0.05709952861070633,
"learning_rate": 0.00017483890568561173,
"loss": 0.4337,
"step": 451
},
{
"epoch": 1.2341296928327645,
"grad_norm": 0.061706554144620895,
"learning_rate": 0.00017468047782208865,
"loss": 0.4126,
"step": 452
},
{
"epoch": 1.2368600682593858,
"grad_norm": 0.056757740676403046,
"learning_rate": 0.00017452162501126227,
"loss": 0.4287,
"step": 453
},
{
"epoch": 1.2395904436860068,
"grad_norm": 0.05650217831134796,
"learning_rate": 0.00017436234815703788,
"loss": 0.4224,
"step": 454
},
{
"epoch": 1.242320819112628,
"grad_norm": 0.05224541947245598,
"learning_rate": 0.0001742026481657335,
"loss": 0.4166,
"step": 455
},
{
"epoch": 1.245051194539249,
"grad_norm": 0.06731689721345901,
"learning_rate": 0.0001740425259460751,
"loss": 0.4538,
"step": 456
},
{
"epoch": 1.2477815699658703,
"grad_norm": 0.060736652463674545,
"learning_rate": 0.00017388198240919102,
"loss": 0.4329,
"step": 457
},
{
"epoch": 1.2505119453924913,
"grad_norm": 0.05695323646068573,
"learning_rate": 0.00017372101846860707,
"loss": 0.4412,
"step": 458
},
{
"epoch": 1.2532423208191126,
"grad_norm": 0.056898247450590134,
"learning_rate": 0.00017355963504024123,
"loss": 0.4418,
"step": 459
},
{
"epoch": 1.2559726962457338,
"grad_norm": 0.059471502900123596,
"learning_rate": 0.00017339783304239843,
"loss": 0.4136,
"step": 460
},
{
"epoch": 1.2587030716723548,
"grad_norm": 0.05504520982503891,
"learning_rate": 0.00017323561339576543,
"loss": 0.4263,
"step": 461
},
{
"epoch": 1.261433447098976,
"grad_norm": 0.059035494923591614,
"learning_rate": 0.0001730729770234054,
"loss": 0.4362,
"step": 462
},
{
"epoch": 1.2641638225255973,
"grad_norm": 0.05722351744771004,
"learning_rate": 0.00017290992485075282,
"loss": 0.4239,
"step": 463
},
{
"epoch": 1.2668941979522184,
"grad_norm": 0.057449549436569214,
"learning_rate": 0.0001727464578056081,
"loss": 0.4357,
"step": 464
},
{
"epoch": 1.2696245733788396,
"grad_norm": 0.0636393278837204,
"learning_rate": 0.00017258257681813244,
"loss": 0.433,
"step": 465
},
{
"epoch": 1.2723549488054609,
"grad_norm": 0.061772268265485764,
"learning_rate": 0.0001724182828208424,
"loss": 0.4365,
"step": 466
},
{
"epoch": 1.2750853242320819,
"grad_norm": 0.053929511457681656,
"learning_rate": 0.0001722535767486047,
"loss": 0.4346,
"step": 467
},
{
"epoch": 1.2778156996587031,
"grad_norm": 0.05948130041360855,
"learning_rate": 0.00017208845953863076,
"loss": 0.4342,
"step": 468
},
{
"epoch": 1.2805460750853244,
"grad_norm": 0.05833544209599495,
"learning_rate": 0.0001719229321304716,
"loss": 0.4309,
"step": 469
},
{
"epoch": 1.2832764505119454,
"grad_norm": 0.055491410195827484,
"learning_rate": 0.00017175699546601223,
"loss": 0.4279,
"step": 470
},
{
"epoch": 1.2860068259385666,
"grad_norm": 0.05924072489142418,
"learning_rate": 0.00017159065048946644,
"loss": 0.432,
"step": 471
},
{
"epoch": 1.2887372013651877,
"grad_norm": 0.05847487971186638,
"learning_rate": 0.00017142389814737142,
"loss": 0.424,
"step": 472
},
{
"epoch": 1.291467576791809,
"grad_norm": 0.05650070682168007,
"learning_rate": 0.00017125673938858237,
"loss": 0.4134,
"step": 473
},
{
"epoch": 1.29419795221843,
"grad_norm": 0.059648044407367706,
"learning_rate": 0.00017108917516426704,
"loss": 0.4279,
"step": 474
},
{
"epoch": 1.2969283276450512,
"grad_norm": 0.060436248779296875,
"learning_rate": 0.00017092120642790042,
"loss": 0.4091,
"step": 475
},
{
"epoch": 1.2996587030716724,
"grad_norm": 0.06787759065628052,
"learning_rate": 0.00017075283413525916,
"loss": 0.4107,
"step": 476
},
{
"epoch": 1.3023890784982934,
"grad_norm": 0.06723356992006302,
"learning_rate": 0.00017058405924441636,
"loss": 0.4339,
"step": 477
},
{
"epoch": 1.3051194539249147,
"grad_norm": 0.058346495032310486,
"learning_rate": 0.00017041488271573587,
"loss": 0.441,
"step": 478
},
{
"epoch": 1.307849829351536,
"grad_norm": 0.059269823133945465,
"learning_rate": 0.00017024530551186702,
"loss": 0.4338,
"step": 479
},
{
"epoch": 1.310580204778157,
"grad_norm": 0.05570577457547188,
"learning_rate": 0.000170075328597739,
"loss": 0.4176,
"step": 480
},
{
"epoch": 1.3133105802047782,
"grad_norm": 0.05658780783414841,
"learning_rate": 0.00016990495294055548,
"loss": 0.4327,
"step": 481
},
{
"epoch": 1.3160409556313994,
"grad_norm": 0.06438103318214417,
"learning_rate": 0.00016973417950978906,
"loss": 0.4451,
"step": 482
},
{
"epoch": 1.3187713310580205,
"grad_norm": 0.06003286689519882,
"learning_rate": 0.00016956300927717575,
"loss": 0.4245,
"step": 483
},
{
"epoch": 1.3215017064846417,
"grad_norm": 0.06092451140284538,
"learning_rate": 0.0001693914432167094,
"loss": 0.4331,
"step": 484
},
{
"epoch": 1.3242320819112627,
"grad_norm": 0.059084732085466385,
"learning_rate": 0.00016921948230463625,
"loss": 0.4261,
"step": 485
},
{
"epoch": 1.326962457337884,
"grad_norm": 0.059612493962049484,
"learning_rate": 0.00016904712751944931,
"loss": 0.4356,
"step": 486
},
{
"epoch": 1.329692832764505,
"grad_norm": 0.05373890697956085,
"learning_rate": 0.00016887437984188286,
"loss": 0.4221,
"step": 487
},
{
"epoch": 1.3324232081911263,
"grad_norm": 0.06069657579064369,
"learning_rate": 0.00016870124025490673,
"loss": 0.4343,
"step": 488
},
{
"epoch": 1.3351535836177475,
"grad_norm": 0.058680132031440735,
"learning_rate": 0.0001685277097437208,
"loss": 0.4376,
"step": 489
},
{
"epoch": 1.3378839590443685,
"grad_norm": 0.052157819271087646,
"learning_rate": 0.0001683537892957495,
"loss": 0.4194,
"step": 490
},
{
"epoch": 1.3406143344709898,
"grad_norm": 0.05680167302489281,
"learning_rate": 0.00016817947990063598,
"loss": 0.4214,
"step": 491
},
{
"epoch": 1.343344709897611,
"grad_norm": 0.061938587576150894,
"learning_rate": 0.0001680047825502366,
"loss": 0.4413,
"step": 492
},
{
"epoch": 1.346075085324232,
"grad_norm": 0.05423510819673538,
"learning_rate": 0.00016782969823861526,
"loss": 0.4188,
"step": 493
},
{
"epoch": 1.3488054607508533,
"grad_norm": 0.059597909450531006,
"learning_rate": 0.0001676542279620378,
"loss": 0.4188,
"step": 494
},
{
"epoch": 1.3515358361774745,
"grad_norm": 0.05773560330271721,
"learning_rate": 0.00016747837271896622,
"loss": 0.4354,
"step": 495
},
{
"epoch": 1.3542662116040955,
"grad_norm": 0.06316240131855011,
"learning_rate": 0.00016730213351005303,
"loss": 0.4248,
"step": 496
},
{
"epoch": 1.3569965870307168,
"grad_norm": 0.056602396070957184,
"learning_rate": 0.00016712551133813572,
"loss": 0.4227,
"step": 497
},
{
"epoch": 1.3597269624573378,
"grad_norm": 0.06384044885635376,
"learning_rate": 0.0001669485072082308,
"loss": 0.4398,
"step": 498
},
{
"epoch": 1.362457337883959,
"grad_norm": 0.06040973588824272,
"learning_rate": 0.00016677112212752824,
"loss": 0.4168,
"step": 499
},
{
"epoch": 1.36518771331058,
"grad_norm": 0.05779508873820305,
"learning_rate": 0.00016659335710538564,
"loss": 0.4097,
"step": 500
},
{
"epoch": 1.3679180887372013,
"grad_norm": 0.060474693775177,
"learning_rate": 0.00016641521315332265,
"loss": 0.4252,
"step": 501
},
{
"epoch": 1.3706484641638226,
"grad_norm": 0.05790797993540764,
"learning_rate": 0.00016623669128501504,
"loss": 0.4238,
"step": 502
},
{
"epoch": 1.3733788395904436,
"grad_norm": 0.06164141371846199,
"learning_rate": 0.00016605779251628903,
"loss": 0.4336,
"step": 503
},
{
"epoch": 1.3761092150170648,
"grad_norm": 0.055059127509593964,
"learning_rate": 0.00016587851786511543,
"loss": 0.4303,
"step": 504
},
{
"epoch": 1.378839590443686,
"grad_norm": 0.05771743133664131,
"learning_rate": 0.00016569886835160399,
"loss": 0.4352,
"step": 505
},
{
"epoch": 1.3815699658703071,
"grad_norm": 0.056050512939691544,
"learning_rate": 0.0001655188449979974,
"loss": 0.4233,
"step": 506
},
{
"epoch": 1.3843003412969284,
"grad_norm": 0.054744672030210495,
"learning_rate": 0.00016533844882866568,
"loss": 0.415,
"step": 507
},
{
"epoch": 1.3870307167235496,
"grad_norm": 0.060217492282390594,
"learning_rate": 0.00016515768087010013,
"loss": 0.3959,
"step": 508
},
{
"epoch": 1.3897610921501706,
"grad_norm": 0.0636279284954071,
"learning_rate": 0.00016497654215090772,
"loss": 0.4341,
"step": 509
},
{
"epoch": 1.3924914675767919,
"grad_norm": 0.05640679970383644,
"learning_rate": 0.00016479503370180507,
"loss": 0.3917,
"step": 510
},
{
"epoch": 1.395221843003413,
"grad_norm": 0.05939646065235138,
"learning_rate": 0.00016461315655561263,
"loss": 0.4378,
"step": 511
},
{
"epoch": 1.3979522184300341,
"grad_norm": 0.05862488970160484,
"learning_rate": 0.00016443091174724885,
"loss": 0.4017,
"step": 512
},
{
"epoch": 1.4006825938566552,
"grad_norm": 0.060345377773046494,
"learning_rate": 0.00016424830031372425,
"loss": 0.4248,
"step": 513
},
{
"epoch": 1.4034129692832764,
"grad_norm": 0.06127999722957611,
"learning_rate": 0.00016406532329413546,
"loss": 0.4129,
"step": 514
},
{
"epoch": 1.4061433447098977,
"grad_norm": 0.0599684976041317,
"learning_rate": 0.00016388198172965942,
"loss": 0.4223,
"step": 515
},
{
"epoch": 1.4088737201365187,
"grad_norm": 0.056950025260448456,
"learning_rate": 0.00016369827666354745,
"loss": 0.4293,
"step": 516
},
{
"epoch": 1.41160409556314,
"grad_norm": 0.05798695236444473,
"learning_rate": 0.00016351420914111916,
"loss": 0.4163,
"step": 517
},
{
"epoch": 1.4143344709897612,
"grad_norm": 0.056971821933984756,
"learning_rate": 0.0001633297802097567,
"loss": 0.4088,
"step": 518
},
{
"epoch": 1.4170648464163822,
"grad_norm": 0.06520035862922668,
"learning_rate": 0.0001631449909188987,
"loss": 0.4316,
"step": 519
},
{
"epoch": 1.4197952218430034,
"grad_norm": 0.054386623203754425,
"learning_rate": 0.00016295984232003426,
"loss": 0.4276,
"step": 520
},
{
"epoch": 1.4225255972696247,
"grad_norm": 0.06270336359739304,
"learning_rate": 0.00016277433546669703,
"loss": 0.4133,
"step": 521
},
{
"epoch": 1.4252559726962457,
"grad_norm": 0.05896778032183647,
"learning_rate": 0.00016258847141445928,
"loss": 0.4331,
"step": 522
},
{
"epoch": 1.427986348122867,
"grad_norm": 0.06417705118656158,
"learning_rate": 0.00016240225122092573,
"loss": 0.4306,
"step": 523
},
{
"epoch": 1.430716723549488,
"grad_norm": 0.06666136533021927,
"learning_rate": 0.00016221567594572762,
"loss": 0.4369,
"step": 524
},
{
"epoch": 1.4334470989761092,
"grad_norm": 0.06409899890422821,
"learning_rate": 0.00016202874665051674,
"loss": 0.442,
"step": 525
},
{
"epoch": 1.4361774744027302,
"grad_norm": 0.06460480391979218,
"learning_rate": 0.00016184146439895928,
"loss": 0.4114,
"step": 526
},
{
"epoch": 1.4389078498293515,
"grad_norm": 0.06045004725456238,
"learning_rate": 0.00016165383025672981,
"loss": 0.424,
"step": 527
},
{
"epoch": 1.4416382252559727,
"grad_norm": 0.0617341473698616,
"learning_rate": 0.00016146584529150526,
"loss": 0.4201,
"step": 528
},
{
"epoch": 1.4443686006825938,
"grad_norm": 0.06265206634998322,
"learning_rate": 0.0001612775105729588,
"loss": 0.4145,
"step": 529
},
{
"epoch": 1.447098976109215,
"grad_norm": 0.06431074440479279,
"learning_rate": 0.00016108882717275384,
"loss": 0.397,
"step": 530
},
{
"epoch": 1.4498293515358363,
"grad_norm": 0.05702768266201019,
"learning_rate": 0.0001608997961645377,
"loss": 0.4024,
"step": 531
},
{
"epoch": 1.4525597269624573,
"grad_norm": 0.06387649476528168,
"learning_rate": 0.00016071041862393578,
"loss": 0.4369,
"step": 532
},
{
"epoch": 1.4552901023890785,
"grad_norm": 0.06181952729821205,
"learning_rate": 0.0001605206956285454,
"loss": 0.4391,
"step": 533
},
{
"epoch": 1.4580204778156998,
"grad_norm": 0.060091473162174225,
"learning_rate": 0.00016033062825792935,
"loss": 0.4207,
"step": 534
},
{
"epoch": 1.4607508532423208,
"grad_norm": 0.059614650905132294,
"learning_rate": 0.0001601402175936102,
"loss": 0.409,
"step": 535
},
{
"epoch": 1.463481228668942,
"grad_norm": 0.06142239645123482,
"learning_rate": 0.00015994946471906382,
"loss": 0.4236,
"step": 536
},
{
"epoch": 1.466211604095563,
"grad_norm": 0.06790998578071594,
"learning_rate": 0.0001597583707197134,
"loss": 0.4131,
"step": 537
},
{
"epoch": 1.4689419795221843,
"grad_norm": 0.05919467657804489,
"learning_rate": 0.00015956693668292313,
"loss": 0.418,
"step": 538
},
{
"epoch": 1.4716723549488053,
"grad_norm": 0.06804287433624268,
"learning_rate": 0.00015937516369799216,
"loss": 0.4216,
"step": 539
},
{
"epoch": 1.4744027303754266,
"grad_norm": 0.061936333775520325,
"learning_rate": 0.00015918305285614822,
"loss": 0.4239,
"step": 540
},
{
"epoch": 1.4771331058020478,
"grad_norm": 0.06181802973151207,
"learning_rate": 0.00015899060525054157,
"loss": 0.4136,
"step": 541
},
{
"epoch": 1.4798634812286688,
"grad_norm": 0.05767858028411865,
"learning_rate": 0.0001587978219762388,
"loss": 0.4178,
"step": 542
},
{
"epoch": 1.48259385665529,
"grad_norm": 0.06959601491689682,
"learning_rate": 0.00015860470413021642,
"loss": 0.4271,
"step": 543
},
{
"epoch": 1.4853242320819113,
"grad_norm": 0.05592988058924675,
"learning_rate": 0.00015841125281135473,
"loss": 0.4165,
"step": 544
},
{
"epoch": 1.4880546075085324,
"grad_norm": 0.06603039801120758,
"learning_rate": 0.00015821746912043165,
"loss": 0.4359,
"step": 545
},
{
"epoch": 1.4907849829351536,
"grad_norm": 0.05518212914466858,
"learning_rate": 0.00015802335416011625,
"loss": 0.4284,
"step": 546
},
{
"epoch": 1.4935153583617748,
"grad_norm": 0.062445998191833496,
"learning_rate": 0.00015782890903496264,
"loss": 0.4171,
"step": 547
},
{
"epoch": 1.4962457337883959,
"grad_norm": 0.05508886277675629,
"learning_rate": 0.00015763413485140365,
"loss": 0.4001,
"step": 548
},
{
"epoch": 1.4989761092150171,
"grad_norm": 0.0545768216252327,
"learning_rate": 0.00015743903271774455,
"loss": 0.4081,
"step": 549
},
{
"epoch": 1.5017064846416384,
"grad_norm": 0.058887772262096405,
"learning_rate": 0.0001572436037441566,
"loss": 0.4224,
"step": 550
},
{
"epoch": 1.5044368600682594,
"grad_norm": 0.05538494512438774,
"learning_rate": 0.00015704784904267097,
"loss": 0.4254,
"step": 551
},
{
"epoch": 1.5071672354948804,
"grad_norm": 0.05865982919931412,
"learning_rate": 0.00015685176972717223,
"loss": 0.4142,
"step": 552
},
{
"epoch": 1.5098976109215017,
"grad_norm": 0.05798998102545738,
"learning_rate": 0.00015665536691339207,
"loss": 0.4298,
"step": 553
},
{
"epoch": 1.512627986348123,
"grad_norm": 0.05779840052127838,
"learning_rate": 0.00015645864171890295,
"loss": 0.4145,
"step": 554
},
{
"epoch": 1.515358361774744,
"grad_norm": 0.05778159946203232,
"learning_rate": 0.00015626159526311174,
"loss": 0.4249,
"step": 555
},
{
"epoch": 1.5180887372013652,
"grad_norm": 0.0566212497651577,
"learning_rate": 0.00015606422866725343,
"loss": 0.4366,
"step": 556
},
{
"epoch": 1.5208191126279864,
"grad_norm": 0.05623873695731163,
"learning_rate": 0.00015586654305438456,
"loss": 0.4297,
"step": 557
},
{
"epoch": 1.5235494880546074,
"grad_norm": 0.05833446979522705,
"learning_rate": 0.00015566853954937694,
"loss": 0.4361,
"step": 558
},
{
"epoch": 1.5262798634812287,
"grad_norm": 0.05821897089481354,
"learning_rate": 0.00015547021927891144,
"loss": 0.4309,
"step": 559
},
{
"epoch": 1.52901023890785,
"grad_norm": 0.05831674486398697,
"learning_rate": 0.00015527158337147112,
"loss": 0.4228,
"step": 560
},
{
"epoch": 1.531740614334471,
"grad_norm": 0.05716761201620102,
"learning_rate": 0.00015507263295733528,
"loss": 0.4237,
"step": 561
},
{
"epoch": 1.5344709897610922,
"grad_norm": 0.061434000730514526,
"learning_rate": 0.00015487336916857278,
"loss": 0.4307,
"step": 562
},
{
"epoch": 1.5372013651877134,
"grad_norm": 0.055752865970134735,
"learning_rate": 0.00015467379313903557,
"loss": 0.4089,
"step": 563
},
{
"epoch": 1.5399317406143345,
"grad_norm": 0.05673924833536148,
"learning_rate": 0.00015447390600435238,
"loss": 0.3955,
"step": 564
},
{
"epoch": 1.5426621160409555,
"grad_norm": 0.05844118818640709,
"learning_rate": 0.00015427370890192224,
"loss": 0.4266,
"step": 565
},
{
"epoch": 1.545392491467577,
"grad_norm": 0.05962743982672691,
"learning_rate": 0.00015407320297090786,
"loss": 0.4063,
"step": 566
},
{
"epoch": 1.548122866894198,
"grad_norm": 0.05776818096637726,
"learning_rate": 0.00015387238935222927,
"loss": 0.4236,
"step": 567
},
{
"epoch": 1.550853242320819,
"grad_norm": 0.05769157037138939,
"learning_rate": 0.00015367126918855738,
"loss": 0.4183,
"step": 568
},
{
"epoch": 1.5535836177474402,
"grad_norm": 0.05596569553017616,
"learning_rate": 0.0001534698436243073,
"loss": 0.4074,
"step": 569
},
{
"epoch": 1.5563139931740615,
"grad_norm": 0.05986526980996132,
"learning_rate": 0.00015326811380563204,
"loss": 0.4166,
"step": 570
},
{
"epoch": 1.5590443686006825,
"grad_norm": 0.05552714318037033,
"learning_rate": 0.0001530660808804158,
"loss": 0.3986,
"step": 571
},
{
"epoch": 1.5617747440273038,
"grad_norm": 0.05853855237364769,
"learning_rate": 0.00015286374599826754,
"loss": 0.3964,
"step": 572
},
{
"epoch": 1.564505119453925,
"grad_norm": 0.06155244633555412,
"learning_rate": 0.00015266111031051442,
"loss": 0.4041,
"step": 573
},
{
"epoch": 1.567235494880546,
"grad_norm": 0.061913736164569855,
"learning_rate": 0.00015245817497019524,
"loss": 0.4228,
"step": 574
},
{
"epoch": 1.5699658703071673,
"grad_norm": 0.05519396439194679,
"learning_rate": 0.00015225494113205393,
"loss": 0.4124,
"step": 575
},
{
"epoch": 1.5726962457337885,
"grad_norm": 0.05629811808466911,
"learning_rate": 0.00015205140995253283,
"loss": 0.418,
"step": 576
},
{
"epoch": 1.5754266211604095,
"grad_norm": 0.051916785538196564,
"learning_rate": 0.00015184758258976637,
"loss": 0.4327,
"step": 577
},
{
"epoch": 1.5781569965870306,
"grad_norm": 0.05583992972970009,
"learning_rate": 0.00015164346020357417,
"loss": 0.417,
"step": 578
},
{
"epoch": 1.580887372013652,
"grad_norm": 0.05611740052700043,
"learning_rate": 0.00015143904395545466,
"loss": 0.413,
"step": 579
},
{
"epoch": 1.583617747440273,
"grad_norm": 0.05637525022029877,
"learning_rate": 0.0001512343350085784,
"loss": 0.4113,
"step": 580
},
{
"epoch": 1.586348122866894,
"grad_norm": 0.059624236077070236,
"learning_rate": 0.0001510293345277815,
"loss": 0.4321,
"step": 581
},
{
"epoch": 1.5890784982935153,
"grad_norm": 0.05502263084053993,
"learning_rate": 0.0001508240436795589,
"loss": 0.409,
"step": 582
},
{
"epoch": 1.5918088737201366,
"grad_norm": 0.05809929221868515,
"learning_rate": 0.00015061846363205784,
"loss": 0.4129,
"step": 583
},
{
"epoch": 1.5945392491467576,
"grad_norm": 0.05428490787744522,
"learning_rate": 0.00015041259555507108,
"loss": 0.4181,
"step": 584
},
{
"epoch": 1.5972696245733788,
"grad_norm": 0.05276649072766304,
"learning_rate": 0.00015020644062003046,
"loss": 0.3996,
"step": 585
},
{
"epoch": 1.6,
"grad_norm": 0.06145811080932617,
"learning_rate": 0.00015000000000000001,
"loss": 0.4156,
"step": 586
},
{
"epoch": 1.6027303754266211,
"grad_norm": 0.05626256391406059,
"learning_rate": 0.00014979327486966938,
"loss": 0.4184,
"step": 587
},
{
"epoch": 1.6054607508532424,
"grad_norm": 0.06118204817175865,
"learning_rate": 0.0001495862664053471,
"loss": 0.4208,
"step": 588
},
{
"epoch": 1.6081911262798636,
"grad_norm": 0.06345456838607788,
"learning_rate": 0.0001493789757849541,
"loss": 0.4234,
"step": 589
},
{
"epoch": 1.6109215017064846,
"grad_norm": 0.058717817068099976,
"learning_rate": 0.00014917140418801655,
"loss": 0.4176,
"step": 590
},
{
"epoch": 1.6136518771331056,
"grad_norm": 0.05213068425655365,
"learning_rate": 0.00014896355279565976,
"loss": 0.3857,
"step": 591
},
{
"epoch": 1.6163822525597271,
"grad_norm": 0.056677792221307755,
"learning_rate": 0.00014875542279060085,
"loss": 0.4211,
"step": 592
},
{
"epoch": 1.6191126279863481,
"grad_norm": 0.058997780084609985,
"learning_rate": 0.00014854701535714244,
"loss": 0.4174,
"step": 593
},
{
"epoch": 1.6218430034129692,
"grad_norm": 0.0554414838552475,
"learning_rate": 0.00014833833168116582,
"loss": 0.4182,
"step": 594
},
{
"epoch": 1.6245733788395904,
"grad_norm": 0.06074132025241852,
"learning_rate": 0.00014812937295012406,
"loss": 0.4261,
"step": 595
},
{
"epoch": 1.6273037542662117,
"grad_norm": 0.05850062891840935,
"learning_rate": 0.00014792014035303535,
"loss": 0.4085,
"step": 596
},
{
"epoch": 1.6300341296928327,
"grad_norm": 0.06121140718460083,
"learning_rate": 0.00014771063508047636,
"loss": 0.4183,
"step": 597
},
{
"epoch": 1.632764505119454,
"grad_norm": 0.06299193948507309,
"learning_rate": 0.00014750085832457519,
"loss": 0.426,
"step": 598
},
{
"epoch": 1.6354948805460752,
"grad_norm": 0.06619743257761002,
"learning_rate": 0.00014729081127900476,
"loss": 0.4129,
"step": 599
},
{
"epoch": 1.6382252559726962,
"grad_norm": 0.05819617956876755,
"learning_rate": 0.0001470804951389761,
"loss": 0.4129,
"step": 600
},
{
"epoch": 1.6409556313993174,
"grad_norm": 0.06314659863710403,
"learning_rate": 0.00014686991110123135,
"loss": 0.3967,
"step": 601
},
{
"epoch": 1.6436860068259387,
"grad_norm": 0.05983169004321098,
"learning_rate": 0.00014665906036403706,
"loss": 0.4161,
"step": 602
},
{
"epoch": 1.6464163822525597,
"grad_norm": 0.06163496896624565,
"learning_rate": 0.00014644794412717736,
"loss": 0.4103,
"step": 603
},
{
"epoch": 1.6491467576791807,
"grad_norm": 0.06737516075372696,
"learning_rate": 0.00014623656359194712,
"loss": 0.4215,
"step": 604
},
{
"epoch": 1.6518771331058022,
"grad_norm": 0.058461885899305344,
"learning_rate": 0.00014602491996114516,
"loss": 0.4168,
"step": 605
},
{
"epoch": 1.6546075085324232,
"grad_norm": 0.06050106883049011,
"learning_rate": 0.0001458130144390673,
"loss": 0.4184,
"step": 606
},
{
"epoch": 1.6573378839590442,
"grad_norm": 0.059844836592674255,
"learning_rate": 0.00014560084823149965,
"loss": 0.4181,
"step": 607
},
{
"epoch": 1.6600682593856655,
"grad_norm": 0.05483812466263771,
"learning_rate": 0.0001453884225457116,
"loss": 0.3996,
"step": 608
},
{
"epoch": 1.6627986348122867,
"grad_norm": 0.06310712546110153,
"learning_rate": 0.00014517573859044907,
"loss": 0.4266,
"step": 609
},
{
"epoch": 1.6655290102389078,
"grad_norm": 0.06159716099500656,
"learning_rate": 0.00014496279757592766,
"loss": 0.4248,
"step": 610
},
{
"epoch": 1.668259385665529,
"grad_norm": 0.058709222823381424,
"learning_rate": 0.0001447496007138255,
"loss": 0.4067,
"step": 611
},
{
"epoch": 1.6709897610921502,
"grad_norm": 0.05836094543337822,
"learning_rate": 0.00014453614921727668,
"loss": 0.4005,
"step": 612
},
{
"epoch": 1.6737201365187713,
"grad_norm": 0.05980111286044121,
"learning_rate": 0.00014432244430086423,
"loss": 0.4222,
"step": 613
},
{
"epoch": 1.6764505119453925,
"grad_norm": 0.05967998504638672,
"learning_rate": 0.00014410848718061312,
"loss": 0.4075,
"step": 614
},
{
"epoch": 1.6791808873720138,
"grad_norm": 0.05903726816177368,
"learning_rate": 0.00014389427907398342,
"loss": 0.4007,
"step": 615
},
{
"epoch": 1.6819112627986348,
"grad_norm": 0.05877222120761871,
"learning_rate": 0.00014367982119986342,
"loss": 0.4234,
"step": 616
},
{
"epoch": 1.6846416382252558,
"grad_norm": 0.0625043734908104,
"learning_rate": 0.00014346511477856259,
"loss": 0.4165,
"step": 617
},
{
"epoch": 1.6873720136518773,
"grad_norm": 0.05730627477169037,
"learning_rate": 0.0001432501610318047,
"loss": 0.4221,
"step": 618
},
{
"epoch": 1.6901023890784983,
"grad_norm": 0.05606284737586975,
"learning_rate": 0.00014303496118272084,
"loss": 0.4201,
"step": 619
},
{
"epoch": 1.6928327645051193,
"grad_norm": 0.056516390293836594,
"learning_rate": 0.0001428195164558425,
"loss": 0.4241,
"step": 620
},
{
"epoch": 1.6955631399317406,
"grad_norm": 0.0579177550971508,
"learning_rate": 0.00014260382807709457,
"loss": 0.4147,
"step": 621
},
{
"epoch": 1.6982935153583618,
"grad_norm": 0.05802591145038605,
"learning_rate": 0.0001423878972737883,
"loss": 0.409,
"step": 622
},
{
"epoch": 1.7010238907849828,
"grad_norm": 0.05921417847275734,
"learning_rate": 0.0001421717252746145,
"loss": 0.4126,
"step": 623
},
{
"epoch": 1.703754266211604,
"grad_norm": 0.0596776120364666,
"learning_rate": 0.00014195531330963635,
"loss": 0.405,
"step": 624
},
{
"epoch": 1.7064846416382253,
"grad_norm": 0.057035986334085464,
"learning_rate": 0.0001417386626102825,
"loss": 0.4208,
"step": 625
},
{
"epoch": 1.7092150170648464,
"grad_norm": 0.05868854373693466,
"learning_rate": 0.00014152177440934012,
"loss": 0.4186,
"step": 626
},
{
"epoch": 1.7119453924914676,
"grad_norm": 0.058524154126644135,
"learning_rate": 0.0001413046499409477,
"loss": 0.4072,
"step": 627
},
{
"epoch": 1.7146757679180888,
"grad_norm": 0.05203258991241455,
"learning_rate": 0.0001410872904405882,
"loss": 0.3929,
"step": 628
},
{
"epoch": 1.7174061433447099,
"grad_norm": 0.059925347566604614,
"learning_rate": 0.00014086969714508196,
"loss": 0.4211,
"step": 629
},
{
"epoch": 1.7201365187713311,
"grad_norm": 0.0577407106757164,
"learning_rate": 0.00014065187129257964,
"loss": 0.4128,
"step": 630
},
{
"epoch": 1.7228668941979524,
"grad_norm": 0.06548412144184113,
"learning_rate": 0.00014043381412255526,
"loss": 0.4117,
"step": 631
},
{
"epoch": 1.7255972696245734,
"grad_norm": 0.060420285910367966,
"learning_rate": 0.00014021552687579902,
"loss": 0.4176,
"step": 632
},
{
"epoch": 1.7283276450511944,
"grad_norm": 0.05787500739097595,
"learning_rate": 0.00013999701079441028,
"loss": 0.4173,
"step": 633
},
{
"epoch": 1.7310580204778157,
"grad_norm": 0.10321489721536636,
"learning_rate": 0.00013977826712179058,
"loss": 0.4098,
"step": 634
},
{
"epoch": 1.733788395904437,
"grad_norm": 0.05935697257518768,
"learning_rate": 0.00013955929710263653,
"loss": 0.433,
"step": 635
},
{
"epoch": 1.736518771331058,
"grad_norm": 0.05731033533811569,
"learning_rate": 0.00013934010198293257,
"loss": 0.4117,
"step": 636
},
{
"epoch": 1.7392491467576792,
"grad_norm": 0.05932068079710007,
"learning_rate": 0.00013912068300994413,
"loss": 0.4,
"step": 637
},
{
"epoch": 1.7419795221843004,
"grad_norm": 0.06352514028549194,
"learning_rate": 0.0001389010414322104,
"loss": 0.4135,
"step": 638
},
{
"epoch": 1.7447098976109214,
"grad_norm": 0.0548391118645668,
"learning_rate": 0.0001386811784995371,
"loss": 0.3998,
"step": 639
},
{
"epoch": 1.7474402730375427,
"grad_norm": 0.05962222442030907,
"learning_rate": 0.00013846109546298971,
"loss": 0.3982,
"step": 640
},
{
"epoch": 1.750170648464164,
"grad_norm": 0.056578923016786575,
"learning_rate": 0.00013824079357488598,
"loss": 0.4187,
"step": 641
},
{
"epoch": 1.752901023890785,
"grad_norm": 0.05794934183359146,
"learning_rate": 0.0001380202740887891,
"loss": 0.406,
"step": 642
},
{
"epoch": 1.7556313993174062,
"grad_norm": 0.056768182665109634,
"learning_rate": 0.00013779953825950034,
"loss": 0.4129,
"step": 643
},
{
"epoch": 1.7583617747440274,
"grad_norm": 0.06082385033369064,
"learning_rate": 0.00013757858734305203,
"loss": 0.4226,
"step": 644
},
{
"epoch": 1.7610921501706485,
"grad_norm": 0.059198446571826935,
"learning_rate": 0.0001373574225967004,
"loss": 0.405,
"step": 645
},
{
"epoch": 1.7638225255972695,
"grad_norm": 0.06012206897139549,
"learning_rate": 0.00013713604527891844,
"loss": 0.4192,
"step": 646
},
{
"epoch": 1.7665529010238907,
"grad_norm": 0.06151711568236351,
"learning_rate": 0.00013691445664938866,
"loss": 0.4206,
"step": 647
},
{
"epoch": 1.769283276450512,
"grad_norm": 0.06284491717815399,
"learning_rate": 0.00013669265796899607,
"loss": 0.4118,
"step": 648
},
{
"epoch": 1.772013651877133,
"grad_norm": 0.06001686304807663,
"learning_rate": 0.00013647065049982078,
"loss": 0.4293,
"step": 649
},
{
"epoch": 1.7747440273037542,
"grad_norm": 0.05952538549900055,
"learning_rate": 0.0001362484355051311,
"loss": 0.4114,
"step": 650
},
{
"epoch": 1.7774744027303755,
"grad_norm": 0.057195715606212616,
"learning_rate": 0.00013602601424937604,
"loss": 0.4104,
"step": 651
},
{
"epoch": 1.7802047781569965,
"grad_norm": 0.05979065224528313,
"learning_rate": 0.00013580338799817844,
"loss": 0.4321,
"step": 652
},
{
"epoch": 1.7829351535836178,
"grad_norm": 0.06188386306166649,
"learning_rate": 0.00013558055801832748,
"loss": 0.4044,
"step": 653
},
{
"epoch": 1.785665529010239,
"grad_norm": 0.060921113938093185,
"learning_rate": 0.0001353575255777717,
"loss": 0.422,
"step": 654
},
{
"epoch": 1.78839590443686,
"grad_norm": 0.0592602975666523,
"learning_rate": 0.0001351342919456116,
"loss": 0.3936,
"step": 655
},
{
"epoch": 1.7911262798634813,
"grad_norm": 0.06046243757009506,
"learning_rate": 0.0001349108583920925,
"loss": 0.4251,
"step": 656
},
{
"epoch": 1.7938566552901025,
"grad_norm": 0.05771365761756897,
"learning_rate": 0.00013468722618859743,
"loss": 0.4073,
"step": 657
},
{
"epoch": 1.7965870307167235,
"grad_norm": 0.05681789293885231,
"learning_rate": 0.0001344633966076396,
"loss": 0.4074,
"step": 658
},
{
"epoch": 1.7993174061433446,
"grad_norm": 0.05813178792595863,
"learning_rate": 0.00013423937092285555,
"loss": 0.3896,
"step": 659
},
{
"epoch": 1.802047781569966,
"grad_norm": 0.05757216364145279,
"learning_rate": 0.00013401515040899746,
"loss": 0.4178,
"step": 660
},
{
"epoch": 1.804778156996587,
"grad_norm": 0.057594846934080124,
"learning_rate": 0.00013379073634192632,
"loss": 0.3785,
"step": 661
},
{
"epoch": 1.807508532423208,
"grad_norm": 0.06386829912662506,
"learning_rate": 0.00013356612999860436,
"loss": 0.4017,
"step": 662
},
{
"epoch": 1.8102389078498293,
"grad_norm": 0.059352222830057144,
"learning_rate": 0.000133341332657088,
"loss": 0.4053,
"step": 663
},
{
"epoch": 1.8129692832764506,
"grad_norm": 0.058490559458732605,
"learning_rate": 0.00013311634559652036,
"loss": 0.4036,
"step": 664
},
{
"epoch": 1.8156996587030716,
"grad_norm": 0.0580880232155323,
"learning_rate": 0.00013289117009712418,
"loss": 0.4075,
"step": 665
},
{
"epoch": 1.8184300341296928,
"grad_norm": 0.054440416395664215,
"learning_rate": 0.00013266580744019445,
"loss": 0.4139,
"step": 666
},
{
"epoch": 1.821160409556314,
"grad_norm": 0.058102305978536606,
"learning_rate": 0.00013244025890809112,
"loss": 0.4051,
"step": 667
},
{
"epoch": 1.823890784982935,
"grad_norm": 0.06036128103733063,
"learning_rate": 0.00013221452578423176,
"loss": 0.4091,
"step": 668
},
{
"epoch": 1.8266211604095564,
"grad_norm": 0.061323538422584534,
"learning_rate": 0.00013198860935308444,
"loss": 0.4273,
"step": 669
},
{
"epoch": 1.8293515358361776,
"grad_norm": 0.06144220754504204,
"learning_rate": 0.00013176251090016007,
"loss": 0.4228,
"step": 670
},
{
"epoch": 1.8320819112627986,
"grad_norm": 0.05480247363448143,
"learning_rate": 0.0001315362317120055,
"loss": 0.4078,
"step": 671
},
{
"epoch": 1.8348122866894196,
"grad_norm": 0.0559588298201561,
"learning_rate": 0.00013130977307619594,
"loss": 0.4015,
"step": 672
},
{
"epoch": 1.8375426621160411,
"grad_norm": 0.0562249980866909,
"learning_rate": 0.0001310831362813276,
"loss": 0.4216,
"step": 673
},
{
"epoch": 1.8402730375426621,
"grad_norm": 0.05529346689581871,
"learning_rate": 0.00013085632261701063,
"loss": 0.3991,
"step": 674
},
{
"epoch": 1.8430034129692832,
"grad_norm": 0.055582497268915176,
"learning_rate": 0.00013062933337386142,
"loss": 0.3956,
"step": 675
},
{
"epoch": 1.8457337883959044,
"grad_norm": 0.057054124772548676,
"learning_rate": 0.00013040216984349555,
"loss": 0.398,
"step": 676
},
{
"epoch": 1.8484641638225257,
"grad_norm": 0.057355768978595734,
"learning_rate": 0.00013017483331852035,
"loss": 0.4059,
"step": 677
},
{
"epoch": 1.8511945392491467,
"grad_norm": 0.056889165192842484,
"learning_rate": 0.00012994732509252744,
"loss": 0.3806,
"step": 678
},
{
"epoch": 1.853924914675768,
"grad_norm": 0.057586781680583954,
"learning_rate": 0.00012971964646008542,
"loss": 0.4104,
"step": 679
},
{
"epoch": 1.8566552901023892,
"grad_norm": 0.059306979179382324,
"learning_rate": 0.00012949179871673278,
"loss": 0.4033,
"step": 680
},
{
"epoch": 1.8593856655290102,
"grad_norm": 0.057881347835063934,
"learning_rate": 0.00012926378315896998,
"loss": 0.4135,
"step": 681
},
{
"epoch": 1.8621160409556314,
"grad_norm": 0.06169261038303375,
"learning_rate": 0.00012903560108425258,
"loss": 0.412,
"step": 682
},
{
"epoch": 1.8648464163822527,
"grad_norm": 0.05441267788410187,
"learning_rate": 0.00012880725379098352,
"loss": 0.3986,
"step": 683
},
{
"epoch": 1.8675767918088737,
"grad_norm": 0.061068952083587646,
"learning_rate": 0.00012857874257850605,
"loss": 0.418,
"step": 684
},
{
"epoch": 1.8703071672354947,
"grad_norm": 0.058384671807289124,
"learning_rate": 0.00012835006874709594,
"loss": 0.4074,
"step": 685
},
{
"epoch": 1.8730375426621162,
"grad_norm": 0.0570659376680851,
"learning_rate": 0.00012812123359795446,
"loss": 0.4149,
"step": 686
},
{
"epoch": 1.8757679180887372,
"grad_norm": 0.05798759683966637,
"learning_rate": 0.00012789223843320073,
"loss": 0.4022,
"step": 687
},
{
"epoch": 1.8784982935153582,
"grad_norm": 0.059756677597761154,
"learning_rate": 0.0001276630845558644,
"loss": 0.4152,
"step": 688
},
{
"epoch": 1.8812286689419795,
"grad_norm": 0.05982014164328575,
"learning_rate": 0.00012743377326987826,
"loss": 0.4127,
"step": 689
},
{
"epoch": 1.8839590443686007,
"grad_norm": 0.05929556116461754,
"learning_rate": 0.00012720430588007077,
"loss": 0.405,
"step": 690
},
{
"epoch": 1.8866894197952218,
"grad_norm": 0.05722184479236603,
"learning_rate": 0.00012697468369215863,
"loss": 0.3978,
"step": 691
},
{
"epoch": 1.889419795221843,
"grad_norm": 0.05866376683115959,
"learning_rate": 0.00012674490801273938,
"loss": 0.417,
"step": 692
},
{
"epoch": 1.8921501706484642,
"grad_norm": 0.055445022881031036,
"learning_rate": 0.00012651498014928402,
"loss": 0.4161,
"step": 693
},
{
"epoch": 1.8948805460750853,
"grad_norm": 0.06086587905883789,
"learning_rate": 0.00012628490141012937,
"loss": 0.402,
"step": 694
},
{
"epoch": 1.8976109215017065,
"grad_norm": 0.06076718121767044,
"learning_rate": 0.000126054673104471,
"loss": 0.414,
"step": 695
},
{
"epoch": 1.9003412969283278,
"grad_norm": 0.055698879063129425,
"learning_rate": 0.00012582429654235523,
"loss": 0.3926,
"step": 696
},
{
"epoch": 1.9030716723549488,
"grad_norm": 0.056595612317323685,
"learning_rate": 0.00012559377303467226,
"loss": 0.4135,
"step": 697
},
{
"epoch": 1.9058020477815698,
"grad_norm": 0.05591044947504997,
"learning_rate": 0.00012536310389314832,
"loss": 0.4074,
"step": 698
},
{
"epoch": 1.9085324232081913,
"grad_norm": 0.06135864555835724,
"learning_rate": 0.0001251322904303383,
"loss": 0.4203,
"step": 699
},
{
"epoch": 1.9112627986348123,
"grad_norm": 0.058106984943151474,
"learning_rate": 0.00012490133395961844,
"loss": 0.4046,
"step": 700
},
{
"epoch": 1.9139931740614333,
"grad_norm": 0.059473518282175064,
"learning_rate": 0.00012467023579517856,
"loss": 0.4027,
"step": 701
},
{
"epoch": 1.9167235494880546,
"grad_norm": 0.057781342417001724,
"learning_rate": 0.00012443899725201482,
"loss": 0.4163,
"step": 702
},
{
"epoch": 1.9194539249146758,
"grad_norm": 0.0613093338906765,
"learning_rate": 0.00012420761964592223,
"loss": 0.4127,
"step": 703
},
{
"epoch": 1.9221843003412968,
"grad_norm": 0.05781256780028343,
"learning_rate": 0.000123976104293487,
"loss": 0.398,
"step": 704
},
{
"epoch": 1.924914675767918,
"grad_norm": 0.057743050158023834,
"learning_rate": 0.00012374445251207914,
"loss": 0.3969,
"step": 705
},
{
"epoch": 1.9276450511945393,
"grad_norm": 0.0608978345990181,
"learning_rate": 0.00012351266561984507,
"loss": 0.4037,
"step": 706
},
{
"epoch": 1.9303754266211604,
"grad_norm": 0.05937394127249718,
"learning_rate": 0.00012328074493569993,
"loss": 0.3964,
"step": 707
},
{
"epoch": 1.9331058020477816,
"grad_norm": 0.06584876775741577,
"learning_rate": 0.0001230486917793202,
"loss": 0.4186,
"step": 708
},
{
"epoch": 1.9358361774744028,
"grad_norm": 0.06222471594810486,
"learning_rate": 0.00012281650747113612,
"loss": 0.4178,
"step": 709
},
{
"epoch": 1.9385665529010239,
"grad_norm": 0.05962240695953369,
"learning_rate": 0.0001225841933323242,
"loss": 0.3898,
"step": 710
},
{
"epoch": 1.9412969283276449,
"grad_norm": 0.06118809059262276,
"learning_rate": 0.00012235175068479984,
"loss": 0.3926,
"step": 711
},
{
"epoch": 1.9440273037542664,
"grad_norm": 0.05581739544868469,
"learning_rate": 0.00012211918085120954,
"loss": 0.3907,
"step": 712
},
{
"epoch": 1.9467576791808874,
"grad_norm": 0.05898397043347359,
"learning_rate": 0.00012188648515492355,
"loss": 0.3979,
"step": 713
},
{
"epoch": 1.9494880546075084,
"grad_norm": 0.06158998981118202,
"learning_rate": 0.00012165366492002832,
"loss": 0.4138,
"step": 714
},
{
"epoch": 1.9522184300341296,
"grad_norm": 0.06278332322835922,
"learning_rate": 0.00012142072147131898,
"loss": 0.4141,
"step": 715
},
{
"epoch": 1.954948805460751,
"grad_norm": 0.06232950836420059,
"learning_rate": 0.00012118765613429173,
"loss": 0.4058,
"step": 716
},
{
"epoch": 1.957679180887372,
"grad_norm": 0.05726422742009163,
"learning_rate": 0.0001209544702351363,
"loss": 0.4021,
"step": 717
},
{
"epoch": 1.9604095563139932,
"grad_norm": 0.05597952753305435,
"learning_rate": 0.00012072116510072858,
"loss": 0.3965,
"step": 718
},
{
"epoch": 1.9631399317406144,
"grad_norm": 0.0619698166847229,
"learning_rate": 0.00012048774205862279,
"loss": 0.4112,
"step": 719
},
{
"epoch": 1.9658703071672354,
"grad_norm": 0.05994318053126335,
"learning_rate": 0.0001202542024370441,
"loss": 0.4186,
"step": 720
},
{
"epoch": 1.9686006825938567,
"grad_norm": 0.06278800964355469,
"learning_rate": 0.00012002054756488115,
"loss": 0.4122,
"step": 721
},
{
"epoch": 1.971331058020478,
"grad_norm": 0.06267794966697693,
"learning_rate": 0.00011978677877167822,
"loss": 0.4057,
"step": 722
},
{
"epoch": 1.974061433447099,
"grad_norm": 0.06913238018751144,
"learning_rate": 0.00011955289738762796,
"loss": 0.4069,
"step": 723
},
{
"epoch": 1.9767918088737202,
"grad_norm": 0.06418196856975555,
"learning_rate": 0.00011931890474356358,
"loss": 0.4078,
"step": 724
},
{
"epoch": 1.9795221843003414,
"grad_norm": 0.06403093785047531,
"learning_rate": 0.00011908480217095141,
"loss": 0.4062,
"step": 725
},
{
"epoch": 1.9822525597269625,
"grad_norm": 0.0585256926715374,
"learning_rate": 0.00011885059100188341,
"loss": 0.409,
"step": 726
},
{
"epoch": 1.9849829351535835,
"grad_norm": 0.06400654464960098,
"learning_rate": 0.00011861627256906929,
"loss": 0.4113,
"step": 727
},
{
"epoch": 1.9877133105802047,
"grad_norm": 0.06193806603550911,
"learning_rate": 0.00011838184820582923,
"loss": 0.4194,
"step": 728
},
{
"epoch": 1.990443686006826,
"grad_norm": 0.05865743011236191,
"learning_rate": 0.00011814731924608616,
"loss": 0.4002,
"step": 729
},
{
"epoch": 1.993174061433447,
"grad_norm": 0.05942784622311592,
"learning_rate": 0.00011791268702435816,
"loss": 0.4047,
"step": 730
},
{
"epoch": 1.9959044368600682,
"grad_norm": 0.056138355284929276,
"learning_rate": 0.0001176779528757509,
"loss": 0.4084,
"step": 731
},
{
"epoch": 1.9986348122866895,
"grad_norm": 0.058754485100507736,
"learning_rate": 0.00011744311813595006,
"loss": 0.3986,
"step": 732
},
{
"epoch": 1.9986348122866895,
"eval_loss": 0.40529951453208923,
"eval_runtime": 310.303,
"eval_samples_per_second": 8.392,
"eval_steps_per_second": 1.051,
"step": 732
}
],
"logging_steps": 1,
"max_steps": 1464,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0372438099073434e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}