atsuki-yamaguchi's picture
Upload folder using huggingface_hub
904e526 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 874,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002288329519450801,
"grad_norm": 18.883691787719727,
"learning_rate": 2.2727272727272728e-06,
"loss": 10.3613,
"step": 1
},
{
"epoch": 0.004576659038901602,
"grad_norm": 23.170867919921875,
"learning_rate": 4.5454545454545455e-06,
"loss": 11.4465,
"step": 2
},
{
"epoch": 0.006864988558352402,
"grad_norm": 22.565134048461914,
"learning_rate": 6.818181818181818e-06,
"loss": 11.6681,
"step": 3
},
{
"epoch": 0.009153318077803204,
"grad_norm": 17.173921585083008,
"learning_rate": 9.090909090909091e-06,
"loss": 9.774,
"step": 4
},
{
"epoch": 0.011441647597254004,
"grad_norm": 20.478288650512695,
"learning_rate": 1.1363636363636365e-05,
"loss": 9.4622,
"step": 5
},
{
"epoch": 0.013729977116704805,
"grad_norm": 16.507568359375,
"learning_rate": 1.3636363636363637e-05,
"loss": 9.8348,
"step": 6
},
{
"epoch": 0.016018306636155607,
"grad_norm": 19.91631317138672,
"learning_rate": 1.590909090909091e-05,
"loss": 8.3954,
"step": 7
},
{
"epoch": 0.018306636155606407,
"grad_norm": 18.219541549682617,
"learning_rate": 1.8181818181818182e-05,
"loss": 8.8841,
"step": 8
},
{
"epoch": 0.020594965675057208,
"grad_norm": 18.8386173248291,
"learning_rate": 2.0454545454545457e-05,
"loss": 7.7361,
"step": 9
},
{
"epoch": 0.02288329519450801,
"grad_norm": 39.84943771362305,
"learning_rate": 2.272727272727273e-05,
"loss": 7.8007,
"step": 10
},
{
"epoch": 0.02517162471395881,
"grad_norm": 37.528114318847656,
"learning_rate": 2.5e-05,
"loss": 7.3203,
"step": 11
},
{
"epoch": 0.02745995423340961,
"grad_norm": 17.135168075561523,
"learning_rate": 2.7272727272727273e-05,
"loss": 6.4416,
"step": 12
},
{
"epoch": 0.029748283752860413,
"grad_norm": 22.87267303466797,
"learning_rate": 2.954545454545455e-05,
"loss": 6.7089,
"step": 13
},
{
"epoch": 0.032036613272311214,
"grad_norm": 15.40108871459961,
"learning_rate": 3.181818181818182e-05,
"loss": 5.8722,
"step": 14
},
{
"epoch": 0.034324942791762014,
"grad_norm": 14.930537223815918,
"learning_rate": 3.409090909090909e-05,
"loss": 5.4715,
"step": 15
},
{
"epoch": 0.036613272311212815,
"grad_norm": 24.848709106445312,
"learning_rate": 3.6363636363636364e-05,
"loss": 6.0102,
"step": 16
},
{
"epoch": 0.038901601830663615,
"grad_norm": 9.122722625732422,
"learning_rate": 3.8636363636363636e-05,
"loss": 5.1952,
"step": 17
},
{
"epoch": 0.041189931350114416,
"grad_norm": 19.82050895690918,
"learning_rate": 4.0909090909090915e-05,
"loss": 5.4025,
"step": 18
},
{
"epoch": 0.043478260869565216,
"grad_norm": 14.143157958984375,
"learning_rate": 4.318181818181819e-05,
"loss": 5.4865,
"step": 19
},
{
"epoch": 0.04576659038901602,
"grad_norm": 13.207295417785645,
"learning_rate": 4.545454545454546e-05,
"loss": 4.9825,
"step": 20
},
{
"epoch": 0.04805491990846682,
"grad_norm": 14.648661613464355,
"learning_rate": 4.772727272727273e-05,
"loss": 5.2058,
"step": 21
},
{
"epoch": 0.05034324942791762,
"grad_norm": 14.701571464538574,
"learning_rate": 5e-05,
"loss": 4.878,
"step": 22
},
{
"epoch": 0.05263157894736842,
"grad_norm": 11.080320358276367,
"learning_rate": 5.2272727272727274e-05,
"loss": 4.8911,
"step": 23
},
{
"epoch": 0.05491990846681922,
"grad_norm": 7.584497451782227,
"learning_rate": 5.4545454545454546e-05,
"loss": 4.755,
"step": 24
},
{
"epoch": 0.057208237986270026,
"grad_norm": 5.899540424346924,
"learning_rate": 5.6818181818181825e-05,
"loss": 4.7097,
"step": 25
},
{
"epoch": 0.059496567505720827,
"grad_norm": 25.04722785949707,
"learning_rate": 5.90909090909091e-05,
"loss": 4.2626,
"step": 26
},
{
"epoch": 0.06178489702517163,
"grad_norm": 22.6114501953125,
"learning_rate": 6.136363636363636e-05,
"loss": 4.7042,
"step": 27
},
{
"epoch": 0.06407322654462243,
"grad_norm": 9.285355567932129,
"learning_rate": 6.363636363636364e-05,
"loss": 4.4734,
"step": 28
},
{
"epoch": 0.06636155606407322,
"grad_norm": 6.6501874923706055,
"learning_rate": 6.59090909090909e-05,
"loss": 4.381,
"step": 29
},
{
"epoch": 0.06864988558352403,
"grad_norm": 5.746420860290527,
"learning_rate": 6.818181818181818e-05,
"loss": 4.2421,
"step": 30
},
{
"epoch": 0.07093821510297482,
"grad_norm": 3.9305100440979004,
"learning_rate": 7.045454545454546e-05,
"loss": 4.1085,
"step": 31
},
{
"epoch": 0.07322654462242563,
"grad_norm": 9.795059204101562,
"learning_rate": 7.272727272727273e-05,
"loss": 4.0933,
"step": 32
},
{
"epoch": 0.07551487414187644,
"grad_norm": 5.907273769378662,
"learning_rate": 7.500000000000001e-05,
"loss": 4.1663,
"step": 33
},
{
"epoch": 0.07780320366132723,
"grad_norm": 6.9800190925598145,
"learning_rate": 7.727272727272727e-05,
"loss": 4.1722,
"step": 34
},
{
"epoch": 0.08009153318077804,
"grad_norm": 7.007209777832031,
"learning_rate": 7.954545454545455e-05,
"loss": 4.1487,
"step": 35
},
{
"epoch": 0.08237986270022883,
"grad_norm": 7.624848365783691,
"learning_rate": 8.181818181818183e-05,
"loss": 4.2597,
"step": 36
},
{
"epoch": 0.08466819221967964,
"grad_norm": 6.604410171508789,
"learning_rate": 8.40909090909091e-05,
"loss": 4.1198,
"step": 37
},
{
"epoch": 0.08695652173913043,
"grad_norm": 5.09440803527832,
"learning_rate": 8.636363636363637e-05,
"loss": 3.8203,
"step": 38
},
{
"epoch": 0.08924485125858124,
"grad_norm": 5.450263500213623,
"learning_rate": 8.863636363636364e-05,
"loss": 3.9002,
"step": 39
},
{
"epoch": 0.09153318077803203,
"grad_norm": 3.4505696296691895,
"learning_rate": 9.090909090909092e-05,
"loss": 3.9174,
"step": 40
},
{
"epoch": 0.09382151029748284,
"grad_norm": 7.434864044189453,
"learning_rate": 9.318181818181818e-05,
"loss": 4.0441,
"step": 41
},
{
"epoch": 0.09610983981693363,
"grad_norm": 12.046749114990234,
"learning_rate": 9.545454545454546e-05,
"loss": 3.7604,
"step": 42
},
{
"epoch": 0.09839816933638444,
"grad_norm": 9.222173690795898,
"learning_rate": 9.772727272727274e-05,
"loss": 4.1223,
"step": 43
},
{
"epoch": 0.10068649885583524,
"grad_norm": 4.6726789474487305,
"learning_rate": 0.0001,
"loss": 3.9827,
"step": 44
},
{
"epoch": 0.10297482837528604,
"grad_norm": 4.8803791999816895,
"learning_rate": 9.999964183507702e-05,
"loss": 3.8722,
"step": 45
},
{
"epoch": 0.10526315789473684,
"grad_norm": 6.589260101318359,
"learning_rate": 9.999856734543933e-05,
"loss": 3.7152,
"step": 46
},
{
"epoch": 0.10755148741418764,
"grad_norm": 3.9649579524993896,
"learning_rate": 9.999677654648072e-05,
"loss": 3.606,
"step": 47
},
{
"epoch": 0.10983981693363844,
"grad_norm": 7.821356296539307,
"learning_rate": 9.999426946385727e-05,
"loss": 3.9462,
"step": 48
},
{
"epoch": 0.11212814645308924,
"grad_norm": 7.566446304321289,
"learning_rate": 9.999104613348688e-05,
"loss": 3.5093,
"step": 49
},
{
"epoch": 0.11441647597254005,
"grad_norm": 6.507759094238281,
"learning_rate": 9.998710660154898e-05,
"loss": 3.8733,
"step": 50
},
{
"epoch": 0.11670480549199085,
"grad_norm": 3.9552855491638184,
"learning_rate": 9.998245092448362e-05,
"loss": 3.6431,
"step": 51
},
{
"epoch": 0.11899313501144165,
"grad_norm": 6.334136486053467,
"learning_rate": 9.997707916899079e-05,
"loss": 3.602,
"step": 52
},
{
"epoch": 0.12128146453089245,
"grad_norm": 3.8587405681610107,
"learning_rate": 9.99709914120295e-05,
"loss": 3.5833,
"step": 53
},
{
"epoch": 0.12356979405034325,
"grad_norm": 8.523442268371582,
"learning_rate": 9.996418774081658e-05,
"loss": 3.7635,
"step": 54
},
{
"epoch": 0.12585812356979406,
"grad_norm": 6.120156288146973,
"learning_rate": 9.995666825282547e-05,
"loss": 3.5929,
"step": 55
},
{
"epoch": 0.12814645308924486,
"grad_norm": 6.545000076293945,
"learning_rate": 9.994843305578486e-05,
"loss": 3.5276,
"step": 56
},
{
"epoch": 0.13043478260869565,
"grad_norm": 6.0784759521484375,
"learning_rate": 9.99394822676771e-05,
"loss": 3.7363,
"step": 57
},
{
"epoch": 0.13272311212814644,
"grad_norm": 3.7013518810272217,
"learning_rate": 9.99298160167365e-05,
"loss": 3.4895,
"step": 58
},
{
"epoch": 0.13501144164759726,
"grad_norm": 7.4772419929504395,
"learning_rate": 9.991943444144757e-05,
"loss": 3.6833,
"step": 59
},
{
"epoch": 0.13729977116704806,
"grad_norm": 19.202077865600586,
"learning_rate": 9.990833769054293e-05,
"loss": 3.531,
"step": 60
},
{
"epoch": 0.13958810068649885,
"grad_norm": 5.234053611755371,
"learning_rate": 9.989652592300128e-05,
"loss": 3.6795,
"step": 61
},
{
"epoch": 0.14187643020594964,
"grad_norm": 4.528527736663818,
"learning_rate": 9.988399930804504e-05,
"loss": 3.6353,
"step": 62
},
{
"epoch": 0.14416475972540047,
"grad_norm": 4.144354820251465,
"learning_rate": 9.987075802513797e-05,
"loss": 3.7929,
"step": 63
},
{
"epoch": 0.14645308924485126,
"grad_norm": 7.227776050567627,
"learning_rate": 9.985680226398261e-05,
"loss": 3.6075,
"step": 64
},
{
"epoch": 0.14874141876430205,
"grad_norm": 3.7296457290649414,
"learning_rate": 9.98421322245175e-05,
"loss": 3.5842,
"step": 65
},
{
"epoch": 0.15102974828375287,
"grad_norm": 4.071051597595215,
"learning_rate": 9.98267481169144e-05,
"loss": 3.5031,
"step": 66
},
{
"epoch": 0.15331807780320367,
"grad_norm": 4.512979030609131,
"learning_rate": 9.981065016157522e-05,
"loss": 3.529,
"step": 67
},
{
"epoch": 0.15560640732265446,
"grad_norm": 4.703476905822754,
"learning_rate": 9.979383858912885e-05,
"loss": 3.4393,
"step": 68
},
{
"epoch": 0.15789473684210525,
"grad_norm": 3.3399972915649414,
"learning_rate": 9.977631364042795e-05,
"loss": 3.3321,
"step": 69
},
{
"epoch": 0.16018306636155608,
"grad_norm": 4.721877574920654,
"learning_rate": 9.975807556654537e-05,
"loss": 3.3929,
"step": 70
},
{
"epoch": 0.16247139588100687,
"grad_norm": 4.589676856994629,
"learning_rate": 9.973912462877066e-05,
"loss": 3.5313,
"step": 71
},
{
"epoch": 0.16475972540045766,
"grad_norm": 3.2385528087615967,
"learning_rate": 9.971946109860626e-05,
"loss": 3.3098,
"step": 72
},
{
"epoch": 0.16704805491990846,
"grad_norm": 10.516329765319824,
"learning_rate": 9.969908525776364e-05,
"loss": 3.2536,
"step": 73
},
{
"epoch": 0.16933638443935928,
"grad_norm": 4.250394821166992,
"learning_rate": 9.967799739815925e-05,
"loss": 3.5101,
"step": 74
},
{
"epoch": 0.17162471395881007,
"grad_norm": 6.82457160949707,
"learning_rate": 9.965619782191036e-05,
"loss": 3.3573,
"step": 75
},
{
"epoch": 0.17391304347826086,
"grad_norm": 3.3551783561706543,
"learning_rate": 9.963368684133072e-05,
"loss": 3.2689,
"step": 76
},
{
"epoch": 0.17620137299771166,
"grad_norm": 3.7537643909454346,
"learning_rate": 9.961046477892608e-05,
"loss": 3.4124,
"step": 77
},
{
"epoch": 0.17848970251716248,
"grad_norm": 4.550995826721191,
"learning_rate": 9.958653196738954e-05,
"loss": 3.2057,
"step": 78
},
{
"epoch": 0.18077803203661327,
"grad_norm": 3.5284054279327393,
"learning_rate": 9.956188874959687e-05,
"loss": 3.3048,
"step": 79
},
{
"epoch": 0.18306636155606407,
"grad_norm": 3.11651873588562,
"learning_rate": 9.953653547860151e-05,
"loss": 3.2424,
"step": 80
},
{
"epoch": 0.1853546910755149,
"grad_norm": 3.3010475635528564,
"learning_rate": 9.951047251762954e-05,
"loss": 3.3064,
"step": 81
},
{
"epoch": 0.18764302059496568,
"grad_norm": 3.0703697204589844,
"learning_rate": 9.948370024007454e-05,
"loss": 3.2671,
"step": 82
},
{
"epoch": 0.18993135011441648,
"grad_norm": 3.3035237789154053,
"learning_rate": 9.94562190294921e-05,
"loss": 3.2557,
"step": 83
},
{
"epoch": 0.19221967963386727,
"grad_norm": 4.406396865844727,
"learning_rate": 9.942802927959443e-05,
"loss": 3.2696,
"step": 84
},
{
"epoch": 0.1945080091533181,
"grad_norm": 3.550173759460449,
"learning_rate": 9.939913139424476e-05,
"loss": 3.195,
"step": 85
},
{
"epoch": 0.19679633867276888,
"grad_norm": 3.505958318710327,
"learning_rate": 9.936952578745142e-05,
"loss": 3.1276,
"step": 86
},
{
"epoch": 0.19908466819221968,
"grad_norm": 3.840190887451172,
"learning_rate": 9.933921288336201e-05,
"loss": 3.164,
"step": 87
},
{
"epoch": 0.20137299771167047,
"grad_norm": 3.551823139190674,
"learning_rate": 9.93081931162573e-05,
"loss": 3.1447,
"step": 88
},
{
"epoch": 0.2036613272311213,
"grad_norm": 2.611680269241333,
"learning_rate": 9.927646693054496e-05,
"loss": 2.9314,
"step": 89
},
{
"epoch": 0.20594965675057209,
"grad_norm": 3.5281317234039307,
"learning_rate": 9.92440347807533e-05,
"loss": 3.1775,
"step": 90
},
{
"epoch": 0.20823798627002288,
"grad_norm": 2.898061513900757,
"learning_rate": 9.921089713152462e-05,
"loss": 3.1434,
"step": 91
},
{
"epoch": 0.21052631578947367,
"grad_norm": 2.3443377017974854,
"learning_rate": 9.91770544576087e-05,
"loss": 3.1309,
"step": 92
},
{
"epoch": 0.2128146453089245,
"grad_norm": 3.889735460281372,
"learning_rate": 9.914250724385588e-05,
"loss": 3.413,
"step": 93
},
{
"epoch": 0.2151029748283753,
"grad_norm": 2.6670336723327637,
"learning_rate": 9.910725598521013e-05,
"loss": 3.3243,
"step": 94
},
{
"epoch": 0.21739130434782608,
"grad_norm": 3.1559407711029053,
"learning_rate": 9.907130118670207e-05,
"loss": 3.1358,
"step": 95
},
{
"epoch": 0.21967963386727687,
"grad_norm": 3.0841243267059326,
"learning_rate": 9.90346433634416e-05,
"loss": 3.0006,
"step": 96
},
{
"epoch": 0.2219679633867277,
"grad_norm": 3.750990867614746,
"learning_rate": 9.899728304061054e-05,
"loss": 2.9871,
"step": 97
},
{
"epoch": 0.2242562929061785,
"grad_norm": 3.793957471847534,
"learning_rate": 9.89592207534552e-05,
"loss": 3.1504,
"step": 98
},
{
"epoch": 0.22654462242562928,
"grad_norm": 2.926502227783203,
"learning_rate": 9.892045704727864e-05,
"loss": 3.0773,
"step": 99
},
{
"epoch": 0.2288329519450801,
"grad_norm": 3.8088338375091553,
"learning_rate": 9.888099247743283e-05,
"loss": 3.0159,
"step": 100
},
{
"epoch": 0.2311212814645309,
"grad_norm": 3.136817455291748,
"learning_rate": 9.884082760931078e-05,
"loss": 3.0799,
"step": 101
},
{
"epoch": 0.2334096109839817,
"grad_norm": 3.4826202392578125,
"learning_rate": 9.879996301833833e-05,
"loss": 2.9949,
"step": 102
},
{
"epoch": 0.23569794050343248,
"grad_norm": 2.8044424057006836,
"learning_rate": 9.875839928996605e-05,
"loss": 2.9263,
"step": 103
},
{
"epoch": 0.2379862700228833,
"grad_norm": 3.0698440074920654,
"learning_rate": 9.871613701966067e-05,
"loss": 2.901,
"step": 104
},
{
"epoch": 0.2402745995423341,
"grad_norm": 3.5167031288146973,
"learning_rate": 9.867317681289674e-05,
"loss": 2.8163,
"step": 105
},
{
"epoch": 0.2425629290617849,
"grad_norm": 4.114626407623291,
"learning_rate": 9.862951928514782e-05,
"loss": 2.8116,
"step": 106
},
{
"epoch": 0.2448512585812357,
"grad_norm": 3.7168688774108887,
"learning_rate": 9.858516506187769e-05,
"loss": 2.8926,
"step": 107
},
{
"epoch": 0.2471395881006865,
"grad_norm": 4.157980442047119,
"learning_rate": 9.854011477853146e-05,
"loss": 3.0417,
"step": 108
},
{
"epoch": 0.2494279176201373,
"grad_norm": 3.974653959274292,
"learning_rate": 9.849436908052636e-05,
"loss": 2.9427,
"step": 109
},
{
"epoch": 0.2517162471395881,
"grad_norm": 3.3569509983062744,
"learning_rate": 9.844792862324258e-05,
"loss": 3.0074,
"step": 110
},
{
"epoch": 0.2540045766590389,
"grad_norm": 3.318108081817627,
"learning_rate": 9.840079407201381e-05,
"loss": 2.9424,
"step": 111
},
{
"epoch": 0.2562929061784897,
"grad_norm": 2.5115249156951904,
"learning_rate": 9.835296610211779e-05,
"loss": 2.8719,
"step": 112
},
{
"epoch": 0.2585812356979405,
"grad_norm": 4.036254405975342,
"learning_rate": 9.830444539876655e-05,
"loss": 2.8323,
"step": 113
},
{
"epoch": 0.2608695652173913,
"grad_norm": 2.956651210784912,
"learning_rate": 9.825523265709666e-05,
"loss": 2.8217,
"step": 114
},
{
"epoch": 0.2631578947368421,
"grad_norm": 20.366439819335938,
"learning_rate": 9.820532858215924e-05,
"loss": 2.9269,
"step": 115
},
{
"epoch": 0.2654462242562929,
"grad_norm": 5.692528247833252,
"learning_rate": 9.815473388890983e-05,
"loss": 2.9319,
"step": 116
},
{
"epoch": 0.26773455377574373,
"grad_norm": 4.276037693023682,
"learning_rate": 9.810344930219824e-05,
"loss": 2.8439,
"step": 117
},
{
"epoch": 0.2700228832951945,
"grad_norm": 5.107546806335449,
"learning_rate": 9.805147555675805e-05,
"loss": 2.8193,
"step": 118
},
{
"epoch": 0.2723112128146453,
"grad_norm": 4.974969387054443,
"learning_rate": 9.799881339719615e-05,
"loss": 2.8969,
"step": 119
},
{
"epoch": 0.2745995423340961,
"grad_norm": 2.5996475219726562,
"learning_rate": 9.794546357798208e-05,
"loss": 2.7867,
"step": 120
},
{
"epoch": 0.2768878718535469,
"grad_norm": 5.79841947555542,
"learning_rate": 9.789142686343723e-05,
"loss": 2.9682,
"step": 121
},
{
"epoch": 0.2791762013729977,
"grad_norm": 3.4821016788482666,
"learning_rate": 9.783670402772379e-05,
"loss": 2.8194,
"step": 122
},
{
"epoch": 0.2814645308924485,
"grad_norm": 3.6274449825286865,
"learning_rate": 9.778129585483377e-05,
"loss": 2.9552,
"step": 123
},
{
"epoch": 0.2837528604118993,
"grad_norm": 5.462944507598877,
"learning_rate": 9.772520313857775e-05,
"loss": 2.82,
"step": 124
},
{
"epoch": 0.28604118993135014,
"grad_norm": 2.919630765914917,
"learning_rate": 9.766842668257348e-05,
"loss": 2.8958,
"step": 125
},
{
"epoch": 0.28832951945080093,
"grad_norm": 4.07204008102417,
"learning_rate": 9.761096730023432e-05,
"loss": 2.6548,
"step": 126
},
{
"epoch": 0.2906178489702517,
"grad_norm": 3.1327707767486572,
"learning_rate": 9.755282581475769e-05,
"loss": 2.7502,
"step": 127
},
{
"epoch": 0.2929061784897025,
"grad_norm": 3.4713220596313477,
"learning_rate": 9.749400305911322e-05,
"loss": 2.8989,
"step": 128
},
{
"epoch": 0.2951945080091533,
"grad_norm": 3.0974678993225098,
"learning_rate": 9.743449987603083e-05,
"loss": 2.7687,
"step": 129
},
{
"epoch": 0.2974828375286041,
"grad_norm": 2.840522527694702,
"learning_rate": 9.737431711798864e-05,
"loss": 2.6381,
"step": 130
},
{
"epoch": 0.2997711670480549,
"grad_norm": 3.248396635055542,
"learning_rate": 9.731345564720074e-05,
"loss": 2.803,
"step": 131
},
{
"epoch": 0.30205949656750575,
"grad_norm": 2.489511251449585,
"learning_rate": 9.725191633560491e-05,
"loss": 2.7353,
"step": 132
},
{
"epoch": 0.30434782608695654,
"grad_norm": 3.164222240447998,
"learning_rate": 9.718970006485006e-05,
"loss": 2.8043,
"step": 133
},
{
"epoch": 0.30663615560640733,
"grad_norm": 2.9290547370910645,
"learning_rate": 9.712680772628364e-05,
"loss": 2.8801,
"step": 134
},
{
"epoch": 0.30892448512585813,
"grad_norm": 2.126338243484497,
"learning_rate": 9.70632402209388e-05,
"loss": 2.6804,
"step": 135
},
{
"epoch": 0.3112128146453089,
"grad_norm": 3.28828501701355,
"learning_rate": 9.69989984595216e-05,
"loss": 2.7758,
"step": 136
},
{
"epoch": 0.3135011441647597,
"grad_norm": 2.5393142700195312,
"learning_rate": 9.693408336239783e-05,
"loss": 2.7694,
"step": 137
},
{
"epoch": 0.3157894736842105,
"grad_norm": 2.971083641052246,
"learning_rate": 9.686849585957994e-05,
"loss": 2.8087,
"step": 138
},
{
"epoch": 0.3180778032036613,
"grad_norm": 2.3476786613464355,
"learning_rate": 9.680223689071364e-05,
"loss": 2.6401,
"step": 139
},
{
"epoch": 0.32036613272311215,
"grad_norm": 3.1489346027374268,
"learning_rate": 9.673530740506447e-05,
"loss": 2.6452,
"step": 140
},
{
"epoch": 0.32265446224256294,
"grad_norm": 2.252417802810669,
"learning_rate": 9.666770836150421e-05,
"loss": 2.7393,
"step": 141
},
{
"epoch": 0.32494279176201374,
"grad_norm": 2.2149484157562256,
"learning_rate": 9.659944072849707e-05,
"loss": 2.6799,
"step": 142
},
{
"epoch": 0.32723112128146453,
"grad_norm": 2.29628849029541,
"learning_rate": 9.653050548408593e-05,
"loss": 2.7648,
"step": 143
},
{
"epoch": 0.3295194508009153,
"grad_norm": 2.487936019897461,
"learning_rate": 9.646090361587827e-05,
"loss": 2.7027,
"step": 144
},
{
"epoch": 0.3318077803203661,
"grad_norm": 2.9859704971313477,
"learning_rate": 9.639063612103198e-05,
"loss": 2.7749,
"step": 145
},
{
"epoch": 0.3340961098398169,
"grad_norm": 2.6007204055786133,
"learning_rate": 9.631970400624113e-05,
"loss": 2.7246,
"step": 146
},
{
"epoch": 0.33638443935926776,
"grad_norm": 33.11263656616211,
"learning_rate": 9.624810828772155e-05,
"loss": 2.7545,
"step": 147
},
{
"epoch": 0.33867276887871856,
"grad_norm": 3.515178918838501,
"learning_rate": 9.617584999119625e-05,
"loss": 2.732,
"step": 148
},
{
"epoch": 0.34096109839816935,
"grad_norm": 2.4641125202178955,
"learning_rate": 9.610293015188067e-05,
"loss": 2.5388,
"step": 149
},
{
"epoch": 0.34324942791762014,
"grad_norm": 3.40997314453125,
"learning_rate": 9.602934981446803e-05,
"loss": 2.7129,
"step": 150
},
{
"epoch": 0.34553775743707094,
"grad_norm": 2.843931198120117,
"learning_rate": 9.59551100331141e-05,
"loss": 2.5792,
"step": 151
},
{
"epoch": 0.34782608695652173,
"grad_norm": 3.463437795639038,
"learning_rate": 9.588021187142235e-05,
"loss": 2.7224,
"step": 152
},
{
"epoch": 0.3501144164759725,
"grad_norm": 3.4558653831481934,
"learning_rate": 9.580465640242851e-05,
"loss": 2.7847,
"step": 153
},
{
"epoch": 0.3524027459954233,
"grad_norm": 2.8027448654174805,
"learning_rate": 9.572844470858537e-05,
"loss": 2.636,
"step": 154
},
{
"epoch": 0.35469107551487417,
"grad_norm": 3.39350962638855,
"learning_rate": 9.565157788174712e-05,
"loss": 2.7577,
"step": 155
},
{
"epoch": 0.35697940503432496,
"grad_norm": 2.9632396697998047,
"learning_rate": 9.557405702315381e-05,
"loss": 2.6417,
"step": 156
},
{
"epoch": 0.35926773455377575,
"grad_norm": 2.825023889541626,
"learning_rate": 9.549588324341555e-05,
"loss": 2.5702,
"step": 157
},
{
"epoch": 0.36155606407322655,
"grad_norm": 3.1311004161834717,
"learning_rate": 9.541705766249655e-05,
"loss": 2.7682,
"step": 158
},
{
"epoch": 0.36384439359267734,
"grad_norm": 3.473098039627075,
"learning_rate": 9.533758140969912e-05,
"loss": 2.6545,
"step": 159
},
{
"epoch": 0.36613272311212813,
"grad_norm": 2.7823293209075928,
"learning_rate": 9.525745562364756e-05,
"loss": 2.6668,
"step": 160
},
{
"epoch": 0.3684210526315789,
"grad_norm": 2.742286205291748,
"learning_rate": 9.517668145227167e-05,
"loss": 2.7998,
"step": 161
},
{
"epoch": 0.3707093821510298,
"grad_norm": 4.77061653137207,
"learning_rate": 9.509526005279044e-05,
"loss": 2.789,
"step": 162
},
{
"epoch": 0.37299771167048057,
"grad_norm": 3.353771448135376,
"learning_rate": 9.501319259169543e-05,
"loss": 2.5644,
"step": 163
},
{
"epoch": 0.37528604118993136,
"grad_norm": 2.2853267192840576,
"learning_rate": 9.493048024473412e-05,
"loss": 2.6974,
"step": 164
},
{
"epoch": 0.37757437070938216,
"grad_norm": 3.146768569946289,
"learning_rate": 9.484712419689292e-05,
"loss": 2.568,
"step": 165
},
{
"epoch": 0.37986270022883295,
"grad_norm": 5.74379301071167,
"learning_rate": 9.476312564238034e-05,
"loss": 2.6581,
"step": 166
},
{
"epoch": 0.38215102974828374,
"grad_norm": 2.7834126949310303,
"learning_rate": 9.467848578460985e-05,
"loss": 2.6585,
"step": 167
},
{
"epoch": 0.38443935926773454,
"grad_norm": 4.0173139572143555,
"learning_rate": 9.459320583618252e-05,
"loss": 2.6218,
"step": 168
},
{
"epoch": 0.38672768878718533,
"grad_norm": 2.463062286376953,
"learning_rate": 9.450728701886983e-05,
"loss": 2.5853,
"step": 169
},
{
"epoch": 0.3890160183066362,
"grad_norm": 2.0359222888946533,
"learning_rate": 9.442073056359604e-05,
"loss": 2.4268,
"step": 170
},
{
"epoch": 0.391304347826087,
"grad_norm": 2.32696533203125,
"learning_rate": 9.433353771042059e-05,
"loss": 2.6057,
"step": 171
},
{
"epoch": 0.39359267734553777,
"grad_norm": 2.851733446121216,
"learning_rate": 9.424570970852034e-05,
"loss": 2.5317,
"step": 172
},
{
"epoch": 0.39588100686498856,
"grad_norm": 2.5211381912231445,
"learning_rate": 9.415724781617165e-05,
"loss": 2.577,
"step": 173
},
{
"epoch": 0.39816933638443935,
"grad_norm": 2.574113130569458,
"learning_rate": 9.406815330073244e-05,
"loss": 2.4746,
"step": 174
},
{
"epoch": 0.40045766590389015,
"grad_norm": 2.246934413909912,
"learning_rate": 9.397842743862391e-05,
"loss": 2.7172,
"step": 175
},
{
"epoch": 0.40274599542334094,
"grad_norm": 2.0411953926086426,
"learning_rate": 9.388807151531229e-05,
"loss": 2.3795,
"step": 176
},
{
"epoch": 0.40503432494279173,
"grad_norm": 2.0988056659698486,
"learning_rate": 9.37970868252905e-05,
"loss": 2.4858,
"step": 177
},
{
"epoch": 0.4073226544622426,
"grad_norm": 2.6199228763580322,
"learning_rate": 9.37054746720595e-05,
"loss": 2.5046,
"step": 178
},
{
"epoch": 0.4096109839816934,
"grad_norm": 2.879153251647949,
"learning_rate": 9.36132363681097e-05,
"loss": 2.5783,
"step": 179
},
{
"epoch": 0.41189931350114417,
"grad_norm": 2.188169002532959,
"learning_rate": 9.352037323490208e-05,
"loss": 2.5765,
"step": 180
},
{
"epoch": 0.41418764302059496,
"grad_norm": 2.8007380962371826,
"learning_rate": 9.342688660284935e-05,
"loss": 2.5559,
"step": 181
},
{
"epoch": 0.41647597254004576,
"grad_norm": 2.118467092514038,
"learning_rate": 9.333277781129678e-05,
"loss": 2.6657,
"step": 182
},
{
"epoch": 0.41876430205949655,
"grad_norm": 2.415208101272583,
"learning_rate": 9.32380482085031e-05,
"loss": 2.5692,
"step": 183
},
{
"epoch": 0.42105263157894735,
"grad_norm": 2.284472942352295,
"learning_rate": 9.314269915162114e-05,
"loss": 2.8171,
"step": 184
},
{
"epoch": 0.4233409610983982,
"grad_norm": 2.6300644874572754,
"learning_rate": 9.304673200667842e-05,
"loss": 2.5209,
"step": 185
},
{
"epoch": 0.425629290617849,
"grad_norm": 2.484619617462158,
"learning_rate": 9.295014814855753e-05,
"loss": 2.6103,
"step": 186
},
{
"epoch": 0.4279176201372998,
"grad_norm": 1.9404324293136597,
"learning_rate": 9.285294896097649e-05,
"loss": 2.6948,
"step": 187
},
{
"epoch": 0.4302059496567506,
"grad_norm": 5.251960754394531,
"learning_rate": 9.275513583646884e-05,
"loss": 2.5566,
"step": 188
},
{
"epoch": 0.43249427917620137,
"grad_norm": 3.4060168266296387,
"learning_rate": 9.265671017636383e-05,
"loss": 2.4573,
"step": 189
},
{
"epoch": 0.43478260869565216,
"grad_norm": 2.6057965755462646,
"learning_rate": 9.255767339076622e-05,
"loss": 2.578,
"step": 190
},
{
"epoch": 0.43707093821510296,
"grad_norm": 2.6781222820281982,
"learning_rate": 9.24580268985361e-05,
"loss": 2.6097,
"step": 191
},
{
"epoch": 0.43935926773455375,
"grad_norm": 2.630892038345337,
"learning_rate": 9.23577721272686e-05,
"loss": 2.4862,
"step": 192
},
{
"epoch": 0.4416475972540046,
"grad_norm": 2.559825897216797,
"learning_rate": 9.225691051327341e-05,
"loss": 2.4985,
"step": 193
},
{
"epoch": 0.4439359267734554,
"grad_norm": 2.574370861053467,
"learning_rate": 9.215544350155422e-05,
"loss": 2.5045,
"step": 194
},
{
"epoch": 0.4462242562929062,
"grad_norm": 2.611264944076538,
"learning_rate": 9.205337254578804e-05,
"loss": 2.5606,
"step": 195
},
{
"epoch": 0.448512585812357,
"grad_norm": 2.938662052154541,
"learning_rate": 9.195069910830427e-05,
"loss": 2.3961,
"step": 196
},
{
"epoch": 0.45080091533180777,
"grad_norm": 2.3292245864868164,
"learning_rate": 9.184742466006389e-05,
"loss": 2.3445,
"step": 197
},
{
"epoch": 0.45308924485125857,
"grad_norm": 3.314525604248047,
"learning_rate": 9.174355068063828e-05,
"loss": 2.3837,
"step": 198
},
{
"epoch": 0.45537757437070936,
"grad_norm": 2.9725730419158936,
"learning_rate": 9.163907865818806e-05,
"loss": 2.4219,
"step": 199
},
{
"epoch": 0.4576659038901602,
"grad_norm": 2.6355316638946533,
"learning_rate": 9.15340100894418e-05,
"loss": 2.5177,
"step": 200
},
{
"epoch": 0.459954233409611,
"grad_norm": 2.6601266860961914,
"learning_rate": 9.142834647967455e-05,
"loss": 2.5016,
"step": 201
},
{
"epoch": 0.4622425629290618,
"grad_norm": 2.8524606227874756,
"learning_rate": 9.132208934268622e-05,
"loss": 2.6128,
"step": 202
},
{
"epoch": 0.4645308924485126,
"grad_norm": 2.9725794792175293,
"learning_rate": 9.121524020078002e-05,
"loss": 2.4071,
"step": 203
},
{
"epoch": 0.4668192219679634,
"grad_norm": 2.737095832824707,
"learning_rate": 9.110780058474052e-05,
"loss": 2.5999,
"step": 204
},
{
"epoch": 0.4691075514874142,
"grad_norm": 2.458143711090088,
"learning_rate": 9.099977203381176e-05,
"loss": 2.4793,
"step": 205
},
{
"epoch": 0.47139588100686497,
"grad_norm": 2.4118494987487793,
"learning_rate": 9.08911560956753e-05,
"loss": 2.5604,
"step": 206
},
{
"epoch": 0.47368421052631576,
"grad_norm": 2.5340561866760254,
"learning_rate": 9.078195432642787e-05,
"loss": 2.3189,
"step": 207
},
{
"epoch": 0.4759725400457666,
"grad_norm": 15.165704727172852,
"learning_rate": 9.067216829055922e-05,
"loss": 2.3839,
"step": 208
},
{
"epoch": 0.4782608695652174,
"grad_norm": 3.408928394317627,
"learning_rate": 9.056179956092962e-05,
"loss": 2.4059,
"step": 209
},
{
"epoch": 0.4805491990846682,
"grad_norm": 2.3760733604431152,
"learning_rate": 9.045084971874738e-05,
"loss": 2.4665,
"step": 210
},
{
"epoch": 0.482837528604119,
"grad_norm": 2.2348382472991943,
"learning_rate": 9.033932035354616e-05,
"loss": 2.5414,
"step": 211
},
{
"epoch": 0.4851258581235698,
"grad_norm": 2.8291633129119873,
"learning_rate": 9.022721306316222e-05,
"loss": 2.4842,
"step": 212
},
{
"epoch": 0.4874141876430206,
"grad_norm": 2.2343130111694336,
"learning_rate": 9.011452945371153e-05,
"loss": 2.2786,
"step": 213
},
{
"epoch": 0.4897025171624714,
"grad_norm": 2.3048455715179443,
"learning_rate": 9.000127113956674e-05,
"loss": 2.6093,
"step": 214
},
{
"epoch": 0.4919908466819222,
"grad_norm": 2.0562024116516113,
"learning_rate": 8.988743974333405e-05,
"loss": 2.746,
"step": 215
},
{
"epoch": 0.494279176201373,
"grad_norm": 2.252657651901245,
"learning_rate": 8.977303689583e-05,
"loss": 2.3817,
"step": 216
},
{
"epoch": 0.4965675057208238,
"grad_norm": 2.080920696258545,
"learning_rate": 8.965806423605807e-05,
"loss": 2.3491,
"step": 217
},
{
"epoch": 0.4988558352402746,
"grad_norm": 2.2473278045654297,
"learning_rate": 8.954252341118523e-05,
"loss": 2.3824,
"step": 218
},
{
"epoch": 0.5011441647597255,
"grad_norm": 1.9687806367874146,
"learning_rate": 8.94264160765183e-05,
"loss": 2.3748,
"step": 219
},
{
"epoch": 0.5034324942791762,
"grad_norm": 6.516245365142822,
"learning_rate": 8.930974389548023e-05,
"loss": 2.3854,
"step": 220
},
{
"epoch": 0.505720823798627,
"grad_norm": 2.8898024559020996,
"learning_rate": 8.919250853958639e-05,
"loss": 2.4238,
"step": 221
},
{
"epoch": 0.5080091533180778,
"grad_norm": 2.3418991565704346,
"learning_rate": 8.90747116884204e-05,
"loss": 2.3012,
"step": 222
},
{
"epoch": 0.5102974828375286,
"grad_norm": 1.9986119270324707,
"learning_rate": 8.895635502961033e-05,
"loss": 2.3529,
"step": 223
},
{
"epoch": 0.5125858123569794,
"grad_norm": 2.4174418449401855,
"learning_rate": 8.883744025880428e-05,
"loss": 2.4234,
"step": 224
},
{
"epoch": 0.5148741418764302,
"grad_norm": 3.099142074584961,
"learning_rate": 8.871796907964625e-05,
"loss": 2.4744,
"step": 225
},
{
"epoch": 0.517162471395881,
"grad_norm": 2.573162317276001,
"learning_rate": 8.859794320375168e-05,
"loss": 2.4386,
"step": 226
},
{
"epoch": 0.5194508009153318,
"grad_norm": 2.158879518508911,
"learning_rate": 8.847736435068288e-05,
"loss": 2.453,
"step": 227
},
{
"epoch": 0.5217391304347826,
"grad_norm": 2.082505941390991,
"learning_rate": 8.835623424792452e-05,
"loss": 2.3861,
"step": 228
},
{
"epoch": 0.5240274599542334,
"grad_norm": 2.180189847946167,
"learning_rate": 8.823455463085873e-05,
"loss": 2.3747,
"step": 229
},
{
"epoch": 0.5263157894736842,
"grad_norm": 2.4317615032196045,
"learning_rate": 8.811232724274035e-05,
"loss": 2.4164,
"step": 230
},
{
"epoch": 0.528604118993135,
"grad_norm": 2.0874435901641846,
"learning_rate": 8.798955383467189e-05,
"loss": 2.3647,
"step": 231
},
{
"epoch": 0.5308924485125858,
"grad_norm": 2.1150388717651367,
"learning_rate": 8.786623616557847e-05,
"loss": 2.3115,
"step": 232
},
{
"epoch": 0.5331807780320366,
"grad_norm": 2.2048499584198,
"learning_rate": 8.774237600218266e-05,
"loss": 2.5097,
"step": 233
},
{
"epoch": 0.5354691075514875,
"grad_norm": 2.2788608074188232,
"learning_rate": 8.761797511897906e-05,
"loss": 2.2931,
"step": 234
},
{
"epoch": 0.5377574370709383,
"grad_norm": 2.0588808059692383,
"learning_rate": 8.749303529820903e-05,
"loss": 2.4971,
"step": 235
},
{
"epoch": 0.540045766590389,
"grad_norm": 2.104788303375244,
"learning_rate": 8.736755832983497e-05,
"loss": 2.3433,
"step": 236
},
{
"epoch": 0.5423340961098398,
"grad_norm": 1.8593131303787231,
"learning_rate": 8.724154601151484e-05,
"loss": 2.285,
"step": 237
},
{
"epoch": 0.5446224256292906,
"grad_norm": 1.9205878973007202,
"learning_rate": 8.711500014857634e-05,
"loss": 2.1847,
"step": 238
},
{
"epoch": 0.5469107551487414,
"grad_norm": 2.350421905517578,
"learning_rate": 8.698792255399104e-05,
"loss": 2.3161,
"step": 239
},
{
"epoch": 0.5491990846681922,
"grad_norm": 2.14412784576416,
"learning_rate": 8.686031504834843e-05,
"loss": 2.2517,
"step": 240
},
{
"epoch": 0.551487414187643,
"grad_norm": 1.8842304944992065,
"learning_rate": 8.673217945982978e-05,
"loss": 2.36,
"step": 241
},
{
"epoch": 0.5537757437070938,
"grad_norm": 1.9713618755340576,
"learning_rate": 8.660351762418203e-05,
"loss": 2.194,
"step": 242
},
{
"epoch": 0.5560640732265446,
"grad_norm": 1.9737403392791748,
"learning_rate": 8.647433138469144e-05,
"loss": 2.4538,
"step": 243
},
{
"epoch": 0.5583524027459954,
"grad_norm": 2.11395525932312,
"learning_rate": 8.634462259215719e-05,
"loss": 2.4534,
"step": 244
},
{
"epoch": 0.5606407322654462,
"grad_norm": 2.621248245239258,
"learning_rate": 8.621439310486486e-05,
"loss": 2.5009,
"step": 245
},
{
"epoch": 0.562929061784897,
"grad_norm": 2.1076266765594482,
"learning_rate": 8.608364478855983e-05,
"loss": 2.2991,
"step": 246
},
{
"epoch": 0.5652173913043478,
"grad_norm": 2.0468740463256836,
"learning_rate": 8.595237951642055e-05,
"loss": 2.4584,
"step": 247
},
{
"epoch": 0.5675057208237986,
"grad_norm": 2.757575750350952,
"learning_rate": 8.58205991690316e-05,
"loss": 2.3874,
"step": 248
},
{
"epoch": 0.5697940503432495,
"grad_norm": 2.700618267059326,
"learning_rate": 8.568830563435694e-05,
"loss": 2.2581,
"step": 249
},
{
"epoch": 0.5720823798627003,
"grad_norm": 2.136777400970459,
"learning_rate": 8.555550080771273e-05,
"loss": 2.3313,
"step": 250
},
{
"epoch": 0.5743707093821511,
"grad_norm": 2.2393014430999756,
"learning_rate": 8.542218659174017e-05,
"loss": 2.3745,
"step": 251
},
{
"epoch": 0.5766590389016019,
"grad_norm": 1.9977840185165405,
"learning_rate": 8.528836489637828e-05,
"loss": 2.4189,
"step": 252
},
{
"epoch": 0.5789473684210527,
"grad_norm": 1.8844605684280396,
"learning_rate": 8.515403763883659e-05,
"loss": 2.2503,
"step": 253
},
{
"epoch": 0.5812356979405034,
"grad_norm": 2.225182056427002,
"learning_rate": 8.501920674356754e-05,
"loss": 2.3373,
"step": 254
},
{
"epoch": 0.5835240274599542,
"grad_norm": 3.804654598236084,
"learning_rate": 8.488387414223904e-05,
"loss": 2.4731,
"step": 255
},
{
"epoch": 0.585812356979405,
"grad_norm": 2.7391419410705566,
"learning_rate": 8.47480417737067e-05,
"loss": 2.3385,
"step": 256
},
{
"epoch": 0.5881006864988558,
"grad_norm": 1.8760191202163696,
"learning_rate": 8.461171158398612e-05,
"loss": 2.2315,
"step": 257
},
{
"epoch": 0.5903890160183066,
"grad_norm": 2.5383033752441406,
"learning_rate": 8.447488552622498e-05,
"loss": 2.2305,
"step": 258
},
{
"epoch": 0.5926773455377574,
"grad_norm": 2.4613940715789795,
"learning_rate": 8.433756556067506e-05,
"loss": 2.3998,
"step": 259
},
{
"epoch": 0.5949656750572082,
"grad_norm": 2.1040103435516357,
"learning_rate": 8.419975365466415e-05,
"loss": 2.4298,
"step": 260
},
{
"epoch": 0.597254004576659,
"grad_norm": 2.3999195098876953,
"learning_rate": 8.406145178256788e-05,
"loss": 2.3064,
"step": 261
},
{
"epoch": 0.5995423340961098,
"grad_norm": 2.633197069168091,
"learning_rate": 8.392266192578143e-05,
"loss": 2.4663,
"step": 262
},
{
"epoch": 0.6018306636155606,
"grad_norm": 2.4251699447631836,
"learning_rate": 8.37833860726911e-05,
"loss": 2.3587,
"step": 263
},
{
"epoch": 0.6041189931350115,
"grad_norm": 1.8132442235946655,
"learning_rate": 8.364362621864595e-05,
"loss": 2.3555,
"step": 264
},
{
"epoch": 0.6064073226544623,
"grad_norm": 2.6563608646392822,
"learning_rate": 8.350338436592905e-05,
"loss": 2.2702,
"step": 265
},
{
"epoch": 0.6086956521739131,
"grad_norm": 2.4801723957061768,
"learning_rate": 8.336266252372889e-05,
"loss": 2.266,
"step": 266
},
{
"epoch": 0.6109839816933639,
"grad_norm": 2.3440706729888916,
"learning_rate": 8.322146270811059e-05,
"loss": 2.3181,
"step": 267
},
{
"epoch": 0.6132723112128147,
"grad_norm": 2.256977081298828,
"learning_rate": 8.307978694198699e-05,
"loss": 2.2594,
"step": 268
},
{
"epoch": 0.6155606407322655,
"grad_norm": 2.4283649921417236,
"learning_rate": 8.293763725508969e-05,
"loss": 2.171,
"step": 269
},
{
"epoch": 0.6178489702517163,
"grad_norm": 1.9902546405792236,
"learning_rate": 8.279501568393994e-05,
"loss": 2.3791,
"step": 270
},
{
"epoch": 0.620137299771167,
"grad_norm": 2.388878583908081,
"learning_rate": 8.265192427181954e-05,
"loss": 2.2571,
"step": 271
},
{
"epoch": 0.6224256292906178,
"grad_norm": 2.6152727603912354,
"learning_rate": 8.250836506874142e-05,
"loss": 2.3477,
"step": 272
},
{
"epoch": 0.6247139588100686,
"grad_norm": 1.8113371133804321,
"learning_rate": 8.236434013142045e-05,
"loss": 2.1395,
"step": 273
},
{
"epoch": 0.6270022883295194,
"grad_norm": 9.44183349609375,
"learning_rate": 8.221985152324385e-05,
"loss": 2.4124,
"step": 274
},
{
"epoch": 0.6292906178489702,
"grad_norm": 2.5722267627716064,
"learning_rate": 8.207490131424167e-05,
"loss": 2.3806,
"step": 275
},
{
"epoch": 0.631578947368421,
"grad_norm": 2.5060784816741943,
"learning_rate": 8.192949158105713e-05,
"loss": 2.4453,
"step": 276
},
{
"epoch": 0.6338672768878718,
"grad_norm": 1.9694451093673706,
"learning_rate": 8.178362440691685e-05,
"loss": 2.2709,
"step": 277
},
{
"epoch": 0.6361556064073226,
"grad_norm": 2.1653637886047363,
"learning_rate": 8.163730188160105e-05,
"loss": 2.2756,
"step": 278
},
{
"epoch": 0.6384439359267735,
"grad_norm": 2.000991106033325,
"learning_rate": 8.149052610141357e-05,
"loss": 2.3256,
"step": 279
},
{
"epoch": 0.6407322654462243,
"grad_norm": 1.9745527505874634,
"learning_rate": 8.134329916915184e-05,
"loss": 2.1619,
"step": 280
},
{
"epoch": 0.6430205949656751,
"grad_norm": 1.7634015083312988,
"learning_rate": 8.119562319407679e-05,
"loss": 2.3815,
"step": 281
},
{
"epoch": 0.6453089244851259,
"grad_norm": 2.338313341140747,
"learning_rate": 8.104750029188257e-05,
"loss": 2.2965,
"step": 282
},
{
"epoch": 0.6475972540045767,
"grad_norm": 5.942075729370117,
"learning_rate": 8.089893258466632e-05,
"loss": 2.2091,
"step": 283
},
{
"epoch": 0.6498855835240275,
"grad_norm": 2.5060672760009766,
"learning_rate": 8.074992220089769e-05,
"loss": 2.4674,
"step": 284
},
{
"epoch": 0.6521739130434783,
"grad_norm": 2.6582159996032715,
"learning_rate": 8.060047127538835e-05,
"loss": 2.3614,
"step": 285
},
{
"epoch": 0.6544622425629291,
"grad_norm": 2.005200147628784,
"learning_rate": 8.045058194926153e-05,
"loss": 2.2056,
"step": 286
},
{
"epoch": 0.6567505720823799,
"grad_norm": 2.410688877105713,
"learning_rate": 8.030025636992113e-05,
"loss": 2.2247,
"step": 287
},
{
"epoch": 0.6590389016018307,
"grad_norm": 1.9305633306503296,
"learning_rate": 8.014949669102117e-05,
"loss": 2.0635,
"step": 288
},
{
"epoch": 0.6613272311212814,
"grad_norm": 2.0201923847198486,
"learning_rate": 7.999830507243478e-05,
"loss": 2.1299,
"step": 289
},
{
"epoch": 0.6636155606407322,
"grad_norm": 1.8325549364089966,
"learning_rate": 7.984668368022335e-05,
"loss": 2.1106,
"step": 290
},
{
"epoch": 0.665903890160183,
"grad_norm": 2.2241806983947754,
"learning_rate": 7.969463468660545e-05,
"loss": 2.2576,
"step": 291
},
{
"epoch": 0.6681922196796338,
"grad_norm": 2.101597547531128,
"learning_rate": 7.954216026992571e-05,
"loss": 2.3561,
"step": 292
},
{
"epoch": 0.6704805491990846,
"grad_norm": 2.0303845405578613,
"learning_rate": 7.938926261462366e-05,
"loss": 2.1977,
"step": 293
},
{
"epoch": 0.6727688787185355,
"grad_norm": 2.016955852508545,
"learning_rate": 7.923594391120236e-05,
"loss": 2.3022,
"step": 294
},
{
"epoch": 0.6750572082379863,
"grad_norm": 1.99684476852417,
"learning_rate": 7.908220635619708e-05,
"loss": 2.1994,
"step": 295
},
{
"epoch": 0.6773455377574371,
"grad_norm": 2.2181754112243652,
"learning_rate": 7.892805215214381e-05,
"loss": 2.2126,
"step": 296
},
{
"epoch": 0.6796338672768879,
"grad_norm": 2.0173442363739014,
"learning_rate": 7.877348350754767e-05,
"loss": 2.2134,
"step": 297
},
{
"epoch": 0.6819221967963387,
"grad_norm": 1.8784480094909668,
"learning_rate": 7.861850263685134e-05,
"loss": 2.3381,
"step": 298
},
{
"epoch": 0.6842105263157895,
"grad_norm": 2.2438318729400635,
"learning_rate": 7.846311176040331e-05,
"loss": 2.3408,
"step": 299
},
{
"epoch": 0.6864988558352403,
"grad_norm": 2.977372884750366,
"learning_rate": 7.830731310442599e-05,
"loss": 2.2358,
"step": 300
},
{
"epoch": 0.6887871853546911,
"grad_norm": 2.1407394409179688,
"learning_rate": 7.815110890098397e-05,
"loss": 2.2125,
"step": 301
},
{
"epoch": 0.6910755148741419,
"grad_norm": 1.6845601797103882,
"learning_rate": 7.799450138795185e-05,
"loss": 2.0579,
"step": 302
},
{
"epoch": 0.6933638443935927,
"grad_norm": 1.834087610244751,
"learning_rate": 7.78374928089824e-05,
"loss": 2.3022,
"step": 303
},
{
"epoch": 0.6956521739130435,
"grad_norm": 1.8871235847473145,
"learning_rate": 7.768008541347423e-05,
"loss": 2.2783,
"step": 304
},
{
"epoch": 0.6979405034324943,
"grad_norm": 1.9850207567214966,
"learning_rate": 7.752228145653964e-05,
"loss": 2.3613,
"step": 305
},
{
"epoch": 0.700228832951945,
"grad_norm": 1.9508417844772339,
"learning_rate": 7.73640831989723e-05,
"loss": 2.3295,
"step": 306
},
{
"epoch": 0.7025171624713958,
"grad_norm": 2.0530943870544434,
"learning_rate": 7.72054929072149e-05,
"loss": 2.2485,
"step": 307
},
{
"epoch": 0.7048054919908466,
"grad_norm": 2.0443074703216553,
"learning_rate": 7.704651285332663e-05,
"loss": 2.1668,
"step": 308
},
{
"epoch": 0.7070938215102975,
"grad_norm": 2.0727176666259766,
"learning_rate": 7.688714531495061e-05,
"loss": 2.1517,
"step": 309
},
{
"epoch": 0.7093821510297483,
"grad_norm": 2.033459186553955,
"learning_rate": 7.672739257528134e-05,
"loss": 2.2783,
"step": 310
},
{
"epoch": 0.7116704805491991,
"grad_norm": 2.2704668045043945,
"learning_rate": 7.656725692303195e-05,
"loss": 2.3378,
"step": 311
},
{
"epoch": 0.7139588100686499,
"grad_norm": 2.021702766418457,
"learning_rate": 7.640674065240136e-05,
"loss": 2.1894,
"step": 312
},
{
"epoch": 0.7162471395881007,
"grad_norm": 2.0539019107818604,
"learning_rate": 7.624584606304147e-05,
"loss": 2.2705,
"step": 313
},
{
"epoch": 0.7185354691075515,
"grad_norm": 1.6023483276367188,
"learning_rate": 7.608457546002424e-05,
"loss": 2.1957,
"step": 314
},
{
"epoch": 0.7208237986270023,
"grad_norm": 1.9021177291870117,
"learning_rate": 7.592293115380855e-05,
"loss": 2.1939,
"step": 315
},
{
"epoch": 0.7231121281464531,
"grad_norm": 1.7106837034225464,
"learning_rate": 7.576091546020725e-05,
"loss": 2.1383,
"step": 316
},
{
"epoch": 0.7254004576659039,
"grad_norm": 2.3384652137756348,
"learning_rate": 7.559853070035389e-05,
"loss": 2.1954,
"step": 317
},
{
"epoch": 0.7276887871853547,
"grad_norm": 2.0455210208892822,
"learning_rate": 7.543577920066944e-05,
"loss": 2.2671,
"step": 318
},
{
"epoch": 0.7299771167048055,
"grad_norm": 2.6084513664245605,
"learning_rate": 7.527266329282905e-05,
"loss": 2.1935,
"step": 319
},
{
"epoch": 0.7322654462242563,
"grad_norm": 1.9567663669586182,
"learning_rate": 7.510918531372857e-05,
"loss": 2.3201,
"step": 320
},
{
"epoch": 0.7345537757437071,
"grad_norm": 2.3495213985443115,
"learning_rate": 7.494534760545113e-05,
"loss": 2.2716,
"step": 321
},
{
"epoch": 0.7368421052631579,
"grad_norm": 2.139066696166992,
"learning_rate": 7.478115251523352e-05,
"loss": 2.3472,
"step": 322
},
{
"epoch": 0.7391304347826086,
"grad_norm": 1.9608685970306396,
"learning_rate": 7.461660239543261e-05,
"loss": 2.2516,
"step": 323
},
{
"epoch": 0.7414187643020596,
"grad_norm": 1.9583091735839844,
"learning_rate": 7.445169960349167e-05,
"loss": 2.0278,
"step": 324
},
{
"epoch": 0.7437070938215103,
"grad_norm": 1.9751293659210205,
"learning_rate": 7.42864465019065e-05,
"loss": 2.111,
"step": 325
},
{
"epoch": 0.7459954233409611,
"grad_norm": 1.8538637161254883,
"learning_rate": 7.412084545819168e-05,
"loss": 2.3414,
"step": 326
},
{
"epoch": 0.7482837528604119,
"grad_norm": 2.0090718269348145,
"learning_rate": 7.395489884484665e-05,
"loss": 2.3156,
"step": 327
},
{
"epoch": 0.7505720823798627,
"grad_norm": 1.8915284872055054,
"learning_rate": 7.378860903932159e-05,
"loss": 2.2654,
"step": 328
},
{
"epoch": 0.7528604118993135,
"grad_norm": 1.6930265426635742,
"learning_rate": 7.362197842398355e-05,
"loss": 2.0445,
"step": 329
},
{
"epoch": 0.7551487414187643,
"grad_norm": 1.9732853174209595,
"learning_rate": 7.34550093860822e-05,
"loss": 2.2913,
"step": 330
},
{
"epoch": 0.7574370709382151,
"grad_norm": 1.865723967552185,
"learning_rate": 7.32877043177156e-05,
"loss": 2.0363,
"step": 331
},
{
"epoch": 0.7597254004576659,
"grad_norm": 2.0198216438293457,
"learning_rate": 7.31200656157961e-05,
"loss": 2.1827,
"step": 332
},
{
"epoch": 0.7620137299771167,
"grad_norm": 1.8572312593460083,
"learning_rate": 7.295209568201574e-05,
"loss": 2.3125,
"step": 333
},
{
"epoch": 0.7643020594965675,
"grad_norm": 1.9783533811569214,
"learning_rate": 7.278379692281208e-05,
"loss": 2.0752,
"step": 334
},
{
"epoch": 0.7665903890160183,
"grad_norm": 1.9235711097717285,
"learning_rate": 7.261517174933362e-05,
"loss": 2.2604,
"step": 335
},
{
"epoch": 0.7688787185354691,
"grad_norm": 1.8910268545150757,
"learning_rate": 7.244622257740523e-05,
"loss": 2.5354,
"step": 336
},
{
"epoch": 0.7711670480549199,
"grad_norm": 1.8306984901428223,
"learning_rate": 7.227695182749361e-05,
"loss": 2.3317,
"step": 337
},
{
"epoch": 0.7734553775743707,
"grad_norm": 1.7641491889953613,
"learning_rate": 7.210736192467256e-05,
"loss": 2.1461,
"step": 338
},
{
"epoch": 0.7757437070938215,
"grad_norm": 2.0800931453704834,
"learning_rate": 7.193745529858826e-05,
"loss": 2.3707,
"step": 339
},
{
"epoch": 0.7780320366132724,
"grad_norm": 2.155435800552368,
"learning_rate": 7.176723438342446e-05,
"loss": 2.3194,
"step": 340
},
{
"epoch": 0.7803203661327232,
"grad_norm": 1.8292967081069946,
"learning_rate": 7.159670161786759e-05,
"loss": 2.2795,
"step": 341
},
{
"epoch": 0.782608695652174,
"grad_norm": 2.0984854698181152,
"learning_rate": 7.142585944507185e-05,
"loss": 2.2426,
"step": 342
},
{
"epoch": 0.7848970251716247,
"grad_norm": 2.123730421066284,
"learning_rate": 7.125471031262417e-05,
"loss": 2.0311,
"step": 343
},
{
"epoch": 0.7871853546910755,
"grad_norm": 2.2667717933654785,
"learning_rate": 7.10832566725092e-05,
"loss": 2.1289,
"step": 344
},
{
"epoch": 0.7894736842105263,
"grad_norm": 1.7086976766586304,
"learning_rate": 7.091150098107414e-05,
"loss": 2.4501,
"step": 345
},
{
"epoch": 0.7917620137299771,
"grad_norm": 2.096912384033203,
"learning_rate": 7.073944569899354e-05,
"loss": 2.2119,
"step": 346
},
{
"epoch": 0.7940503432494279,
"grad_norm": 1.7574291229248047,
"learning_rate": 7.056709329123408e-05,
"loss": 2.2372,
"step": 347
},
{
"epoch": 0.7963386727688787,
"grad_norm": 1.718612790107727,
"learning_rate": 7.039444622701922e-05,
"loss": 2.2722,
"step": 348
},
{
"epoch": 0.7986270022883295,
"grad_norm": 2.0621182918548584,
"learning_rate": 7.022150697979384e-05,
"loss": 1.9942,
"step": 349
},
{
"epoch": 0.8009153318077803,
"grad_norm": 2.8785934448242188,
"learning_rate": 7.00482780271889e-05,
"loss": 2.2271,
"step": 350
},
{
"epoch": 0.8032036613272311,
"grad_norm": 1.8720818758010864,
"learning_rate": 6.98747618509857e-05,
"loss": 2.1045,
"step": 351
},
{
"epoch": 0.8054919908466819,
"grad_norm": 2.2784407138824463,
"learning_rate": 6.97009609370806e-05,
"loss": 2.22,
"step": 352
},
{
"epoch": 0.8077803203661327,
"grad_norm": 1.8243550062179565,
"learning_rate": 6.952687777544922e-05,
"loss": 2.1709,
"step": 353
},
{
"epoch": 0.8100686498855835,
"grad_norm": 1.965592384338379,
"learning_rate": 6.935251486011087e-05,
"loss": 2.2081,
"step": 354
},
{
"epoch": 0.8123569794050344,
"grad_norm": 1.8083536624908447,
"learning_rate": 6.917787468909271e-05,
"loss": 2.0548,
"step": 355
},
{
"epoch": 0.8146453089244852,
"grad_norm": 2.135918140411377,
"learning_rate": 6.900295976439413e-05,
"loss": 2.1584,
"step": 356
},
{
"epoch": 0.816933638443936,
"grad_norm": 1.90550696849823,
"learning_rate": 6.882777259195071e-05,
"loss": 2.0983,
"step": 357
},
{
"epoch": 0.8192219679633868,
"grad_norm": 2.0657262802124023,
"learning_rate": 6.865231568159846e-05,
"loss": 2.2419,
"step": 358
},
{
"epoch": 0.8215102974828375,
"grad_norm": 1.8104684352874756,
"learning_rate": 6.847659154703785e-05,
"loss": 2.0536,
"step": 359
},
{
"epoch": 0.8237986270022883,
"grad_norm": 1.855124831199646,
"learning_rate": 6.830060270579768e-05,
"loss": 2.2918,
"step": 360
},
{
"epoch": 0.8260869565217391,
"grad_norm": 1.8047131299972534,
"learning_rate": 6.812435167919918e-05,
"loss": 2.2204,
"step": 361
},
{
"epoch": 0.8283752860411899,
"grad_norm": 1.9419913291931152,
"learning_rate": 6.794784099231972e-05,
"loss": 2.1344,
"step": 362
},
{
"epoch": 0.8306636155606407,
"grad_norm": 2.067949056625366,
"learning_rate": 6.777107317395679e-05,
"loss": 2.1645,
"step": 363
},
{
"epoch": 0.8329519450800915,
"grad_norm": 1.7515795230865479,
"learning_rate": 6.759405075659166e-05,
"loss": 2.585,
"step": 364
},
{
"epoch": 0.8352402745995423,
"grad_norm": 1.828723430633545,
"learning_rate": 6.741677627635317e-05,
"loss": 2.1944,
"step": 365
},
{
"epoch": 0.8375286041189931,
"grad_norm": 2.222910165786743,
"learning_rate": 6.723925227298132e-05,
"loss": 2.1099,
"step": 366
},
{
"epoch": 0.8398169336384439,
"grad_norm": 1.6402628421783447,
"learning_rate": 6.706148128979095e-05,
"loss": 2.3331,
"step": 367
},
{
"epoch": 0.8421052631578947,
"grad_norm": 2.122103214263916,
"learning_rate": 6.688346587363533e-05,
"loss": 2.167,
"step": 368
},
{
"epoch": 0.8443935926773455,
"grad_norm": 1.705405592918396,
"learning_rate": 6.67052085748695e-05,
"loss": 2.1447,
"step": 369
},
{
"epoch": 0.8466819221967964,
"grad_norm": 1.9684945344924927,
"learning_rate": 6.652671194731396e-05,
"loss": 2.1688,
"step": 370
},
{
"epoch": 0.8489702517162472,
"grad_norm": 1.9052337408065796,
"learning_rate": 6.634797854821795e-05,
"loss": 2.1049,
"step": 371
},
{
"epoch": 0.851258581235698,
"grad_norm": 1.8532124757766724,
"learning_rate": 6.616901093822283e-05,
"loss": 2.2438,
"step": 372
},
{
"epoch": 0.8535469107551488,
"grad_norm": 1.8264321088790894,
"learning_rate": 6.598981168132539e-05,
"loss": 2.1135,
"step": 373
},
{
"epoch": 0.8558352402745996,
"grad_norm": 1.6481093168258667,
"learning_rate": 6.58103833448412e-05,
"loss": 2.1073,
"step": 374
},
{
"epoch": 0.8581235697940504,
"grad_norm": 1.8912767171859741,
"learning_rate": 6.563072849936766e-05,
"loss": 2.0309,
"step": 375
},
{
"epoch": 0.8604118993135011,
"grad_norm": 1.7808010578155518,
"learning_rate": 6.545084971874738e-05,
"loss": 2.3258,
"step": 376
},
{
"epoch": 0.8627002288329519,
"grad_norm": 1.9393279552459717,
"learning_rate": 6.527074958003109e-05,
"loss": 2.1877,
"step": 377
},
{
"epoch": 0.8649885583524027,
"grad_norm": 1.8000222444534302,
"learning_rate": 6.509043066344092e-05,
"loss": 2.172,
"step": 378
},
{
"epoch": 0.8672768878718535,
"grad_norm": 1.6351038217544556,
"learning_rate": 6.490989555233327e-05,
"loss": 1.9942,
"step": 379
},
{
"epoch": 0.8695652173913043,
"grad_norm": 1.7084710597991943,
"learning_rate": 6.472914683316195e-05,
"loss": 2.0928,
"step": 380
},
{
"epoch": 0.8718535469107551,
"grad_norm": 2.094414234161377,
"learning_rate": 6.454818709544097e-05,
"loss": 2.1815,
"step": 381
},
{
"epoch": 0.8741418764302059,
"grad_norm": 2.299128293991089,
"learning_rate": 6.436701893170756e-05,
"loss": 2.5142,
"step": 382
},
{
"epoch": 0.8764302059496567,
"grad_norm": 2.0357043743133545,
"learning_rate": 6.4185644937485e-05,
"loss": 2.2609,
"step": 383
},
{
"epoch": 0.8787185354691075,
"grad_norm": 1.9757893085479736,
"learning_rate": 6.400406771124536e-05,
"loss": 2.0867,
"step": 384
},
{
"epoch": 0.8810068649885584,
"grad_norm": 1.849135160446167,
"learning_rate": 6.382228985437237e-05,
"loss": 2.2756,
"step": 385
},
{
"epoch": 0.8832951945080092,
"grad_norm": 2.372392177581787,
"learning_rate": 6.364031397112416e-05,
"loss": 2.1671,
"step": 386
},
{
"epoch": 0.88558352402746,
"grad_norm": 1.765740156173706,
"learning_rate": 6.345814266859581e-05,
"loss": 1.9694,
"step": 387
},
{
"epoch": 0.8878718535469108,
"grad_norm": 2.085186719894409,
"learning_rate": 6.327577855668216e-05,
"loss": 2.1867,
"step": 388
},
{
"epoch": 0.8901601830663616,
"grad_norm": 2.0762360095977783,
"learning_rate": 6.309322424804034e-05,
"loss": 2.3435,
"step": 389
},
{
"epoch": 0.8924485125858124,
"grad_norm": 1.8886313438415527,
"learning_rate": 6.291048235805234e-05,
"loss": 1.9982,
"step": 390
},
{
"epoch": 0.8947368421052632,
"grad_norm": 1.963932752609253,
"learning_rate": 6.272755550478757e-05,
"loss": 2.1239,
"step": 391
},
{
"epoch": 0.897025171624714,
"grad_norm": 1.6394884586334229,
"learning_rate": 6.254444630896529e-05,
"loss": 2.2411,
"step": 392
},
{
"epoch": 0.8993135011441648,
"grad_norm": 2.1238386631011963,
"learning_rate": 6.236115739391716e-05,
"loss": 2.1889,
"step": 393
},
{
"epoch": 0.9016018306636155,
"grad_norm": 1.8659511804580688,
"learning_rate": 6.21776913855496e-05,
"loss": 1.9483,
"step": 394
},
{
"epoch": 0.9038901601830663,
"grad_norm": 1.7186646461486816,
"learning_rate": 6.199405091230614e-05,
"loss": 1.8723,
"step": 395
},
{
"epoch": 0.9061784897025171,
"grad_norm": 2.3829565048217773,
"learning_rate": 6.181023860512984e-05,
"loss": 2.188,
"step": 396
},
{
"epoch": 0.9084668192219679,
"grad_norm": 1.926705002784729,
"learning_rate": 6.162625709742551e-05,
"loss": 2.1078,
"step": 397
},
{
"epoch": 0.9107551487414187,
"grad_norm": 1.8216861486434937,
"learning_rate": 6.144210902502207e-05,
"loss": 2.0508,
"step": 398
},
{
"epoch": 0.9130434782608695,
"grad_norm": 1.8305362462997437,
"learning_rate": 6.125779702613471e-05,
"loss": 2.0667,
"step": 399
},
{
"epoch": 0.9153318077803204,
"grad_norm": 1.6279104948043823,
"learning_rate": 6.107332374132715e-05,
"loss": 2.0211,
"step": 400
},
{
"epoch": 0.9176201372997712,
"grad_norm": 1.9483661651611328,
"learning_rate": 6.088869181347379e-05,
"loss": 1.9574,
"step": 401
},
{
"epoch": 0.919908466819222,
"grad_norm": 1.8065564632415771,
"learning_rate": 6.0703903887721837e-05,
"loss": 2.1856,
"step": 402
},
{
"epoch": 0.9221967963386728,
"grad_norm": 1.7617079019546509,
"learning_rate": 6.051896261145341e-05,
"loss": 2.0909,
"step": 403
},
{
"epoch": 0.9244851258581236,
"grad_norm": 2.038517951965332,
"learning_rate": 6.0333870634247645e-05,
"loss": 2.0191,
"step": 404
},
{
"epoch": 0.9267734553775744,
"grad_norm": 1.9872255325317383,
"learning_rate": 6.0148630607842706e-05,
"loss": 2.0042,
"step": 405
},
{
"epoch": 0.9290617848970252,
"grad_norm": 1.7905998229980469,
"learning_rate": 5.9963245186097725e-05,
"loss": 2.5172,
"step": 406
},
{
"epoch": 0.931350114416476,
"grad_norm": 1.7833902835845947,
"learning_rate": 5.977771702495497e-05,
"loss": 2.2731,
"step": 407
},
{
"epoch": 0.9336384439359268,
"grad_norm": 1.748382806777954,
"learning_rate": 5.95920487824016e-05,
"loss": 2.2315,
"step": 408
},
{
"epoch": 0.9359267734553776,
"grad_norm": 1.7928653955459595,
"learning_rate": 5.940624311843169e-05,
"loss": 1.9035,
"step": 409
},
{
"epoch": 0.9382151029748284,
"grad_norm": 1.7036387920379639,
"learning_rate": 5.922030269500809e-05,
"loss": 1.9166,
"step": 410
},
{
"epoch": 0.9405034324942791,
"grad_norm": 2.600257396697998,
"learning_rate": 5.9034230176024316e-05,
"loss": 1.9559,
"step": 411
},
{
"epoch": 0.9427917620137299,
"grad_norm": 1.8736798763275146,
"learning_rate": 5.8848028227266325e-05,
"loss": 1.9929,
"step": 412
},
{
"epoch": 0.9450800915331807,
"grad_norm": 1.917090892791748,
"learning_rate": 5.866169951637439e-05,
"loss": 2.1785,
"step": 413
},
{
"epoch": 0.9473684210526315,
"grad_norm": 2.0305333137512207,
"learning_rate": 5.847524671280484e-05,
"loss": 2.1782,
"step": 414
},
{
"epoch": 0.9496567505720824,
"grad_norm": 1.9978338479995728,
"learning_rate": 5.8288672487791854e-05,
"loss": 2.1131,
"step": 415
},
{
"epoch": 0.9519450800915332,
"grad_norm": 1.714215874671936,
"learning_rate": 5.810197951430911e-05,
"loss": 2.0684,
"step": 416
},
{
"epoch": 0.954233409610984,
"grad_norm": 1.8952488899230957,
"learning_rate": 5.7915170467031635e-05,
"loss": 2.2021,
"step": 417
},
{
"epoch": 0.9565217391304348,
"grad_norm": 2.1692068576812744,
"learning_rate": 5.772824802229733e-05,
"loss": 2.1982,
"step": 418
},
{
"epoch": 0.9588100686498856,
"grad_norm": 1.9838628768920898,
"learning_rate": 5.7541214858068705e-05,
"loss": 1.9893,
"step": 419
},
{
"epoch": 0.9610983981693364,
"grad_norm": 1.965591311454773,
"learning_rate": 5.735407365389453e-05,
"loss": 2.2526,
"step": 420
},
{
"epoch": 0.9633867276887872,
"grad_norm": 1.8901221752166748,
"learning_rate": 5.716682709087139e-05,
"loss": 2.1193,
"step": 421
},
{
"epoch": 0.965675057208238,
"grad_norm": 1.8736552000045776,
"learning_rate": 5.697947785160532e-05,
"loss": 1.9734,
"step": 422
},
{
"epoch": 0.9679633867276888,
"grad_norm": 1.5453659296035767,
"learning_rate": 5.679202862017338e-05,
"loss": 2.1037,
"step": 423
},
{
"epoch": 0.9702517162471396,
"grad_norm": 2.2312018871307373,
"learning_rate": 5.660448208208513e-05,
"loss": 2.2064,
"step": 424
},
{
"epoch": 0.9725400457665904,
"grad_norm": 2.2896294593811035,
"learning_rate": 5.641684092424421e-05,
"loss": 2.1111,
"step": 425
},
{
"epoch": 0.9748283752860412,
"grad_norm": 1.7300447225570679,
"learning_rate": 5.622910783490988e-05,
"loss": 2.1577,
"step": 426
},
{
"epoch": 0.977116704805492,
"grad_norm": 1.988135576248169,
"learning_rate": 5.604128550365845e-05,
"loss": 2.0322,
"step": 427
},
{
"epoch": 0.9794050343249427,
"grad_norm": 1.9037761688232422,
"learning_rate": 5.585337662134471e-05,
"loss": 2.351,
"step": 428
},
{
"epoch": 0.9816933638443935,
"grad_norm": 1.5671565532684326,
"learning_rate": 5.56653838800635e-05,
"loss": 1.8901,
"step": 429
},
{
"epoch": 0.9839816933638444,
"grad_norm": 1.687756896018982,
"learning_rate": 5.5477309973111046e-05,
"loss": 2.2379,
"step": 430
},
{
"epoch": 0.9862700228832952,
"grad_norm": 1.8213844299316406,
"learning_rate": 5.52891575949464e-05,
"loss": 2.1647,
"step": 431
},
{
"epoch": 0.988558352402746,
"grad_norm": 2.0748496055603027,
"learning_rate": 5.510092944115286e-05,
"loss": 2.0475,
"step": 432
},
{
"epoch": 0.9908466819221968,
"grad_norm": 1.5618698596954346,
"learning_rate": 5.4912628208399294e-05,
"loss": 1.9637,
"step": 433
},
{
"epoch": 0.9931350114416476,
"grad_norm": 2.596681594848633,
"learning_rate": 5.472425659440157e-05,
"loss": 2.408,
"step": 434
},
{
"epoch": 0.9954233409610984,
"grad_norm": 1.877379059791565,
"learning_rate": 5.4535817297883876e-05,
"loss": 2.009,
"step": 435
},
{
"epoch": 0.9977116704805492,
"grad_norm": 2.1219913959503174,
"learning_rate": 5.4347313018540056e-05,
"loss": 2.0394,
"step": 436
},
{
"epoch": 1.0,
"grad_norm": 1.9108868837356567,
"learning_rate": 5.415874645699492e-05,
"loss": 2.2033,
"step": 437
},
{
"epoch": 1.002288329519451,
"grad_norm": 1.572576642036438,
"learning_rate": 5.397012031476562e-05,
"loss": 1.9969,
"step": 438
},
{
"epoch": 1.0045766590389016,
"grad_norm": 1.759139895439148,
"learning_rate": 5.3781437294222845e-05,
"loss": 1.8446,
"step": 439
},
{
"epoch": 1.0068649885583525,
"grad_norm": 1.6574208736419678,
"learning_rate": 5.359270009855216e-05,
"loss": 1.8921,
"step": 440
},
{
"epoch": 1.0091533180778032,
"grad_norm": 1.57534921169281,
"learning_rate": 5.340391143171535e-05,
"loss": 1.7728,
"step": 441
},
{
"epoch": 1.011441647597254,
"grad_norm": 2.2976841926574707,
"learning_rate": 5.321507399841148e-05,
"loss": 1.9545,
"step": 442
},
{
"epoch": 1.0137299771167048,
"grad_norm": 2.1588919162750244,
"learning_rate": 5.302619050403836e-05,
"loss": 1.8841,
"step": 443
},
{
"epoch": 1.0160183066361557,
"grad_norm": 1.7370262145996094,
"learning_rate": 5.2837263654653715e-05,
"loss": 1.6826,
"step": 444
},
{
"epoch": 1.0183066361556063,
"grad_norm": 1.791210651397705,
"learning_rate": 5.264829615693632e-05,
"loss": 1.8758,
"step": 445
},
{
"epoch": 1.0205949656750573,
"grad_norm": 1.655311942100525,
"learning_rate": 5.2459290718147344e-05,
"loss": 1.7852,
"step": 446
},
{
"epoch": 1.022883295194508,
"grad_norm": 1.5473575592041016,
"learning_rate": 5.2270250046091565e-05,
"loss": 1.8484,
"step": 447
},
{
"epoch": 1.0251716247139588,
"grad_norm": 1.887047529220581,
"learning_rate": 5.2081176849078464e-05,
"loss": 1.9274,
"step": 448
},
{
"epoch": 1.0274599542334095,
"grad_norm": 1.7386250495910645,
"learning_rate": 5.1892073835883524e-05,
"loss": 1.8835,
"step": 449
},
{
"epoch": 1.0297482837528604,
"grad_norm": 1.702992558479309,
"learning_rate": 5.170294371570939e-05,
"loss": 1.7701,
"step": 450
},
{
"epoch": 1.032036613272311,
"grad_norm": 1.6595892906188965,
"learning_rate": 5.151378919814708e-05,
"loss": 1.8446,
"step": 451
},
{
"epoch": 1.034324942791762,
"grad_norm": 1.7467985153198242,
"learning_rate": 5.132461299313709e-05,
"loss": 1.9419,
"step": 452
},
{
"epoch": 1.036613272311213,
"grad_norm": 2.760329484939575,
"learning_rate": 5.113541781093067e-05,
"loss": 1.9143,
"step": 453
},
{
"epoch": 1.0389016018306636,
"grad_norm": 1.7901750802993774,
"learning_rate": 5.094620636205095e-05,
"loss": 1.8533,
"step": 454
},
{
"epoch": 1.0411899313501145,
"grad_norm": 1.7374346256256104,
"learning_rate": 5.0756981357254086e-05,
"loss": 1.7943,
"step": 455
},
{
"epoch": 1.0434782608695652,
"grad_norm": 1.5613045692443848,
"learning_rate": 5.056774550749043e-05,
"loss": 1.6596,
"step": 456
},
{
"epoch": 1.045766590389016,
"grad_norm": 1.5850396156311035,
"learning_rate": 5.037850152386574e-05,
"loss": 1.9496,
"step": 457
},
{
"epoch": 1.0480549199084668,
"grad_norm": 1.5969269275665283,
"learning_rate": 5.018925211760227e-05,
"loss": 1.9164,
"step": 458
},
{
"epoch": 1.0503432494279177,
"grad_norm": 1.6653907299041748,
"learning_rate": 5e-05,
"loss": 1.6898,
"step": 459
},
{
"epoch": 1.0526315789473684,
"grad_norm": 1.9547151327133179,
"learning_rate": 4.981074788239773e-05,
"loss": 2.0595,
"step": 460
},
{
"epoch": 1.0549199084668193,
"grad_norm": 1.6709939241409302,
"learning_rate": 4.962149847613428e-05,
"loss": 1.8511,
"step": 461
},
{
"epoch": 1.05720823798627,
"grad_norm": 1.5872114896774292,
"learning_rate": 4.943225449250958e-05,
"loss": 1.866,
"step": 462
},
{
"epoch": 1.0594965675057209,
"grad_norm": 1.5942639112472534,
"learning_rate": 4.9243018642745926e-05,
"loss": 1.7682,
"step": 463
},
{
"epoch": 1.0617848970251715,
"grad_norm": 1.9201196432113647,
"learning_rate": 4.9053793637949067e-05,
"loss": 1.9174,
"step": 464
},
{
"epoch": 1.0640732265446224,
"grad_norm": 3.5494394302368164,
"learning_rate": 4.886458218906934e-05,
"loss": 1.8625,
"step": 465
},
{
"epoch": 1.0663615560640731,
"grad_norm": 1.7486943006515503,
"learning_rate": 4.8675387006862914e-05,
"loss": 1.8216,
"step": 466
},
{
"epoch": 1.068649885583524,
"grad_norm": 2.6902172565460205,
"learning_rate": 4.8486210801852946e-05,
"loss": 2.0201,
"step": 467
},
{
"epoch": 1.070938215102975,
"grad_norm": 1.8246062994003296,
"learning_rate": 4.829705628429061e-05,
"loss": 1.8379,
"step": 468
},
{
"epoch": 1.0732265446224256,
"grad_norm": 1.6337534189224243,
"learning_rate": 4.810792616411649e-05,
"loss": 1.782,
"step": 469
},
{
"epoch": 1.0755148741418765,
"grad_norm": 1.5864002704620361,
"learning_rate": 4.7918823150921555e-05,
"loss": 1.7724,
"step": 470
},
{
"epoch": 1.0778032036613272,
"grad_norm": 1.5951626300811768,
"learning_rate": 4.772974995390845e-05,
"loss": 1.9832,
"step": 471
},
{
"epoch": 1.080091533180778,
"grad_norm": 7.005580425262451,
"learning_rate": 4.754070928185266e-05,
"loss": 1.8087,
"step": 472
},
{
"epoch": 1.0823798627002288,
"grad_norm": 1.8255860805511475,
"learning_rate": 4.735170384306371e-05,
"loss": 1.8183,
"step": 473
},
{
"epoch": 1.0846681922196797,
"grad_norm": 1.7794055938720703,
"learning_rate": 4.7162736345346303e-05,
"loss": 1.9306,
"step": 474
},
{
"epoch": 1.0869565217391304,
"grad_norm": 3.2093281745910645,
"learning_rate": 4.6973809495961635e-05,
"loss": 1.9895,
"step": 475
},
{
"epoch": 1.0892448512585813,
"grad_norm": 1.7342705726623535,
"learning_rate": 4.6784926001588544e-05,
"loss": 1.7491,
"step": 476
},
{
"epoch": 1.091533180778032,
"grad_norm": 1.744227647781372,
"learning_rate": 4.659608856828467e-05,
"loss": 1.8527,
"step": 477
},
{
"epoch": 1.0938215102974829,
"grad_norm": 1.5981770753860474,
"learning_rate": 4.640729990144784e-05,
"loss": 1.8146,
"step": 478
},
{
"epoch": 1.0961098398169336,
"grad_norm": 1.6575838327407837,
"learning_rate": 4.621856270577718e-05,
"loss": 1.6702,
"step": 479
},
{
"epoch": 1.0983981693363845,
"grad_norm": 2.154350519180298,
"learning_rate": 4.6029879685234395e-05,
"loss": 1.8477,
"step": 480
},
{
"epoch": 1.1006864988558351,
"grad_norm": 1.6603792905807495,
"learning_rate": 4.584125354300508e-05,
"loss": 1.6785,
"step": 481
},
{
"epoch": 1.102974828375286,
"grad_norm": 2.6443369388580322,
"learning_rate": 4.565268698145997e-05,
"loss": 1.9084,
"step": 482
},
{
"epoch": 1.1052631578947367,
"grad_norm": 1.8790538311004639,
"learning_rate": 4.5464182702116135e-05,
"loss": 1.8971,
"step": 483
},
{
"epoch": 1.1075514874141876,
"grad_norm": 1.7948126792907715,
"learning_rate": 4.527574340559844e-05,
"loss": 1.8418,
"step": 484
},
{
"epoch": 1.1098398169336385,
"grad_norm": 1.5553560256958008,
"learning_rate": 4.508737179160072e-05,
"loss": 1.7311,
"step": 485
},
{
"epoch": 1.1121281464530892,
"grad_norm": 1.7662110328674316,
"learning_rate": 4.4899070558847154e-05,
"loss": 1.9865,
"step": 486
},
{
"epoch": 1.1144164759725401,
"grad_norm": 1.7766494750976562,
"learning_rate": 4.47108424050536e-05,
"loss": 1.7453,
"step": 487
},
{
"epoch": 1.1167048054919908,
"grad_norm": 1.8124011754989624,
"learning_rate": 4.452269002688897e-05,
"loss": 1.7079,
"step": 488
},
{
"epoch": 1.1189931350114417,
"grad_norm": 1.914129376411438,
"learning_rate": 4.433461611993651e-05,
"loss": 1.8638,
"step": 489
},
{
"epoch": 1.1212814645308924,
"grad_norm": 2.121391773223877,
"learning_rate": 4.4146623378655296e-05,
"loss": 1.8618,
"step": 490
},
{
"epoch": 1.1235697940503433,
"grad_norm": 1.6572754383087158,
"learning_rate": 4.3958714496341576e-05,
"loss": 1.7002,
"step": 491
},
{
"epoch": 1.125858123569794,
"grad_norm": 1.6960413455963135,
"learning_rate": 4.3770892165090126e-05,
"loss": 1.7757,
"step": 492
},
{
"epoch": 1.1281464530892449,
"grad_norm": 1.6742331981658936,
"learning_rate": 4.358315907575579e-05,
"loss": 1.9089,
"step": 493
},
{
"epoch": 1.1304347826086956,
"grad_norm": 1.587895393371582,
"learning_rate": 4.3395517917914895e-05,
"loss": 1.8139,
"step": 494
},
{
"epoch": 1.1327231121281465,
"grad_norm": 1.6868927478790283,
"learning_rate": 4.3207971379826634e-05,
"loss": 1.8067,
"step": 495
},
{
"epoch": 1.1350114416475972,
"grad_norm": 1.5678075551986694,
"learning_rate": 4.3020522148394676e-05,
"loss": 1.7984,
"step": 496
},
{
"epoch": 1.137299771167048,
"grad_norm": 1.5832632780075073,
"learning_rate": 4.283317290912863e-05,
"loss": 1.9427,
"step": 497
},
{
"epoch": 1.139588100686499,
"grad_norm": 2.134333372116089,
"learning_rate": 4.2645926346105484e-05,
"loss": 1.9557,
"step": 498
},
{
"epoch": 1.1418764302059496,
"grad_norm": 1.93275785446167,
"learning_rate": 4.2458785141931314e-05,
"loss": 1.8765,
"step": 499
},
{
"epoch": 1.1441647597254005,
"grad_norm": 1.618548035621643,
"learning_rate": 4.22717519777027e-05,
"loss": 1.8844,
"step": 500
},
{
"epoch": 1.1464530892448512,
"grad_norm": 1.5336642265319824,
"learning_rate": 4.208482953296838e-05,
"loss": 1.7973,
"step": 501
},
{
"epoch": 1.1487414187643021,
"grad_norm": 1.706456184387207,
"learning_rate": 4.189802048569089e-05,
"loss": 1.8691,
"step": 502
},
{
"epoch": 1.1510297482837528,
"grad_norm": 1.6857644319534302,
"learning_rate": 4.171132751220818e-05,
"loss": 2.0299,
"step": 503
},
{
"epoch": 1.1533180778032037,
"grad_norm": 1.629396915435791,
"learning_rate": 4.1524753287195165e-05,
"loss": 1.8305,
"step": 504
},
{
"epoch": 1.1556064073226544,
"grad_norm": 1.6303800344467163,
"learning_rate": 4.1338300483625615e-05,
"loss": 1.9312,
"step": 505
},
{
"epoch": 1.1578947368421053,
"grad_norm": 1.5684703588485718,
"learning_rate": 4.1151971772733686e-05,
"loss": 1.8142,
"step": 506
},
{
"epoch": 1.160183066361556,
"grad_norm": 1.51244056224823,
"learning_rate": 4.0965769823975696e-05,
"loss": 1.6742,
"step": 507
},
{
"epoch": 1.162471395881007,
"grad_norm": 1.5296814441680908,
"learning_rate": 4.07796973049919e-05,
"loss": 1.8872,
"step": 508
},
{
"epoch": 1.1647597254004576,
"grad_norm": 1.592588186264038,
"learning_rate": 4.059375688156832e-05,
"loss": 1.8124,
"step": 509
},
{
"epoch": 1.1670480549199085,
"grad_norm": 1.806203842163086,
"learning_rate": 4.04079512175984e-05,
"loss": 1.9555,
"step": 510
},
{
"epoch": 1.1693363844393594,
"grad_norm": 1.7168980836868286,
"learning_rate": 4.022228297504503e-05,
"loss": 2.0149,
"step": 511
},
{
"epoch": 1.17162471395881,
"grad_norm": 1.7011675834655762,
"learning_rate": 4.003675481390228e-05,
"loss": 1.907,
"step": 512
},
{
"epoch": 1.1739130434782608,
"grad_norm": 1.7287613153457642,
"learning_rate": 3.985136939215731e-05,
"loss": 1.7924,
"step": 513
},
{
"epoch": 1.1762013729977117,
"grad_norm": 2.2461440563201904,
"learning_rate": 3.966612936575235e-05,
"loss": 1.9193,
"step": 514
},
{
"epoch": 1.1784897025171626,
"grad_norm": 1.623975396156311,
"learning_rate": 3.94810373885466e-05,
"loss": 1.8328,
"step": 515
},
{
"epoch": 1.1807780320366132,
"grad_norm": 1.6067254543304443,
"learning_rate": 3.929609611227817e-05,
"loss": 1.8484,
"step": 516
},
{
"epoch": 1.1830663615560641,
"grad_norm": 1.6115238666534424,
"learning_rate": 3.911130818652621e-05,
"loss": 1.7091,
"step": 517
},
{
"epoch": 1.1853546910755148,
"grad_norm": 1.6530563831329346,
"learning_rate": 3.8926676258672866e-05,
"loss": 1.9627,
"step": 518
},
{
"epoch": 1.1876430205949657,
"grad_norm": 1.493896484375,
"learning_rate": 3.87422029738653e-05,
"loss": 1.6187,
"step": 519
},
{
"epoch": 1.1899313501144164,
"grad_norm": 6.4736647605896,
"learning_rate": 3.855789097497794e-05,
"loss": 1.7145,
"step": 520
},
{
"epoch": 1.1922196796338673,
"grad_norm": 1.6431195735931396,
"learning_rate": 3.837374290257449e-05,
"loss": 1.8529,
"step": 521
},
{
"epoch": 1.194508009153318,
"grad_norm": 1.9539908170700073,
"learning_rate": 3.818976139487017e-05,
"loss": 1.8584,
"step": 522
},
{
"epoch": 1.196796338672769,
"grad_norm": 1.7939807176589966,
"learning_rate": 3.800594908769385e-05,
"loss": 1.9842,
"step": 523
},
{
"epoch": 1.1990846681922196,
"grad_norm": 1.9472920894622803,
"learning_rate": 3.7822308614450406e-05,
"loss": 1.8557,
"step": 524
},
{
"epoch": 1.2013729977116705,
"grad_norm": 1.4854570627212524,
"learning_rate": 3.763884260608284e-05,
"loss": 1.7276,
"step": 525
},
{
"epoch": 1.2036613272311212,
"grad_norm": 1.7153141498565674,
"learning_rate": 3.745555369103471e-05,
"loss": 1.8041,
"step": 526
},
{
"epoch": 1.205949656750572,
"grad_norm": 1.8074935674667358,
"learning_rate": 3.727244449521245e-05,
"loss": 1.8617,
"step": 527
},
{
"epoch": 1.208237986270023,
"grad_norm": 1.734864354133606,
"learning_rate": 3.708951764194767e-05,
"loss": 1.7278,
"step": 528
},
{
"epoch": 1.2105263157894737,
"grad_norm": 1.639548659324646,
"learning_rate": 3.690677575195967e-05,
"loss": 2.0274,
"step": 529
},
{
"epoch": 1.2128146453089246,
"grad_norm": 1.52273428440094,
"learning_rate": 3.6724221443317855e-05,
"loss": 1.6321,
"step": 530
},
{
"epoch": 1.2151029748283753,
"grad_norm": 2.152561664581299,
"learning_rate": 3.65418573314042e-05,
"loss": 1.8368,
"step": 531
},
{
"epoch": 1.2173913043478262,
"grad_norm": 2.027317523956299,
"learning_rate": 3.635968602887585e-05,
"loss": 1.8566,
"step": 532
},
{
"epoch": 1.2196796338672768,
"grad_norm": 1.5637290477752686,
"learning_rate": 3.6177710145627635e-05,
"loss": 1.762,
"step": 533
},
{
"epoch": 1.2219679633867278,
"grad_norm": 2.137563705444336,
"learning_rate": 3.599593228875465e-05,
"loss": 1.8883,
"step": 534
},
{
"epoch": 1.2242562929061784,
"grad_norm": 1.9149296283721924,
"learning_rate": 3.581435506251501e-05,
"loss": 1.779,
"step": 535
},
{
"epoch": 1.2265446224256293,
"grad_norm": 1.6718182563781738,
"learning_rate": 3.563298106829244e-05,
"loss": 1.8807,
"step": 536
},
{
"epoch": 1.22883295194508,
"grad_norm": 1.6503219604492188,
"learning_rate": 3.545181290455904e-05,
"loss": 1.9217,
"step": 537
},
{
"epoch": 1.231121281464531,
"grad_norm": 1.499015212059021,
"learning_rate": 3.527085316683805e-05,
"loss": 1.7497,
"step": 538
},
{
"epoch": 1.2334096109839816,
"grad_norm": 2.4825656414031982,
"learning_rate": 3.509010444766674e-05,
"loss": 1.8272,
"step": 539
},
{
"epoch": 1.2356979405034325,
"grad_norm": 2.2124814987182617,
"learning_rate": 3.490956933655909e-05,
"loss": 1.9529,
"step": 540
},
{
"epoch": 1.2379862700228834,
"grad_norm": 1.762706995010376,
"learning_rate": 3.4729250419968906e-05,
"loss": 1.7847,
"step": 541
},
{
"epoch": 1.240274599542334,
"grad_norm": 1.6821390390396118,
"learning_rate": 3.4549150281252636e-05,
"loss": 1.8104,
"step": 542
},
{
"epoch": 1.2425629290617848,
"grad_norm": 1.624935507774353,
"learning_rate": 3.4369271500632336e-05,
"loss": 2.032,
"step": 543
},
{
"epoch": 1.2448512585812357,
"grad_norm": 1.6413863897323608,
"learning_rate": 3.41896166551588e-05,
"loss": 1.9053,
"step": 544
},
{
"epoch": 1.2471395881006866,
"grad_norm": 1.6230844259262085,
"learning_rate": 3.4010188318674614e-05,
"loss": 1.8971,
"step": 545
},
{
"epoch": 1.2494279176201373,
"grad_norm": 1.674910068511963,
"learning_rate": 3.383098906177719e-05,
"loss": 1.7863,
"step": 546
},
{
"epoch": 1.2517162471395882,
"grad_norm": 1.662138819694519,
"learning_rate": 3.365202145178205e-05,
"loss": 1.8283,
"step": 547
},
{
"epoch": 1.2540045766590389,
"grad_norm": 1.662114143371582,
"learning_rate": 3.347328805268605e-05,
"loss": 1.9365,
"step": 548
},
{
"epoch": 1.2562929061784898,
"grad_norm": 1.711565375328064,
"learning_rate": 3.329479142513051e-05,
"loss": 1.77,
"step": 549
},
{
"epoch": 1.2585812356979404,
"grad_norm": 1.7137374877929688,
"learning_rate": 3.3116534126364685e-05,
"loss": 1.7756,
"step": 550
},
{
"epoch": 1.2608695652173914,
"grad_norm": 1.5832325220108032,
"learning_rate": 3.293851871020905e-05,
"loss": 1.889,
"step": 551
},
{
"epoch": 1.263157894736842,
"grad_norm": 1.812990665435791,
"learning_rate": 3.2760747727018694e-05,
"loss": 1.9248,
"step": 552
},
{
"epoch": 1.265446224256293,
"grad_norm": 1.8253705501556396,
"learning_rate": 3.2583223723646836e-05,
"loss": 1.8042,
"step": 553
},
{
"epoch": 1.2677345537757438,
"grad_norm": 1.796898365020752,
"learning_rate": 3.240594924340835e-05,
"loss": 1.8171,
"step": 554
},
{
"epoch": 1.2700228832951945,
"grad_norm": 1.658321738243103,
"learning_rate": 3.2228926826043224e-05,
"loss": 1.9478,
"step": 555
},
{
"epoch": 1.2723112128146452,
"grad_norm": 1.7326852083206177,
"learning_rate": 3.205215900768029e-05,
"loss": 1.926,
"step": 556
},
{
"epoch": 1.2745995423340961,
"grad_norm": 1.5755597352981567,
"learning_rate": 3.187564832080084e-05,
"loss": 1.7267,
"step": 557
},
{
"epoch": 1.276887871853547,
"grad_norm": 1.575656771659851,
"learning_rate": 3.169939729420233e-05,
"loss": 1.7673,
"step": 558
},
{
"epoch": 1.2791762013729977,
"grad_norm": 1.5404809713363647,
"learning_rate": 3.152340845296216e-05,
"loss": 1.6433,
"step": 559
},
{
"epoch": 1.2814645308924484,
"grad_norm": 17.19310188293457,
"learning_rate": 3.1347684318401536e-05,
"loss": 1.7839,
"step": 560
},
{
"epoch": 1.2837528604118993,
"grad_norm": 1.5951805114746094,
"learning_rate": 3.11722274080493e-05,
"loss": 1.8625,
"step": 561
},
{
"epoch": 1.2860411899313502,
"grad_norm": 1.5244046449661255,
"learning_rate": 3.099704023560587e-05,
"loss": 1.7633,
"step": 562
},
{
"epoch": 1.2883295194508009,
"grad_norm": 1.6716521978378296,
"learning_rate": 3.08221253109073e-05,
"loss": 1.8701,
"step": 563
},
{
"epoch": 1.2906178489702518,
"grad_norm": 1.4705312252044678,
"learning_rate": 3.0647485139889145e-05,
"loss": 1.6322,
"step": 564
},
{
"epoch": 1.2929061784897025,
"grad_norm": 1.5014140605926514,
"learning_rate": 3.0473122224550787e-05,
"loss": 1.8418,
"step": 565
},
{
"epoch": 1.2951945080091534,
"grad_norm": 1.6687349081039429,
"learning_rate": 3.0299039062919416e-05,
"loss": 1.7194,
"step": 566
},
{
"epoch": 1.297482837528604,
"grad_norm": 1.6681313514709473,
"learning_rate": 3.01252381490143e-05,
"loss": 1.6571,
"step": 567
},
{
"epoch": 1.299771167048055,
"grad_norm": 1.585770845413208,
"learning_rate": 2.995172197281113e-05,
"loss": 1.7169,
"step": 568
},
{
"epoch": 1.3020594965675056,
"grad_norm": 1.4088305234909058,
"learning_rate": 2.9778493020206154e-05,
"loss": 1.6839,
"step": 569
},
{
"epoch": 1.3043478260869565,
"grad_norm": 1.600642442703247,
"learning_rate": 2.96055537729808e-05,
"loss": 1.728,
"step": 570
},
{
"epoch": 1.3066361556064074,
"grad_norm": 1.5582667589187622,
"learning_rate": 2.943290670876595e-05,
"loss": 1.627,
"step": 571
},
{
"epoch": 1.3089244851258581,
"grad_norm": 1.6819640398025513,
"learning_rate": 2.926055430100647e-05,
"loss": 1.6819,
"step": 572
},
{
"epoch": 1.3112128146453088,
"grad_norm": 1.786377191543579,
"learning_rate": 2.908849901892587e-05,
"loss": 1.8909,
"step": 573
},
{
"epoch": 1.3135011441647597,
"grad_norm": 4.219162940979004,
"learning_rate": 2.8916743327490803e-05,
"loss": 1.8257,
"step": 574
},
{
"epoch": 1.3157894736842106,
"grad_norm": 1.5954387187957764,
"learning_rate": 2.8745289687375843e-05,
"loss": 1.8058,
"step": 575
},
{
"epoch": 1.3180778032036613,
"grad_norm": 1.7538143396377563,
"learning_rate": 2.8574140554928175e-05,
"loss": 1.8492,
"step": 576
},
{
"epoch": 1.3203661327231122,
"grad_norm": 1.6355198621749878,
"learning_rate": 2.8403298382132437e-05,
"loss": 1.7393,
"step": 577
},
{
"epoch": 1.322654462242563,
"grad_norm": 1.6124778985977173,
"learning_rate": 2.8232765616575563e-05,
"loss": 1.8442,
"step": 578
},
{
"epoch": 1.3249427917620138,
"grad_norm": 2.022512435913086,
"learning_rate": 2.8062544701411742e-05,
"loss": 1.7092,
"step": 579
},
{
"epoch": 1.3272311212814645,
"grad_norm": 1.6726174354553223,
"learning_rate": 2.789263807532746e-05,
"loss": 1.8071,
"step": 580
},
{
"epoch": 1.3295194508009154,
"grad_norm": 1.6086249351501465,
"learning_rate": 2.7723048172506395e-05,
"loss": 1.6439,
"step": 581
},
{
"epoch": 1.331807780320366,
"grad_norm": 1.6463301181793213,
"learning_rate": 2.7553777422594774e-05,
"loss": 1.7961,
"step": 582
},
{
"epoch": 1.334096109839817,
"grad_norm": 1.545799732208252,
"learning_rate": 2.7384828250666396e-05,
"loss": 1.7943,
"step": 583
},
{
"epoch": 1.3363844393592679,
"grad_norm": 1.6468026638031006,
"learning_rate": 2.721620307718793e-05,
"loss": 1.904,
"step": 584
},
{
"epoch": 1.3386727688787186,
"grad_norm": 1.6222645044326782,
"learning_rate": 2.7047904317984273e-05,
"loss": 1.8544,
"step": 585
},
{
"epoch": 1.3409610983981692,
"grad_norm": 1.4929869174957275,
"learning_rate": 2.687993438420392e-05,
"loss": 1.7437,
"step": 586
},
{
"epoch": 1.3432494279176201,
"grad_norm": 1.4991482496261597,
"learning_rate": 2.6712295682284403e-05,
"loss": 1.7539,
"step": 587
},
{
"epoch": 1.345537757437071,
"grad_norm": 1.6057149171829224,
"learning_rate": 2.65449906139178e-05,
"loss": 1.8114,
"step": 588
},
{
"epoch": 1.3478260869565217,
"grad_norm": 1.633756399154663,
"learning_rate": 2.6378021576016466e-05,
"loss": 1.8485,
"step": 589
},
{
"epoch": 1.3501144164759724,
"grad_norm": 1.4756066799163818,
"learning_rate": 2.6211390960678413e-05,
"loss": 1.7935,
"step": 590
},
{
"epoch": 1.3524027459954233,
"grad_norm": 1.4749020338058472,
"learning_rate": 2.604510115515336e-05,
"loss": 1.7995,
"step": 591
},
{
"epoch": 1.3546910755148742,
"grad_norm": 1.4876452684402466,
"learning_rate": 2.5879154541808337e-05,
"loss": 1.6321,
"step": 592
},
{
"epoch": 1.356979405034325,
"grad_norm": 1.6898012161254883,
"learning_rate": 2.5713553498093505e-05,
"loss": 1.7485,
"step": 593
},
{
"epoch": 1.3592677345537758,
"grad_norm": 1.4891339540481567,
"learning_rate": 2.554830039650834e-05,
"loss": 1.8683,
"step": 594
},
{
"epoch": 1.3615560640732265,
"grad_norm": 1.6826235055923462,
"learning_rate": 2.5383397604567394e-05,
"loss": 1.8593,
"step": 595
},
{
"epoch": 1.3638443935926774,
"grad_norm": 3.089046001434326,
"learning_rate": 2.5218847484766495e-05,
"loss": 1.6196,
"step": 596
},
{
"epoch": 1.366132723112128,
"grad_norm": 1.8149257898330688,
"learning_rate": 2.5054652394548893e-05,
"loss": 1.7911,
"step": 597
},
{
"epoch": 1.368421052631579,
"grad_norm": 1.497933030128479,
"learning_rate": 2.4890814686271448e-05,
"loss": 1.7172,
"step": 598
},
{
"epoch": 1.3707093821510297,
"grad_norm": 1.5548969507217407,
"learning_rate": 2.4727336707170973e-05,
"loss": 1.9158,
"step": 599
},
{
"epoch": 1.3729977116704806,
"grad_norm": 1.4351743459701538,
"learning_rate": 2.456422079933056e-05,
"loss": 1.6162,
"step": 600
},
{
"epoch": 1.3752860411899315,
"grad_norm": 1.5610346794128418,
"learning_rate": 2.4401469299646133e-05,
"loss": 1.7969,
"step": 601
},
{
"epoch": 1.3775743707093822,
"grad_norm": 1.6332178115844727,
"learning_rate": 2.4239084539792745e-05,
"loss": 1.9592,
"step": 602
},
{
"epoch": 1.3798627002288328,
"grad_norm": 1.6217281818389893,
"learning_rate": 2.4077068846191453e-05,
"loss": 1.8602,
"step": 603
},
{
"epoch": 1.3821510297482837,
"grad_norm": 1.6409474611282349,
"learning_rate": 2.391542453997578e-05,
"loss": 1.6992,
"step": 604
},
{
"epoch": 1.3844393592677346,
"grad_norm": 1.5256273746490479,
"learning_rate": 2.375415393695854e-05,
"loss": 1.7704,
"step": 605
},
{
"epoch": 1.3867276887871853,
"grad_norm": 1.5644723176956177,
"learning_rate": 2.3593259347598657e-05,
"loss": 1.946,
"step": 606
},
{
"epoch": 1.3890160183066362,
"grad_norm": 1.4548919200897217,
"learning_rate": 2.3432743076968066e-05,
"loss": 1.8726,
"step": 607
},
{
"epoch": 1.391304347826087,
"grad_norm": 2.5512516498565674,
"learning_rate": 2.3272607424718675e-05,
"loss": 1.8147,
"step": 608
},
{
"epoch": 1.3935926773455378,
"grad_norm": 1.6066017150878906,
"learning_rate": 2.3112854685049397e-05,
"loss": 1.6665,
"step": 609
},
{
"epoch": 1.3958810068649885,
"grad_norm": 1.8042073249816895,
"learning_rate": 2.29534871466734e-05,
"loss": 1.823,
"step": 610
},
{
"epoch": 1.3981693363844394,
"grad_norm": 1.5607564449310303,
"learning_rate": 2.2794507092785106e-05,
"loss": 1.7391,
"step": 611
},
{
"epoch": 1.40045766590389,
"grad_norm": 1.4327287673950195,
"learning_rate": 2.2635916801027706e-05,
"loss": 1.651,
"step": 612
},
{
"epoch": 1.402745995423341,
"grad_norm": 1.4062193632125854,
"learning_rate": 2.2477718543460373e-05,
"loss": 1.5554,
"step": 613
},
{
"epoch": 1.4050343249427917,
"grad_norm": 1.5095700025558472,
"learning_rate": 2.2319914586525777e-05,
"loss": 1.7648,
"step": 614
},
{
"epoch": 1.4073226544622426,
"grad_norm": 1.5155692100524902,
"learning_rate": 2.21625071910176e-05,
"loss": 1.6781,
"step": 615
},
{
"epoch": 1.4096109839816933,
"grad_norm": 1.6352697610855103,
"learning_rate": 2.2005498612048155e-05,
"loss": 1.8137,
"step": 616
},
{
"epoch": 1.4118993135011442,
"grad_norm": 1.5272334814071655,
"learning_rate": 2.1848891099016057e-05,
"loss": 1.8767,
"step": 617
},
{
"epoch": 1.414187643020595,
"grad_norm": 2.0643205642700195,
"learning_rate": 2.1692686895574005e-05,
"loss": 1.9925,
"step": 618
},
{
"epoch": 1.4164759725400458,
"grad_norm": 1.6766259670257568,
"learning_rate": 2.1536888239596714e-05,
"loss": 1.7768,
"step": 619
},
{
"epoch": 1.4187643020594964,
"grad_norm": 1.6757736206054688,
"learning_rate": 2.1381497363148673e-05,
"loss": 1.7757,
"step": 620
},
{
"epoch": 1.4210526315789473,
"grad_norm": 1.4198200702667236,
"learning_rate": 2.1226516492452336e-05,
"loss": 1.7404,
"step": 621
},
{
"epoch": 1.4233409610983982,
"grad_norm": 1.577269196510315,
"learning_rate": 2.1071947847856222e-05,
"loss": 1.9389,
"step": 622
},
{
"epoch": 1.425629290617849,
"grad_norm": 1.3547738790512085,
"learning_rate": 2.091779364380293e-05,
"loss": 1.6328,
"step": 623
},
{
"epoch": 1.4279176201372998,
"grad_norm": 1.553648591041565,
"learning_rate": 2.0764056088797645e-05,
"loss": 1.9481,
"step": 624
},
{
"epoch": 1.4302059496567505,
"grad_norm": 1.61307954788208,
"learning_rate": 2.061073738537635e-05,
"loss": 1.7224,
"step": 625
},
{
"epoch": 1.4324942791762014,
"grad_norm": 1.4605125188827515,
"learning_rate": 2.045783973007429e-05,
"loss": 1.6555,
"step": 626
},
{
"epoch": 1.434782608695652,
"grad_norm": 1.5534050464630127,
"learning_rate": 2.030536531339456e-05,
"loss": 1.7746,
"step": 627
},
{
"epoch": 1.437070938215103,
"grad_norm": 1.6186987161636353,
"learning_rate": 2.0153316319776662e-05,
"loss": 1.755,
"step": 628
},
{
"epoch": 1.4393592677345537,
"grad_norm": 1.6184231042861938,
"learning_rate": 2.000169492756523e-05,
"loss": 1.7425,
"step": 629
},
{
"epoch": 1.4416475972540046,
"grad_norm": 1.6221272945404053,
"learning_rate": 1.985050330897883e-05,
"loss": 1.9271,
"step": 630
},
{
"epoch": 1.4439359267734555,
"grad_norm": 1.4895519018173218,
"learning_rate": 1.9699743630078883e-05,
"loss": 1.7055,
"step": 631
},
{
"epoch": 1.4462242562929062,
"grad_norm": 1.3880146741867065,
"learning_rate": 1.954941805073848e-05,
"loss": 1.7598,
"step": 632
},
{
"epoch": 1.4485125858123569,
"grad_norm": 1.4680176973342896,
"learning_rate": 1.9399528724611644e-05,
"loss": 1.7733,
"step": 633
},
{
"epoch": 1.4508009153318078,
"grad_norm": 1.5890253782272339,
"learning_rate": 1.9250077799102322e-05,
"loss": 1.8409,
"step": 634
},
{
"epoch": 1.4530892448512587,
"grad_norm": 1.583875060081482,
"learning_rate": 1.9101067415333684e-05,
"loss": 1.8627,
"step": 635
},
{
"epoch": 1.4553775743707094,
"grad_norm": 2.7736949920654297,
"learning_rate": 1.8952499708117432e-05,
"loss": 1.8298,
"step": 636
},
{
"epoch": 1.4576659038901603,
"grad_norm": 1.6217132806777954,
"learning_rate": 1.8804376805923223e-05,
"loss": 1.9238,
"step": 637
},
{
"epoch": 1.459954233409611,
"grad_norm": 1.5518221855163574,
"learning_rate": 1.8656700830848174e-05,
"loss": 1.7901,
"step": 638
},
{
"epoch": 1.4622425629290619,
"grad_norm": 1.5829041004180908,
"learning_rate": 1.850947389858643e-05,
"loss": 1.9257,
"step": 639
},
{
"epoch": 1.4645308924485125,
"grad_norm": 1.4454208612442017,
"learning_rate": 1.8362698118398967e-05,
"loss": 1.622,
"step": 640
},
{
"epoch": 1.4668192219679634,
"grad_norm": 1.4906951189041138,
"learning_rate": 1.821637559308315e-05,
"loss": 1.6691,
"step": 641
},
{
"epoch": 1.4691075514874141,
"grad_norm": 1.5857839584350586,
"learning_rate": 1.8070508418942876e-05,
"loss": 1.9164,
"step": 642
},
{
"epoch": 1.471395881006865,
"grad_norm": 1.5303192138671875,
"learning_rate": 1.7925098685758345e-05,
"loss": 1.5674,
"step": 643
},
{
"epoch": 1.4736842105263157,
"grad_norm": 1.7008835077285767,
"learning_rate": 1.7780148476756147e-05,
"loss": 1.7663,
"step": 644
},
{
"epoch": 1.4759725400457666,
"grad_norm": 2.85447359085083,
"learning_rate": 1.763565986857955e-05,
"loss": 1.8347,
"step": 645
},
{
"epoch": 1.4782608695652173,
"grad_norm": 1.470250129699707,
"learning_rate": 1.7491634931258587e-05,
"loss": 1.8086,
"step": 646
},
{
"epoch": 1.4805491990846682,
"grad_norm": 1.4214550256729126,
"learning_rate": 1.7348075728180478e-05,
"loss": 1.5806,
"step": 647
},
{
"epoch": 1.482837528604119,
"grad_norm": 1.6169315576553345,
"learning_rate": 1.7204984316060063e-05,
"loss": 1.8988,
"step": 648
},
{
"epoch": 1.4851258581235698,
"grad_norm": 1.4714914560317993,
"learning_rate": 1.7062362744910322e-05,
"loss": 1.5895,
"step": 649
},
{
"epoch": 1.4874141876430205,
"grad_norm": 1.4366496801376343,
"learning_rate": 1.6920213058013022e-05,
"loss": 1.6835,
"step": 650
},
{
"epoch": 1.4897025171624714,
"grad_norm": 1.4395853281021118,
"learning_rate": 1.6778537291889407e-05,
"loss": 1.7919,
"step": 651
},
{
"epoch": 1.4919908466819223,
"grad_norm": 1.5093960762023926,
"learning_rate": 1.6637337476271124e-05,
"loss": 1.6612,
"step": 652
},
{
"epoch": 1.494279176201373,
"grad_norm": 1.5296788215637207,
"learning_rate": 1.6496615634070956e-05,
"loss": 1.8203,
"step": 653
},
{
"epoch": 1.4965675057208239,
"grad_norm": 1.3884508609771729,
"learning_rate": 1.6356373781354058e-05,
"loss": 1.5958,
"step": 654
},
{
"epoch": 1.4988558352402745,
"grad_norm": 1.5440672636032104,
"learning_rate": 1.6216613927308905e-05,
"loss": 1.7568,
"step": 655
},
{
"epoch": 1.5011441647597255,
"grad_norm": 1.3419849872589111,
"learning_rate": 1.6077338074218596e-05,
"loss": 1.5438,
"step": 656
},
{
"epoch": 1.5034324942791764,
"grad_norm": 1.6140450239181519,
"learning_rate": 1.5938548217432136e-05,
"loss": 1.8422,
"step": 657
},
{
"epoch": 1.505720823798627,
"grad_norm": 40.070556640625,
"learning_rate": 1.580024634533587e-05,
"loss": 1.9379,
"step": 658
},
{
"epoch": 1.5080091533180777,
"grad_norm": 1.5266087055206299,
"learning_rate": 1.566243443932496e-05,
"loss": 1.6635,
"step": 659
},
{
"epoch": 1.5102974828375286,
"grad_norm": 1.3794541358947754,
"learning_rate": 1.5525114473775014e-05,
"loss": 1.6518,
"step": 660
},
{
"epoch": 1.5125858123569795,
"grad_norm": 1.374881625175476,
"learning_rate": 1.5388288416013896e-05,
"loss": 1.6466,
"step": 661
},
{
"epoch": 1.5148741418764302,
"grad_norm": 1.4669468402862549,
"learning_rate": 1.5251958226293306e-05,
"loss": 1.8786,
"step": 662
},
{
"epoch": 1.517162471395881,
"grad_norm": 1.533560872077942,
"learning_rate": 1.5116125857760966e-05,
"loss": 1.785,
"step": 663
},
{
"epoch": 1.5194508009153318,
"grad_norm": 1.5260666608810425,
"learning_rate": 1.4980793256432474e-05,
"loss": 1.7887,
"step": 664
},
{
"epoch": 1.5217391304347827,
"grad_norm": 1.4833292961120605,
"learning_rate": 1.4845962361163413e-05,
"loss": 1.6844,
"step": 665
},
{
"epoch": 1.5240274599542334,
"grad_norm": 2.452692985534668,
"learning_rate": 1.4711635103621719e-05,
"loss": 1.8198,
"step": 666
},
{
"epoch": 1.526315789473684,
"grad_norm": 1.4687340259552002,
"learning_rate": 1.4577813408259838e-05,
"loss": 1.7256,
"step": 667
},
{
"epoch": 1.528604118993135,
"grad_norm": 1.4038817882537842,
"learning_rate": 1.4444499192287275e-05,
"loss": 1.5841,
"step": 668
},
{
"epoch": 1.5308924485125859,
"grad_norm": 1.4188700914382935,
"learning_rate": 1.4311694365643047e-05,
"loss": 1.708,
"step": 669
},
{
"epoch": 1.5331807780320366,
"grad_norm": 1.54819917678833,
"learning_rate": 1.4179400830968415e-05,
"loss": 1.8999,
"step": 670
},
{
"epoch": 1.5354691075514875,
"grad_norm": 1.4917936325073242,
"learning_rate": 1.4047620483579477e-05,
"loss": 1.7434,
"step": 671
},
{
"epoch": 1.5377574370709381,
"grad_norm": 1.4572731256484985,
"learning_rate": 1.3916355211440164e-05,
"loss": 1.8177,
"step": 672
},
{
"epoch": 1.540045766590389,
"grad_norm": 1.4481701850891113,
"learning_rate": 1.378560689513515e-05,
"loss": 1.6619,
"step": 673
},
{
"epoch": 1.54233409610984,
"grad_norm": 1.562588095664978,
"learning_rate": 1.3655377407842812e-05,
"loss": 1.7962,
"step": 674
},
{
"epoch": 1.5446224256292906,
"grad_norm": 1.4607584476470947,
"learning_rate": 1.3525668615308562e-05,
"loss": 1.7012,
"step": 675
},
{
"epoch": 1.5469107551487413,
"grad_norm": 1.6339269876480103,
"learning_rate": 1.3396482375817975e-05,
"loss": 1.6341,
"step": 676
},
{
"epoch": 1.5491990846681922,
"grad_norm": 1.4561293125152588,
"learning_rate": 1.3267820540170229e-05,
"loss": 1.7071,
"step": 677
},
{
"epoch": 1.5514874141876431,
"grad_norm": 2.308603286743164,
"learning_rate": 1.3139684951651588e-05,
"loss": 1.9484,
"step": 678
},
{
"epoch": 1.5537757437070938,
"grad_norm": 1.3794751167297363,
"learning_rate": 1.3012077446008968e-05,
"loss": 1.5538,
"step": 679
},
{
"epoch": 1.5560640732265445,
"grad_norm": 1.586301326751709,
"learning_rate": 1.2884999851423673e-05,
"loss": 1.7967,
"step": 680
},
{
"epoch": 1.5583524027459954,
"grad_norm": 1.3982346057891846,
"learning_rate": 1.2758453988485164e-05,
"loss": 1.6258,
"step": 681
},
{
"epoch": 1.5606407322654463,
"grad_norm": 1.8608312606811523,
"learning_rate": 1.2632441670165056e-05,
"loss": 1.6441,
"step": 682
},
{
"epoch": 1.562929061784897,
"grad_norm": 1.565237045288086,
"learning_rate": 1.2506964701790985e-05,
"loss": 1.5952,
"step": 683
},
{
"epoch": 1.5652173913043477,
"grad_norm": 1.4737681150436401,
"learning_rate": 1.2382024881020937e-05,
"loss": 1.761,
"step": 684
},
{
"epoch": 1.5675057208237986,
"grad_norm": 1.516098141670227,
"learning_rate": 1.2257623997817347e-05,
"loss": 1.8565,
"step": 685
},
{
"epoch": 1.5697940503432495,
"grad_norm": 1.5380991697311401,
"learning_rate": 1.213376383442153e-05,
"loss": 1.7487,
"step": 686
},
{
"epoch": 1.5720823798627004,
"grad_norm": 1.5555819272994995,
"learning_rate": 1.2010446165328126e-05,
"loss": 1.842,
"step": 687
},
{
"epoch": 1.574370709382151,
"grad_norm": 1.60054612159729,
"learning_rate": 1.188767275725966e-05,
"loss": 1.7022,
"step": 688
},
{
"epoch": 1.5766590389016018,
"grad_norm": 1.4314377307891846,
"learning_rate": 1.1765445369141276e-05,
"loss": 1.6684,
"step": 689
},
{
"epoch": 1.5789473684210527,
"grad_norm": 1.430720329284668,
"learning_rate": 1.164376575207547e-05,
"loss": 1.7769,
"step": 690
},
{
"epoch": 1.5812356979405036,
"grad_norm": 1.3903356790542603,
"learning_rate": 1.152263564931712e-05,
"loss": 1.5938,
"step": 691
},
{
"epoch": 1.5835240274599542,
"grad_norm": 1.6598271131515503,
"learning_rate": 1.140205679624834e-05,
"loss": 1.9105,
"step": 692
},
{
"epoch": 1.585812356979405,
"grad_norm": 1.535503625869751,
"learning_rate": 1.1282030920353747e-05,
"loss": 1.8986,
"step": 693
},
{
"epoch": 1.5881006864988558,
"grad_norm": 1.6116646528244019,
"learning_rate": 1.1162559741195733e-05,
"loss": 1.8399,
"step": 694
},
{
"epoch": 1.5903890160183067,
"grad_norm": 1.3860000371932983,
"learning_rate": 1.1043644970389671e-05,
"loss": 1.723,
"step": 695
},
{
"epoch": 1.5926773455377574,
"grad_norm": 1.3762991428375244,
"learning_rate": 1.092528831157959e-05,
"loss": 1.6605,
"step": 696
},
{
"epoch": 1.594965675057208,
"grad_norm": 1.5220414400100708,
"learning_rate": 1.0807491460413622e-05,
"loss": 1.5274,
"step": 697
},
{
"epoch": 1.597254004576659,
"grad_norm": 1.4220688343048096,
"learning_rate": 1.0690256104519764e-05,
"loss": 1.7568,
"step": 698
},
{
"epoch": 1.59954233409611,
"grad_norm": 1.6418180465698242,
"learning_rate": 1.0573583923481711e-05,
"loss": 1.7716,
"step": 699
},
{
"epoch": 1.6018306636155606,
"grad_norm": 1.4398093223571777,
"learning_rate": 1.0457476588814774e-05,
"loss": 1.8126,
"step": 700
},
{
"epoch": 1.6041189931350115,
"grad_norm": 1.4509756565093994,
"learning_rate": 1.0341935763941935e-05,
"loss": 1.7863,
"step": 701
},
{
"epoch": 1.6064073226544622,
"grad_norm": 1.427012324333191,
"learning_rate": 1.0226963104170002e-05,
"loss": 1.6395,
"step": 702
},
{
"epoch": 1.608695652173913,
"grad_norm": 1.4785486459732056,
"learning_rate": 1.011256025666597e-05,
"loss": 1.7012,
"step": 703
},
{
"epoch": 1.610983981693364,
"grad_norm": 1.5970810651779175,
"learning_rate": 9.998728860433276e-06,
"loss": 1.7864,
"step": 704
},
{
"epoch": 1.6132723112128147,
"grad_norm": 1.4483610391616821,
"learning_rate": 9.885470546288478e-06,
"loss": 1.7952,
"step": 705
},
{
"epoch": 1.6155606407322654,
"grad_norm": 1.4754085540771484,
"learning_rate": 9.772786936837785e-06,
"loss": 1.7347,
"step": 706
},
{
"epoch": 1.6178489702517163,
"grad_norm": 1.3727151155471802,
"learning_rate": 9.660679646453851e-06,
"loss": 1.7124,
"step": 707
},
{
"epoch": 1.6201372997711672,
"grad_norm": 1.4896659851074219,
"learning_rate": 9.549150281252633e-06,
"loss": 1.6849,
"step": 708
},
{
"epoch": 1.6224256292906178,
"grad_norm": 1.4439793825149536,
"learning_rate": 9.438200439070388e-06,
"loss": 1.7911,
"step": 709
},
{
"epoch": 1.6247139588100685,
"grad_norm": 1.432722568511963,
"learning_rate": 9.327831709440792e-06,
"loss": 1.7975,
"step": 710
},
{
"epoch": 1.6270022883295194,
"grad_norm": 1.6031569242477417,
"learning_rate": 9.218045673572123e-06,
"loss": 1.9478,
"step": 711
},
{
"epoch": 1.6292906178489703,
"grad_norm": 1.4716615676879883,
"learning_rate": 9.108843904324715e-06,
"loss": 1.6608,
"step": 712
},
{
"epoch": 1.631578947368421,
"grad_norm": 1.4718276262283325,
"learning_rate": 9.000227966188234e-06,
"loss": 1.8744,
"step": 713
},
{
"epoch": 1.6338672768878717,
"grad_norm": 1.4385418891906738,
"learning_rate": 8.8921994152595e-06,
"loss": 1.7512,
"step": 714
},
{
"epoch": 1.6361556064073226,
"grad_norm": 1.469067096710205,
"learning_rate": 8.78475979922e-06,
"loss": 1.6451,
"step": 715
},
{
"epoch": 1.6384439359267735,
"grad_norm": 1.410254716873169,
"learning_rate": 8.677910657313782e-06,
"loss": 1.7576,
"step": 716
},
{
"epoch": 1.6407322654462244,
"grad_norm": 1.4825024604797363,
"learning_rate": 8.571653520325463e-06,
"loss": 1.6878,
"step": 717
},
{
"epoch": 1.643020594965675,
"grad_norm": 1.3992328643798828,
"learning_rate": 8.465989910558209e-06,
"loss": 1.7016,
"step": 718
},
{
"epoch": 1.6453089244851258,
"grad_norm": 1.3796439170837402,
"learning_rate": 8.360921341811956e-06,
"loss": 1.7485,
"step": 719
},
{
"epoch": 1.6475972540045767,
"grad_norm": 1.6624534130096436,
"learning_rate": 8.256449319361748e-06,
"loss": 1.9316,
"step": 720
},
{
"epoch": 1.6498855835240276,
"grad_norm": 1.3875632286071777,
"learning_rate": 8.15257533993613e-06,
"loss": 1.6544,
"step": 721
},
{
"epoch": 1.6521739130434783,
"grad_norm": 1.4452712535858154,
"learning_rate": 8.049300891695744e-06,
"loss": 1.5939,
"step": 722
},
{
"epoch": 1.654462242562929,
"grad_norm": 1.422812581062317,
"learning_rate": 7.946627454211968e-06,
"loss": 1.6288,
"step": 723
},
{
"epoch": 1.6567505720823799,
"grad_norm": 2.14966082572937,
"learning_rate": 7.844556498445788e-06,
"loss": 1.6796,
"step": 724
},
{
"epoch": 1.6590389016018308,
"grad_norm": 1.511987328529358,
"learning_rate": 7.7430894867266e-06,
"loss": 1.6898,
"step": 725
},
{
"epoch": 1.6613272311212814,
"grad_norm": 1.4629607200622559,
"learning_rate": 7.642227872731417e-06,
"loss": 1.6181,
"step": 726
},
{
"epoch": 1.6636155606407321,
"grad_norm": 1.5010524988174438,
"learning_rate": 7.541973101463912e-06,
"loss": 1.7704,
"step": 727
},
{
"epoch": 1.665903890160183,
"grad_norm": 1.4450128078460693,
"learning_rate": 7.4423266092337855e-06,
"loss": 1.7728,
"step": 728
},
{
"epoch": 1.668192219679634,
"grad_norm": 1.4205902814865112,
"learning_rate": 7.343289823636168e-06,
"loss": 1.6339,
"step": 729
},
{
"epoch": 1.6704805491990846,
"grad_norm": 1.498461365699768,
"learning_rate": 7.244864163531162e-06,
"loss": 1.6626,
"step": 730
},
{
"epoch": 1.6727688787185355,
"grad_norm": 1.4677972793579102,
"learning_rate": 7.147051039023528e-06,
"loss": 1.6955,
"step": 731
},
{
"epoch": 1.6750572082379862,
"grad_norm": 1.485467791557312,
"learning_rate": 7.049851851442468e-06,
"loss": 1.6536,
"step": 732
},
{
"epoch": 1.677345537757437,
"grad_norm": 1.4434114694595337,
"learning_rate": 6.953267993321588e-06,
"loss": 1.8493,
"step": 733
},
{
"epoch": 1.679633867276888,
"grad_norm": 1.5006053447723389,
"learning_rate": 6.857300848378856e-06,
"loss": 1.7167,
"step": 734
},
{
"epoch": 1.6819221967963387,
"grad_norm": 1.4473903179168701,
"learning_rate": 6.761951791496901e-06,
"loss": 1.7096,
"step": 735
},
{
"epoch": 1.6842105263157894,
"grad_norm": 1.5299594402313232,
"learning_rate": 6.667222188703226e-06,
"loss": 1.6376,
"step": 736
},
{
"epoch": 1.6864988558352403,
"grad_norm": 1.4059666395187378,
"learning_rate": 6.573113397150654e-06,
"loss": 1.868,
"step": 737
},
{
"epoch": 1.6887871853546912,
"grad_norm": 1.4118850231170654,
"learning_rate": 6.479626765097918e-06,
"loss": 1.7095,
"step": 738
},
{
"epoch": 1.6910755148741419,
"grad_norm": 1.4745031595230103,
"learning_rate": 6.386763631890313e-06,
"loss": 1.7882,
"step": 739
},
{
"epoch": 1.6933638443935926,
"grad_norm": 1.4812159538269043,
"learning_rate": 6.294525327940515e-06,
"loss": 1.745,
"step": 740
},
{
"epoch": 1.6956521739130435,
"grad_norm": 1.4774779081344604,
"learning_rate": 6.202913174709507e-06,
"loss": 1.5398,
"step": 741
},
{
"epoch": 1.6979405034324944,
"grad_norm": 1.4957542419433594,
"learning_rate": 6.111928484687723e-06,
"loss": 1.7825,
"step": 742
},
{
"epoch": 1.700228832951945,
"grad_norm": 1.4538252353668213,
"learning_rate": 6.02157256137611e-06,
"loss": 1.6752,
"step": 743
},
{
"epoch": 1.7025171624713957,
"grad_norm": 1.4226707220077515,
"learning_rate": 5.931846699267557e-06,
"loss": 1.7249,
"step": 744
},
{
"epoch": 1.7048054919908466,
"grad_norm": 1.4320975542068481,
"learning_rate": 5.842752183828354e-06,
"loss": 1.7245,
"step": 745
},
{
"epoch": 1.7070938215102975,
"grad_norm": 1.3623591661453247,
"learning_rate": 5.7542902914796745e-06,
"loss": 1.7421,
"step": 746
},
{
"epoch": 1.7093821510297484,
"grad_norm": 4.592003345489502,
"learning_rate": 5.666462289579422e-06,
"loss": 1.9731,
"step": 747
},
{
"epoch": 1.7116704805491991,
"grad_norm": 1.4294320344924927,
"learning_rate": 5.579269436403967e-06,
"loss": 1.7351,
"step": 748
},
{
"epoch": 1.7139588100686498,
"grad_norm": 1.4926899671554565,
"learning_rate": 5.4927129811301715e-06,
"loss": 1.7125,
"step": 749
},
{
"epoch": 1.7162471395881007,
"grad_norm": 1.4961239099502563,
"learning_rate": 5.4067941638174806e-06,
"loss": 1.9168,
"step": 750
},
{
"epoch": 1.7185354691075516,
"grad_norm": 1.4223458766937256,
"learning_rate": 5.3215142153901605e-06,
"loss": 1.7701,
"step": 751
},
{
"epoch": 1.7208237986270023,
"grad_norm": 1.3997184038162231,
"learning_rate": 5.2368743576196536e-06,
"loss": 1.7696,
"step": 752
},
{
"epoch": 1.723112128146453,
"grad_norm": 1.7406820058822632,
"learning_rate": 5.152875803107083e-06,
"loss": 1.6607,
"step": 753
},
{
"epoch": 1.7254004576659039,
"grad_norm": 1.4031175374984741,
"learning_rate": 5.0695197552659e-06,
"loss": 1.652,
"step": 754
},
{
"epoch": 1.7276887871853548,
"grad_norm": 1.4126728773117065,
"learning_rate": 4.986807408304567e-06,
"loss": 1.8364,
"step": 755
},
{
"epoch": 1.7299771167048055,
"grad_norm": 1.3579936027526855,
"learning_rate": 4.9047399472095746e-06,
"loss": 1.5031,
"step": 756
},
{
"epoch": 1.7322654462242562,
"grad_norm": 1.5867729187011719,
"learning_rate": 4.82331854772834e-06,
"loss": 1.7306,
"step": 757
},
{
"epoch": 1.734553775743707,
"grad_norm": 1.3981932401657104,
"learning_rate": 4.742544376352443e-06,
"loss": 1.5772,
"step": 758
},
{
"epoch": 1.736842105263158,
"grad_norm": 1.5310570001602173,
"learning_rate": 4.662418590300871e-06,
"loss": 1.7998,
"step": 759
},
{
"epoch": 1.7391304347826086,
"grad_norm": 1.4550405740737915,
"learning_rate": 4.582942337503465e-06,
"loss": 1.6577,
"step": 760
},
{
"epoch": 1.7414187643020596,
"grad_norm": 1.438949465751648,
"learning_rate": 4.504116756584465e-06,
"loss": 1.753,
"step": 761
},
{
"epoch": 1.7437070938215102,
"grad_norm": 1.4468332529067993,
"learning_rate": 4.425942976846187e-06,
"loss": 1.8617,
"step": 762
},
{
"epoch": 1.7459954233409611,
"grad_norm": 1.3982702493667603,
"learning_rate": 4.348422118252892e-06,
"loss": 1.6861,
"step": 763
},
{
"epoch": 1.748283752860412,
"grad_norm": 1.480664610862732,
"learning_rate": 4.271555291414636e-06,
"loss": 1.6337,
"step": 764
},
{
"epoch": 1.7505720823798627,
"grad_norm": 1.4461660385131836,
"learning_rate": 4.195343597571488e-06,
"loss": 1.8426,
"step": 765
},
{
"epoch": 1.7528604118993134,
"grad_norm": 1.3636289834976196,
"learning_rate": 4.119788128577667e-06,
"loss": 1.6556,
"step": 766
},
{
"epoch": 1.7551487414187643,
"grad_norm": 1.5137560367584229,
"learning_rate": 4.044889966885895e-06,
"loss": 1.6594,
"step": 767
},
{
"epoch": 1.7574370709382152,
"grad_norm": 1.3906878232955933,
"learning_rate": 3.9706501855319765e-06,
"loss": 1.5104,
"step": 768
},
{
"epoch": 1.759725400457666,
"grad_norm": 1.517803430557251,
"learning_rate": 3.897069848119323e-06,
"loss": 1.5927,
"step": 769
},
{
"epoch": 1.7620137299771166,
"grad_norm": 1.4511746168136597,
"learning_rate": 3.824150008803767e-06,
"loss": 1.7718,
"step": 770
},
{
"epoch": 1.7643020594965675,
"grad_norm": 1.3890513181686401,
"learning_rate": 3.7518917122784604e-06,
"loss": 1.8953,
"step": 771
},
{
"epoch": 1.7665903890160184,
"grad_norm": 2.190563201904297,
"learning_rate": 3.680295993758881e-06,
"loss": 1.8052,
"step": 772
},
{
"epoch": 1.768878718535469,
"grad_norm": 1.86270010471344,
"learning_rate": 3.609363878968036e-06,
"loss": 1.5349,
"step": 773
},
{
"epoch": 1.7711670480549198,
"grad_norm": 1.4694937467575073,
"learning_rate": 3.539096384121743e-06,
"loss": 1.7833,
"step": 774
},
{
"epoch": 1.7734553775743707,
"grad_norm": 1.5534601211547852,
"learning_rate": 3.469494515914079e-06,
"loss": 1.5243,
"step": 775
},
{
"epoch": 1.7757437070938216,
"grad_norm": 1.8910963535308838,
"learning_rate": 3.40055927150294e-06,
"loss": 1.7936,
"step": 776
},
{
"epoch": 1.7780320366132725,
"grad_norm": 1.5681207180023193,
"learning_rate": 3.332291638495816e-06,
"loss": 1.7809,
"step": 777
},
{
"epoch": 1.7803203661327232,
"grad_norm": 1.430917739868164,
"learning_rate": 3.2646925949355312e-06,
"loss": 1.6679,
"step": 778
},
{
"epoch": 1.7826086956521738,
"grad_norm": 1.7073776721954346,
"learning_rate": 3.1977631092863615e-06,
"loss": 1.6635,
"step": 779
},
{
"epoch": 1.7848970251716247,
"grad_norm": 1.5938652753829956,
"learning_rate": 3.1315041404200663e-06,
"loss": 1.8343,
"step": 780
},
{
"epoch": 1.7871853546910756,
"grad_norm": 1.510628581047058,
"learning_rate": 3.065916637602173e-06,
"loss": 1.7529,
"step": 781
},
{
"epoch": 1.7894736842105263,
"grad_norm": 1.308749794960022,
"learning_rate": 3.00100154047841e-06,
"loss": 1.6496,
"step": 782
},
{
"epoch": 1.791762013729977,
"grad_norm": 1.3793610334396362,
"learning_rate": 2.936759779061199e-06,
"loss": 1.6751,
"step": 783
},
{
"epoch": 1.794050343249428,
"grad_norm": 1.4116971492767334,
"learning_rate": 2.8731922737163685e-06,
"loss": 1.8891,
"step": 784
},
{
"epoch": 1.7963386727688788,
"grad_norm": 1.6015205383300781,
"learning_rate": 2.810299935149935e-06,
"loss": 1.8412,
"step": 785
},
{
"epoch": 1.7986270022883295,
"grad_norm": 1.4557969570159912,
"learning_rate": 2.7480836643950956e-06,
"loss": 1.6544,
"step": 786
},
{
"epoch": 1.8009153318077802,
"grad_norm": 1.485110878944397,
"learning_rate": 2.6865443527992696e-06,
"loss": 1.5934,
"step": 787
},
{
"epoch": 1.803203661327231,
"grad_norm": 1.4106863737106323,
"learning_rate": 2.6256828820113766e-06,
"loss": 1.6499,
"step": 788
},
{
"epoch": 1.805491990846682,
"grad_norm": 1.4385408163070679,
"learning_rate": 2.5655001239691835e-06,
"loss": 1.8236,
"step": 789
},
{
"epoch": 1.8077803203661327,
"grad_norm": 1.4379700422286987,
"learning_rate": 2.5059969408867843e-06,
"loss": 1.5647,
"step": 790
},
{
"epoch": 1.8100686498855834,
"grad_norm": 1.428528904914856,
"learning_rate": 2.4471741852423237e-06,
"loss": 1.7437,
"step": 791
},
{
"epoch": 1.8123569794050343,
"grad_norm": 1.483229160308838,
"learning_rate": 2.3890326997656975e-06,
"loss": 1.7859,
"step": 792
},
{
"epoch": 1.8146453089244852,
"grad_norm": 1.3773958683013916,
"learning_rate": 2.331573317426533e-06,
"loss": 1.7202,
"step": 793
},
{
"epoch": 1.816933638443936,
"grad_norm": 1.645362138748169,
"learning_rate": 2.274796861422246e-06,
"loss": 1.7313,
"step": 794
},
{
"epoch": 1.8192219679633868,
"grad_norm": 1.5220423936843872,
"learning_rate": 2.2187041451662282e-06,
"loss": 1.8495,
"step": 795
},
{
"epoch": 1.8215102974828374,
"grad_norm": 2.0332138538360596,
"learning_rate": 2.163295972276219e-06,
"loss": 1.8125,
"step": 796
},
{
"epoch": 1.8237986270022883,
"grad_norm": 1.386008381843567,
"learning_rate": 2.1085731365627746e-06,
"loss": 1.7602,
"step": 797
},
{
"epoch": 1.8260869565217392,
"grad_norm": 1.4969429969787598,
"learning_rate": 2.054536422017922e-06,
"loss": 1.7125,
"step": 798
},
{
"epoch": 1.82837528604119,
"grad_norm": 1.5347836017608643,
"learning_rate": 2.0011866028038617e-06,
"loss": 1.6985,
"step": 799
},
{
"epoch": 1.8306636155606406,
"grad_norm": 1.453011155128479,
"learning_rate": 1.9485244432419667e-06,
"loss": 1.5477,
"step": 800
},
{
"epoch": 1.8329519450800915,
"grad_norm": 1.4869728088378906,
"learning_rate": 1.896550697801769e-06,
"loss": 1.7949,
"step": 801
},
{
"epoch": 1.8352402745995424,
"grad_norm": 1.3592677116394043,
"learning_rate": 1.8452661110901715e-06,
"loss": 1.6919,
"step": 802
},
{
"epoch": 1.837528604118993,
"grad_norm": 1.4468754529953003,
"learning_rate": 1.7946714178407652e-06,
"loss": 1.8576,
"step": 803
},
{
"epoch": 1.8398169336384438,
"grad_norm": 1.399828314781189,
"learning_rate": 1.7447673429033362e-06,
"loss": 1.7356,
"step": 804
},
{
"epoch": 1.8421052631578947,
"grad_norm": 1.4425907135009766,
"learning_rate": 1.695554601233451e-06,
"loss": 1.7861,
"step": 805
},
{
"epoch": 1.8443935926773456,
"grad_norm": 1.4491031169891357,
"learning_rate": 1.6470338978822108e-06,
"loss": 1.733,
"step": 806
},
{
"epoch": 1.8466819221967965,
"grad_norm": 1.4042807817459106,
"learning_rate": 1.5992059279861914e-06,
"loss": 1.548,
"step": 807
},
{
"epoch": 1.8489702517162472,
"grad_norm": 2.0380630493164062,
"learning_rate": 1.5520713767574246e-06,
"loss": 1.8301,
"step": 808
},
{
"epoch": 1.8512585812356979,
"grad_norm": 1.3438981771469116,
"learning_rate": 1.5056309194736384e-06,
"loss": 1.684,
"step": 809
},
{
"epoch": 1.8535469107551488,
"grad_norm": 1.5010857582092285,
"learning_rate": 1.4598852214685488e-06,
"loss": 1.8225,
"step": 810
},
{
"epoch": 1.8558352402745997,
"grad_norm": 1.4094516038894653,
"learning_rate": 1.414834938122306e-06,
"loss": 1.727,
"step": 811
},
{
"epoch": 1.8581235697940504,
"grad_norm": 1.5290316343307495,
"learning_rate": 1.3704807148521903e-06,
"loss": 1.854,
"step": 812
},
{
"epoch": 1.860411899313501,
"grad_norm": 1.5701794624328613,
"learning_rate": 1.3268231871032655e-06,
"loss": 1.6217,
"step": 813
},
{
"epoch": 1.862700228832952,
"grad_norm": 1.4407206773757935,
"learning_rate": 1.2838629803393342e-06,
"loss": 1.855,
"step": 814
},
{
"epoch": 1.8649885583524028,
"grad_norm": 1.4647456407546997,
"learning_rate": 1.2416007100339577e-06,
"loss": 1.6763,
"step": 815
},
{
"epoch": 1.8672768878718535,
"grad_norm": 1.4444900751113892,
"learning_rate": 1.2000369816616674e-06,
"loss": 1.677,
"step": 816
},
{
"epoch": 1.8695652173913042,
"grad_norm": 1.545384407043457,
"learning_rate": 1.1591723906892337e-06,
"loss": 1.8448,
"step": 817
},
{
"epoch": 1.8718535469107551,
"grad_norm": 1.5062963962554932,
"learning_rate": 1.119007522567167e-06,
"loss": 1.757,
"step": 818
},
{
"epoch": 1.874141876430206,
"grad_norm": 1.4100571870803833,
"learning_rate": 1.0795429527213686e-06,
"loss": 1.5666,
"step": 819
},
{
"epoch": 1.8764302059496567,
"grad_norm": 1.8375623226165771,
"learning_rate": 1.0407792465447986e-06,
"loss": 1.7731,
"step": 820
},
{
"epoch": 1.8787185354691074,
"grad_norm": 1.4669498205184937,
"learning_rate": 1.002716959389466e-06,
"loss": 1.7061,
"step": 821
},
{
"epoch": 1.8810068649885583,
"grad_norm": 1.501181721687317,
"learning_rate": 9.653566365584176e-07,
"loss": 1.6322,
"step": 822
},
{
"epoch": 1.8832951945080092,
"grad_norm": 1.3494288921356201,
"learning_rate": 9.28698813297929e-07,
"loss": 1.5071,
"step": 823
},
{
"epoch": 1.88558352402746,
"grad_norm": 1.3176014423370361,
"learning_rate": 8.927440147898702e-07,
"loss": 1.5119,
"step": 824
},
{
"epoch": 1.8878718535469108,
"grad_norm": 1.6312108039855957,
"learning_rate": 8.574927561441349e-07,
"loss": 1.7968,
"step": 825
},
{
"epoch": 1.8901601830663615,
"grad_norm": 1.461529016494751,
"learning_rate": 8.229455423913013e-07,
"loss": 1.6752,
"step": 826
},
{
"epoch": 1.8924485125858124,
"grad_norm": 1.4274225234985352,
"learning_rate": 7.891028684753777e-07,
"loss": 1.5143,
"step": 827
},
{
"epoch": 1.8947368421052633,
"grad_norm": 1.6452929973602295,
"learning_rate": 7.559652192467126e-07,
"loss": 1.8208,
"step": 828
},
{
"epoch": 1.897025171624714,
"grad_norm": 1.5423994064331055,
"learning_rate": 7.235330694550402e-07,
"loss": 1.8131,
"step": 829
},
{
"epoch": 1.8993135011441646,
"grad_norm": 1.342574119567871,
"learning_rate": 6.918068837427128e-07,
"loss": 1.6335,
"step": 830
},
{
"epoch": 1.9016018306636155,
"grad_norm": 1.461423635482788,
"learning_rate": 6.607871166379897e-07,
"loss": 1.7961,
"step": 831
},
{
"epoch": 1.9038901601830664,
"grad_norm": 1.3837182521820068,
"learning_rate": 6.304742125485874e-07,
"loss": 1.6387,
"step": 832
},
{
"epoch": 1.9061784897025171,
"grad_norm": 1.3506449460983276,
"learning_rate": 6.008686057552448e-07,
"loss": 1.8114,
"step": 833
},
{
"epoch": 1.9084668192219678,
"grad_norm": 1.571703314781189,
"learning_rate": 5.719707204055735e-07,
"loss": 1.853,
"step": 834
},
{
"epoch": 1.9107551487414187,
"grad_norm": 1.483771562576294,
"learning_rate": 5.437809705079233e-07,
"loss": 1.8256,
"step": 835
},
{
"epoch": 1.9130434782608696,
"grad_norm": 1.5069774389266968,
"learning_rate": 5.162997599254704e-07,
"loss": 1.9386,
"step": 836
},
{
"epoch": 1.9153318077803205,
"grad_norm": 1.3011687994003296,
"learning_rate": 4.895274823704555e-07,
"loss": 1.6496,
"step": 837
},
{
"epoch": 1.9176201372997712,
"grad_norm": 1.726117730140686,
"learning_rate": 4.634645213984934e-07,
"loss": 1.7189,
"step": 838
},
{
"epoch": 1.919908466819222,
"grad_norm": 1.4472270011901855,
"learning_rate": 4.381112504031337e-07,
"loss": 1.773,
"step": 839
},
{
"epoch": 1.9221967963386728,
"grad_norm": 1.3641666173934937,
"learning_rate": 4.134680326104645e-07,
"loss": 1.8951,
"step": 840
},
{
"epoch": 1.9244851258581237,
"grad_norm": 3.5742650032043457,
"learning_rate": 3.895352210739278e-07,
"loss": 1.6608,
"step": 841
},
{
"epoch": 1.9267734553775744,
"grad_norm": 1.3817075490951538,
"learning_rate": 3.663131586692792e-07,
"loss": 1.6727,
"step": 842
},
{
"epoch": 1.929061784897025,
"grad_norm": 1.5136713981628418,
"learning_rate": 3.4380217808964166e-07,
"loss": 1.596,
"step": 843
},
{
"epoch": 1.931350114416476,
"grad_norm": 1.4291810989379883,
"learning_rate": 3.2200260184075406e-07,
"loss": 1.6071,
"step": 844
},
{
"epoch": 1.9336384439359269,
"grad_norm": 1.459362506866455,
"learning_rate": 3.0091474223636895e-07,
"loss": 1.6072,
"step": 845
},
{
"epoch": 1.9359267734553776,
"grad_norm": 1.4230263233184814,
"learning_rate": 2.805389013937454e-07,
"loss": 1.7475,
"step": 846
},
{
"epoch": 1.9382151029748282,
"grad_norm": 1.586031436920166,
"learning_rate": 2.6087537122934103e-07,
"loss": 1.9043,
"step": 847
},
{
"epoch": 1.9405034324942791,
"grad_norm": 1.896700382232666,
"learning_rate": 2.419244334546267e-07,
"loss": 1.83,
"step": 848
},
{
"epoch": 1.94279176201373,
"grad_norm": 1.346601963043213,
"learning_rate": 2.2368635957205618e-07,
"loss": 1.7708,
"step": 849
},
{
"epoch": 1.9450800915331807,
"grad_norm": 1.3309531211853027,
"learning_rate": 2.061614108711474e-07,
"loss": 1.7702,
"step": 850
},
{
"epoch": 1.9473684210526314,
"grad_norm": 1.325856328010559,
"learning_rate": 1.8934983842479047e-07,
"loss": 1.6979,
"step": 851
},
{
"epoch": 1.9496567505720823,
"grad_norm": 1.456886887550354,
"learning_rate": 1.732518830856067e-07,
"loss": 1.7145,
"step": 852
},
{
"epoch": 1.9519450800915332,
"grad_norm": 1.3818657398223877,
"learning_rate": 1.5786777548250641e-07,
"loss": 1.8091,
"step": 853
},
{
"epoch": 1.9542334096109841,
"grad_norm": 1.3843435049057007,
"learning_rate": 1.431977360173975e-07,
"loss": 1.7547,
"step": 854
},
{
"epoch": 1.9565217391304348,
"grad_norm": 1.768879771232605,
"learning_rate": 1.2924197486203215e-07,
"loss": 1.9333,
"step": 855
},
{
"epoch": 1.9588100686498855,
"grad_norm": 1.4703888893127441,
"learning_rate": 1.16000691954965e-07,
"loss": 1.6836,
"step": 856
},
{
"epoch": 1.9610983981693364,
"grad_norm": 3.4643313884735107,
"learning_rate": 1.0347407699872191e-07,
"loss": 1.72,
"step": 857
},
{
"epoch": 1.9633867276887873,
"grad_norm": 1.4141038656234741,
"learning_rate": 9.1662309457069e-08,
"loss": 1.7119,
"step": 858
},
{
"epoch": 1.965675057208238,
"grad_norm": 1.3972052335739136,
"learning_rate": 8.056555855243675e-08,
"loss": 1.7246,
"step": 859
},
{
"epoch": 1.9679633867276887,
"grad_norm": 1.461970329284668,
"learning_rate": 7.018398326350539e-08,
"loss": 1.7522,
"step": 860
},
{
"epoch": 1.9702517162471396,
"grad_norm": 1.4914907217025757,
"learning_rate": 6.051773232291225e-08,
"loss": 1.8131,
"step": 861
},
{
"epoch": 1.9725400457665905,
"grad_norm": 1.4054067134857178,
"learning_rate": 5.15669442151423e-08,
"loss": 1.8239,
"step": 862
},
{
"epoch": 1.9748283752860412,
"grad_norm": 1.414573073387146,
"learning_rate": 4.333174717453536e-08,
"loss": 1.4845,
"step": 863
},
{
"epoch": 1.9771167048054918,
"grad_norm": 1.2987114191055298,
"learning_rate": 3.581225918342646e-08,
"loss": 1.7815,
"step": 864
},
{
"epoch": 1.9794050343249427,
"grad_norm": 1.4163333177566528,
"learning_rate": 2.9008587970502653e-08,
"loss": 1.6575,
"step": 865
},
{
"epoch": 1.9816933638443937,
"grad_norm": 1.4055818319320679,
"learning_rate": 2.292083100920994e-08,
"loss": 1.7272,
"step": 866
},
{
"epoch": 1.9839816933638446,
"grad_norm": 1.4559320211410522,
"learning_rate": 1.7549075516393178e-08,
"loss": 1.6477,
"step": 867
},
{
"epoch": 1.9862700228832952,
"grad_norm": 1.3513697385787964,
"learning_rate": 1.2893398451024886e-08,
"loss": 1.6163,
"step": 868
},
{
"epoch": 1.988558352402746,
"grad_norm": 1.2909469604492188,
"learning_rate": 8.953866513111697e-09,
"loss": 1.6992,
"step": 869
},
{
"epoch": 1.9908466819221968,
"grad_norm": 1.442205786705017,
"learning_rate": 5.730536142745102e-09,
"loss": 1.8069,
"step": 870
},
{
"epoch": 1.9931350114416477,
"grad_norm": 1.4293149709701538,
"learning_rate": 3.2234535192798843e-09,
"loss": 1.7146,
"step": 871
},
{
"epoch": 1.9954233409610984,
"grad_norm": 1.4986752271652222,
"learning_rate": 1.432654560679092e-09,
"loss": 1.8256,
"step": 872
},
{
"epoch": 1.997711670480549,
"grad_norm": 1.3538659811019897,
"learning_rate": 3.5816492299223237e-10,
"loss": 1.8013,
"step": 873
},
{
"epoch": 2.0,
"grad_norm": 1.5758063793182373,
"learning_rate": 0.0,
"loss": 1.8155,
"step": 874
}
],
"logging_steps": 1,
"max_steps": 874,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 437,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4794397096450458e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}