jssky's picture
Training in progress, step 690, checkpoint
843ffa8 verified
raw
history blame
121 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.750816104461371,
"eval_steps": 230,
"global_step": 690,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001088139281828074,
"grad_norm": 0.04828796908259392,
"learning_rate": 2e-05,
"loss": 0.0762,
"step": 1
},
{
"epoch": 0.002176278563656148,
"grad_norm": 0.06207922473549843,
"learning_rate": 4e-05,
"loss": 0.0973,
"step": 2
},
{
"epoch": 0.003264417845484222,
"grad_norm": 0.06030188128352165,
"learning_rate": 6e-05,
"loss": 0.1003,
"step": 3
},
{
"epoch": 0.004352557127312296,
"grad_norm": 0.05758730694651604,
"learning_rate": 8e-05,
"loss": 0.085,
"step": 4
},
{
"epoch": 0.00544069640914037,
"grad_norm": 0.07111983001232147,
"learning_rate": 0.0001,
"loss": 0.1174,
"step": 5
},
{
"epoch": 0.006528835690968444,
"grad_norm": 0.09522929042577744,
"learning_rate": 0.00012,
"loss": 0.1404,
"step": 6
},
{
"epoch": 0.007616974972796518,
"grad_norm": 0.08600069582462311,
"learning_rate": 0.00014,
"loss": 0.1187,
"step": 7
},
{
"epoch": 0.008705114254624592,
"grad_norm": 0.0808354914188385,
"learning_rate": 0.00016,
"loss": 0.1235,
"step": 8
},
{
"epoch": 0.009793253536452665,
"grad_norm": 0.10051260888576508,
"learning_rate": 0.00018,
"loss": 0.135,
"step": 9
},
{
"epoch": 0.01088139281828074,
"grad_norm": 0.08877796679735184,
"learning_rate": 0.0002,
"loss": 0.0803,
"step": 10
},
{
"epoch": 0.011969532100108813,
"grad_norm": 0.10384293645620346,
"learning_rate": 0.00019999940277008808,
"loss": 0.1206,
"step": 11
},
{
"epoch": 0.013057671381936888,
"grad_norm": 0.1314290463924408,
"learning_rate": 0.00019999761108748597,
"loss": 0.1249,
"step": 12
},
{
"epoch": 0.014145810663764961,
"grad_norm": 0.12873488664627075,
"learning_rate": 0.00019999462497359466,
"loss": 0.0847,
"step": 13
},
{
"epoch": 0.015233949945593036,
"grad_norm": 0.12493155896663666,
"learning_rate": 0.000199990444464082,
"loss": 0.0858,
"step": 14
},
{
"epoch": 0.01632208922742111,
"grad_norm": 0.1131022647023201,
"learning_rate": 0.00019998506960888256,
"loss": 0.0737,
"step": 15
},
{
"epoch": 0.017410228509249184,
"grad_norm": 0.1214890405535698,
"learning_rate": 0.0001999785004721968,
"loss": 0.1151,
"step": 16
},
{
"epoch": 0.018498367791077257,
"grad_norm": 0.12649253010749817,
"learning_rate": 0.0001999707371324904,
"loss": 0.0913,
"step": 17
},
{
"epoch": 0.01958650707290533,
"grad_norm": 0.12470237910747528,
"learning_rate": 0.00019996177968249334,
"loss": 0.1043,
"step": 18
},
{
"epoch": 0.020674646354733407,
"grad_norm": 0.15574277937412262,
"learning_rate": 0.00019995162822919883,
"loss": 0.1092,
"step": 19
},
{
"epoch": 0.02176278563656148,
"grad_norm": 0.1281992644071579,
"learning_rate": 0.0001999402828938618,
"loss": 0.0892,
"step": 20
},
{
"epoch": 0.022850924918389554,
"grad_norm": 0.16906976699829102,
"learning_rate": 0.00019992774381199778,
"loss": 0.1633,
"step": 21
},
{
"epoch": 0.023939064200217627,
"grad_norm": 0.12387573719024658,
"learning_rate": 0.00019991401113338104,
"loss": 0.0872,
"step": 22
},
{
"epoch": 0.025027203482045703,
"grad_norm": 0.12343913316726685,
"learning_rate": 0.00019989908502204292,
"loss": 0.1084,
"step": 23
},
{
"epoch": 0.026115342763873776,
"grad_norm": 0.11399204283952713,
"learning_rate": 0.00019988296565626987,
"loss": 0.0824,
"step": 24
},
{
"epoch": 0.02720348204570185,
"grad_norm": 0.1402149796485901,
"learning_rate": 0.00019986565322860115,
"loss": 0.1289,
"step": 25
},
{
"epoch": 0.028291621327529923,
"grad_norm": 0.13711000978946686,
"learning_rate": 0.00019984714794582683,
"loss": 0.097,
"step": 26
},
{
"epoch": 0.029379760609358,
"grad_norm": 0.18544617295265198,
"learning_rate": 0.000199827450028985,
"loss": 0.1281,
"step": 27
},
{
"epoch": 0.030467899891186073,
"grad_norm": 0.15856046974658966,
"learning_rate": 0.00019980655971335945,
"loss": 0.1027,
"step": 28
},
{
"epoch": 0.031556039173014146,
"grad_norm": 0.12590578198432922,
"learning_rate": 0.00019978447724847652,
"loss": 0.0863,
"step": 29
},
{
"epoch": 0.03264417845484222,
"grad_norm": 0.14784540235996246,
"learning_rate": 0.00019976120289810247,
"loss": 0.1125,
"step": 30
},
{
"epoch": 0.03373231773667029,
"grad_norm": 0.19753308594226837,
"learning_rate": 0.00019973673694024,
"loss": 0.1389,
"step": 31
},
{
"epoch": 0.03482045701849837,
"grad_norm": 0.17247086763381958,
"learning_rate": 0.00019971107966712518,
"loss": 0.0901,
"step": 32
},
{
"epoch": 0.035908596300326445,
"grad_norm": 0.23573236167430878,
"learning_rate": 0.0001996842313852238,
"loss": 0.1141,
"step": 33
},
{
"epoch": 0.036996735582154515,
"grad_norm": 0.1349520981311798,
"learning_rate": 0.0001996561924152278,
"loss": 0.077,
"step": 34
},
{
"epoch": 0.03808487486398259,
"grad_norm": 0.2573637068271637,
"learning_rate": 0.00019962696309205148,
"loss": 0.0965,
"step": 35
},
{
"epoch": 0.03917301414581066,
"grad_norm": 0.2688570022583008,
"learning_rate": 0.0001995965437648273,
"loss": 0.1754,
"step": 36
},
{
"epoch": 0.04026115342763874,
"grad_norm": 0.2946501076221466,
"learning_rate": 0.0001995649347969019,
"loss": 0.1893,
"step": 37
},
{
"epoch": 0.041349292709466814,
"grad_norm": 0.2942318320274353,
"learning_rate": 0.00019953213656583168,
"loss": 0.1461,
"step": 38
},
{
"epoch": 0.042437431991294884,
"grad_norm": 0.3316934406757355,
"learning_rate": 0.00019949814946337838,
"loss": 0.1072,
"step": 39
},
{
"epoch": 0.04352557127312296,
"grad_norm": 0.3863315284252167,
"learning_rate": 0.00019946297389550433,
"loss": 0.1517,
"step": 40
},
{
"epoch": 0.04461371055495103,
"grad_norm": 0.3102675974369049,
"learning_rate": 0.00019942661028236745,
"loss": 0.1421,
"step": 41
},
{
"epoch": 0.04570184983677911,
"grad_norm": 0.3270488679409027,
"learning_rate": 0.00019938905905831654,
"loss": 0.085,
"step": 42
},
{
"epoch": 0.046789989118607184,
"grad_norm": 0.3799861669540405,
"learning_rate": 0.0001993503206718859,
"loss": 0.1333,
"step": 43
},
{
"epoch": 0.04787812840043525,
"grad_norm": 0.45475858449935913,
"learning_rate": 0.00019931039558578997,
"loss": 0.1431,
"step": 44
},
{
"epoch": 0.04896626768226333,
"grad_norm": 0.3995339870452881,
"learning_rate": 0.00019926928427691786,
"loss": 0.18,
"step": 45
},
{
"epoch": 0.05005440696409141,
"grad_norm": 0.4547290503978729,
"learning_rate": 0.00019922698723632767,
"loss": 0.1578,
"step": 46
},
{
"epoch": 0.051142546245919476,
"grad_norm": 0.44180116057395935,
"learning_rate": 0.0001991835049692405,
"loss": 0.2327,
"step": 47
},
{
"epoch": 0.05223068552774755,
"grad_norm": 0.8059853911399841,
"learning_rate": 0.0001991388379950346,
"loss": 0.3089,
"step": 48
},
{
"epoch": 0.05331882480957562,
"grad_norm": 0.6336686015129089,
"learning_rate": 0.00019909298684723904,
"loss": 0.2055,
"step": 49
},
{
"epoch": 0.0544069640914037,
"grad_norm": 0.8279074430465698,
"learning_rate": 0.00019904595207352737,
"loss": 0.2626,
"step": 50
},
{
"epoch": 0.055495103373231776,
"grad_norm": 0.07426032423973083,
"learning_rate": 0.000198997734235711,
"loss": 0.0632,
"step": 51
},
{
"epoch": 0.056583242655059846,
"grad_norm": 0.09094005823135376,
"learning_rate": 0.00019894833390973266,
"loss": 0.0734,
"step": 52
},
{
"epoch": 0.05767138193688792,
"grad_norm": 0.09561450034379959,
"learning_rate": 0.00019889775168565943,
"loss": 0.0972,
"step": 53
},
{
"epoch": 0.058759521218716,
"grad_norm": 0.09174304455518723,
"learning_rate": 0.00019884598816767563,
"loss": 0.082,
"step": 54
},
{
"epoch": 0.05984766050054407,
"grad_norm": 0.0911480113863945,
"learning_rate": 0.0001987930439740757,
"loss": 0.0712,
"step": 55
},
{
"epoch": 0.060935799782372145,
"grad_norm": 0.1090071052312851,
"learning_rate": 0.0001987389197372567,
"loss": 0.09,
"step": 56
},
{
"epoch": 0.062023939064200215,
"grad_norm": 0.09118974953889847,
"learning_rate": 0.00019868361610371097,
"loss": 0.0946,
"step": 57
},
{
"epoch": 0.06311207834602829,
"grad_norm": 0.09903446584939957,
"learning_rate": 0.0001986271337340182,
"loss": 0.1074,
"step": 58
},
{
"epoch": 0.06420021762785637,
"grad_norm": 0.08208192884922028,
"learning_rate": 0.00019856947330283752,
"loss": 0.0847,
"step": 59
},
{
"epoch": 0.06528835690968444,
"grad_norm": 0.08504832535982132,
"learning_rate": 0.0001985106354988997,
"loss": 0.0852,
"step": 60
},
{
"epoch": 0.06637649619151251,
"grad_norm": 0.07276565581560135,
"learning_rate": 0.0001984506210249986,
"loss": 0.061,
"step": 61
},
{
"epoch": 0.06746463547334058,
"grad_norm": 0.08346979320049286,
"learning_rate": 0.00019838943059798304,
"loss": 0.0717,
"step": 62
},
{
"epoch": 0.06855277475516866,
"grad_norm": 0.09144837409257889,
"learning_rate": 0.0001983270649487481,
"loss": 0.0817,
"step": 63
},
{
"epoch": 0.06964091403699674,
"grad_norm": 0.09562050551176071,
"learning_rate": 0.00019826352482222638,
"loss": 0.0809,
"step": 64
},
{
"epoch": 0.07072905331882481,
"grad_norm": 0.10410594195127487,
"learning_rate": 0.00019819881097737915,
"loss": 0.0801,
"step": 65
},
{
"epoch": 0.07181719260065289,
"grad_norm": 0.0836932361125946,
"learning_rate": 0.00019813292418718732,
"loss": 0.0775,
"step": 66
},
{
"epoch": 0.07290533188248095,
"grad_norm": 0.09397463500499725,
"learning_rate": 0.0001980658652386421,
"loss": 0.065,
"step": 67
},
{
"epoch": 0.07399347116430903,
"grad_norm": 0.108340784907341,
"learning_rate": 0.0001979976349327357,
"loss": 0.0981,
"step": 68
},
{
"epoch": 0.0750816104461371,
"grad_norm": 0.08717814087867737,
"learning_rate": 0.00019792823408445174,
"loss": 0.0798,
"step": 69
},
{
"epoch": 0.07616974972796518,
"grad_norm": 0.1279960721731186,
"learning_rate": 0.00019785766352275542,
"loss": 0.0993,
"step": 70
},
{
"epoch": 0.07725788900979326,
"grad_norm": 0.13422514498233795,
"learning_rate": 0.00019778592409058378,
"loss": 0.1023,
"step": 71
},
{
"epoch": 0.07834602829162132,
"grad_norm": 0.10113417357206345,
"learning_rate": 0.0001977130166448355,
"loss": 0.0742,
"step": 72
},
{
"epoch": 0.0794341675734494,
"grad_norm": 0.1026310920715332,
"learning_rate": 0.00019763894205636072,
"loss": 0.0988,
"step": 73
},
{
"epoch": 0.08052230685527748,
"grad_norm": 0.09779265522956848,
"learning_rate": 0.00019756370120995066,
"loss": 0.0738,
"step": 74
},
{
"epoch": 0.08161044613710555,
"grad_norm": 0.14643464982509613,
"learning_rate": 0.000197487295004327,
"loss": 0.09,
"step": 75
},
{
"epoch": 0.08269858541893363,
"grad_norm": 0.11976644396781921,
"learning_rate": 0.00019740972435213115,
"loss": 0.0904,
"step": 76
},
{
"epoch": 0.08378672470076169,
"grad_norm": 0.10904321819543839,
"learning_rate": 0.00019733099017991341,
"loss": 0.0766,
"step": 77
},
{
"epoch": 0.08487486398258977,
"grad_norm": 0.1651339828968048,
"learning_rate": 0.0001972510934281218,
"loss": 0.1186,
"step": 78
},
{
"epoch": 0.08596300326441784,
"grad_norm": 0.11781762540340424,
"learning_rate": 0.00019717003505109095,
"loss": 0.0833,
"step": 79
},
{
"epoch": 0.08705114254624592,
"grad_norm": 0.15122370421886444,
"learning_rate": 0.00019708781601703065,
"loss": 0.1166,
"step": 80
},
{
"epoch": 0.088139281828074,
"grad_norm": 0.1798838973045349,
"learning_rate": 0.00019700443730801413,
"loss": 0.1109,
"step": 81
},
{
"epoch": 0.08922742110990206,
"grad_norm": 0.18025629222393036,
"learning_rate": 0.00019691989991996663,
"loss": 0.1243,
"step": 82
},
{
"epoch": 0.09031556039173014,
"grad_norm": 0.1731874644756317,
"learning_rate": 0.00019683420486265327,
"loss": 0.1189,
"step": 83
},
{
"epoch": 0.09140369967355821,
"grad_norm": 0.2220824509859085,
"learning_rate": 0.0001967473531596671,
"loss": 0.1451,
"step": 84
},
{
"epoch": 0.09249183895538629,
"grad_norm": 0.1664338856935501,
"learning_rate": 0.00019665934584841682,
"loss": 0.0897,
"step": 85
},
{
"epoch": 0.09357997823721437,
"grad_norm": 0.17619486153125763,
"learning_rate": 0.00019657018398011434,
"loss": 0.0935,
"step": 86
},
{
"epoch": 0.09466811751904244,
"grad_norm": 0.24987219274044037,
"learning_rate": 0.00019647986861976246,
"loss": 0.1955,
"step": 87
},
{
"epoch": 0.0957562568008705,
"grad_norm": 0.21318784356117249,
"learning_rate": 0.00019638840084614182,
"loss": 0.1131,
"step": 88
},
{
"epoch": 0.09684439608269858,
"grad_norm": 0.3128167390823364,
"learning_rate": 0.0001962957817517982,
"loss": 0.2011,
"step": 89
},
{
"epoch": 0.09793253536452666,
"grad_norm": 0.2833835184574127,
"learning_rate": 0.00019620201244302952,
"loss": 0.1592,
"step": 90
},
{
"epoch": 0.09902067464635474,
"grad_norm": 0.3286789357662201,
"learning_rate": 0.00019610709403987246,
"loss": 0.1602,
"step": 91
},
{
"epoch": 0.10010881392818281,
"grad_norm": 0.44117358326911926,
"learning_rate": 0.00019601102767608923,
"loss": 0.2323,
"step": 92
},
{
"epoch": 0.10119695321001088,
"grad_norm": 0.3795579671859741,
"learning_rate": 0.00019591381449915397,
"loss": 0.1867,
"step": 93
},
{
"epoch": 0.10228509249183895,
"grad_norm": 0.5780506730079651,
"learning_rate": 0.000195815455670239,
"loss": 0.1884,
"step": 94
},
{
"epoch": 0.10337323177366703,
"grad_norm": 0.5124024748802185,
"learning_rate": 0.00019571595236420102,
"loss": 0.221,
"step": 95
},
{
"epoch": 0.1044613710554951,
"grad_norm": 0.4628782272338867,
"learning_rate": 0.00019561530576956703,
"loss": 0.208,
"step": 96
},
{
"epoch": 0.10554951033732318,
"grad_norm": 0.3904087543487549,
"learning_rate": 0.0001955135170885202,
"loss": 0.2029,
"step": 97
},
{
"epoch": 0.10663764961915125,
"grad_norm": 0.513387143611908,
"learning_rate": 0.00019541058753688538,
"loss": 0.2248,
"step": 98
},
{
"epoch": 0.10772578890097932,
"grad_norm": 0.6133597493171692,
"learning_rate": 0.00019530651834411474,
"loss": 0.3751,
"step": 99
},
{
"epoch": 0.1088139281828074,
"grad_norm": 0.6515563726425171,
"learning_rate": 0.00019520131075327298,
"loss": 0.2096,
"step": 100
},
{
"epoch": 0.10990206746463548,
"grad_norm": 0.07718291878700256,
"learning_rate": 0.00019509496602102252,
"loss": 0.0631,
"step": 101
},
{
"epoch": 0.11099020674646355,
"grad_norm": 0.07896394282579422,
"learning_rate": 0.00019498748541760846,
"loss": 0.095,
"step": 102
},
{
"epoch": 0.11207834602829161,
"grad_norm": 0.07955438643693924,
"learning_rate": 0.00019487887022684336,
"loss": 0.067,
"step": 103
},
{
"epoch": 0.11316648531011969,
"grad_norm": 0.08391376584768295,
"learning_rate": 0.0001947691217460921,
"loss": 0.0637,
"step": 104
},
{
"epoch": 0.11425462459194777,
"grad_norm": 0.07990088313817978,
"learning_rate": 0.00019465824128625617,
"loss": 0.0569,
"step": 105
},
{
"epoch": 0.11534276387377584,
"grad_norm": 0.09000790864229202,
"learning_rate": 0.00019454623017175812,
"loss": 0.0639,
"step": 106
},
{
"epoch": 0.11643090315560392,
"grad_norm": 0.0831453874707222,
"learning_rate": 0.0001944330897405257,
"loss": 0.0766,
"step": 107
},
{
"epoch": 0.117519042437432,
"grad_norm": 0.08180610835552216,
"learning_rate": 0.00019431882134397598,
"loss": 0.0806,
"step": 108
},
{
"epoch": 0.11860718171926006,
"grad_norm": 0.07601706683635712,
"learning_rate": 0.0001942034263469989,
"loss": 0.0727,
"step": 109
},
{
"epoch": 0.11969532100108814,
"grad_norm": 0.08873546868562698,
"learning_rate": 0.00019408690612794148,
"loss": 0.0878,
"step": 110
},
{
"epoch": 0.12078346028291621,
"grad_norm": 0.10206414759159088,
"learning_rate": 0.00019396926207859084,
"loss": 0.1171,
"step": 111
},
{
"epoch": 0.12187159956474429,
"grad_norm": 0.07465587556362152,
"learning_rate": 0.00019385049560415794,
"loss": 0.0603,
"step": 112
},
{
"epoch": 0.12295973884657237,
"grad_norm": 0.09952360391616821,
"learning_rate": 0.00019373060812326052,
"loss": 0.0923,
"step": 113
},
{
"epoch": 0.12404787812840043,
"grad_norm": 0.09894778579473495,
"learning_rate": 0.00019360960106790643,
"loss": 0.0795,
"step": 114
},
{
"epoch": 0.1251360174102285,
"grad_norm": 0.09721358120441437,
"learning_rate": 0.00019348747588347637,
"loss": 0.1103,
"step": 115
},
{
"epoch": 0.12622415669205658,
"grad_norm": 0.10310002416372299,
"learning_rate": 0.00019336423402870653,
"loss": 0.1037,
"step": 116
},
{
"epoch": 0.12731229597388466,
"grad_norm": 0.10904382914304733,
"learning_rate": 0.0001932398769756714,
"loss": 0.1063,
"step": 117
},
{
"epoch": 0.12840043525571274,
"grad_norm": 0.11545544862747192,
"learning_rate": 0.00019311440620976597,
"loss": 0.096,
"step": 118
},
{
"epoch": 0.1294885745375408,
"grad_norm": 0.08674886077642441,
"learning_rate": 0.00019298782322968815,
"loss": 0.0779,
"step": 119
},
{
"epoch": 0.1305767138193689,
"grad_norm": 0.09437372535467148,
"learning_rate": 0.0001928601295474208,
"loss": 0.0975,
"step": 120
},
{
"epoch": 0.13166485310119697,
"grad_norm": 0.1258208006620407,
"learning_rate": 0.00019273132668821364,
"loss": 0.1277,
"step": 121
},
{
"epoch": 0.13275299238302501,
"grad_norm": 0.09919868409633636,
"learning_rate": 0.00019260141619056507,
"loss": 0.0993,
"step": 122
},
{
"epoch": 0.1338411316648531,
"grad_norm": 0.12028370052576065,
"learning_rate": 0.0001924703996062038,
"loss": 0.0751,
"step": 123
},
{
"epoch": 0.13492927094668117,
"grad_norm": 0.10702817142009735,
"learning_rate": 0.00019233827850007027,
"loss": 0.0949,
"step": 124
},
{
"epoch": 0.13601741022850924,
"grad_norm": 0.10939855128526688,
"learning_rate": 0.000192205054450298,
"loss": 0.0977,
"step": 125
},
{
"epoch": 0.13710554951033732,
"grad_norm": 0.11803679168224335,
"learning_rate": 0.00019207072904819486,
"loss": 0.0877,
"step": 126
},
{
"epoch": 0.1381936887921654,
"grad_norm": 0.1382649838924408,
"learning_rate": 0.00019193530389822363,
"loss": 0.1017,
"step": 127
},
{
"epoch": 0.13928182807399347,
"grad_norm": 0.1433139145374298,
"learning_rate": 0.00019179878061798347,
"loss": 0.0748,
"step": 128
},
{
"epoch": 0.14036996735582155,
"grad_norm": 0.14679527282714844,
"learning_rate": 0.00019166116083819002,
"loss": 0.1164,
"step": 129
},
{
"epoch": 0.14145810663764963,
"grad_norm": 0.13680118322372437,
"learning_rate": 0.0001915224462026563,
"loss": 0.1149,
"step": 130
},
{
"epoch": 0.1425462459194777,
"grad_norm": 0.16263848543167114,
"learning_rate": 0.00019138263836827288,
"loss": 0.1192,
"step": 131
},
{
"epoch": 0.14363438520130578,
"grad_norm": 0.16534928977489471,
"learning_rate": 0.00019124173900498818,
"loss": 0.1138,
"step": 132
},
{
"epoch": 0.14472252448313383,
"grad_norm": 0.21276706457138062,
"learning_rate": 0.0001910997497957885,
"loss": 0.1461,
"step": 133
},
{
"epoch": 0.1458106637649619,
"grad_norm": 0.2375650703907013,
"learning_rate": 0.0001909566724366779,
"loss": 0.2031,
"step": 134
},
{
"epoch": 0.14689880304678998,
"grad_norm": 0.20974405109882355,
"learning_rate": 0.00019081250863665794,
"loss": 0.1285,
"step": 135
},
{
"epoch": 0.14798694232861806,
"grad_norm": 0.308624267578125,
"learning_rate": 0.00019066726011770726,
"loss": 0.1717,
"step": 136
},
{
"epoch": 0.14907508161044614,
"grad_norm": 0.21192695200443268,
"learning_rate": 0.0001905209286147611,
"loss": 0.1064,
"step": 137
},
{
"epoch": 0.1501632208922742,
"grad_norm": 0.20596542954444885,
"learning_rate": 0.0001903735158756905,
"loss": 0.0975,
"step": 138
},
{
"epoch": 0.1512513601741023,
"grad_norm": 0.21547934412956238,
"learning_rate": 0.00019022502366128135,
"loss": 0.1255,
"step": 139
},
{
"epoch": 0.15233949945593037,
"grad_norm": 0.276815801858902,
"learning_rate": 0.00019007545374521355,
"loss": 0.0868,
"step": 140
},
{
"epoch": 0.15342763873775844,
"grad_norm": 0.37251031398773193,
"learning_rate": 0.00018992480791403958,
"loss": 0.1078,
"step": 141
},
{
"epoch": 0.15451577801958652,
"grad_norm": 0.328265517950058,
"learning_rate": 0.0001897730879671634,
"loss": 0.1842,
"step": 142
},
{
"epoch": 0.15560391730141457,
"grad_norm": 0.4400005340576172,
"learning_rate": 0.00018962029571681886,
"loss": 0.1857,
"step": 143
},
{
"epoch": 0.15669205658324264,
"grad_norm": 0.28378888964653015,
"learning_rate": 0.00018946643298804793,
"loss": 0.0955,
"step": 144
},
{
"epoch": 0.15778019586507072,
"grad_norm": 0.609008252620697,
"learning_rate": 0.00018931150161867916,
"loss": 0.3827,
"step": 145
},
{
"epoch": 0.1588683351468988,
"grad_norm": 0.3973180055618286,
"learning_rate": 0.0001891555034593055,
"loss": 0.1844,
"step": 146
},
{
"epoch": 0.15995647442872687,
"grad_norm": 0.36245423555374146,
"learning_rate": 0.00018899844037326225,
"loss": 0.2005,
"step": 147
},
{
"epoch": 0.16104461371055495,
"grad_norm": 0.5730637311935425,
"learning_rate": 0.0001888403142366049,
"loss": 0.2742,
"step": 148
},
{
"epoch": 0.16213275299238303,
"grad_norm": 0.5383718013763428,
"learning_rate": 0.00018868112693808665,
"loss": 0.2249,
"step": 149
},
{
"epoch": 0.1632208922742111,
"grad_norm": 0.9835379123687744,
"learning_rate": 0.00018852088037913577,
"loss": 0.344,
"step": 150
},
{
"epoch": 0.16430903155603918,
"grad_norm": 0.09142426401376724,
"learning_rate": 0.00018835957647383303,
"loss": 0.0876,
"step": 151
},
{
"epoch": 0.16539717083786726,
"grad_norm": 0.09199874103069305,
"learning_rate": 0.00018819721714888877,
"loss": 0.0798,
"step": 152
},
{
"epoch": 0.16648531011969533,
"grad_norm": 0.08299195021390915,
"learning_rate": 0.00018803380434362,
"loss": 0.0746,
"step": 153
},
{
"epoch": 0.16757344940152338,
"grad_norm": 0.10273315012454987,
"learning_rate": 0.00018786934000992688,
"loss": 0.101,
"step": 154
},
{
"epoch": 0.16866158868335146,
"grad_norm": 0.08524151146411896,
"learning_rate": 0.00018770382611226987,
"loss": 0.0684,
"step": 155
},
{
"epoch": 0.16974972796517954,
"grad_norm": 0.10515481233596802,
"learning_rate": 0.000187537264627646,
"loss": 0.0809,
"step": 156
},
{
"epoch": 0.1708378672470076,
"grad_norm": 0.09009183198213577,
"learning_rate": 0.00018736965754556528,
"loss": 0.0955,
"step": 157
},
{
"epoch": 0.1719260065288357,
"grad_norm": 0.10571747273206711,
"learning_rate": 0.00018720100686802694,
"loss": 0.0847,
"step": 158
},
{
"epoch": 0.17301414581066377,
"grad_norm": 0.08275768160820007,
"learning_rate": 0.00018703131460949554,
"loss": 0.0799,
"step": 159
},
{
"epoch": 0.17410228509249184,
"grad_norm": 0.08440782129764557,
"learning_rate": 0.00018686058279687698,
"loss": 0.0672,
"step": 160
},
{
"epoch": 0.17519042437431992,
"grad_norm": 0.10261505097150803,
"learning_rate": 0.00018668881346949417,
"loss": 0.1004,
"step": 161
},
{
"epoch": 0.176278563656148,
"grad_norm": 0.08730655908584595,
"learning_rate": 0.00018651600867906272,
"loss": 0.0711,
"step": 162
},
{
"epoch": 0.17736670293797607,
"grad_norm": 0.10591359436511993,
"learning_rate": 0.00018634217048966637,
"loss": 0.0919,
"step": 163
},
{
"epoch": 0.17845484221980412,
"grad_norm": 0.09251653403043747,
"learning_rate": 0.0001861673009777325,
"loss": 0.078,
"step": 164
},
{
"epoch": 0.1795429815016322,
"grad_norm": 0.106822170317173,
"learning_rate": 0.00018599140223200716,
"loss": 0.0895,
"step": 165
},
{
"epoch": 0.18063112078346028,
"grad_norm": 0.11222364753484726,
"learning_rate": 0.0001858144763535302,
"loss": 0.0918,
"step": 166
},
{
"epoch": 0.18171926006528835,
"grad_norm": 0.1363314390182495,
"learning_rate": 0.00018563652545561013,
"loss": 0.1126,
"step": 167
},
{
"epoch": 0.18280739934711643,
"grad_norm": 0.09316174685955048,
"learning_rate": 0.000185457551663799,
"loss": 0.0799,
"step": 168
},
{
"epoch": 0.1838955386289445,
"grad_norm": 0.13098089396953583,
"learning_rate": 0.00018527755711586678,
"loss": 0.0994,
"step": 169
},
{
"epoch": 0.18498367791077258,
"grad_norm": 0.11433115601539612,
"learning_rate": 0.00018509654396177609,
"loss": 0.1072,
"step": 170
},
{
"epoch": 0.18607181719260066,
"grad_norm": 0.11261814087629318,
"learning_rate": 0.00018491451436365627,
"loss": 0.1011,
"step": 171
},
{
"epoch": 0.18715995647442873,
"grad_norm": 0.1038559302687645,
"learning_rate": 0.00018473147049577774,
"loss": 0.0746,
"step": 172
},
{
"epoch": 0.1882480957562568,
"grad_norm": 0.11395396292209625,
"learning_rate": 0.00018454741454452603,
"loss": 0.0792,
"step": 173
},
{
"epoch": 0.1893362350380849,
"grad_norm": 0.13332821428775787,
"learning_rate": 0.00018436234870837547,
"loss": 0.1041,
"step": 174
},
{
"epoch": 0.19042437431991294,
"grad_norm": 0.12438289821147919,
"learning_rate": 0.00018417627519786315,
"loss": 0.1066,
"step": 175
},
{
"epoch": 0.191512513601741,
"grad_norm": 0.1353287398815155,
"learning_rate": 0.00018398919623556238,
"loss": 0.1193,
"step": 176
},
{
"epoch": 0.1926006528835691,
"grad_norm": 0.13928581774234772,
"learning_rate": 0.0001838011140560562,
"loss": 0.1228,
"step": 177
},
{
"epoch": 0.19368879216539717,
"grad_norm": 0.1474994421005249,
"learning_rate": 0.00018361203090591071,
"loss": 0.0812,
"step": 178
},
{
"epoch": 0.19477693144722524,
"grad_norm": 0.1910678595304489,
"learning_rate": 0.00018342194904364813,
"loss": 0.1435,
"step": 179
},
{
"epoch": 0.19586507072905332,
"grad_norm": 0.16526034474372864,
"learning_rate": 0.00018323087073971993,
"loss": 0.1136,
"step": 180
},
{
"epoch": 0.1969532100108814,
"grad_norm": 0.1933068335056305,
"learning_rate": 0.00018303879827647975,
"loss": 0.1498,
"step": 181
},
{
"epoch": 0.19804134929270947,
"grad_norm": 0.1647845059633255,
"learning_rate": 0.00018284573394815597,
"loss": 0.0764,
"step": 182
},
{
"epoch": 0.19912948857453755,
"grad_norm": 0.19414739310741425,
"learning_rate": 0.00018265168006082437,
"loss": 0.1142,
"step": 183
},
{
"epoch": 0.20021762785636563,
"grad_norm": 0.1872360110282898,
"learning_rate": 0.00018245663893238075,
"loss": 0.1169,
"step": 184
},
{
"epoch": 0.20130576713819368,
"grad_norm": 0.19919492304325104,
"learning_rate": 0.00018226061289251298,
"loss": 0.0854,
"step": 185
},
{
"epoch": 0.20239390642002175,
"grad_norm": 0.2233375757932663,
"learning_rate": 0.00018206360428267332,
"loss": 0.1271,
"step": 186
},
{
"epoch": 0.20348204570184983,
"grad_norm": 0.22116345167160034,
"learning_rate": 0.00018186561545605054,
"loss": 0.1402,
"step": 187
},
{
"epoch": 0.2045701849836779,
"grad_norm": 0.253864049911499,
"learning_rate": 0.0001816666487775416,
"loss": 0.1431,
"step": 188
},
{
"epoch": 0.20565832426550598,
"grad_norm": 0.2945636212825775,
"learning_rate": 0.00018146670662372354,
"loss": 0.1284,
"step": 189
},
{
"epoch": 0.20674646354733406,
"grad_norm": 0.24834126234054565,
"learning_rate": 0.00018126579138282503,
"loss": 0.098,
"step": 190
},
{
"epoch": 0.20783460282916214,
"grad_norm": 0.26815730333328247,
"learning_rate": 0.00018106390545469795,
"loss": 0.0877,
"step": 191
},
{
"epoch": 0.2089227421109902,
"grad_norm": 0.375293493270874,
"learning_rate": 0.00018086105125078857,
"loss": 0.1985,
"step": 192
},
{
"epoch": 0.2100108813928183,
"grad_norm": 0.4025906026363373,
"learning_rate": 0.00018065723119410884,
"loss": 0.2082,
"step": 193
},
{
"epoch": 0.21109902067464636,
"grad_norm": 0.3551553785800934,
"learning_rate": 0.0001804524477192075,
"loss": 0.2305,
"step": 194
},
{
"epoch": 0.21218715995647444,
"grad_norm": 0.594780445098877,
"learning_rate": 0.00018024670327214084,
"loss": 0.2713,
"step": 195
},
{
"epoch": 0.2132752992383025,
"grad_norm": 0.3940027356147766,
"learning_rate": 0.0001800400003104436,
"loss": 0.1623,
"step": 196
},
{
"epoch": 0.21436343852013057,
"grad_norm": 0.51041579246521,
"learning_rate": 0.00017983234130309968,
"loss": 0.2236,
"step": 197
},
{
"epoch": 0.21545157780195864,
"grad_norm": 0.6203753352165222,
"learning_rate": 0.00017962372873051252,
"loss": 0.2654,
"step": 198
},
{
"epoch": 0.21653971708378672,
"grad_norm": 0.7527713179588318,
"learning_rate": 0.00017941416508447536,
"loss": 0.2088,
"step": 199
},
{
"epoch": 0.2176278563656148,
"grad_norm": 1.1047406196594238,
"learning_rate": 0.00017920365286814183,
"loss": 0.3097,
"step": 200
},
{
"epoch": 0.21871599564744287,
"grad_norm": 0.0492124930024147,
"learning_rate": 0.0001789921945959958,
"loss": 0.0344,
"step": 201
},
{
"epoch": 0.21980413492927095,
"grad_norm": 0.07087790220975876,
"learning_rate": 0.00017877979279382135,
"loss": 0.0582,
"step": 202
},
{
"epoch": 0.22089227421109903,
"grad_norm": 0.07622935622930527,
"learning_rate": 0.00017856644999867264,
"loss": 0.062,
"step": 203
},
{
"epoch": 0.2219804134929271,
"grad_norm": 0.08792652189731598,
"learning_rate": 0.00017835216875884368,
"loss": 0.0511,
"step": 204
},
{
"epoch": 0.22306855277475518,
"grad_norm": 0.08028998970985413,
"learning_rate": 0.0001781369516338378,
"loss": 0.0665,
"step": 205
},
{
"epoch": 0.22415669205658323,
"grad_norm": 0.08997032046318054,
"learning_rate": 0.0001779208011943371,
"loss": 0.069,
"step": 206
},
{
"epoch": 0.2252448313384113,
"grad_norm": 0.08684886246919632,
"learning_rate": 0.00017770372002217172,
"loss": 0.077,
"step": 207
},
{
"epoch": 0.22633297062023938,
"grad_norm": 0.0965440422296524,
"learning_rate": 0.000177485710710289,
"loss": 0.0782,
"step": 208
},
{
"epoch": 0.22742110990206746,
"grad_norm": 0.09060367196798325,
"learning_rate": 0.00017726677586272263,
"loss": 0.066,
"step": 209
},
{
"epoch": 0.22850924918389554,
"grad_norm": 0.0900409147143364,
"learning_rate": 0.00017704691809456143,
"loss": 0.0707,
"step": 210
},
{
"epoch": 0.2295973884657236,
"grad_norm": 0.10733999311923981,
"learning_rate": 0.00017682614003191807,
"loss": 0.0916,
"step": 211
},
{
"epoch": 0.2306855277475517,
"grad_norm": 0.09372083842754364,
"learning_rate": 0.0001766044443118978,
"loss": 0.0872,
"step": 212
},
{
"epoch": 0.23177366702937977,
"grad_norm": 0.10344577580690384,
"learning_rate": 0.00017638183358256696,
"loss": 0.0903,
"step": 213
},
{
"epoch": 0.23286180631120784,
"grad_norm": 0.1084800437092781,
"learning_rate": 0.0001761583105029213,
"loss": 0.0926,
"step": 214
},
{
"epoch": 0.23394994559303592,
"grad_norm": 0.08565113693475723,
"learning_rate": 0.00017593387774285412,
"loss": 0.0758,
"step": 215
},
{
"epoch": 0.235038084874864,
"grad_norm": 0.11589045077562332,
"learning_rate": 0.0001757085379831246,
"loss": 0.0925,
"step": 216
},
{
"epoch": 0.23612622415669204,
"grad_norm": 0.12087468057870865,
"learning_rate": 0.00017548229391532572,
"loss": 0.1012,
"step": 217
},
{
"epoch": 0.23721436343852012,
"grad_norm": 0.1125798150897026,
"learning_rate": 0.00017525514824185185,
"loss": 0.109,
"step": 218
},
{
"epoch": 0.2383025027203482,
"grad_norm": 0.12492644041776657,
"learning_rate": 0.00017502710367586687,
"loss": 0.1048,
"step": 219
},
{
"epoch": 0.23939064200217627,
"grad_norm": 0.09837982058525085,
"learning_rate": 0.00017479816294127152,
"loss": 0.0803,
"step": 220
},
{
"epoch": 0.24047878128400435,
"grad_norm": 0.099558524787426,
"learning_rate": 0.00017456832877267084,
"loss": 0.0552,
"step": 221
},
{
"epoch": 0.24156692056583243,
"grad_norm": 0.095551498234272,
"learning_rate": 0.00017433760391534167,
"loss": 0.0905,
"step": 222
},
{
"epoch": 0.2426550598476605,
"grad_norm": 0.11664412170648575,
"learning_rate": 0.0001741059911251997,
"loss": 0.1005,
"step": 223
},
{
"epoch": 0.24374319912948858,
"grad_norm": 0.1248706802725792,
"learning_rate": 0.00017387349316876666,
"loss": 0.1135,
"step": 224
},
{
"epoch": 0.24483133841131666,
"grad_norm": 0.13133874535560608,
"learning_rate": 0.0001736401128231373,
"loss": 0.121,
"step": 225
},
{
"epoch": 0.24591947769314473,
"grad_norm": 0.12476039677858353,
"learning_rate": 0.00017340585287594604,
"loss": 0.1025,
"step": 226
},
{
"epoch": 0.2470076169749728,
"grad_norm": 0.1645650863647461,
"learning_rate": 0.0001731707161253338,
"loss": 0.1313,
"step": 227
},
{
"epoch": 0.24809575625680086,
"grad_norm": 0.1172671690583229,
"learning_rate": 0.00017293470537991463,
"loss": 0.0801,
"step": 228
},
{
"epoch": 0.24918389553862894,
"grad_norm": 0.17031441628932953,
"learning_rate": 0.00017269782345874203,
"loss": 0.154,
"step": 229
},
{
"epoch": 0.250272034820457,
"grad_norm": 0.16571593284606934,
"learning_rate": 0.00017246007319127545,
"loss": 0.1209,
"step": 230
},
{
"epoch": 0.250272034820457,
"eval_loss": 0.12318640202283859,
"eval_runtime": 24.4163,
"eval_samples_per_second": 15.85,
"eval_steps_per_second": 7.945,
"step": 230
},
{
"epoch": 0.2513601741022851,
"grad_norm": 0.14655253291130066,
"learning_rate": 0.00017222145741734626,
"loss": 0.0879,
"step": 231
},
{
"epoch": 0.25244831338411317,
"grad_norm": 0.17367680370807648,
"learning_rate": 0.00017198197898712404,
"loss": 0.1261,
"step": 232
},
{
"epoch": 0.2535364526659412,
"grad_norm": 0.14948749542236328,
"learning_rate": 0.0001717416407610824,
"loss": 0.0874,
"step": 233
},
{
"epoch": 0.2546245919477693,
"grad_norm": 0.19695116579532623,
"learning_rate": 0.00017150044560996488,
"loss": 0.1119,
"step": 234
},
{
"epoch": 0.25571273122959737,
"grad_norm": 0.2416209876537323,
"learning_rate": 0.00017125839641475072,
"loss": 0.1495,
"step": 235
},
{
"epoch": 0.25680087051142547,
"grad_norm": 0.23595106601715088,
"learning_rate": 0.00017101549606662024,
"loss": 0.092,
"step": 236
},
{
"epoch": 0.2578890097932535,
"grad_norm": 0.3377005457878113,
"learning_rate": 0.00017077174746692056,
"loss": 0.1537,
"step": 237
},
{
"epoch": 0.2589771490750816,
"grad_norm": 0.31011515855789185,
"learning_rate": 0.00017052715352713075,
"loss": 0.2351,
"step": 238
},
{
"epoch": 0.2600652883569097,
"grad_norm": 0.2296973615884781,
"learning_rate": 0.00017028171716882714,
"loss": 0.1034,
"step": 239
},
{
"epoch": 0.2611534276387378,
"grad_norm": 0.33184927701950073,
"learning_rate": 0.00017003544132364846,
"loss": 0.1518,
"step": 240
},
{
"epoch": 0.2622415669205658,
"grad_norm": 0.333794504404068,
"learning_rate": 0.00016978832893326074,
"loss": 0.1167,
"step": 241
},
{
"epoch": 0.26332970620239393,
"grad_norm": 0.33567357063293457,
"learning_rate": 0.00016954038294932216,
"loss": 0.1672,
"step": 242
},
{
"epoch": 0.264417845484222,
"grad_norm": 0.3648099899291992,
"learning_rate": 0.0001692916063334479,
"loss": 0.1562,
"step": 243
},
{
"epoch": 0.26550598476605003,
"grad_norm": 0.3762454092502594,
"learning_rate": 0.0001690420020571747,
"loss": 0.1495,
"step": 244
},
{
"epoch": 0.26659412404787813,
"grad_norm": 0.42424383759498596,
"learning_rate": 0.00016879157310192535,
"loss": 0.1763,
"step": 245
},
{
"epoch": 0.2676822633297062,
"grad_norm": 0.4968826472759247,
"learning_rate": 0.00016854032245897308,
"loss": 0.2473,
"step": 246
},
{
"epoch": 0.2687704026115343,
"grad_norm": 0.5231485366821289,
"learning_rate": 0.00016828825312940592,
"loss": 0.2924,
"step": 247
},
{
"epoch": 0.26985854189336234,
"grad_norm": 0.5466935634613037,
"learning_rate": 0.00016803536812409075,
"loss": 0.2519,
"step": 248
},
{
"epoch": 0.27094668117519044,
"grad_norm": 0.6696439981460571,
"learning_rate": 0.00016778167046363734,
"loss": 0.2106,
"step": 249
},
{
"epoch": 0.2720348204570185,
"grad_norm": 0.7066907286643982,
"learning_rate": 0.00016752716317836229,
"loss": 0.2733,
"step": 250
},
{
"epoch": 0.2731229597388466,
"grad_norm": 0.058309707790613174,
"learning_rate": 0.00016727184930825288,
"loss": 0.0459,
"step": 251
},
{
"epoch": 0.27421109902067464,
"grad_norm": 0.06278934329748154,
"learning_rate": 0.00016701573190293077,
"loss": 0.049,
"step": 252
},
{
"epoch": 0.27529923830250275,
"grad_norm": 0.07942797988653183,
"learning_rate": 0.00016675881402161536,
"loss": 0.0757,
"step": 253
},
{
"epoch": 0.2763873775843308,
"grad_norm": 0.0874176099896431,
"learning_rate": 0.00016650109873308765,
"loss": 0.0952,
"step": 254
},
{
"epoch": 0.27747551686615884,
"grad_norm": 0.0788157656788826,
"learning_rate": 0.0001662425891156531,
"loss": 0.0655,
"step": 255
},
{
"epoch": 0.27856365614798695,
"grad_norm": 0.08784733712673187,
"learning_rate": 0.00016598328825710533,
"loss": 0.0778,
"step": 256
},
{
"epoch": 0.279651795429815,
"grad_norm": 0.09089700132608414,
"learning_rate": 0.00016572319925468892,
"loss": 0.0767,
"step": 257
},
{
"epoch": 0.2807399347116431,
"grad_norm": 0.07957662642002106,
"learning_rate": 0.0001654623252150624,
"loss": 0.0623,
"step": 258
},
{
"epoch": 0.28182807399347115,
"grad_norm": 0.08320681005716324,
"learning_rate": 0.00016520066925426144,
"loss": 0.0812,
"step": 259
},
{
"epoch": 0.28291621327529926,
"grad_norm": 0.10143834352493286,
"learning_rate": 0.00016493823449766136,
"loss": 0.0953,
"step": 260
},
{
"epoch": 0.2840043525571273,
"grad_norm": 0.1192905604839325,
"learning_rate": 0.00016467502407993992,
"loss": 0.1163,
"step": 261
},
{
"epoch": 0.2850924918389554,
"grad_norm": 0.11428846418857574,
"learning_rate": 0.0001644110411450398,
"loss": 0.1028,
"step": 262
},
{
"epoch": 0.28618063112078346,
"grad_norm": 0.11233223229646683,
"learning_rate": 0.00016414628884613107,
"loss": 0.091,
"step": 263
},
{
"epoch": 0.28726877040261156,
"grad_norm": 0.10367966443300247,
"learning_rate": 0.00016388077034557355,
"loss": 0.0797,
"step": 264
},
{
"epoch": 0.2883569096844396,
"grad_norm": 0.11604032665491104,
"learning_rate": 0.00016361448881487914,
"loss": 0.0919,
"step": 265
},
{
"epoch": 0.28944504896626766,
"grad_norm": 0.10309276729822159,
"learning_rate": 0.00016334744743467364,
"loss": 0.1065,
"step": 266
},
{
"epoch": 0.29053318824809576,
"grad_norm": 0.11475658416748047,
"learning_rate": 0.00016307964939465914,
"loss": 0.0959,
"step": 267
},
{
"epoch": 0.2916213275299238,
"grad_norm": 0.1230575293302536,
"learning_rate": 0.0001628110978935756,
"loss": 0.1031,
"step": 268
},
{
"epoch": 0.2927094668117519,
"grad_norm": 0.1267620474100113,
"learning_rate": 0.00016254179613916278,
"loss": 0.1219,
"step": 269
},
{
"epoch": 0.29379760609357997,
"grad_norm": 0.1032036617398262,
"learning_rate": 0.000162271747348122,
"loss": 0.0792,
"step": 270
},
{
"epoch": 0.29488574537540807,
"grad_norm": 0.10867134481668472,
"learning_rate": 0.00016200095474607753,
"loss": 0.0964,
"step": 271
},
{
"epoch": 0.2959738846572361,
"grad_norm": 0.13934585452079773,
"learning_rate": 0.0001617294215675382,
"loss": 0.1493,
"step": 272
},
{
"epoch": 0.2970620239390642,
"grad_norm": 0.1254916936159134,
"learning_rate": 0.0001614571510558588,
"loss": 0.1035,
"step": 273
},
{
"epoch": 0.2981501632208923,
"grad_norm": 0.10226383805274963,
"learning_rate": 0.0001611841464632011,
"loss": 0.0777,
"step": 274
},
{
"epoch": 0.2992383025027203,
"grad_norm": 0.11369970440864563,
"learning_rate": 0.0001609104110504954,
"loss": 0.0823,
"step": 275
},
{
"epoch": 0.3003264417845484,
"grad_norm": 0.11098276078701019,
"learning_rate": 0.00016063594808740113,
"loss": 0.0976,
"step": 276
},
{
"epoch": 0.3014145810663765,
"grad_norm": 0.13366885483264923,
"learning_rate": 0.00016036076085226814,
"loss": 0.1378,
"step": 277
},
{
"epoch": 0.3025027203482046,
"grad_norm": 0.11494230479001999,
"learning_rate": 0.00016008485263209742,
"loss": 0.072,
"step": 278
},
{
"epoch": 0.30359085963003263,
"grad_norm": 0.11145862936973572,
"learning_rate": 0.0001598082267225018,
"loss": 0.066,
"step": 279
},
{
"epoch": 0.30467899891186073,
"grad_norm": 0.1483200490474701,
"learning_rate": 0.0001595308864276666,
"loss": 0.1023,
"step": 280
},
{
"epoch": 0.3057671381936888,
"grad_norm": 0.12836772203445435,
"learning_rate": 0.0001592528350603103,
"loss": 0.0682,
"step": 281
},
{
"epoch": 0.3068552774755169,
"grad_norm": 0.16118410229682922,
"learning_rate": 0.00015897407594164467,
"loss": 0.1025,
"step": 282
},
{
"epoch": 0.30794341675734493,
"grad_norm": 0.22559022903442383,
"learning_rate": 0.0001586946124013354,
"loss": 0.1228,
"step": 283
},
{
"epoch": 0.30903155603917304,
"grad_norm": 0.233434796333313,
"learning_rate": 0.0001584144477774623,
"loss": 0.1928,
"step": 284
},
{
"epoch": 0.3101196953210011,
"grad_norm": 0.21861650049686432,
"learning_rate": 0.00015813358541647915,
"loss": 0.1054,
"step": 285
},
{
"epoch": 0.31120783460282914,
"grad_norm": 0.2723356783390045,
"learning_rate": 0.00015785202867317407,
"loss": 0.1411,
"step": 286
},
{
"epoch": 0.31229597388465724,
"grad_norm": 0.3065739870071411,
"learning_rate": 0.0001575697809106292,
"loss": 0.1785,
"step": 287
},
{
"epoch": 0.3133841131664853,
"grad_norm": 0.2983495593070984,
"learning_rate": 0.00015728684550018064,
"loss": 0.1402,
"step": 288
},
{
"epoch": 0.3144722524483134,
"grad_norm": 0.3250825107097626,
"learning_rate": 0.00015700322582137827,
"loss": 0.1929,
"step": 289
},
{
"epoch": 0.31556039173014144,
"grad_norm": 0.35388973355293274,
"learning_rate": 0.00015671892526194516,
"loss": 0.1791,
"step": 290
},
{
"epoch": 0.31664853101196955,
"grad_norm": 0.32610148191452026,
"learning_rate": 0.0001564339472177373,
"loss": 0.1289,
"step": 291
},
{
"epoch": 0.3177366702937976,
"grad_norm": 0.4028049409389496,
"learning_rate": 0.0001561482950927029,
"loss": 0.2026,
"step": 292
},
{
"epoch": 0.3188248095756257,
"grad_norm": 0.2420492172241211,
"learning_rate": 0.00015586197229884184,
"loss": 0.098,
"step": 293
},
{
"epoch": 0.31991294885745375,
"grad_norm": 0.3512971103191376,
"learning_rate": 0.00015557498225616487,
"loss": 0.205,
"step": 294
},
{
"epoch": 0.32100108813928185,
"grad_norm": 0.39271989464759827,
"learning_rate": 0.00015528732839265272,
"loss": 0.1473,
"step": 295
},
{
"epoch": 0.3220892274211099,
"grad_norm": 0.3802226185798645,
"learning_rate": 0.0001549990141442153,
"loss": 0.136,
"step": 296
},
{
"epoch": 0.32317736670293795,
"grad_norm": 0.5737869739532471,
"learning_rate": 0.00015471004295465035,
"loss": 0.3053,
"step": 297
},
{
"epoch": 0.32426550598476606,
"grad_norm": 0.45224013924598694,
"learning_rate": 0.00015442041827560274,
"loss": 0.222,
"step": 298
},
{
"epoch": 0.3253536452665941,
"grad_norm": 0.522432267665863,
"learning_rate": 0.00015413014356652286,
"loss": 0.1809,
"step": 299
},
{
"epoch": 0.3264417845484222,
"grad_norm": 0.6229780316352844,
"learning_rate": 0.00015383922229462549,
"loss": 0.2081,
"step": 300
},
{
"epoch": 0.32752992383025026,
"grad_norm": 0.053111448884010315,
"learning_rate": 0.00015354765793484834,
"loss": 0.0414,
"step": 301
},
{
"epoch": 0.32861806311207836,
"grad_norm": 0.07464036345481873,
"learning_rate": 0.0001532554539698105,
"loss": 0.0639,
"step": 302
},
{
"epoch": 0.3297062023939064,
"grad_norm": 0.08635352551937103,
"learning_rate": 0.00015296261388977108,
"loss": 0.074,
"step": 303
},
{
"epoch": 0.3307943416757345,
"grad_norm": 0.0818236917257309,
"learning_rate": 0.000152669141192587,
"loss": 0.0843,
"step": 304
},
{
"epoch": 0.33188248095756256,
"grad_norm": 0.08959626406431198,
"learning_rate": 0.00015237503938367186,
"loss": 0.0752,
"step": 305
},
{
"epoch": 0.33297062023939067,
"grad_norm": 0.087018683552742,
"learning_rate": 0.00015208031197595356,
"loss": 0.074,
"step": 306
},
{
"epoch": 0.3340587595212187,
"grad_norm": 0.10946961492300034,
"learning_rate": 0.00015178496248983254,
"loss": 0.0907,
"step": 307
},
{
"epoch": 0.33514689880304677,
"grad_norm": 0.09914237260818481,
"learning_rate": 0.00015148899445313981,
"loss": 0.0939,
"step": 308
},
{
"epoch": 0.33623503808487487,
"grad_norm": 0.07641992717981339,
"learning_rate": 0.00015119241140109467,
"loss": 0.0587,
"step": 309
},
{
"epoch": 0.3373231773667029,
"grad_norm": 0.10857319831848145,
"learning_rate": 0.00015089521687626243,
"loss": 0.1052,
"step": 310
},
{
"epoch": 0.338411316648531,
"grad_norm": 0.0868939459323883,
"learning_rate": 0.0001505974144285124,
"loss": 0.0723,
"step": 311
},
{
"epoch": 0.3394994559303591,
"grad_norm": 0.11470666527748108,
"learning_rate": 0.00015029900761497506,
"loss": 0.0972,
"step": 312
},
{
"epoch": 0.3405875952121872,
"grad_norm": 0.09828225523233414,
"learning_rate": 0.00015000000000000001,
"loss": 0.0904,
"step": 313
},
{
"epoch": 0.3416757344940152,
"grad_norm": 0.09422045201063156,
"learning_rate": 0.00014970039515511304,
"loss": 0.0736,
"step": 314
},
{
"epoch": 0.34276387377584333,
"grad_norm": 0.09876245260238647,
"learning_rate": 0.0001494001966589736,
"loss": 0.0849,
"step": 315
},
{
"epoch": 0.3438520130576714,
"grad_norm": 0.1073005348443985,
"learning_rate": 0.00014909940809733222,
"loss": 0.0842,
"step": 316
},
{
"epoch": 0.34494015233949943,
"grad_norm": 0.11519600450992584,
"learning_rate": 0.00014879803306298736,
"loss": 0.0901,
"step": 317
},
{
"epoch": 0.34602829162132753,
"grad_norm": 0.10380937904119492,
"learning_rate": 0.00014849607515574276,
"loss": 0.0688,
"step": 318
},
{
"epoch": 0.3471164309031556,
"grad_norm": 0.10230353474617004,
"learning_rate": 0.00014819353798236427,
"loss": 0.064,
"step": 319
},
{
"epoch": 0.3482045701849837,
"grad_norm": 0.10846245288848877,
"learning_rate": 0.00014789042515653687,
"loss": 0.0815,
"step": 320
},
{
"epoch": 0.34929270946681173,
"grad_norm": 0.11520566791296005,
"learning_rate": 0.00014758674029882152,
"loss": 0.0846,
"step": 321
},
{
"epoch": 0.35038084874863984,
"grad_norm": 0.16834412515163422,
"learning_rate": 0.00014728248703661182,
"loss": 0.1249,
"step": 322
},
{
"epoch": 0.3514689880304679,
"grad_norm": 0.11053828150033951,
"learning_rate": 0.00014697766900409074,
"loss": 0.073,
"step": 323
},
{
"epoch": 0.352557127312296,
"grad_norm": 0.12219499796628952,
"learning_rate": 0.0001466722898421873,
"loss": 0.0943,
"step": 324
},
{
"epoch": 0.35364526659412404,
"grad_norm": 0.1294214427471161,
"learning_rate": 0.00014636635319853275,
"loss": 0.0761,
"step": 325
},
{
"epoch": 0.35473340587595215,
"grad_norm": 0.13043484091758728,
"learning_rate": 0.00014605986272741748,
"loss": 0.1065,
"step": 326
},
{
"epoch": 0.3558215451577802,
"grad_norm": 0.1078469529747963,
"learning_rate": 0.00014575282208974702,
"loss": 0.0718,
"step": 327
},
{
"epoch": 0.35690968443960824,
"grad_norm": 0.17083537578582764,
"learning_rate": 0.00014544523495299842,
"loss": 0.1035,
"step": 328
},
{
"epoch": 0.35799782372143635,
"grad_norm": 0.1370207518339157,
"learning_rate": 0.00014513710499117647,
"loss": 0.089,
"step": 329
},
{
"epoch": 0.3590859630032644,
"grad_norm": 0.1698474884033203,
"learning_rate": 0.00014482843588476974,
"loss": 0.1172,
"step": 330
},
{
"epoch": 0.3601741022850925,
"grad_norm": 0.1472265124320984,
"learning_rate": 0.0001445192313207067,
"loss": 0.0782,
"step": 331
},
{
"epoch": 0.36126224156692055,
"grad_norm": 0.153669074177742,
"learning_rate": 0.00014420949499231172,
"loss": 0.0844,
"step": 332
},
{
"epoch": 0.36235038084874865,
"grad_norm": 0.2612091600894928,
"learning_rate": 0.00014389923059926062,
"loss": 0.1256,
"step": 333
},
{
"epoch": 0.3634385201305767,
"grad_norm": 0.18867933750152588,
"learning_rate": 0.00014358844184753712,
"loss": 0.0822,
"step": 334
},
{
"epoch": 0.3645266594124048,
"grad_norm": 0.29405227303504944,
"learning_rate": 0.0001432771324493879,
"loss": 0.1562,
"step": 335
},
{
"epoch": 0.36561479869423286,
"grad_norm": 0.299411803483963,
"learning_rate": 0.00014296530612327863,
"loss": 0.151,
"step": 336
},
{
"epoch": 0.36670293797606096,
"grad_norm": 0.4020368754863739,
"learning_rate": 0.00014265296659384956,
"loss": 0.2564,
"step": 337
},
{
"epoch": 0.367791077257889,
"grad_norm": 0.27561965584754944,
"learning_rate": 0.00014234011759187083,
"loss": 0.1193,
"step": 338
},
{
"epoch": 0.36887921653971706,
"grad_norm": 0.36899837851524353,
"learning_rate": 0.00014202676285419812,
"loss": 0.1844,
"step": 339
},
{
"epoch": 0.36996735582154516,
"grad_norm": 0.3305605351924896,
"learning_rate": 0.0001417129061237278,
"loss": 0.0825,
"step": 340
},
{
"epoch": 0.3710554951033732,
"grad_norm": 0.45063266158103943,
"learning_rate": 0.00014139855114935252,
"loss": 0.2383,
"step": 341
},
{
"epoch": 0.3721436343852013,
"grad_norm": 0.319297730922699,
"learning_rate": 0.0001410837016859161,
"loss": 0.1054,
"step": 342
},
{
"epoch": 0.37323177366702937,
"grad_norm": 0.3711492121219635,
"learning_rate": 0.00014076836149416887,
"loss": 0.1265,
"step": 343
},
{
"epoch": 0.37431991294885747,
"grad_norm": 0.4362325966358185,
"learning_rate": 0.0001404525343407228,
"loss": 0.1818,
"step": 344
},
{
"epoch": 0.3754080522306855,
"grad_norm": 0.39839836955070496,
"learning_rate": 0.00014013622399800627,
"loss": 0.173,
"step": 345
},
{
"epoch": 0.3764961915125136,
"grad_norm": 0.4215060770511627,
"learning_rate": 0.00013981943424421932,
"loss": 0.1225,
"step": 346
},
{
"epoch": 0.37758433079434167,
"grad_norm": 0.4466668963432312,
"learning_rate": 0.0001395021688632882,
"loss": 0.1336,
"step": 347
},
{
"epoch": 0.3786724700761698,
"grad_norm": 0.490313321352005,
"learning_rate": 0.00013918443164482046,
"loss": 0.1364,
"step": 348
},
{
"epoch": 0.3797606093579978,
"grad_norm": 0.6865617036819458,
"learning_rate": 0.00013886622638405952,
"loss": 0.2864,
"step": 349
},
{
"epoch": 0.3808487486398259,
"grad_norm": 0.7716324925422668,
"learning_rate": 0.0001385475568818394,
"loss": 0.2994,
"step": 350
},
{
"epoch": 0.381936887921654,
"grad_norm": 0.04748038947582245,
"learning_rate": 0.00013822842694453924,
"loss": 0.0425,
"step": 351
},
{
"epoch": 0.383025027203482,
"grad_norm": 0.06222306191921234,
"learning_rate": 0.00013790884038403795,
"loss": 0.0519,
"step": 352
},
{
"epoch": 0.38411316648531013,
"grad_norm": 0.07450418174266815,
"learning_rate": 0.0001375888010176686,
"loss": 0.0629,
"step": 353
},
{
"epoch": 0.3852013057671382,
"grad_norm": 0.08733393251895905,
"learning_rate": 0.00013726831266817278,
"loss": 0.0701,
"step": 354
},
{
"epoch": 0.3862894450489663,
"grad_norm": 0.09622704982757568,
"learning_rate": 0.00013694737916365517,
"loss": 0.0909,
"step": 355
},
{
"epoch": 0.38737758433079433,
"grad_norm": 0.08062370121479034,
"learning_rate": 0.00013662600433753745,
"loss": 0.0722,
"step": 356
},
{
"epoch": 0.38846572361262244,
"grad_norm": 0.09811591356992722,
"learning_rate": 0.00013630419202851284,
"loss": 0.0894,
"step": 357
},
{
"epoch": 0.3895538628944505,
"grad_norm": 0.0917980894446373,
"learning_rate": 0.0001359819460805001,
"loss": 0.0816,
"step": 358
},
{
"epoch": 0.3906420021762786,
"grad_norm": 0.08292034268379211,
"learning_rate": 0.0001356592703425976,
"loss": 0.0832,
"step": 359
},
{
"epoch": 0.39173014145810664,
"grad_norm": 0.0940559059381485,
"learning_rate": 0.00013533616866903735,
"loss": 0.078,
"step": 360
},
{
"epoch": 0.3928182807399347,
"grad_norm": 0.09960496425628662,
"learning_rate": 0.00013501264491913906,
"loss": 0.0899,
"step": 361
},
{
"epoch": 0.3939064200217628,
"grad_norm": 0.1174091249704361,
"learning_rate": 0.00013468870295726398,
"loss": 0.0983,
"step": 362
},
{
"epoch": 0.39499455930359084,
"grad_norm": 0.1083730086684227,
"learning_rate": 0.00013436434665276865,
"loss": 0.1004,
"step": 363
},
{
"epoch": 0.39608269858541895,
"grad_norm": 0.09829343855381012,
"learning_rate": 0.00013403957987995882,
"loss": 0.0851,
"step": 364
},
{
"epoch": 0.397170837867247,
"grad_norm": 0.1172933354973793,
"learning_rate": 0.00013371440651804313,
"loss": 0.1033,
"step": 365
},
{
"epoch": 0.3982589771490751,
"grad_norm": 0.11004797369241714,
"learning_rate": 0.00013338883045108674,
"loss": 0.0852,
"step": 366
},
{
"epoch": 0.39934711643090315,
"grad_norm": 0.10466606914997101,
"learning_rate": 0.00013306285556796495,
"loss": 0.0893,
"step": 367
},
{
"epoch": 0.40043525571273125,
"grad_norm": 0.121376633644104,
"learning_rate": 0.0001327364857623168,
"loss": 0.1037,
"step": 368
},
{
"epoch": 0.4015233949945593,
"grad_norm": 0.10333437472581863,
"learning_rate": 0.00013240972493249847,
"loss": 0.0981,
"step": 369
},
{
"epoch": 0.40261153427638735,
"grad_norm": 0.1174560934305191,
"learning_rate": 0.00013208257698153677,
"loss": 0.1187,
"step": 370
},
{
"epoch": 0.40369967355821545,
"grad_norm": 0.09671124815940857,
"learning_rate": 0.0001317550458170826,
"loss": 0.0771,
"step": 371
},
{
"epoch": 0.4047878128400435,
"grad_norm": 0.11311496794223785,
"learning_rate": 0.00013142713535136414,
"loss": 0.0915,
"step": 372
},
{
"epoch": 0.4058759521218716,
"grad_norm": 0.11149045825004578,
"learning_rate": 0.00013109884950114007,
"loss": 0.078,
"step": 373
},
{
"epoch": 0.40696409140369966,
"grad_norm": 0.15049664676189423,
"learning_rate": 0.00013077019218765305,
"loss": 0.1008,
"step": 374
},
{
"epoch": 0.40805223068552776,
"grad_norm": 0.13566477596759796,
"learning_rate": 0.0001304411673365826,
"loss": 0.1116,
"step": 375
},
{
"epoch": 0.4091403699673558,
"grad_norm": 0.1317652463912964,
"learning_rate": 0.00013011177887799845,
"loss": 0.1068,
"step": 376
},
{
"epoch": 0.4102285092491839,
"grad_norm": 0.12117652595043182,
"learning_rate": 0.00012978203074631334,
"loss": 0.0926,
"step": 377
},
{
"epoch": 0.41131664853101196,
"grad_norm": 0.13246335089206696,
"learning_rate": 0.00012945192688023624,
"loss": 0.0867,
"step": 378
},
{
"epoch": 0.41240478781284007,
"grad_norm": 0.1427900642156601,
"learning_rate": 0.00012912147122272523,
"loss": 0.1054,
"step": 379
},
{
"epoch": 0.4134929270946681,
"grad_norm": 0.13975268602371216,
"learning_rate": 0.0001287906677209403,
"loss": 0.0993,
"step": 380
},
{
"epoch": 0.41458106637649617,
"grad_norm": 0.16829046607017517,
"learning_rate": 0.0001284595203261965,
"loss": 0.0986,
"step": 381
},
{
"epoch": 0.41566920565832427,
"grad_norm": 0.18288354575634003,
"learning_rate": 0.00012812803299391628,
"loss": 0.1164,
"step": 382
},
{
"epoch": 0.4167573449401523,
"grad_norm": 0.20097504556179047,
"learning_rate": 0.00012779620968358273,
"loss": 0.1273,
"step": 383
},
{
"epoch": 0.4178454842219804,
"grad_norm": 0.1646791398525238,
"learning_rate": 0.00012746405435869198,
"loss": 0.0833,
"step": 384
},
{
"epoch": 0.41893362350380847,
"grad_norm": 0.1997787058353424,
"learning_rate": 0.0001271315709867059,
"loss": 0.1495,
"step": 385
},
{
"epoch": 0.4200217627856366,
"grad_norm": 0.1489897519350052,
"learning_rate": 0.00012679876353900482,
"loss": 0.0756,
"step": 386
},
{
"epoch": 0.4211099020674646,
"grad_norm": 0.22502455115318298,
"learning_rate": 0.00012646563599083996,
"loss": 0.1427,
"step": 387
},
{
"epoch": 0.42219804134929273,
"grad_norm": 0.19359458982944489,
"learning_rate": 0.00012613219232128608,
"loss": 0.121,
"step": 388
},
{
"epoch": 0.4232861806311208,
"grad_norm": 0.244260773062706,
"learning_rate": 0.0001257984365131938,
"loss": 0.1469,
"step": 389
},
{
"epoch": 0.4243743199129489,
"grad_norm": 0.18485282361507416,
"learning_rate": 0.00012546437255314222,
"loss": 0.0892,
"step": 390
},
{
"epoch": 0.42546245919477693,
"grad_norm": 0.3717290461063385,
"learning_rate": 0.00012513000443139112,
"loss": 0.2099,
"step": 391
},
{
"epoch": 0.426550598476605,
"grad_norm": 0.28721094131469727,
"learning_rate": 0.00012479533614183334,
"loss": 0.1193,
"step": 392
},
{
"epoch": 0.4276387377584331,
"grad_norm": 0.2697299122810364,
"learning_rate": 0.00012446037168194714,
"loss": 0.0965,
"step": 393
},
{
"epoch": 0.42872687704026113,
"grad_norm": 0.32627496123313904,
"learning_rate": 0.00012412511505274844,
"loss": 0.1832,
"step": 394
},
{
"epoch": 0.42981501632208924,
"grad_norm": 0.37745073437690735,
"learning_rate": 0.000123789570258743,
"loss": 0.1572,
"step": 395
},
{
"epoch": 0.4309031556039173,
"grad_norm": 0.4901193082332611,
"learning_rate": 0.00012345374130787854,
"loss": 0.1873,
"step": 396
},
{
"epoch": 0.4319912948857454,
"grad_norm": 0.44663485884666443,
"learning_rate": 0.000123117632211497,
"loss": 0.3353,
"step": 397
},
{
"epoch": 0.43307943416757344,
"grad_norm": 0.34345391392707825,
"learning_rate": 0.0001227812469842864,
"loss": 0.1924,
"step": 398
},
{
"epoch": 0.43416757344940154,
"grad_norm": 0.5725805759429932,
"learning_rate": 0.00012244458964423327,
"loss": 0.2352,
"step": 399
},
{
"epoch": 0.4352557127312296,
"grad_norm": 0.5519152879714966,
"learning_rate": 0.0001221076642125742,
"loss": 0.167,
"step": 400
},
{
"epoch": 0.4363438520130577,
"grad_norm": 0.055198315531015396,
"learning_rate": 0.00012177047471374807,
"loss": 0.0472,
"step": 401
},
{
"epoch": 0.43743199129488575,
"grad_norm": 0.09932799637317657,
"learning_rate": 0.0001214330251753481,
"loss": 0.0783,
"step": 402
},
{
"epoch": 0.4385201305767138,
"grad_norm": 0.08226185292005539,
"learning_rate": 0.00012109531962807332,
"loss": 0.0656,
"step": 403
},
{
"epoch": 0.4396082698585419,
"grad_norm": 0.0858379453420639,
"learning_rate": 0.0001207573621056809,
"loss": 0.0741,
"step": 404
},
{
"epoch": 0.44069640914036995,
"grad_norm": 0.07838830351829529,
"learning_rate": 0.00012041915664493761,
"loss": 0.066,
"step": 405
},
{
"epoch": 0.44178454842219805,
"grad_norm": 0.08843716233968735,
"learning_rate": 0.00012008070728557186,
"loss": 0.0817,
"step": 406
},
{
"epoch": 0.4428726877040261,
"grad_norm": 0.09485173225402832,
"learning_rate": 0.00011974201807022525,
"loss": 0.0719,
"step": 407
},
{
"epoch": 0.4439608269858542,
"grad_norm": 0.12550269067287445,
"learning_rate": 0.00011940309304440433,
"loss": 0.1025,
"step": 408
},
{
"epoch": 0.44504896626768226,
"grad_norm": 0.10056477040052414,
"learning_rate": 0.00011906393625643244,
"loss": 0.0822,
"step": 409
},
{
"epoch": 0.44613710554951036,
"grad_norm": 0.08779609948396683,
"learning_rate": 0.00011872455175740112,
"loss": 0.0731,
"step": 410
},
{
"epoch": 0.4472252448313384,
"grad_norm": 0.08771763741970062,
"learning_rate": 0.00011838494360112185,
"loss": 0.0686,
"step": 411
},
{
"epoch": 0.44831338411316646,
"grad_norm": 0.09602241218090057,
"learning_rate": 0.00011804511584407763,
"loss": 0.0826,
"step": 412
},
{
"epoch": 0.44940152339499456,
"grad_norm": 0.10052221268415451,
"learning_rate": 0.00011770507254537453,
"loss": 0.0711,
"step": 413
},
{
"epoch": 0.4504896626768226,
"grad_norm": 0.08452215045690536,
"learning_rate": 0.00011736481776669306,
"loss": 0.061,
"step": 414
},
{
"epoch": 0.4515778019586507,
"grad_norm": 0.09362675249576569,
"learning_rate": 0.00011702435557223987,
"loss": 0.0852,
"step": 415
},
{
"epoch": 0.45266594124047876,
"grad_norm": 0.10676004737615585,
"learning_rate": 0.00011668369002869912,
"loss": 0.101,
"step": 416
},
{
"epoch": 0.45375408052230687,
"grad_norm": 0.10523767024278641,
"learning_rate": 0.00011634282520518383,
"loss": 0.0892,
"step": 417
},
{
"epoch": 0.4548422198041349,
"grad_norm": 0.09733587503433228,
"learning_rate": 0.00011600176517318741,
"loss": 0.0714,
"step": 418
},
{
"epoch": 0.455930359085963,
"grad_norm": 0.13115546107292175,
"learning_rate": 0.00011566051400653486,
"loss": 0.1079,
"step": 419
},
{
"epoch": 0.45701849836779107,
"grad_norm": 0.10536440461874008,
"learning_rate": 0.00011531907578133429,
"loss": 0.0807,
"step": 420
},
{
"epoch": 0.4581066376496192,
"grad_norm": 0.10071249306201935,
"learning_rate": 0.00011497745457592816,
"loss": 0.0647,
"step": 421
},
{
"epoch": 0.4591947769314472,
"grad_norm": 0.12494815140962601,
"learning_rate": 0.00011463565447084445,
"loss": 0.0969,
"step": 422
},
{
"epoch": 0.4602829162132753,
"grad_norm": 0.10858377069234848,
"learning_rate": 0.00011429367954874819,
"loss": 0.0709,
"step": 423
},
{
"epoch": 0.4613710554951034,
"grad_norm": 0.10477497428655624,
"learning_rate": 0.00011395153389439233,
"loss": 0.0875,
"step": 424
},
{
"epoch": 0.4624591947769314,
"grad_norm": 0.12716920673847198,
"learning_rate": 0.00011360922159456928,
"loss": 0.0864,
"step": 425
},
{
"epoch": 0.46354733405875953,
"grad_norm": 0.13803425431251526,
"learning_rate": 0.00011326674673806195,
"loss": 0.1028,
"step": 426
},
{
"epoch": 0.4646354733405876,
"grad_norm": 0.1662827581167221,
"learning_rate": 0.0001129241134155949,
"loss": 0.1053,
"step": 427
},
{
"epoch": 0.4657236126224157,
"grad_norm": 0.13029906153678894,
"learning_rate": 0.00011258132571978555,
"loss": 0.0797,
"step": 428
},
{
"epoch": 0.46681175190424373,
"grad_norm": 0.18869560956954956,
"learning_rate": 0.00011223838774509514,
"loss": 0.1292,
"step": 429
},
{
"epoch": 0.46789989118607184,
"grad_norm": 0.14279034733772278,
"learning_rate": 0.00011189530358778005,
"loss": 0.0951,
"step": 430
},
{
"epoch": 0.4689880304678999,
"grad_norm": 0.11428643018007278,
"learning_rate": 0.00011155207734584263,
"loss": 0.0664,
"step": 431
},
{
"epoch": 0.470076169749728,
"grad_norm": 0.20008297264575958,
"learning_rate": 0.00011120871311898254,
"loss": 0.1027,
"step": 432
},
{
"epoch": 0.47116430903155604,
"grad_norm": 0.22173888981342316,
"learning_rate": 0.00011086521500854745,
"loss": 0.1356,
"step": 433
},
{
"epoch": 0.4722524483133841,
"grad_norm": 0.2382795661687851,
"learning_rate": 0.00011052158711748434,
"loss": 0.1516,
"step": 434
},
{
"epoch": 0.4733405875952122,
"grad_norm": 0.2854343354701996,
"learning_rate": 0.00011017783355029026,
"loss": 0.1116,
"step": 435
},
{
"epoch": 0.47442872687704024,
"grad_norm": 0.23063793778419495,
"learning_rate": 0.00010983395841296348,
"loss": 0.107,
"step": 436
},
{
"epoch": 0.47551686615886835,
"grad_norm": 0.19402769207954407,
"learning_rate": 0.00010948996581295436,
"loss": 0.0883,
"step": 437
},
{
"epoch": 0.4766050054406964,
"grad_norm": 0.2664678692817688,
"learning_rate": 0.00010914585985911632,
"loss": 0.1161,
"step": 438
},
{
"epoch": 0.4776931447225245,
"grad_norm": 0.29061347246170044,
"learning_rate": 0.00010880164466165674,
"loss": 0.1833,
"step": 439
},
{
"epoch": 0.47878128400435255,
"grad_norm": 0.33060985803604126,
"learning_rate": 0.00010845732433208779,
"loss": 0.1521,
"step": 440
},
{
"epoch": 0.47986942328618065,
"grad_norm": 0.28285855054855347,
"learning_rate": 0.00010811290298317755,
"loss": 0.1248,
"step": 441
},
{
"epoch": 0.4809575625680087,
"grad_norm": 0.49815383553504944,
"learning_rate": 0.00010776838472890065,
"loss": 0.2238,
"step": 442
},
{
"epoch": 0.4820457018498368,
"grad_norm": 0.367214173078537,
"learning_rate": 0.00010742377368438914,
"loss": 0.2344,
"step": 443
},
{
"epoch": 0.48313384113166485,
"grad_norm": 0.3444245159626007,
"learning_rate": 0.00010707907396588361,
"loss": 0.1254,
"step": 444
},
{
"epoch": 0.4842219804134929,
"grad_norm": 0.31096217036247253,
"learning_rate": 0.00010673428969068364,
"loss": 0.1313,
"step": 445
},
{
"epoch": 0.485310119695321,
"grad_norm": 0.5377318263053894,
"learning_rate": 0.0001063894249770989,
"loss": 0.2526,
"step": 446
},
{
"epoch": 0.48639825897714906,
"grad_norm": 0.4121945798397064,
"learning_rate": 0.00010604448394439983,
"loss": 0.1663,
"step": 447
},
{
"epoch": 0.48748639825897716,
"grad_norm": 0.48366662859916687,
"learning_rate": 0.00010569947071276847,
"loss": 0.2457,
"step": 448
},
{
"epoch": 0.4885745375408052,
"grad_norm": 0.6081061959266663,
"learning_rate": 0.0001053543894032493,
"loss": 0.2678,
"step": 449
},
{
"epoch": 0.4896626768226333,
"grad_norm": 1.0334888696670532,
"learning_rate": 0.00010500924413769988,
"loss": 0.3153,
"step": 450
},
{
"epoch": 0.49075081610446136,
"grad_norm": 0.0451701320707798,
"learning_rate": 0.00010466403903874176,
"loss": 0.0455,
"step": 451
},
{
"epoch": 0.49183895538628947,
"grad_norm": 0.05214075744152069,
"learning_rate": 0.00010431877822971117,
"loss": 0.0519,
"step": 452
},
{
"epoch": 0.4929270946681175,
"grad_norm": 0.06553710252046585,
"learning_rate": 0.00010397346583460971,
"loss": 0.0557,
"step": 453
},
{
"epoch": 0.4940152339499456,
"grad_norm": 0.06424305588006973,
"learning_rate": 0.00010362810597805526,
"loss": 0.0672,
"step": 454
},
{
"epoch": 0.49510337323177367,
"grad_norm": 0.08962132036685944,
"learning_rate": 0.00010328270278523256,
"loss": 0.0957,
"step": 455
},
{
"epoch": 0.4961915125136017,
"grad_norm": 0.08780992031097412,
"learning_rate": 0.00010293726038184393,
"loss": 0.0894,
"step": 456
},
{
"epoch": 0.4972796517954298,
"grad_norm": 0.08018220961093903,
"learning_rate": 0.00010259178289406011,
"loss": 0.0663,
"step": 457
},
{
"epoch": 0.49836779107725787,
"grad_norm": 0.07880765199661255,
"learning_rate": 0.0001022462744484709,
"loss": 0.0832,
"step": 458
},
{
"epoch": 0.499455930359086,
"grad_norm": 0.07101229578256607,
"learning_rate": 0.00010190073917203589,
"loss": 0.054,
"step": 459
},
{
"epoch": 0.500544069640914,
"grad_norm": 0.07901884615421295,
"learning_rate": 0.0001015551811920351,
"loss": 0.0611,
"step": 460
},
{
"epoch": 0.500544069640914,
"eval_loss": 0.11412899941205978,
"eval_runtime": 24.4441,
"eval_samples_per_second": 15.832,
"eval_steps_per_second": 7.936,
"step": 460
},
{
"epoch": 0.5016322089227421,
"grad_norm": 0.08245149999856949,
"learning_rate": 0.00010120960463601976,
"loss": 0.0758,
"step": 461
},
{
"epoch": 0.5027203482045702,
"grad_norm": 0.07407300174236298,
"learning_rate": 0.00010086401363176305,
"loss": 0.0567,
"step": 462
},
{
"epoch": 0.5038084874863983,
"grad_norm": 0.08621735125780106,
"learning_rate": 0.00010051841230721065,
"loss": 0.0853,
"step": 463
},
{
"epoch": 0.5048966267682263,
"grad_norm": 0.08985213935375214,
"learning_rate": 0.00010017280479043147,
"loss": 0.0806,
"step": 464
},
{
"epoch": 0.5059847660500544,
"grad_norm": 0.10007097572088242,
"learning_rate": 9.982719520956855e-05,
"loss": 0.0943,
"step": 465
},
{
"epoch": 0.5070729053318824,
"grad_norm": 0.08833252638578415,
"learning_rate": 9.948158769278939e-05,
"loss": 0.0736,
"step": 466
},
{
"epoch": 0.5081610446137106,
"grad_norm": 0.08999834209680557,
"learning_rate": 9.913598636823693e-05,
"loss": 0.0711,
"step": 467
},
{
"epoch": 0.5092491838955386,
"grad_norm": 0.09894470125436783,
"learning_rate": 9.879039536398024e-05,
"loss": 0.0854,
"step": 468
},
{
"epoch": 0.5103373231773667,
"grad_norm": 0.10664485394954681,
"learning_rate": 9.844481880796491e-05,
"loss": 0.0817,
"step": 469
},
{
"epoch": 0.5114254624591947,
"grad_norm": 0.08572715520858765,
"learning_rate": 9.809926082796415e-05,
"loss": 0.0685,
"step": 470
},
{
"epoch": 0.5125136017410229,
"grad_norm": 0.1364021748304367,
"learning_rate": 9.775372555152912e-05,
"loss": 0.1289,
"step": 471
},
{
"epoch": 0.5136017410228509,
"grad_norm": 0.0975506454706192,
"learning_rate": 9.740821710593989e-05,
"loss": 0.0816,
"step": 472
},
{
"epoch": 0.514689880304679,
"grad_norm": 0.10148054361343384,
"learning_rate": 9.70627396181561e-05,
"loss": 0.0685,
"step": 473
},
{
"epoch": 0.515778019586507,
"grad_norm": 0.10723249614238739,
"learning_rate": 9.671729721476746e-05,
"loss": 0.1108,
"step": 474
},
{
"epoch": 0.5168661588683352,
"grad_norm": 0.1059638112783432,
"learning_rate": 9.637189402194476e-05,
"loss": 0.0797,
"step": 475
},
{
"epoch": 0.5179542981501633,
"grad_norm": 0.09612549841403961,
"learning_rate": 9.602653416539031e-05,
"loss": 0.0756,
"step": 476
},
{
"epoch": 0.5190424374319913,
"grad_norm": 0.12177007645368576,
"learning_rate": 9.568122177028884e-05,
"loss": 0.072,
"step": 477
},
{
"epoch": 0.5201305767138193,
"grad_norm": 0.10441110283136368,
"learning_rate": 9.533596096125825e-05,
"loss": 0.082,
"step": 478
},
{
"epoch": 0.5212187159956474,
"grad_norm": 0.1342850774526596,
"learning_rate": 9.499075586230013e-05,
"loss": 0.0877,
"step": 479
},
{
"epoch": 0.5223068552774756,
"grad_norm": 0.14177678525447845,
"learning_rate": 9.464561059675073e-05,
"loss": 0.1105,
"step": 480
},
{
"epoch": 0.5233949945593036,
"grad_norm": 0.14493241906166077,
"learning_rate": 9.430052928723153e-05,
"loss": 0.0975,
"step": 481
},
{
"epoch": 0.5244831338411317,
"grad_norm": 0.22142328321933746,
"learning_rate": 9.395551605560018e-05,
"loss": 0.1441,
"step": 482
},
{
"epoch": 0.5255712731229597,
"grad_norm": 0.13570967316627502,
"learning_rate": 9.361057502290113e-05,
"loss": 0.0952,
"step": 483
},
{
"epoch": 0.5266594124047879,
"grad_norm": 0.16124001145362854,
"learning_rate": 9.326571030931637e-05,
"loss": 0.0976,
"step": 484
},
{
"epoch": 0.5277475516866159,
"grad_norm": 0.17145387828350067,
"learning_rate": 9.292092603411641e-05,
"loss": 0.1024,
"step": 485
},
{
"epoch": 0.528835690968444,
"grad_norm": 0.23006115853786469,
"learning_rate": 9.257622631561085e-05,
"loss": 0.147,
"step": 486
},
{
"epoch": 0.529923830250272,
"grad_norm": 0.2926236391067505,
"learning_rate": 9.223161527109937e-05,
"loss": 0.1706,
"step": 487
},
{
"epoch": 0.5310119695321001,
"grad_norm": 0.2122851312160492,
"learning_rate": 9.188709701682247e-05,
"loss": 0.0793,
"step": 488
},
{
"epoch": 0.5321001088139282,
"grad_norm": 0.30783331394195557,
"learning_rate": 9.154267566791223e-05,
"loss": 0.1446,
"step": 489
},
{
"epoch": 0.5331882480957563,
"grad_norm": 0.26017701625823975,
"learning_rate": 9.119835533834331e-05,
"loss": 0.0925,
"step": 490
},
{
"epoch": 0.5342763873775843,
"grad_norm": 0.3646100163459778,
"learning_rate": 9.085414014088369e-05,
"loss": 0.1893,
"step": 491
},
{
"epoch": 0.5353645266594124,
"grad_norm": 0.3202396035194397,
"learning_rate": 9.051003418704565e-05,
"loss": 0.1389,
"step": 492
},
{
"epoch": 0.5364526659412405,
"grad_norm": 0.420622318983078,
"learning_rate": 9.016604158703654e-05,
"loss": 0.2336,
"step": 493
},
{
"epoch": 0.5375408052230686,
"grad_norm": 0.41953420639038086,
"learning_rate": 8.982216644970979e-05,
"loss": 0.2328,
"step": 494
},
{
"epoch": 0.5386289445048966,
"grad_norm": 0.39154770970344543,
"learning_rate": 8.947841288251568e-05,
"loss": 0.1791,
"step": 495
},
{
"epoch": 0.5397170837867247,
"grad_norm": 0.3094623386859894,
"learning_rate": 8.913478499145254e-05,
"loss": 0.1163,
"step": 496
},
{
"epoch": 0.5408052230685527,
"grad_norm": 0.47343355417251587,
"learning_rate": 8.879128688101749e-05,
"loss": 0.2118,
"step": 497
},
{
"epoch": 0.5418933623503809,
"grad_norm": 0.5772989988327026,
"learning_rate": 8.844792265415738e-05,
"loss": 0.3241,
"step": 498
},
{
"epoch": 0.5429815016322089,
"grad_norm": 0.574259877204895,
"learning_rate": 8.810469641222001e-05,
"loss": 0.1832,
"step": 499
},
{
"epoch": 0.544069640914037,
"grad_norm": 0.7393798828125,
"learning_rate": 8.776161225490489e-05,
"loss": 0.3208,
"step": 500
},
{
"epoch": 0.545157780195865,
"grad_norm": 0.08375679701566696,
"learning_rate": 8.741867428021446e-05,
"loss": 0.0647,
"step": 501
},
{
"epoch": 0.5462459194776932,
"grad_norm": 0.06409952789545059,
"learning_rate": 8.707588658440511e-05,
"loss": 0.0669,
"step": 502
},
{
"epoch": 0.5473340587595212,
"grad_norm": 0.07277576625347137,
"learning_rate": 8.673325326193806e-05,
"loss": 0.069,
"step": 503
},
{
"epoch": 0.5484221980413493,
"grad_norm": 0.0738697499036789,
"learning_rate": 8.639077840543077e-05,
"loss": 0.0729,
"step": 504
},
{
"epoch": 0.5495103373231773,
"grad_norm": 0.07032415270805359,
"learning_rate": 8.604846610560771e-05,
"loss": 0.0619,
"step": 505
},
{
"epoch": 0.5505984766050055,
"grad_norm": 0.08152731508016586,
"learning_rate": 8.570632045125185e-05,
"loss": 0.0984,
"step": 506
},
{
"epoch": 0.5516866158868335,
"grad_norm": 0.08058314025402069,
"learning_rate": 8.536434552915556e-05,
"loss": 0.0727,
"step": 507
},
{
"epoch": 0.5527747551686616,
"grad_norm": 0.08897782117128372,
"learning_rate": 8.502254542407186e-05,
"loss": 0.0844,
"step": 508
},
{
"epoch": 0.5538628944504896,
"grad_norm": 0.07130607962608337,
"learning_rate": 8.468092421866573e-05,
"loss": 0.0612,
"step": 509
},
{
"epoch": 0.5549510337323177,
"grad_norm": 0.09470459073781967,
"learning_rate": 8.433948599346516e-05,
"loss": 0.0983,
"step": 510
},
{
"epoch": 0.5560391730141458,
"grad_norm": 0.10217074304819107,
"learning_rate": 8.399823482681262e-05,
"loss": 0.1066,
"step": 511
},
{
"epoch": 0.5571273122959739,
"grad_norm": 0.08955902606248856,
"learning_rate": 8.36571747948162e-05,
"loss": 0.0754,
"step": 512
},
{
"epoch": 0.558215451577802,
"grad_norm": 0.08967861533164978,
"learning_rate": 8.33163099713009e-05,
"loss": 0.0745,
"step": 513
},
{
"epoch": 0.55930359085963,
"grad_norm": 0.09490852057933807,
"learning_rate": 8.297564442776014e-05,
"loss": 0.0956,
"step": 514
},
{
"epoch": 0.5603917301414582,
"grad_norm": 0.1147465854883194,
"learning_rate": 8.263518223330697e-05,
"loss": 0.0918,
"step": 515
},
{
"epoch": 0.5614798694232862,
"grad_norm": 0.10192292928695679,
"learning_rate": 8.22949274546255e-05,
"loss": 0.0865,
"step": 516
},
{
"epoch": 0.5625680087051143,
"grad_norm": 0.09790289402008057,
"learning_rate": 8.195488415592238e-05,
"loss": 0.0884,
"step": 517
},
{
"epoch": 0.5636561479869423,
"grad_norm": 0.10624364018440247,
"learning_rate": 8.161505639887817e-05,
"loss": 0.0743,
"step": 518
},
{
"epoch": 0.5647442872687704,
"grad_norm": 0.10007826238870621,
"learning_rate": 8.127544824259889e-05,
"loss": 0.077,
"step": 519
},
{
"epoch": 0.5658324265505985,
"grad_norm": 0.10816752165555954,
"learning_rate": 8.093606374356759e-05,
"loss": 0.0967,
"step": 520
},
{
"epoch": 0.5669205658324266,
"grad_norm": 0.09151450544595718,
"learning_rate": 8.059690695559568e-05,
"loss": 0.0559,
"step": 521
},
{
"epoch": 0.5680087051142546,
"grad_norm": 0.09337753057479858,
"learning_rate": 8.025798192977481e-05,
"loss": 0.0666,
"step": 522
},
{
"epoch": 0.5690968443960827,
"grad_norm": 0.07552886009216309,
"learning_rate": 7.991929271442817e-05,
"loss": 0.0427,
"step": 523
},
{
"epoch": 0.5701849836779108,
"grad_norm": 0.125149667263031,
"learning_rate": 7.958084335506239e-05,
"loss": 0.0886,
"step": 524
},
{
"epoch": 0.5712731229597389,
"grad_norm": 0.1259811669588089,
"learning_rate": 7.924263789431912e-05,
"loss": 0.0836,
"step": 525
},
{
"epoch": 0.5723612622415669,
"grad_norm": 0.11561718583106995,
"learning_rate": 7.89046803719267e-05,
"loss": 0.0767,
"step": 526
},
{
"epoch": 0.573449401523395,
"grad_norm": 0.15881969034671783,
"learning_rate": 7.856697482465196e-05,
"loss": 0.132,
"step": 527
},
{
"epoch": 0.5745375408052231,
"grad_norm": 0.11481433361768723,
"learning_rate": 7.822952528625191e-05,
"loss": 0.0772,
"step": 528
},
{
"epoch": 0.5756256800870512,
"grad_norm": 0.133880615234375,
"learning_rate": 7.789233578742582e-05,
"loss": 0.0851,
"step": 529
},
{
"epoch": 0.5767138193688792,
"grad_norm": 0.1513182371854782,
"learning_rate": 7.755541035576677e-05,
"loss": 0.1258,
"step": 530
},
{
"epoch": 0.5778019586507073,
"grad_norm": 0.1432909518480301,
"learning_rate": 7.721875301571359e-05,
"loss": 0.1227,
"step": 531
},
{
"epoch": 0.5788900979325353,
"grad_norm": 0.11642878502607346,
"learning_rate": 7.688236778850306e-05,
"loss": 0.0686,
"step": 532
},
{
"epoch": 0.5799782372143635,
"grad_norm": 0.16764387488365173,
"learning_rate": 7.654625869212146e-05,
"loss": 0.1262,
"step": 533
},
{
"epoch": 0.5810663764961915,
"grad_norm": 0.11564290523529053,
"learning_rate": 7.6210429741257e-05,
"loss": 0.0572,
"step": 534
},
{
"epoch": 0.5821545157780196,
"grad_norm": 0.2554946541786194,
"learning_rate": 7.587488494725157e-05,
"loss": 0.1812,
"step": 535
},
{
"epoch": 0.5832426550598476,
"grad_norm": 0.1616709679365158,
"learning_rate": 7.55396283180529e-05,
"loss": 0.0818,
"step": 536
},
{
"epoch": 0.5843307943416758,
"grad_norm": 0.321591317653656,
"learning_rate": 7.520466385816671e-05,
"loss": 0.2124,
"step": 537
},
{
"epoch": 0.5854189336235038,
"grad_norm": 0.21219317615032196,
"learning_rate": 7.48699955686089e-05,
"loss": 0.1097,
"step": 538
},
{
"epoch": 0.5865070729053319,
"grad_norm": 0.19242917001247406,
"learning_rate": 7.453562744685778e-05,
"loss": 0.0769,
"step": 539
},
{
"epoch": 0.5875952121871599,
"grad_norm": 0.27252575755119324,
"learning_rate": 7.42015634868062e-05,
"loss": 0.123,
"step": 540
},
{
"epoch": 0.588683351468988,
"grad_norm": 0.3768553137779236,
"learning_rate": 7.386780767871397e-05,
"loss": 0.1882,
"step": 541
},
{
"epoch": 0.5897714907508161,
"grad_norm": 0.2910580337047577,
"learning_rate": 7.353436400916004e-05,
"loss": 0.11,
"step": 542
},
{
"epoch": 0.5908596300326442,
"grad_norm": 0.5426351428031921,
"learning_rate": 7.320123646099519e-05,
"loss": 0.2963,
"step": 543
},
{
"epoch": 0.5919477693144722,
"grad_norm": 0.47717493772506714,
"learning_rate": 7.286842901329412e-05,
"loss": 0.1808,
"step": 544
},
{
"epoch": 0.5930359085963003,
"grad_norm": 0.3526507019996643,
"learning_rate": 7.253594564130804e-05,
"loss": 0.1664,
"step": 545
},
{
"epoch": 0.5941240478781284,
"grad_norm": 0.4166988134384155,
"learning_rate": 7.22037903164173e-05,
"loss": 0.199,
"step": 546
},
{
"epoch": 0.5952121871599565,
"grad_norm": 0.3085844814777374,
"learning_rate": 7.187196700608373e-05,
"loss": 0.1008,
"step": 547
},
{
"epoch": 0.5963003264417845,
"grad_norm": 0.6430099606513977,
"learning_rate": 7.154047967380354e-05,
"loss": 0.2083,
"step": 548
},
{
"epoch": 0.5973884657236126,
"grad_norm": 0.5886674523353577,
"learning_rate": 7.12093322790597e-05,
"loss": 0.2135,
"step": 549
},
{
"epoch": 0.5984766050054406,
"grad_norm": 0.6182297468185425,
"learning_rate": 7.087852877727481e-05,
"loss": 0.1715,
"step": 550
},
{
"epoch": 0.5995647442872688,
"grad_norm": 0.052968356758356094,
"learning_rate": 7.054807311976379e-05,
"loss": 0.0482,
"step": 551
},
{
"epoch": 0.6006528835690969,
"grad_norm": 0.05972611904144287,
"learning_rate": 7.021796925368667e-05,
"loss": 0.0518,
"step": 552
},
{
"epoch": 0.6017410228509249,
"grad_norm": 0.06913693249225616,
"learning_rate": 6.988822112200156e-05,
"loss": 0.067,
"step": 553
},
{
"epoch": 0.602829162132753,
"grad_norm": 0.07957234978675842,
"learning_rate": 6.955883266341741e-05,
"loss": 0.0881,
"step": 554
},
{
"epoch": 0.6039173014145811,
"grad_norm": 0.07048328220844269,
"learning_rate": 6.922980781234699e-05,
"loss": 0.073,
"step": 555
},
{
"epoch": 0.6050054406964092,
"grad_norm": 0.07528451830148697,
"learning_rate": 6.890115049885994e-05,
"loss": 0.0777,
"step": 556
},
{
"epoch": 0.6060935799782372,
"grad_norm": 0.07747096568346024,
"learning_rate": 6.85728646486359e-05,
"loss": 0.0758,
"step": 557
},
{
"epoch": 0.6071817192600653,
"grad_norm": 0.10643380880355835,
"learning_rate": 6.82449541829174e-05,
"loss": 0.1274,
"step": 558
},
{
"epoch": 0.6082698585418934,
"grad_norm": 0.08175136148929596,
"learning_rate": 6.791742301846326e-05,
"loss": 0.0838,
"step": 559
},
{
"epoch": 0.6093579978237215,
"grad_norm": 0.09339357912540436,
"learning_rate": 6.759027506750158e-05,
"loss": 0.0767,
"step": 560
},
{
"epoch": 0.6104461371055495,
"grad_norm": 0.07980689406394958,
"learning_rate": 6.726351423768322e-05,
"loss": 0.0645,
"step": 561
},
{
"epoch": 0.6115342763873776,
"grad_norm": 0.09120305627584457,
"learning_rate": 6.693714443203507e-05,
"loss": 0.0763,
"step": 562
},
{
"epoch": 0.6126224156692056,
"grad_norm": 0.09666004031896591,
"learning_rate": 6.661116954891328e-05,
"loss": 0.0749,
"step": 563
},
{
"epoch": 0.6137105549510338,
"grad_norm": 0.09731943160295486,
"learning_rate": 6.62855934819569e-05,
"loss": 0.0891,
"step": 564
},
{
"epoch": 0.6147986942328618,
"grad_norm": 0.09505044668912888,
"learning_rate": 6.59604201200412e-05,
"loss": 0.0734,
"step": 565
},
{
"epoch": 0.6158868335146899,
"grad_norm": 0.09885645657777786,
"learning_rate": 6.563565334723134e-05,
"loss": 0.0726,
"step": 566
},
{
"epoch": 0.6169749727965179,
"grad_norm": 0.10182873159646988,
"learning_rate": 6.531129704273604e-05,
"loss": 0.0907,
"step": 567
},
{
"epoch": 0.6180631120783461,
"grad_norm": 0.10983593761920929,
"learning_rate": 6.498735508086093e-05,
"loss": 0.0774,
"step": 568
},
{
"epoch": 0.6191512513601741,
"grad_norm": 0.13069598376750946,
"learning_rate": 6.466383133096267e-05,
"loss": 0.1334,
"step": 569
},
{
"epoch": 0.6202393906420022,
"grad_norm": 0.10764119029045105,
"learning_rate": 6.434072965740242e-05,
"loss": 0.0921,
"step": 570
},
{
"epoch": 0.6213275299238302,
"grad_norm": 0.10900213569402695,
"learning_rate": 6.40180539194999e-05,
"loss": 0.0842,
"step": 571
},
{
"epoch": 0.6224156692056583,
"grad_norm": 0.13261805474758148,
"learning_rate": 6.369580797148718e-05,
"loss": 0.1045,
"step": 572
},
{
"epoch": 0.6235038084874864,
"grad_norm": 0.13680216670036316,
"learning_rate": 6.337399566246257e-05,
"loss": 0.1367,
"step": 573
},
{
"epoch": 0.6245919477693145,
"grad_norm": 0.13827918469905853,
"learning_rate": 6.305262083634488e-05,
"loss": 0.1063,
"step": 574
},
{
"epoch": 0.6256800870511425,
"grad_norm": 0.0938330814242363,
"learning_rate": 6.273168733182722e-05,
"loss": 0.0686,
"step": 575
},
{
"epoch": 0.6267682263329706,
"grad_norm": 0.14945276081562042,
"learning_rate": 6.241119898233144e-05,
"loss": 0.1373,
"step": 576
},
{
"epoch": 0.6278563656147987,
"grad_norm": 0.10897387564182281,
"learning_rate": 6.209115961596208e-05,
"loss": 0.068,
"step": 577
},
{
"epoch": 0.6289445048966268,
"grad_norm": 0.11899581551551819,
"learning_rate": 6.177157305546078e-05,
"loss": 0.0812,
"step": 578
},
{
"epoch": 0.6300326441784548,
"grad_norm": 0.12964366376399994,
"learning_rate": 6.145244311816063e-05,
"loss": 0.0919,
"step": 579
},
{
"epoch": 0.6311207834602829,
"grad_norm": 0.14867788553237915,
"learning_rate": 6.113377361594049e-05,
"loss": 0.1079,
"step": 580
},
{
"epoch": 0.6322089227421109,
"grad_norm": 0.13811950385570526,
"learning_rate": 6.0815568355179556e-05,
"loss": 0.0871,
"step": 581
},
{
"epoch": 0.6332970620239391,
"grad_norm": 0.14532406628131866,
"learning_rate": 6.0497831136711836e-05,
"loss": 0.0912,
"step": 582
},
{
"epoch": 0.6343852013057671,
"grad_norm": 0.19129644334316254,
"learning_rate": 6.018056575578075e-05,
"loss": 0.1137,
"step": 583
},
{
"epoch": 0.6354733405875952,
"grad_norm": 0.16450344026088715,
"learning_rate": 5.986377600199371e-05,
"loss": 0.072,
"step": 584
},
{
"epoch": 0.6365614798694232,
"grad_norm": 0.1428171992301941,
"learning_rate": 5.9547465659277215e-05,
"loss": 0.0671,
"step": 585
},
{
"epoch": 0.6376496191512514,
"grad_norm": 0.18160395324230194,
"learning_rate": 5.923163850583113e-05,
"loss": 0.1049,
"step": 586
},
{
"epoch": 0.6387377584330794,
"grad_norm": 0.18718522787094116,
"learning_rate": 5.8916298314083915e-05,
"loss": 0.1037,
"step": 587
},
{
"epoch": 0.6398258977149075,
"grad_norm": 0.3233761489391327,
"learning_rate": 5.860144885064751e-05,
"loss": 0.1401,
"step": 588
},
{
"epoch": 0.6409140369967355,
"grad_norm": 0.27942174673080444,
"learning_rate": 5.828709387627218e-05,
"loss": 0.1464,
"step": 589
},
{
"epoch": 0.6420021762785637,
"grad_norm": 0.26235440373420715,
"learning_rate": 5.797323714580192e-05,
"loss": 0.1238,
"step": 590
},
{
"epoch": 0.6430903155603918,
"grad_norm": 0.3004370927810669,
"learning_rate": 5.765988240812921e-05,
"loss": 0.1483,
"step": 591
},
{
"epoch": 0.6441784548422198,
"grad_norm": 0.33811435103416443,
"learning_rate": 5.73470334061505e-05,
"loss": 0.1895,
"step": 592
},
{
"epoch": 0.6452665941240479,
"grad_norm": 0.352417528629303,
"learning_rate": 5.7034693876721376e-05,
"loss": 0.1148,
"step": 593
},
{
"epoch": 0.6463547334058759,
"grad_norm": 0.296340674161911,
"learning_rate": 5.6722867550612116e-05,
"loss": 0.1647,
"step": 594
},
{
"epoch": 0.6474428726877041,
"grad_norm": 0.47691333293914795,
"learning_rate": 5.6411558152462894e-05,
"loss": 0.2066,
"step": 595
},
{
"epoch": 0.6485310119695321,
"grad_norm": 0.5979732275009155,
"learning_rate": 5.6100769400739383e-05,
"loss": 0.291,
"step": 596
},
{
"epoch": 0.6496191512513602,
"grad_norm": 0.4951854944229126,
"learning_rate": 5.579050500768836e-05,
"loss": 0.2417,
"step": 597
},
{
"epoch": 0.6507072905331882,
"grad_norm": 0.5365352630615234,
"learning_rate": 5.54807686792933e-05,
"loss": 0.2438,
"step": 598
},
{
"epoch": 0.6517954298150164,
"grad_norm": 0.6378306150436401,
"learning_rate": 5.5171564115230254e-05,
"loss": 0.2436,
"step": 599
},
{
"epoch": 0.6528835690968444,
"grad_norm": 0.5462427735328674,
"learning_rate": 5.486289500882355e-05,
"loss": 0.1536,
"step": 600
},
{
"epoch": 0.6539717083786725,
"grad_norm": 0.0600992813706398,
"learning_rate": 5.4554765047001613e-05,
"loss": 0.0662,
"step": 601
},
{
"epoch": 0.6550598476605005,
"grad_norm": 0.06883436441421509,
"learning_rate": 5.424717791025302e-05,
"loss": 0.0791,
"step": 602
},
{
"epoch": 0.6561479869423286,
"grad_norm": 0.07644320279359818,
"learning_rate": 5.394013727258254e-05,
"loss": 0.0863,
"step": 603
},
{
"epoch": 0.6572361262241567,
"grad_norm": 0.07377026975154877,
"learning_rate": 5.363364680146725e-05,
"loss": 0.0669,
"step": 604
},
{
"epoch": 0.6583242655059848,
"grad_norm": 0.06373579055070877,
"learning_rate": 5.332771015781275e-05,
"loss": 0.0537,
"step": 605
},
{
"epoch": 0.6594124047878128,
"grad_norm": 0.05605285242199898,
"learning_rate": 5.302233099590928e-05,
"loss": 0.0505,
"step": 606
},
{
"epoch": 0.6605005440696409,
"grad_norm": 0.0761309266090393,
"learning_rate": 5.271751296338823e-05,
"loss": 0.079,
"step": 607
},
{
"epoch": 0.661588683351469,
"grad_norm": 0.08843579143285751,
"learning_rate": 5.2413259701178505e-05,
"loss": 0.0662,
"step": 608
},
{
"epoch": 0.6626768226332971,
"grad_norm": 0.07351501286029816,
"learning_rate": 5.210957484346314e-05,
"loss": 0.0533,
"step": 609
},
{
"epoch": 0.6637649619151251,
"grad_norm": 0.08167769014835358,
"learning_rate": 5.180646201763577e-05,
"loss": 0.0796,
"step": 610
},
{
"epoch": 0.6648531011969532,
"grad_norm": 0.09149360656738281,
"learning_rate": 5.150392484425728e-05,
"loss": 0.1115,
"step": 611
},
{
"epoch": 0.6659412404787813,
"grad_norm": 0.08288878947496414,
"learning_rate": 5.120196693701267e-05,
"loss": 0.0761,
"step": 612
},
{
"epoch": 0.6670293797606094,
"grad_norm": 0.07389149814844131,
"learning_rate": 5.090059190266779e-05,
"loss": 0.0645,
"step": 613
},
{
"epoch": 0.6681175190424374,
"grad_norm": 0.08279106020927429,
"learning_rate": 5.059980334102637e-05,
"loss": 0.0772,
"step": 614
},
{
"epoch": 0.6692056583242655,
"grad_norm": 0.09307517111301422,
"learning_rate": 5.0299604844886985e-05,
"loss": 0.0823,
"step": 615
},
{
"epoch": 0.6702937976060935,
"grad_norm": 0.09981580078601837,
"learning_rate": 5.000000000000002e-05,
"loss": 0.0991,
"step": 616
},
{
"epoch": 0.6713819368879217,
"grad_norm": 0.11640693992376328,
"learning_rate": 4.9700992385024934e-05,
"loss": 0.1061,
"step": 617
},
{
"epoch": 0.6724700761697497,
"grad_norm": 0.10669746994972229,
"learning_rate": 4.940258557148765e-05,
"loss": 0.0897,
"step": 618
},
{
"epoch": 0.6735582154515778,
"grad_norm": 0.1135135293006897,
"learning_rate": 4.9104783123737566e-05,
"loss": 0.1121,
"step": 619
},
{
"epoch": 0.6746463547334058,
"grad_norm": 0.08616163581609726,
"learning_rate": 4.880758859890536e-05,
"loss": 0.0684,
"step": 620
},
{
"epoch": 0.675734494015234,
"grad_norm": 0.10301216691732407,
"learning_rate": 4.851100554686021e-05,
"loss": 0.0894,
"step": 621
},
{
"epoch": 0.676822633297062,
"grad_norm": 0.10738655179738998,
"learning_rate": 4.821503751016746e-05,
"loss": 0.0919,
"step": 622
},
{
"epoch": 0.6779107725788901,
"grad_norm": 0.10355032980442047,
"learning_rate": 4.791968802404648e-05,
"loss": 0.0855,
"step": 623
},
{
"epoch": 0.6789989118607181,
"grad_norm": 0.10595700144767761,
"learning_rate": 4.762496061632814e-05,
"loss": 0.0903,
"step": 624
},
{
"epoch": 0.6800870511425462,
"grad_norm": 0.12492071092128754,
"learning_rate": 4.733085880741301e-05,
"loss": 0.1232,
"step": 625
},
{
"epoch": 0.6811751904243744,
"grad_norm": 0.1137222945690155,
"learning_rate": 4.7037386110228985e-05,
"loss": 0.0958,
"step": 626
},
{
"epoch": 0.6822633297062024,
"grad_norm": 0.13245636224746704,
"learning_rate": 4.6744546030189486e-05,
"loss": 0.0981,
"step": 627
},
{
"epoch": 0.6833514689880305,
"grad_norm": 0.12368131428956985,
"learning_rate": 4.645234206515171e-05,
"loss": 0.0811,
"step": 628
},
{
"epoch": 0.6844396082698585,
"grad_norm": 0.10623182356357574,
"learning_rate": 4.6160777705374524e-05,
"loss": 0.0723,
"step": 629
},
{
"epoch": 0.6855277475516867,
"grad_norm": 0.13129866123199463,
"learning_rate": 4.586985643347717e-05,
"loss": 0.1147,
"step": 630
},
{
"epoch": 0.6866158868335147,
"grad_norm": 0.11502306908369064,
"learning_rate": 4.5579581724397255e-05,
"loss": 0.0715,
"step": 631
},
{
"epoch": 0.6877040261153428,
"grad_norm": 0.1408235728740692,
"learning_rate": 4.5289957045349653e-05,
"loss": 0.0919,
"step": 632
},
{
"epoch": 0.6887921653971708,
"grad_norm": 0.1587967872619629,
"learning_rate": 4.5000985855784746e-05,
"loss": 0.109,
"step": 633
},
{
"epoch": 0.6898803046789989,
"grad_norm": 0.19859641790390015,
"learning_rate": 4.471267160734731e-05,
"loss": 0.0959,
"step": 634
},
{
"epoch": 0.690968443960827,
"grad_norm": 0.16058838367462158,
"learning_rate": 4.442501774383515e-05,
"loss": 0.1012,
"step": 635
},
{
"epoch": 0.6920565832426551,
"grad_norm": 0.2084600329399109,
"learning_rate": 4.413802770115816e-05,
"loss": 0.1343,
"step": 636
},
{
"epoch": 0.6931447225244831,
"grad_norm": 0.23033252358436584,
"learning_rate": 4.385170490729712e-05,
"loss": 0.1517,
"step": 637
},
{
"epoch": 0.6942328618063112,
"grad_norm": 0.2856910228729248,
"learning_rate": 4.3566052782262735e-05,
"loss": 0.1669,
"step": 638
},
{
"epoch": 0.6953210010881393,
"grad_norm": 0.19062520563602448,
"learning_rate": 4.328107473805487e-05,
"loss": 0.0654,
"step": 639
},
{
"epoch": 0.6964091403699674,
"grad_norm": 0.3172631561756134,
"learning_rate": 4.2996774178621736e-05,
"loss": 0.1788,
"step": 640
},
{
"epoch": 0.6974972796517954,
"grad_norm": 0.4395265281200409,
"learning_rate": 4.271315449981934e-05,
"loss": 0.1254,
"step": 641
},
{
"epoch": 0.6985854189336235,
"grad_norm": 0.3653918206691742,
"learning_rate": 4.2430219089370823e-05,
"loss": 0.2148,
"step": 642
},
{
"epoch": 0.6996735582154516,
"grad_norm": 0.45207682251930237,
"learning_rate": 4.2147971326825966e-05,
"loss": 0.2424,
"step": 643
},
{
"epoch": 0.7007616974972797,
"grad_norm": 0.33358728885650635,
"learning_rate": 4.1866414583520877e-05,
"loss": 0.1578,
"step": 644
},
{
"epoch": 0.7018498367791077,
"grad_norm": 0.2964087426662445,
"learning_rate": 4.158555222253771e-05,
"loss": 0.0704,
"step": 645
},
{
"epoch": 0.7029379760609358,
"grad_norm": 0.3286013603210449,
"learning_rate": 4.130538759866457e-05,
"loss": 0.2085,
"step": 646
},
{
"epoch": 0.7040261153427638,
"grad_norm": 0.3953593671321869,
"learning_rate": 4.102592405835536e-05,
"loss": 0.2185,
"step": 647
},
{
"epoch": 0.705114254624592,
"grad_norm": 0.46906566619873047,
"learning_rate": 4.074716493968975e-05,
"loss": 0.2676,
"step": 648
},
{
"epoch": 0.70620239390642,
"grad_norm": 0.545005738735199,
"learning_rate": 4.046911357233343e-05,
"loss": 0.2487,
"step": 649
},
{
"epoch": 0.7072905331882481,
"grad_norm": 0.5767453908920288,
"learning_rate": 4.019177327749822e-05,
"loss": 0.2432,
"step": 650
},
{
"epoch": 0.7083786724700761,
"grad_norm": 0.058534830808639526,
"learning_rate": 3.991514736790258e-05,
"loss": 0.0652,
"step": 651
},
{
"epoch": 0.7094668117519043,
"grad_norm": 0.06634719669818878,
"learning_rate": 3.963923914773187e-05,
"loss": 0.0712,
"step": 652
},
{
"epoch": 0.7105549510337323,
"grad_norm": 0.0771845281124115,
"learning_rate": 3.936405191259891e-05,
"loss": 0.0702,
"step": 653
},
{
"epoch": 0.7116430903155604,
"grad_norm": 0.06693675369024277,
"learning_rate": 3.9089588949504655e-05,
"loss": 0.0785,
"step": 654
},
{
"epoch": 0.7127312295973884,
"grad_norm": 0.07800418138504028,
"learning_rate": 3.8815853536798904e-05,
"loss": 0.093,
"step": 655
},
{
"epoch": 0.7138193688792165,
"grad_norm": 0.06311319023370743,
"learning_rate": 3.854284894414122e-05,
"loss": 0.0619,
"step": 656
},
{
"epoch": 0.7149075081610446,
"grad_norm": 0.08463241159915924,
"learning_rate": 3.82705784324618e-05,
"loss": 0.0799,
"step": 657
},
{
"epoch": 0.7159956474428727,
"grad_norm": 0.06584708392620087,
"learning_rate": 3.79990452539225e-05,
"loss": 0.0508,
"step": 658
},
{
"epoch": 0.7170837867247007,
"grad_norm": 0.07503578066825867,
"learning_rate": 3.772825265187802e-05,
"loss": 0.0651,
"step": 659
},
{
"epoch": 0.7181719260065288,
"grad_norm": 0.09440509229898453,
"learning_rate": 3.7458203860837234e-05,
"loss": 0.0824,
"step": 660
},
{
"epoch": 0.719260065288357,
"grad_norm": 0.0794801339507103,
"learning_rate": 3.7188902106424416e-05,
"loss": 0.0752,
"step": 661
},
{
"epoch": 0.720348204570185,
"grad_norm": 0.08709513396024704,
"learning_rate": 3.692035060534088e-05,
"loss": 0.0858,
"step": 662
},
{
"epoch": 0.721436343852013,
"grad_norm": 0.08292333036661148,
"learning_rate": 3.665255256532638e-05,
"loss": 0.0747,
"step": 663
},
{
"epoch": 0.7225244831338411,
"grad_norm": 0.08945606648921967,
"learning_rate": 3.638551118512089e-05,
"loss": 0.0875,
"step": 664
},
{
"epoch": 0.7236126224156693,
"grad_norm": 0.09742715954780579,
"learning_rate": 3.611922965442648e-05,
"loss": 0.0894,
"step": 665
},
{
"epoch": 0.7247007616974973,
"grad_norm": 0.09512501209974289,
"learning_rate": 3.5853711153868965e-05,
"loss": 0.0831,
"step": 666
},
{
"epoch": 0.7257889009793254,
"grad_norm": 0.12309622764587402,
"learning_rate": 3.558895885496023e-05,
"loss": 0.1218,
"step": 667
},
{
"epoch": 0.7268770402611534,
"grad_norm": 0.09456098824739456,
"learning_rate": 3.53249759200601e-05,
"loss": 0.0957,
"step": 668
},
{
"epoch": 0.7279651795429815,
"grad_norm": 0.10691989958286285,
"learning_rate": 3.506176550233863e-05,
"loss": 0.1067,
"step": 669
},
{
"epoch": 0.7290533188248096,
"grad_norm": 0.11746580898761749,
"learning_rate": 3.479933074573858e-05,
"loss": 0.1002,
"step": 670
},
{
"epoch": 0.7301414581066377,
"grad_norm": 0.0943666398525238,
"learning_rate": 3.4537674784937614e-05,
"loss": 0.0842,
"step": 671
},
{
"epoch": 0.7312295973884657,
"grad_norm": 0.0884823352098465,
"learning_rate": 3.427680074531113e-05,
"loss": 0.0642,
"step": 672
},
{
"epoch": 0.7323177366702938,
"grad_norm": 0.10647040605545044,
"learning_rate": 3.401671174289469e-05,
"loss": 0.0805,
"step": 673
},
{
"epoch": 0.7334058759521219,
"grad_norm": 0.11635477840900421,
"learning_rate": 3.3757410884346894e-05,
"loss": 0.1075,
"step": 674
},
{
"epoch": 0.73449401523395,
"grad_norm": 0.10123847424983978,
"learning_rate": 3.3498901266912396e-05,
"loss": 0.0783,
"step": 675
},
{
"epoch": 0.735582154515778,
"grad_norm": 0.1347595453262329,
"learning_rate": 3.324118597838464e-05,
"loss": 0.1216,
"step": 676
},
{
"epoch": 0.7366702937976061,
"grad_norm": 0.11851736158132553,
"learning_rate": 3.298426809706928e-05,
"loss": 0.0891,
"step": 677
},
{
"epoch": 0.7377584330794341,
"grad_norm": 0.1312292367219925,
"learning_rate": 3.2728150691747115e-05,
"loss": 0.1008,
"step": 678
},
{
"epoch": 0.7388465723612623,
"grad_norm": 0.12228815257549286,
"learning_rate": 3.2472836821637744e-05,
"loss": 0.0696,
"step": 679
},
{
"epoch": 0.7399347116430903,
"grad_norm": 0.12540924549102783,
"learning_rate": 3.2218329536362704e-05,
"loss": 0.0801,
"step": 680
},
{
"epoch": 0.7410228509249184,
"grad_norm": 0.12973323464393616,
"learning_rate": 3.196463187590929e-05,
"loss": 0.0712,
"step": 681
},
{
"epoch": 0.7421109902067464,
"grad_norm": 0.15368451178073883,
"learning_rate": 3.1711746870594086e-05,
"loss": 0.1033,
"step": 682
},
{
"epoch": 0.7431991294885746,
"grad_norm": 0.21176296472549438,
"learning_rate": 3.145967754102691e-05,
"loss": 0.1367,
"step": 683
},
{
"epoch": 0.7442872687704026,
"grad_norm": 0.18676644563674927,
"learning_rate": 3.120842689807468e-05,
"loss": 0.1081,
"step": 684
},
{
"epoch": 0.7453754080522307,
"grad_norm": 0.1671050786972046,
"learning_rate": 3.0957997942825336e-05,
"loss": 0.0898,
"step": 685
},
{
"epoch": 0.7464635473340587,
"grad_norm": 0.187219500541687,
"learning_rate": 3.070839366655215e-05,
"loss": 0.1019,
"step": 686
},
{
"epoch": 0.7475516866158868,
"grad_norm": 0.17203396558761597,
"learning_rate": 3.0459617050677868e-05,
"loss": 0.0865,
"step": 687
},
{
"epoch": 0.7486398258977149,
"grad_norm": 0.23022624850273132,
"learning_rate": 3.021167106673928e-05,
"loss": 0.1252,
"step": 688
},
{
"epoch": 0.749727965179543,
"grad_norm": 0.2407008409500122,
"learning_rate": 2.996455867635155e-05,
"loss": 0.1142,
"step": 689
},
{
"epoch": 0.750816104461371,
"grad_norm": 0.32173025608062744,
"learning_rate": 2.9718282831172883e-05,
"loss": 0.1342,
"step": 690
},
{
"epoch": 0.750816104461371,
"eval_loss": 0.10566242039203644,
"eval_runtime": 24.3899,
"eval_samples_per_second": 15.867,
"eval_steps_per_second": 7.954,
"step": 690
}
],
"logging_steps": 1,
"max_steps": 919,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 230,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.744651772592128e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}