atsuki-yamaguchi's picture
Upload folder using huggingface_hub
fe67eae verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 382,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002617801047120419,
"grad_norm": 4.880457401275635,
"learning_rate": 2.564102564102564e-06,
"loss": 6.8614,
"step": 1
},
{
"epoch": 0.005235602094240838,
"grad_norm": 5.4761762619018555,
"learning_rate": 5.128205128205128e-06,
"loss": 7.3812,
"step": 2
},
{
"epoch": 0.007853403141361256,
"grad_norm": 4.928334712982178,
"learning_rate": 7.692307692307694e-06,
"loss": 7.2286,
"step": 3
},
{
"epoch": 0.010471204188481676,
"grad_norm": 5.0216498374938965,
"learning_rate": 1.0256410256410256e-05,
"loss": 7.3511,
"step": 4
},
{
"epoch": 0.013089005235602094,
"grad_norm": 4.557470798492432,
"learning_rate": 1.282051282051282e-05,
"loss": 6.6464,
"step": 5
},
{
"epoch": 0.015706806282722512,
"grad_norm": 4.309239864349365,
"learning_rate": 1.5384615384615387e-05,
"loss": 6.9958,
"step": 6
},
{
"epoch": 0.01832460732984293,
"grad_norm": 4.001352787017822,
"learning_rate": 1.794871794871795e-05,
"loss": 6.5667,
"step": 7
},
{
"epoch": 0.020942408376963352,
"grad_norm": 3.922077178955078,
"learning_rate": 2.0512820512820512e-05,
"loss": 6.491,
"step": 8
},
{
"epoch": 0.02356020942408377,
"grad_norm": 3.411221742630005,
"learning_rate": 2.307692307692308e-05,
"loss": 6.0273,
"step": 9
},
{
"epoch": 0.02617801047120419,
"grad_norm": 3.9540348052978516,
"learning_rate": 2.564102564102564e-05,
"loss": 6.6163,
"step": 10
},
{
"epoch": 0.028795811518324606,
"grad_norm": 3.2421910762786865,
"learning_rate": 2.8205128205128207e-05,
"loss": 6.2189,
"step": 11
},
{
"epoch": 0.031413612565445025,
"grad_norm": 3.0702974796295166,
"learning_rate": 3.0769230769230774e-05,
"loss": 5.5681,
"step": 12
},
{
"epoch": 0.034031413612565446,
"grad_norm": 4.09296178817749,
"learning_rate": 3.3333333333333335e-05,
"loss": 6.302,
"step": 13
},
{
"epoch": 0.03664921465968586,
"grad_norm": 2.9595351219177246,
"learning_rate": 3.58974358974359e-05,
"loss": 5.4689,
"step": 14
},
{
"epoch": 0.03926701570680628,
"grad_norm": 3.2727208137512207,
"learning_rate": 3.846153846153846e-05,
"loss": 5.6809,
"step": 15
},
{
"epoch": 0.041884816753926704,
"grad_norm": 3.616870880126953,
"learning_rate": 4.1025641025641023e-05,
"loss": 5.7648,
"step": 16
},
{
"epoch": 0.04450261780104712,
"grad_norm": 3.6780197620391846,
"learning_rate": 4.358974358974359e-05,
"loss": 5.4097,
"step": 17
},
{
"epoch": 0.04712041884816754,
"grad_norm": 3.512361526489258,
"learning_rate": 4.615384615384616e-05,
"loss": 5.7418,
"step": 18
},
{
"epoch": 0.049738219895287955,
"grad_norm": 4.385097503662109,
"learning_rate": 4.871794871794872e-05,
"loss": 5.4156,
"step": 19
},
{
"epoch": 0.05235602094240838,
"grad_norm": 4.452427864074707,
"learning_rate": 5.128205128205128e-05,
"loss": 5.3937,
"step": 20
},
{
"epoch": 0.0549738219895288,
"grad_norm": 3.246995210647583,
"learning_rate": 5.384615384615385e-05,
"loss": 5.0434,
"step": 21
},
{
"epoch": 0.05759162303664921,
"grad_norm": 3.3380537033081055,
"learning_rate": 5.6410256410256414e-05,
"loss": 4.999,
"step": 22
},
{
"epoch": 0.060209424083769635,
"grad_norm": 3.313646078109741,
"learning_rate": 5.897435897435898e-05,
"loss": 5.1477,
"step": 23
},
{
"epoch": 0.06282722513089005,
"grad_norm": 2.918846368789673,
"learning_rate": 6.153846153846155e-05,
"loss": 4.6583,
"step": 24
},
{
"epoch": 0.06544502617801047,
"grad_norm": 3.070826768875122,
"learning_rate": 6.410256410256412e-05,
"loss": 4.5479,
"step": 25
},
{
"epoch": 0.06806282722513089,
"grad_norm": 3.04087233543396,
"learning_rate": 6.666666666666667e-05,
"loss": 4.592,
"step": 26
},
{
"epoch": 0.07068062827225131,
"grad_norm": 2.8428642749786377,
"learning_rate": 6.923076923076924e-05,
"loss": 4.3682,
"step": 27
},
{
"epoch": 0.07329842931937172,
"grad_norm": 3.0716347694396973,
"learning_rate": 7.17948717948718e-05,
"loss": 4.5837,
"step": 28
},
{
"epoch": 0.07591623036649214,
"grad_norm": 2.571244478225708,
"learning_rate": 7.435897435897436e-05,
"loss": 3.992,
"step": 29
},
{
"epoch": 0.07853403141361257,
"grad_norm": 3.7117347717285156,
"learning_rate": 7.692307692307693e-05,
"loss": 4.5136,
"step": 30
},
{
"epoch": 0.08115183246073299,
"grad_norm": 2.827247381210327,
"learning_rate": 7.948717948717948e-05,
"loss": 4.4797,
"step": 31
},
{
"epoch": 0.08376963350785341,
"grad_norm": 2.7113707065582275,
"learning_rate": 8.205128205128205e-05,
"loss": 4.1275,
"step": 32
},
{
"epoch": 0.08638743455497382,
"grad_norm": 2.837117910385132,
"learning_rate": 8.461538461538461e-05,
"loss": 4.3012,
"step": 33
},
{
"epoch": 0.08900523560209424,
"grad_norm": 3.1402807235717773,
"learning_rate": 8.717948717948718e-05,
"loss": 4.3305,
"step": 34
},
{
"epoch": 0.09162303664921466,
"grad_norm": 2.8632307052612305,
"learning_rate": 8.974358974358975e-05,
"loss": 4.0145,
"step": 35
},
{
"epoch": 0.09424083769633508,
"grad_norm": 2.7232565879821777,
"learning_rate": 9.230769230769232e-05,
"loss": 4.2267,
"step": 36
},
{
"epoch": 0.0968586387434555,
"grad_norm": 2.762054443359375,
"learning_rate": 9.487179487179487e-05,
"loss": 4.0502,
"step": 37
},
{
"epoch": 0.09947643979057591,
"grad_norm": 18.91065788269043,
"learning_rate": 9.743589743589744e-05,
"loss": 4.1297,
"step": 38
},
{
"epoch": 0.10209424083769633,
"grad_norm": 3.6653544902801514,
"learning_rate": 0.0001,
"loss": 4.2813,
"step": 39
},
{
"epoch": 0.10471204188481675,
"grad_norm": 2.7435433864593506,
"learning_rate": 9.999953057840867e-05,
"loss": 3.7026,
"step": 40
},
{
"epoch": 0.10732984293193717,
"grad_norm": 3.3734006881713867,
"learning_rate": 9.999812232244895e-05,
"loss": 4.1651,
"step": 41
},
{
"epoch": 0.1099476439790576,
"grad_norm": 2.5766139030456543,
"learning_rate": 9.999577525856345e-05,
"loss": 4.0622,
"step": 42
},
{
"epoch": 0.112565445026178,
"grad_norm": 3.926454544067383,
"learning_rate": 9.99924894308227e-05,
"loss": 4.2311,
"step": 43
},
{
"epoch": 0.11518324607329843,
"grad_norm": 3.0207953453063965,
"learning_rate": 9.998826490092421e-05,
"loss": 3.7921,
"step": 44
},
{
"epoch": 0.11780104712041885,
"grad_norm": 2.495757579803467,
"learning_rate": 9.998310174819142e-05,
"loss": 3.7601,
"step": 45
},
{
"epoch": 0.12041884816753927,
"grad_norm": 2.8352251052856445,
"learning_rate": 9.997700006957214e-05,
"loss": 3.6366,
"step": 46
},
{
"epoch": 0.12303664921465969,
"grad_norm": 2.970708131790161,
"learning_rate": 9.996995997963675e-05,
"loss": 3.9135,
"step": 47
},
{
"epoch": 0.1256544502617801,
"grad_norm": 2.4032399654388428,
"learning_rate": 9.996198161057607e-05,
"loss": 3.8009,
"step": 48
},
{
"epoch": 0.12827225130890052,
"grad_norm": 3.217522144317627,
"learning_rate": 9.995306511219885e-05,
"loss": 3.9169,
"step": 49
},
{
"epoch": 0.13089005235602094,
"grad_norm": 2.8503968715667725,
"learning_rate": 9.994321065192894e-05,
"loss": 3.9316,
"step": 50
},
{
"epoch": 0.13350785340314136,
"grad_norm": 2.826801061630249,
"learning_rate": 9.993241841480223e-05,
"loss": 3.9417,
"step": 51
},
{
"epoch": 0.13612565445026178,
"grad_norm": 2.5175540447235107,
"learning_rate": 9.992068860346306e-05,
"loss": 3.5643,
"step": 52
},
{
"epoch": 0.1387434554973822,
"grad_norm": 2.7539591789245605,
"learning_rate": 9.990802143816051e-05,
"loss": 3.5607,
"step": 53
},
{
"epoch": 0.14136125654450263,
"grad_norm": 2.4108636379241943,
"learning_rate": 9.989441715674422e-05,
"loss": 3.4459,
"step": 54
},
{
"epoch": 0.14397905759162305,
"grad_norm": 3.2774994373321533,
"learning_rate": 9.987987601465991e-05,
"loss": 3.9545,
"step": 55
},
{
"epoch": 0.14659685863874344,
"grad_norm": 2.3104467391967773,
"learning_rate": 9.986439828494465e-05,
"loss": 3.6954,
"step": 56
},
{
"epoch": 0.14921465968586387,
"grad_norm": 2.5438120365142822,
"learning_rate": 9.984798425822163e-05,
"loss": 3.5669,
"step": 57
},
{
"epoch": 0.1518324607329843,
"grad_norm": 2.4583828449249268,
"learning_rate": 9.98306342426948e-05,
"loss": 3.8843,
"step": 58
},
{
"epoch": 0.1544502617801047,
"grad_norm": 2.34213924407959,
"learning_rate": 9.981234856414307e-05,
"loss": 3.7361,
"step": 59
},
{
"epoch": 0.15706806282722513,
"grad_norm": 2.6676721572875977,
"learning_rate": 9.979312756591407e-05,
"loss": 3.7573,
"step": 60
},
{
"epoch": 0.15968586387434555,
"grad_norm": 2.4997060298919678,
"learning_rate": 9.977297160891792e-05,
"loss": 3.7876,
"step": 61
},
{
"epoch": 0.16230366492146597,
"grad_norm": 2.709794044494629,
"learning_rate": 9.975188107162026e-05,
"loss": 3.614,
"step": 62
},
{
"epoch": 0.1649214659685864,
"grad_norm": 2.355778932571411,
"learning_rate": 9.972985635003522e-05,
"loss": 3.4262,
"step": 63
},
{
"epoch": 0.16753926701570682,
"grad_norm": 2.5319371223449707,
"learning_rate": 9.970689785771798e-05,
"loss": 3.5963,
"step": 64
},
{
"epoch": 0.17015706806282724,
"grad_norm": 2.559286117553711,
"learning_rate": 9.968300602575707e-05,
"loss": 3.6722,
"step": 65
},
{
"epoch": 0.17277486910994763,
"grad_norm": 2.4441914558410645,
"learning_rate": 9.965818130276612e-05,
"loss": 3.4972,
"step": 66
},
{
"epoch": 0.17539267015706805,
"grad_norm": 2.2148141860961914,
"learning_rate": 9.963242415487557e-05,
"loss": 3.6571,
"step": 67
},
{
"epoch": 0.17801047120418848,
"grad_norm": 2.3289103507995605,
"learning_rate": 9.96057350657239e-05,
"loss": 3.6085,
"step": 68
},
{
"epoch": 0.1806282722513089,
"grad_norm": 2.6062214374542236,
"learning_rate": 9.957811453644847e-05,
"loss": 3.4849,
"step": 69
},
{
"epoch": 0.18324607329842932,
"grad_norm": 2.347382068634033,
"learning_rate": 9.954956308567622e-05,
"loss": 3.3534,
"step": 70
},
{
"epoch": 0.18586387434554974,
"grad_norm": 4.254333972930908,
"learning_rate": 9.952008124951381e-05,
"loss": 3.6956,
"step": 71
},
{
"epoch": 0.18848167539267016,
"grad_norm": 2.4889872074127197,
"learning_rate": 9.948966958153771e-05,
"loss": 3.5671,
"step": 72
},
{
"epoch": 0.19109947643979058,
"grad_norm": 34.48236083984375,
"learning_rate": 9.945832865278363e-05,
"loss": 3.5257,
"step": 73
},
{
"epoch": 0.193717277486911,
"grad_norm": 2.4782514572143555,
"learning_rate": 9.942605905173592e-05,
"loss": 3.5874,
"step": 74
},
{
"epoch": 0.19633507853403143,
"grad_norm": 2.386019706726074,
"learning_rate": 9.939286138431647e-05,
"loss": 3.5946,
"step": 75
},
{
"epoch": 0.19895287958115182,
"grad_norm": 2.1345767974853516,
"learning_rate": 9.935873627387336e-05,
"loss": 3.3744,
"step": 76
},
{
"epoch": 0.20157068062827224,
"grad_norm": 2.562124013900757,
"learning_rate": 9.932368436116915e-05,
"loss": 3.4642,
"step": 77
},
{
"epoch": 0.20418848167539266,
"grad_norm": 2.3154282569885254,
"learning_rate": 9.92877063043688e-05,
"loss": 3.4095,
"step": 78
},
{
"epoch": 0.20680628272251309,
"grad_norm": 2.225123643875122,
"learning_rate": 9.925080277902743e-05,
"loss": 3.496,
"step": 79
},
{
"epoch": 0.2094240837696335,
"grad_norm": 2.0953874588012695,
"learning_rate": 9.921297447807744e-05,
"loss": 3.4574,
"step": 80
},
{
"epoch": 0.21204188481675393,
"grad_norm": 2.2577614784240723,
"learning_rate": 9.917422211181571e-05,
"loss": 3.5562,
"step": 81
},
{
"epoch": 0.21465968586387435,
"grad_norm": 2.2811625003814697,
"learning_rate": 9.913454640789013e-05,
"loss": 3.3894,
"step": 82
},
{
"epoch": 0.21727748691099477,
"grad_norm": 2.139005661010742,
"learning_rate": 9.909394811128598e-05,
"loss": 3.3137,
"step": 83
},
{
"epoch": 0.2198952879581152,
"grad_norm": 2.058781147003174,
"learning_rate": 9.905242798431196e-05,
"loss": 3.4472,
"step": 84
},
{
"epoch": 0.22251308900523561,
"grad_norm": 14.535298347473145,
"learning_rate": 9.900998680658581e-05,
"loss": 3.3115,
"step": 85
},
{
"epoch": 0.225130890052356,
"grad_norm": 2.935969114303589,
"learning_rate": 9.896662537501976e-05,
"loss": 3.3982,
"step": 86
},
{
"epoch": 0.22774869109947643,
"grad_norm": 2.613450288772583,
"learning_rate": 9.892234450380547e-05,
"loss": 3.3868,
"step": 87
},
{
"epoch": 0.23036649214659685,
"grad_norm": 2.750882387161255,
"learning_rate": 9.887714502439884e-05,
"loss": 3.2625,
"step": 88
},
{
"epoch": 0.23298429319371727,
"grad_norm": 2.1589114665985107,
"learning_rate": 9.883102778550434e-05,
"loss": 3.1647,
"step": 89
},
{
"epoch": 0.2356020942408377,
"grad_norm": 2.105313777923584,
"learning_rate": 9.878399365305906e-05,
"loss": 3.1404,
"step": 90
},
{
"epoch": 0.23821989528795812,
"grad_norm": 3.4829330444335938,
"learning_rate": 9.873604351021648e-05,
"loss": 3.3671,
"step": 91
},
{
"epoch": 0.24083769633507854,
"grad_norm": 3.3065099716186523,
"learning_rate": 9.868717825732994e-05,
"loss": 3.4199,
"step": 92
},
{
"epoch": 0.24345549738219896,
"grad_norm": 5.105870723724365,
"learning_rate": 9.863739881193558e-05,
"loss": 3.4172,
"step": 93
},
{
"epoch": 0.24607329842931938,
"grad_norm": 3.03316593170166,
"learning_rate": 9.858670610873528e-05,
"loss": 3.2284,
"step": 94
},
{
"epoch": 0.2486910994764398,
"grad_norm": 2.4118738174438477,
"learning_rate": 9.853510109957903e-05,
"loss": 3.4009,
"step": 95
},
{
"epoch": 0.2513089005235602,
"grad_norm": 5.970595359802246,
"learning_rate": 9.848258475344702e-05,
"loss": 3.387,
"step": 96
},
{
"epoch": 0.25392670157068065,
"grad_norm": 2.43656325340271,
"learning_rate": 9.842915805643155e-05,
"loss": 3.3451,
"step": 97
},
{
"epoch": 0.25654450261780104,
"grad_norm": 3.862488031387329,
"learning_rate": 9.837482201171842e-05,
"loss": 3.2579,
"step": 98
},
{
"epoch": 0.2591623036649215,
"grad_norm": 2.120502471923828,
"learning_rate": 9.831957763956813e-05,
"loss": 3.3365,
"step": 99
},
{
"epoch": 0.2617801047120419,
"grad_norm": 2.438363552093506,
"learning_rate": 9.826342597729672e-05,
"loss": 3.1251,
"step": 100
},
{
"epoch": 0.2643979057591623,
"grad_norm": 2.0495247840881348,
"learning_rate": 9.820636807925628e-05,
"loss": 3.3333,
"step": 101
},
{
"epoch": 0.2670157068062827,
"grad_norm": 2.2188355922698975,
"learning_rate": 9.814840501681522e-05,
"loss": 3.1956,
"step": 102
},
{
"epoch": 0.2696335078534031,
"grad_norm": 2.230815887451172,
"learning_rate": 9.808953787833801e-05,
"loss": 3.1073,
"step": 103
},
{
"epoch": 0.27225130890052357,
"grad_norm": 2.1100032329559326,
"learning_rate": 9.802976776916494e-05,
"loss": 3.184,
"step": 104
},
{
"epoch": 0.27486910994764396,
"grad_norm": 2.1523215770721436,
"learning_rate": 9.796909581159116e-05,
"loss": 3.3009,
"step": 105
},
{
"epoch": 0.2774869109947644,
"grad_norm": 2.2816920280456543,
"learning_rate": 9.790752314484577e-05,
"loss": 3.1125,
"step": 106
},
{
"epoch": 0.2801047120418848,
"grad_norm": 2.134192705154419,
"learning_rate": 9.784505092507031e-05,
"loss": 3.0986,
"step": 107
},
{
"epoch": 0.28272251308900526,
"grad_norm": 2.156048536300659,
"learning_rate": 9.778168032529716e-05,
"loss": 3.3802,
"step": 108
},
{
"epoch": 0.28534031413612565,
"grad_norm": 2.2478604316711426,
"learning_rate": 9.771741253542741e-05,
"loss": 3.2685,
"step": 109
},
{
"epoch": 0.2879581151832461,
"grad_norm": 2.222820997238159,
"learning_rate": 9.765224876220859e-05,
"loss": 3.1221,
"step": 110
},
{
"epoch": 0.2905759162303665,
"grad_norm": 2.3353006839752197,
"learning_rate": 9.758619022921202e-05,
"loss": 3.1653,
"step": 111
},
{
"epoch": 0.2931937172774869,
"grad_norm": 14.295672416687012,
"learning_rate": 9.751923817680972e-05,
"loss": 3.146,
"step": 112
},
{
"epoch": 0.29581151832460734,
"grad_norm": 2.505723714828491,
"learning_rate": 9.745139386215128e-05,
"loss": 3.2138,
"step": 113
},
{
"epoch": 0.29842931937172773,
"grad_norm": 2.1925199031829834,
"learning_rate": 9.738265855914013e-05,
"loss": 3.1778,
"step": 114
},
{
"epoch": 0.3010471204188482,
"grad_norm": 2.1371121406555176,
"learning_rate": 9.731303355840968e-05,
"loss": 3.0539,
"step": 115
},
{
"epoch": 0.3036649214659686,
"grad_norm": 2.133293628692627,
"learning_rate": 9.724252016729909e-05,
"loss": 3.0521,
"step": 116
},
{
"epoch": 0.306282722513089,
"grad_norm": 2.2210733890533447,
"learning_rate": 9.717111970982869e-05,
"loss": 3.1743,
"step": 117
},
{
"epoch": 0.3089005235602094,
"grad_norm": 2.2624800205230713,
"learning_rate": 9.709883352667513e-05,
"loss": 3.2274,
"step": 118
},
{
"epoch": 0.31151832460732987,
"grad_norm": 2.1522083282470703,
"learning_rate": 9.70256629751462e-05,
"loss": 3.1238,
"step": 119
},
{
"epoch": 0.31413612565445026,
"grad_norm": 1.954613447189331,
"learning_rate": 9.69516094291554e-05,
"loss": 3.0268,
"step": 120
},
{
"epoch": 0.31675392670157065,
"grad_norm": 1.9362231492996216,
"learning_rate": 9.687667427919605e-05,
"loss": 3.148,
"step": 121
},
{
"epoch": 0.3193717277486911,
"grad_norm": 2.171438455581665,
"learning_rate": 9.680085893231521e-05,
"loss": 3.069,
"step": 122
},
{
"epoch": 0.3219895287958115,
"grad_norm": 2.280571699142456,
"learning_rate": 9.672416481208738e-05,
"loss": 3.3274,
"step": 123
},
{
"epoch": 0.32460732984293195,
"grad_norm": 2.0850417613983154,
"learning_rate": 9.664659335858755e-05,
"loss": 3.2012,
"step": 124
},
{
"epoch": 0.32722513089005234,
"grad_norm": 2.408496141433716,
"learning_rate": 9.656814602836434e-05,
"loss": 3.2839,
"step": 125
},
{
"epoch": 0.3298429319371728,
"grad_norm": 120.96231842041016,
"learning_rate": 9.648882429441257e-05,
"loss": 3.1548,
"step": 126
},
{
"epoch": 0.3324607329842932,
"grad_norm": 2.1636528968811035,
"learning_rate": 9.640862964614564e-05,
"loss": 3.0179,
"step": 127
},
{
"epoch": 0.33507853403141363,
"grad_norm": 2.2096540927886963,
"learning_rate": 9.632756358936749e-05,
"loss": 3.261,
"step": 128
},
{
"epoch": 0.337696335078534,
"grad_norm": 2.2390010356903076,
"learning_rate": 9.624562764624445e-05,
"loss": 3.1954,
"step": 129
},
{
"epoch": 0.3403141361256545,
"grad_norm": 1.8933037519454956,
"learning_rate": 9.616282335527653e-05,
"loss": 3.2055,
"step": 130
},
{
"epoch": 0.34293193717277487,
"grad_norm": 6.112705230712891,
"learning_rate": 9.607915227126862e-05,
"loss": 3.2929,
"step": 131
},
{
"epoch": 0.34554973821989526,
"grad_norm": 4.289401054382324,
"learning_rate": 9.599461596530127e-05,
"loss": 3.1634,
"step": 132
},
{
"epoch": 0.3481675392670157,
"grad_norm": 2.0408754348754883,
"learning_rate": 9.590921602470116e-05,
"loss": 3.1964,
"step": 133
},
{
"epoch": 0.3507853403141361,
"grad_norm": 2.1104061603546143,
"learning_rate": 9.582295405301131e-05,
"loss": 3.0235,
"step": 134
},
{
"epoch": 0.35340314136125656,
"grad_norm": 1.9149036407470703,
"learning_rate": 9.573583166996103e-05,
"loss": 3.2854,
"step": 135
},
{
"epoch": 0.35602094240837695,
"grad_norm": 2.161102056503296,
"learning_rate": 9.564785051143541e-05,
"loss": 3.1318,
"step": 136
},
{
"epoch": 0.3586387434554974,
"grad_norm": 2.186587333679199,
"learning_rate": 9.555901222944468e-05,
"loss": 3.2125,
"step": 137
},
{
"epoch": 0.3612565445026178,
"grad_norm": 1.8297839164733887,
"learning_rate": 9.546931849209314e-05,
"loss": 2.96,
"step": 138
},
{
"epoch": 0.36387434554973824,
"grad_norm": 2.2810025215148926,
"learning_rate": 9.537877098354786e-05,
"loss": 3.0554,
"step": 139
},
{
"epoch": 0.36649214659685864,
"grad_norm": 2.006052017211914,
"learning_rate": 9.528737140400707e-05,
"loss": 3.1757,
"step": 140
},
{
"epoch": 0.36910994764397903,
"grad_norm": 2.3248322010040283,
"learning_rate": 9.519512146966823e-05,
"loss": 3.2321,
"step": 141
},
{
"epoch": 0.3717277486910995,
"grad_norm": 2.1551225185394287,
"learning_rate": 9.510202291269576e-05,
"loss": 3.0333,
"step": 142
},
{
"epoch": 0.3743455497382199,
"grad_norm": 2.613978147506714,
"learning_rate": 9.500807748118856e-05,
"loss": 3.3177,
"step": 143
},
{
"epoch": 0.3769633507853403,
"grad_norm": 2.302820920944214,
"learning_rate": 9.491328693914722e-05,
"loss": 3.069,
"step": 144
},
{
"epoch": 0.3795811518324607,
"grad_norm": 2.0801970958709717,
"learning_rate": 9.48176530664408e-05,
"loss": 3.1207,
"step": 145
},
{
"epoch": 0.38219895287958117,
"grad_norm": 1.8670393228530884,
"learning_rate": 9.472117765877349e-05,
"loss": 2.988,
"step": 146
},
{
"epoch": 0.38481675392670156,
"grad_norm": 2.123073101043701,
"learning_rate": 9.462386252765087e-05,
"loss": 3.1316,
"step": 147
},
{
"epoch": 0.387434554973822,
"grad_norm": 2.0116608142852783,
"learning_rate": 9.452570950034589e-05,
"loss": 3.1323,
"step": 148
},
{
"epoch": 0.3900523560209424,
"grad_norm": 1.8206120729446411,
"learning_rate": 9.442672041986457e-05,
"loss": 2.8792,
"step": 149
},
{
"epoch": 0.39267015706806285,
"grad_norm": 1.9137699604034424,
"learning_rate": 9.432689714491136e-05,
"loss": 3.0191,
"step": 150
},
{
"epoch": 0.39528795811518325,
"grad_norm": 1.9897340536117554,
"learning_rate": 9.422624154985427e-05,
"loss": 3.1578,
"step": 151
},
{
"epoch": 0.39790575916230364,
"grad_norm": 1.8997538089752197,
"learning_rate": 9.412475552468974e-05,
"loss": 3.0389,
"step": 152
},
{
"epoch": 0.4005235602094241,
"grad_norm": 2.249044895172119,
"learning_rate": 9.402244097500696e-05,
"loss": 3.3441,
"step": 153
},
{
"epoch": 0.4031413612565445,
"grad_norm": 1.9846928119659424,
"learning_rate": 9.391929982195232e-05,
"loss": 3.2146,
"step": 154
},
{
"epoch": 0.40575916230366493,
"grad_norm": 2.240514039993286,
"learning_rate": 9.381533400219318e-05,
"loss": 3.2069,
"step": 155
},
{
"epoch": 0.4083769633507853,
"grad_norm": 2.115003824234009,
"learning_rate": 9.371054546788157e-05,
"loss": 2.9426,
"step": 156
},
{
"epoch": 0.4109947643979058,
"grad_norm": 1.9737358093261719,
"learning_rate": 9.36049361866175e-05,
"loss": 3.1503,
"step": 157
},
{
"epoch": 0.41361256544502617,
"grad_norm": 2.096264600753784,
"learning_rate": 9.349850814141204e-05,
"loss": 3.0736,
"step": 158
},
{
"epoch": 0.4162303664921466,
"grad_norm": 2.3380467891693115,
"learning_rate": 9.339126333065007e-05,
"loss": 3.2136,
"step": 159
},
{
"epoch": 0.418848167539267,
"grad_norm": 2.1407546997070312,
"learning_rate": 9.328320376805281e-05,
"loss": 3.1564,
"step": 160
},
{
"epoch": 0.4214659685863874,
"grad_norm": 1.9080549478530884,
"learning_rate": 9.317433148263995e-05,
"loss": 2.9173,
"step": 161
},
{
"epoch": 0.42408376963350786,
"grad_norm": 4.025334358215332,
"learning_rate": 9.30646485186915e-05,
"loss": 2.8801,
"step": 162
},
{
"epoch": 0.42670157068062825,
"grad_norm": 2.0670013427734375,
"learning_rate": 9.295415693570955e-05,
"loss": 2.9837,
"step": 163
},
{
"epoch": 0.4293193717277487,
"grad_norm": 2.1820228099823,
"learning_rate": 9.284285880837946e-05,
"loss": 3.1169,
"step": 164
},
{
"epoch": 0.4319371727748691,
"grad_norm": 1.9107846021652222,
"learning_rate": 9.273075622653102e-05,
"loss": 3.1854,
"step": 165
},
{
"epoch": 0.43455497382198954,
"grad_norm": 2.0229499340057373,
"learning_rate": 9.261785129509914e-05,
"loss": 2.9287,
"step": 166
},
{
"epoch": 0.43717277486910994,
"grad_norm": 1.9189234972000122,
"learning_rate": 9.250414613408427e-05,
"loss": 3.0001,
"step": 167
},
{
"epoch": 0.4397905759162304,
"grad_norm": 2.0910720825195312,
"learning_rate": 9.238964287851275e-05,
"loss": 2.9357,
"step": 168
},
{
"epoch": 0.4424083769633508,
"grad_norm": 1.9052855968475342,
"learning_rate": 9.22743436783966e-05,
"loss": 2.9641,
"step": 169
},
{
"epoch": 0.44502617801047123,
"grad_norm": 1.9111841917037964,
"learning_rate": 9.215825069869316e-05,
"loss": 3.0879,
"step": 170
},
{
"epoch": 0.4476439790575916,
"grad_norm": 1.898743987083435,
"learning_rate": 9.20413661192645e-05,
"loss": 2.9687,
"step": 171
},
{
"epoch": 0.450261780104712,
"grad_norm": 1.9681291580200195,
"learning_rate": 9.192369213483642e-05,
"loss": 3.0313,
"step": 172
},
{
"epoch": 0.45287958115183247,
"grad_norm": 1.937829613685608,
"learning_rate": 9.180523095495727e-05,
"loss": 2.9251,
"step": 173
},
{
"epoch": 0.45549738219895286,
"grad_norm": 2.15289568901062,
"learning_rate": 9.168598480395651e-05,
"loss": 3.0487,
"step": 174
},
{
"epoch": 0.4581151832460733,
"grad_norm": 2.0502867698669434,
"learning_rate": 9.156595592090284e-05,
"loss": 2.8817,
"step": 175
},
{
"epoch": 0.4607329842931937,
"grad_norm": 2.0720651149749756,
"learning_rate": 9.14451465595622e-05,
"loss": 3.009,
"step": 176
},
{
"epoch": 0.46335078534031415,
"grad_norm": 2.0610387325286865,
"learning_rate": 9.132355898835556e-05,
"loss": 3.0353,
"step": 177
},
{
"epoch": 0.46596858638743455,
"grad_norm": 2.160541534423828,
"learning_rate": 9.12011954903161e-05,
"loss": 3.1546,
"step": 178
},
{
"epoch": 0.468586387434555,
"grad_norm": 2.1311235427856445,
"learning_rate": 9.107805836304658e-05,
"loss": 3.0889,
"step": 179
},
{
"epoch": 0.4712041884816754,
"grad_norm": 1.902626633644104,
"learning_rate": 9.095414991867604e-05,
"loss": 2.9551,
"step": 180
},
{
"epoch": 0.4738219895287958,
"grad_norm": 1.9568802118301392,
"learning_rate": 9.082947248381643e-05,
"loss": 2.9683,
"step": 181
},
{
"epoch": 0.47643979057591623,
"grad_norm": 2.024705171585083,
"learning_rate": 9.070402839951897e-05,
"loss": 3.0207,
"step": 182
},
{
"epoch": 0.4790575916230366,
"grad_norm": 1.8830217123031616,
"learning_rate": 9.057782002123012e-05,
"loss": 2.8518,
"step": 183
},
{
"epoch": 0.4816753926701571,
"grad_norm": 2.031515598297119,
"learning_rate": 9.045084971874738e-05,
"loss": 2.9262,
"step": 184
},
{
"epoch": 0.48429319371727747,
"grad_norm": 1.9783765077590942,
"learning_rate": 9.03231198761748e-05,
"loss": 2.9476,
"step": 185
},
{
"epoch": 0.4869109947643979,
"grad_norm": 1.9431626796722412,
"learning_rate": 9.019463289187827e-05,
"loss": 2.8431,
"step": 186
},
{
"epoch": 0.4895287958115183,
"grad_norm": 1.8779281377792358,
"learning_rate": 9.00653911784403e-05,
"loss": 3.0361,
"step": 187
},
{
"epoch": 0.49214659685863876,
"grad_norm": 1.810121774673462,
"learning_rate": 8.993539716261498e-05,
"loss": 2.847,
"step": 188
},
{
"epoch": 0.49476439790575916,
"grad_norm": 2.5064663887023926,
"learning_rate": 8.980465328528219e-05,
"loss": 2.9118,
"step": 189
},
{
"epoch": 0.4973821989528796,
"grad_norm": 1.9613292217254639,
"learning_rate": 8.96731620014019e-05,
"loss": 2.8882,
"step": 190
},
{
"epoch": 0.5,
"grad_norm": 2.058799982070923,
"learning_rate": 8.954092577996803e-05,
"loss": 3.0066,
"step": 191
},
{
"epoch": 0.5026178010471204,
"grad_norm": 6.452920436859131,
"learning_rate": 8.940794710396205e-05,
"loss": 3.005,
"step": 192
},
{
"epoch": 0.5052356020942408,
"grad_norm": 2.1949808597564697,
"learning_rate": 8.927422847030646e-05,
"loss": 2.9602,
"step": 193
},
{
"epoch": 0.5078534031413613,
"grad_norm": 2.0062756538391113,
"learning_rate": 8.913977238981778e-05,
"loss": 2.9961,
"step": 194
},
{
"epoch": 0.5104712041884817,
"grad_norm": 1.8738442659378052,
"learning_rate": 8.900458138715954e-05,
"loss": 3.0411,
"step": 195
},
{
"epoch": 0.5130890052356021,
"grad_norm": 1.8305824995040894,
"learning_rate": 8.886865800079474e-05,
"loss": 3.0892,
"step": 196
},
{
"epoch": 0.5157068062827225,
"grad_norm": 1.8090300559997559,
"learning_rate": 8.873200478293826e-05,
"loss": 2.7575,
"step": 197
},
{
"epoch": 0.518324607329843,
"grad_norm": 1.870323657989502,
"learning_rate": 8.859462429950897e-05,
"loss": 3.054,
"step": 198
},
{
"epoch": 0.5209424083769634,
"grad_norm": 1.965692162513733,
"learning_rate": 8.845651913008145e-05,
"loss": 2.8143,
"step": 199
},
{
"epoch": 0.5235602094240838,
"grad_norm": 1.88938570022583,
"learning_rate": 8.831769186783765e-05,
"loss": 2.9259,
"step": 200
},
{
"epoch": 0.5261780104712042,
"grad_norm": 1.8629285097122192,
"learning_rate": 8.817814511951814e-05,
"loss": 2.8782,
"step": 201
},
{
"epoch": 0.5287958115183246,
"grad_norm": 1.873532772064209,
"learning_rate": 8.80378815053732e-05,
"loss": 2.8025,
"step": 202
},
{
"epoch": 0.5314136125654451,
"grad_norm": 2.1665189266204834,
"learning_rate": 8.789690365911356e-05,
"loss": 3.0567,
"step": 203
},
{
"epoch": 0.5340314136125655,
"grad_norm": 1.980928659439087,
"learning_rate": 8.775521422786104e-05,
"loss": 2.9795,
"step": 204
},
{
"epoch": 0.5366492146596858,
"grad_norm": 2.0913095474243164,
"learning_rate": 8.761281587209876e-05,
"loss": 3.0988,
"step": 205
},
{
"epoch": 0.5392670157068062,
"grad_norm": 1.8205846548080444,
"learning_rate": 8.746971126562124e-05,
"loss": 2.9828,
"step": 206
},
{
"epoch": 0.5418848167539267,
"grad_norm": 2.1041507720947266,
"learning_rate": 8.732590309548416e-05,
"loss": 3.0541,
"step": 207
},
{
"epoch": 0.5445026178010471,
"grad_norm": 2.023577928543091,
"learning_rate": 8.718139406195393e-05,
"loss": 3.0061,
"step": 208
},
{
"epoch": 0.5471204188481675,
"grad_norm": 1.8006081581115723,
"learning_rate": 8.703618687845696e-05,
"loss": 2.789,
"step": 209
},
{
"epoch": 0.5497382198952879,
"grad_norm": 1.840972900390625,
"learning_rate": 8.689028427152874e-05,
"loss": 2.9437,
"step": 210
},
{
"epoch": 0.5523560209424084,
"grad_norm": 1.9612823724746704,
"learning_rate": 8.674368898076261e-05,
"loss": 3.1407,
"step": 211
},
{
"epoch": 0.5549738219895288,
"grad_norm": 2.0491254329681396,
"learning_rate": 8.65964037587584e-05,
"loss": 2.9557,
"step": 212
},
{
"epoch": 0.5575916230366492,
"grad_norm": 2.0597126483917236,
"learning_rate": 8.644843137107059e-05,
"loss": 2.9996,
"step": 213
},
{
"epoch": 0.5602094240837696,
"grad_norm": 1.7746310234069824,
"learning_rate": 8.629977459615655e-05,
"loss": 3.0459,
"step": 214
},
{
"epoch": 0.56282722513089,
"grad_norm": 1.892563819885254,
"learning_rate": 8.615043622532429e-05,
"loss": 2.9831,
"step": 215
},
{
"epoch": 0.5654450261780105,
"grad_norm": 1.7289716005325317,
"learning_rate": 8.600041906268e-05,
"loss": 2.7416,
"step": 216
},
{
"epoch": 0.5680628272251309,
"grad_norm": 1.929553508758545,
"learning_rate": 8.584972592507553e-05,
"loss": 2.9513,
"step": 217
},
{
"epoch": 0.5706806282722513,
"grad_norm": 1.957163691520691,
"learning_rate": 8.569835964205536e-05,
"loss": 2.7647,
"step": 218
},
{
"epoch": 0.5732984293193717,
"grad_norm": 1.8348171710968018,
"learning_rate": 8.554632305580354e-05,
"loss": 2.9897,
"step": 219
},
{
"epoch": 0.5759162303664922,
"grad_norm": 1.811667561531067,
"learning_rate": 8.539361902109033e-05,
"loss": 2.8322,
"step": 220
},
{
"epoch": 0.5785340314136126,
"grad_norm": 2.0632617473602295,
"learning_rate": 8.524025040521856e-05,
"loss": 2.9788,
"step": 221
},
{
"epoch": 0.581151832460733,
"grad_norm": 1.847805142402649,
"learning_rate": 8.508622008796985e-05,
"loss": 2.8168,
"step": 222
},
{
"epoch": 0.5837696335078534,
"grad_norm": 1.7859346866607666,
"learning_rate": 8.493153096155042e-05,
"loss": 2.819,
"step": 223
},
{
"epoch": 0.5863874345549738,
"grad_norm": 1.756142020225525,
"learning_rate": 8.477618593053693e-05,
"loss": 2.9083,
"step": 224
},
{
"epoch": 0.5890052356020943,
"grad_norm": 1.7891409397125244,
"learning_rate": 8.462018791182184e-05,
"loss": 2.911,
"step": 225
},
{
"epoch": 0.5916230366492147,
"grad_norm": 1.923041582107544,
"learning_rate": 8.44635398345587e-05,
"loss": 2.7662,
"step": 226
},
{
"epoch": 0.5942408376963351,
"grad_norm": 2.131024122238159,
"learning_rate": 8.430624464010706e-05,
"loss": 2.8831,
"step": 227
},
{
"epoch": 0.5968586387434555,
"grad_norm": 1.9196547269821167,
"learning_rate": 8.414830528197737e-05,
"loss": 2.875,
"step": 228
},
{
"epoch": 0.599476439790576,
"grad_norm": 1.9374908208847046,
"learning_rate": 8.39897247257754e-05,
"loss": 3.0421,
"step": 229
},
{
"epoch": 0.6020942408376964,
"grad_norm": 1.9606311321258545,
"learning_rate": 8.383050594914665e-05,
"loss": 2.9347,
"step": 230
},
{
"epoch": 0.6047120418848168,
"grad_norm": 1.8933106660842896,
"learning_rate": 8.367065194172037e-05,
"loss": 2.9623,
"step": 231
},
{
"epoch": 0.6073298429319371,
"grad_norm": 2.1029906272888184,
"learning_rate": 8.351016570505347e-05,
"loss": 2.8704,
"step": 232
},
{
"epoch": 0.6099476439790575,
"grad_norm": 1.86416494846344,
"learning_rate": 8.334905025257413e-05,
"loss": 3.0935,
"step": 233
},
{
"epoch": 0.612565445026178,
"grad_norm": 1.8532062768936157,
"learning_rate": 8.318730860952522e-05,
"loss": 2.7635,
"step": 234
},
{
"epoch": 0.6151832460732984,
"grad_norm": 2.0105836391448975,
"learning_rate": 8.302494381290756e-05,
"loss": 3.0954,
"step": 235
},
{
"epoch": 0.6178010471204188,
"grad_norm": 2.164985418319702,
"learning_rate": 8.286195891142274e-05,
"loss": 2.9424,
"step": 236
},
{
"epoch": 0.6204188481675392,
"grad_norm": 1.9459847211837769,
"learning_rate": 8.269835696541607e-05,
"loss": 2.8438,
"step": 237
},
{
"epoch": 0.6230366492146597,
"grad_norm": 3.4676437377929688,
"learning_rate": 8.253414104681898e-05,
"loss": 3.0494,
"step": 238
},
{
"epoch": 0.6256544502617801,
"grad_norm": 1.8192616701126099,
"learning_rate": 8.236931423909138e-05,
"loss": 2.7965,
"step": 239
},
{
"epoch": 0.6282722513089005,
"grad_norm": 1.8246678113937378,
"learning_rate": 8.220387963716377e-05,
"loss": 2.8788,
"step": 240
},
{
"epoch": 0.6308900523560209,
"grad_norm": 1.8622642755508423,
"learning_rate": 8.20378403473791e-05,
"loss": 3.0934,
"step": 241
},
{
"epoch": 0.6335078534031413,
"grad_norm": 1.927139163017273,
"learning_rate": 8.18711994874345e-05,
"loss": 2.9704,
"step": 242
},
{
"epoch": 0.6361256544502618,
"grad_norm": 1.7952053546905518,
"learning_rate": 8.170396018632264e-05,
"loss": 2.8313,
"step": 243
},
{
"epoch": 0.6387434554973822,
"grad_norm": 1.8302192687988281,
"learning_rate": 8.153612558427311e-05,
"loss": 2.8198,
"step": 244
},
{
"epoch": 0.6413612565445026,
"grad_norm": 1.809410810470581,
"learning_rate": 8.13676988326933e-05,
"loss": 2.6532,
"step": 245
},
{
"epoch": 0.643979057591623,
"grad_norm": 1.8222423791885376,
"learning_rate": 8.119868309410943e-05,
"loss": 2.9867,
"step": 246
},
{
"epoch": 0.6465968586387435,
"grad_norm": 7.688499927520752,
"learning_rate": 8.102908154210693e-05,
"loss": 2.8391,
"step": 247
},
{
"epoch": 0.6492146596858639,
"grad_norm": 1.8286641836166382,
"learning_rate": 8.085889736127103e-05,
"loss": 2.9778,
"step": 248
},
{
"epoch": 0.6518324607329843,
"grad_norm": 1.816272258758545,
"learning_rate": 8.068813374712688e-05,
"loss": 2.8969,
"step": 249
},
{
"epoch": 0.6544502617801047,
"grad_norm": 2.080170154571533,
"learning_rate": 8.05167939060796e-05,
"loss": 2.9861,
"step": 250
},
{
"epoch": 0.6570680628272252,
"grad_norm": 1.8223365545272827,
"learning_rate": 8.0344881055354e-05,
"loss": 2.9545,
"step": 251
},
{
"epoch": 0.6596858638743456,
"grad_norm": 1.891790509223938,
"learning_rate": 8.017239842293427e-05,
"loss": 2.8586,
"step": 252
},
{
"epoch": 0.662303664921466,
"grad_norm": 1.8906011581420898,
"learning_rate": 7.999934924750325e-05,
"loss": 2.94,
"step": 253
},
{
"epoch": 0.6649214659685864,
"grad_norm": 1.8247802257537842,
"learning_rate": 7.982573677838172e-05,
"loss": 2.6747,
"step": 254
},
{
"epoch": 0.6675392670157068,
"grad_norm": 1.8509819507598877,
"learning_rate": 7.965156427546735e-05,
"loss": 2.8795,
"step": 255
},
{
"epoch": 0.6701570680628273,
"grad_norm": 1.796658992767334,
"learning_rate": 7.947683500917347e-05,
"loss": 2.8549,
"step": 256
},
{
"epoch": 0.6727748691099477,
"grad_norm": 1.771183729171753,
"learning_rate": 7.93015522603677e-05,
"loss": 2.9275,
"step": 257
},
{
"epoch": 0.675392670157068,
"grad_norm": 1.8382376432418823,
"learning_rate": 7.91257193203103e-05,
"loss": 3.0237,
"step": 258
},
{
"epoch": 0.6780104712041884,
"grad_norm": 1.8723816871643066,
"learning_rate": 7.894933949059245e-05,
"loss": 2.6623,
"step": 259
},
{
"epoch": 0.680628272251309,
"grad_norm": 1.9213804006576538,
"learning_rate": 7.877241608307411e-05,
"loss": 2.8994,
"step": 260
},
{
"epoch": 0.6832460732984293,
"grad_norm": 1.7478219270706177,
"learning_rate": 7.8594952419822e-05,
"loss": 2.7949,
"step": 261
},
{
"epoch": 0.6858638743455497,
"grad_norm": 1.8188650608062744,
"learning_rate": 7.841695183304713e-05,
"loss": 2.8178,
"step": 262
},
{
"epoch": 0.6884816753926701,
"grad_norm": 1.8901054859161377,
"learning_rate": 7.823841766504227e-05,
"loss": 2.9314,
"step": 263
},
{
"epoch": 0.6910994764397905,
"grad_norm": 1.9486525058746338,
"learning_rate": 7.805935326811912e-05,
"loss": 2.7373,
"step": 264
},
{
"epoch": 0.693717277486911,
"grad_norm": 1.7464019060134888,
"learning_rate": 7.787976200454546e-05,
"loss": 2.8273,
"step": 265
},
{
"epoch": 0.6963350785340314,
"grad_norm": 1.8585435152053833,
"learning_rate": 7.769964724648196e-05,
"loss": 2.8128,
"step": 266
},
{
"epoch": 0.6989528795811518,
"grad_norm": 1.7956280708312988,
"learning_rate": 7.751901237591887e-05,
"loss": 2.7562,
"step": 267
},
{
"epoch": 0.7015706806282722,
"grad_norm": 2.0384879112243652,
"learning_rate": 7.733786078461252e-05,
"loss": 2.9731,
"step": 268
},
{
"epoch": 0.7041884816753927,
"grad_norm": 1.7548547983169556,
"learning_rate": 7.715619587402164e-05,
"loss": 2.834,
"step": 269
},
{
"epoch": 0.7068062827225131,
"grad_norm": 1.651876449584961,
"learning_rate": 7.697402105524351e-05,
"loss": 2.7768,
"step": 270
},
{
"epoch": 0.7094240837696335,
"grad_norm": 1.8058605194091797,
"learning_rate": 7.679133974894983e-05,
"loss": 2.7503,
"step": 271
},
{
"epoch": 0.7120418848167539,
"grad_norm": 1.921723484992981,
"learning_rate": 7.66081553853226e-05,
"loss": 2.7922,
"step": 272
},
{
"epoch": 0.7146596858638743,
"grad_norm": 1.990212321281433,
"learning_rate": 7.642447140398965e-05,
"loss": 2.8968,
"step": 273
},
{
"epoch": 0.7172774869109948,
"grad_norm": 1.8311606645584106,
"learning_rate": 7.624029125396004e-05,
"loss": 2.9963,
"step": 274
},
{
"epoch": 0.7198952879581152,
"grad_norm": 2.0072896480560303,
"learning_rate": 7.605561839355933e-05,
"loss": 2.875,
"step": 275
},
{
"epoch": 0.7225130890052356,
"grad_norm": 1.9953948259353638,
"learning_rate": 7.587045629036463e-05,
"loss": 2.9325,
"step": 276
},
{
"epoch": 0.725130890052356,
"grad_norm": 1.8839069604873657,
"learning_rate": 7.568480842113952e-05,
"loss": 2.682,
"step": 277
},
{
"epoch": 0.7277486910994765,
"grad_norm": 1.6751598119735718,
"learning_rate": 7.549867827176873e-05,
"loss": 2.7488,
"step": 278
},
{
"epoch": 0.7303664921465969,
"grad_norm": 1.7167940139770508,
"learning_rate": 7.53120693371927e-05,
"loss": 2.9112,
"step": 279
},
{
"epoch": 0.7329842931937173,
"grad_norm": 1.9378173351287842,
"learning_rate": 7.512498512134194e-05,
"loss": 2.9304,
"step": 280
},
{
"epoch": 0.7356020942408377,
"grad_norm": 1.7746607065200806,
"learning_rate": 7.493742913707127e-05,
"loss": 2.9898,
"step": 281
},
{
"epoch": 0.7382198952879581,
"grad_norm": 1.9426668882369995,
"learning_rate": 7.474940490609383e-05,
"loss": 2.9828,
"step": 282
},
{
"epoch": 0.7408376963350786,
"grad_norm": 1.8619779348373413,
"learning_rate": 7.456091595891498e-05,
"loss": 2.7965,
"step": 283
},
{
"epoch": 0.743455497382199,
"grad_norm": 1.691998839378357,
"learning_rate": 7.437196583476596e-05,
"loss": 2.8217,
"step": 284
},
{
"epoch": 0.7460732984293194,
"grad_norm": 1.8032022714614868,
"learning_rate": 7.41825580815375e-05,
"loss": 2.7456,
"step": 285
},
{
"epoch": 0.7486910994764397,
"grad_norm": 1.8445478677749634,
"learning_rate": 7.399269625571316e-05,
"loss": 2.855,
"step": 286
},
{
"epoch": 0.7513089005235603,
"grad_norm": 1.69001305103302,
"learning_rate": 7.380238392230257e-05,
"loss": 2.6786,
"step": 287
},
{
"epoch": 0.7539267015706806,
"grad_norm": 1.8098856210708618,
"learning_rate": 7.361162465477442e-05,
"loss": 2.8392,
"step": 288
},
{
"epoch": 0.756544502617801,
"grad_norm": 2.023061752319336,
"learning_rate": 7.342042203498951e-05,
"loss": 3.1742,
"step": 289
},
{
"epoch": 0.7591623036649214,
"grad_norm": 2.027721405029297,
"learning_rate": 7.322877965313335e-05,
"loss": 2.707,
"step": 290
},
{
"epoch": 0.7617801047120419,
"grad_norm": 2.092510938644409,
"learning_rate": 7.303670110764881e-05,
"loss": 2.9764,
"step": 291
},
{
"epoch": 0.7643979057591623,
"grad_norm": 1.794023871421814,
"learning_rate": 7.284419000516855e-05,
"loss": 2.8471,
"step": 292
},
{
"epoch": 0.7670157068062827,
"grad_norm": 1.998582124710083,
"learning_rate": 7.26512499604473e-05,
"loss": 2.8464,
"step": 293
},
{
"epoch": 0.7696335078534031,
"grad_norm": 1.9297953844070435,
"learning_rate": 7.245788459629396e-05,
"loss": 2.6567,
"step": 294
},
{
"epoch": 0.7722513089005235,
"grad_norm": 1.889047622680664,
"learning_rate": 7.226409754350361e-05,
"loss": 2.8468,
"step": 295
},
{
"epoch": 0.774869109947644,
"grad_norm": 1.806944489479065,
"learning_rate": 7.206989244078934e-05,
"loss": 2.7286,
"step": 296
},
{
"epoch": 0.7774869109947644,
"grad_norm": 1.7581151723861694,
"learning_rate": 7.187527293471385e-05,
"loss": 2.7098,
"step": 297
},
{
"epoch": 0.7801047120418848,
"grad_norm": 1.8031219244003296,
"learning_rate": 7.168024267962111e-05,
"loss": 2.746,
"step": 298
},
{
"epoch": 0.7827225130890052,
"grad_norm": 1.743693470954895,
"learning_rate": 7.14848053375676e-05,
"loss": 2.9825,
"step": 299
},
{
"epoch": 0.7853403141361257,
"grad_norm": 1.8501001596450806,
"learning_rate": 7.128896457825364e-05,
"loss": 2.7572,
"step": 300
},
{
"epoch": 0.7879581151832461,
"grad_norm": 1.8102787733078003,
"learning_rate": 7.109272407895449e-05,
"loss": 2.7863,
"step": 301
},
{
"epoch": 0.7905759162303665,
"grad_norm": 1.8646475076675415,
"learning_rate": 7.089608752445121e-05,
"loss": 2.8223,
"step": 302
},
{
"epoch": 0.7931937172774869,
"grad_norm": 1.9499090909957886,
"learning_rate": 7.069905860696162e-05,
"loss": 2.7525,
"step": 303
},
{
"epoch": 0.7958115183246073,
"grad_norm": 2.4195919036865234,
"learning_rate": 7.05016410260708e-05,
"loss": 2.7437,
"step": 304
},
{
"epoch": 0.7984293193717278,
"grad_norm": 2.0008833408355713,
"learning_rate": 7.030383848866177e-05,
"loss": 2.8387,
"step": 305
},
{
"epoch": 0.8010471204188482,
"grad_norm": 2.05761456489563,
"learning_rate": 7.010565470884582e-05,
"loss": 2.8233,
"step": 306
},
{
"epoch": 0.8036649214659686,
"grad_norm": 1.608881950378418,
"learning_rate": 6.990709340789273e-05,
"loss": 2.8514,
"step": 307
},
{
"epoch": 0.806282722513089,
"grad_norm": 1.8444130420684814,
"learning_rate": 6.970815831416099e-05,
"loss": 2.784,
"step": 308
},
{
"epoch": 0.8089005235602095,
"grad_norm": 1.946343183517456,
"learning_rate": 6.950885316302773e-05,
"loss": 2.6383,
"step": 309
},
{
"epoch": 0.8115183246073299,
"grad_norm": 1.7848812341690063,
"learning_rate": 6.93091816968186e-05,
"loss": 2.7524,
"step": 310
},
{
"epoch": 0.8141361256544503,
"grad_norm": 1.8199461698532104,
"learning_rate": 6.910914766473749e-05,
"loss": 2.7292,
"step": 311
},
{
"epoch": 0.8167539267015707,
"grad_norm": 1.8317044973373413,
"learning_rate": 6.890875482279614e-05,
"loss": 2.7381,
"step": 312
},
{
"epoch": 0.819371727748691,
"grad_norm": 1.7864313125610352,
"learning_rate": 6.870800693374364e-05,
"loss": 2.7642,
"step": 313
},
{
"epoch": 0.8219895287958116,
"grad_norm": 1.8122559785842896,
"learning_rate": 6.850690776699573e-05,
"loss": 2.6585,
"step": 314
},
{
"epoch": 0.824607329842932,
"grad_norm": 1.8325432538986206,
"learning_rate": 6.830546109856401e-05,
"loss": 2.7378,
"step": 315
},
{
"epoch": 0.8272251308900523,
"grad_norm": 1.7728067636489868,
"learning_rate": 6.810367071098516e-05,
"loss": 2.8454,
"step": 316
},
{
"epoch": 0.8298429319371727,
"grad_norm": 1.841179609298706,
"learning_rate": 6.790154039324975e-05,
"loss": 2.6204,
"step": 317
},
{
"epoch": 0.8324607329842932,
"grad_norm": 1.6985840797424316,
"learning_rate": 6.769907394073117e-05,
"loss": 2.905,
"step": 318
},
{
"epoch": 0.8350785340314136,
"grad_norm": 1.8315932750701904,
"learning_rate": 6.749627515511442e-05,
"loss": 2.7492,
"step": 319
},
{
"epoch": 0.837696335078534,
"grad_norm": 1.8084025382995605,
"learning_rate": 6.729314784432465e-05,
"loss": 2.752,
"step": 320
},
{
"epoch": 0.8403141361256544,
"grad_norm": 1.8621736764907837,
"learning_rate": 6.708969582245568e-05,
"loss": 2.6648,
"step": 321
},
{
"epoch": 0.8429319371727748,
"grad_norm": 1.7286403179168701,
"learning_rate": 6.688592290969837e-05,
"loss": 2.5931,
"step": 322
},
{
"epoch": 0.8455497382198953,
"grad_norm": 1.67673921585083,
"learning_rate": 6.668183293226891e-05,
"loss": 2.8364,
"step": 323
},
{
"epoch": 0.8481675392670157,
"grad_norm": 1.9970136880874634,
"learning_rate": 6.647742972233703e-05,
"loss": 2.888,
"step": 324
},
{
"epoch": 0.8507853403141361,
"grad_norm": 1.8752323389053345,
"learning_rate": 6.627271711795386e-05,
"loss": 3.0591,
"step": 325
},
{
"epoch": 0.8534031413612565,
"grad_norm": 1.7339110374450684,
"learning_rate": 6.606769896298014e-05,
"loss": 2.8724,
"step": 326
},
{
"epoch": 0.856020942408377,
"grad_norm": 1.8849775791168213,
"learning_rate": 6.586237910701374e-05,
"loss": 2.8541,
"step": 327
},
{
"epoch": 0.8586387434554974,
"grad_norm": 1.8441506624221802,
"learning_rate": 6.565676140531764e-05,
"loss": 2.9447,
"step": 328
},
{
"epoch": 0.8612565445026178,
"grad_norm": 1.7712169885635376,
"learning_rate": 6.545084971874738e-05,
"loss": 2.6989,
"step": 329
},
{
"epoch": 0.8638743455497382,
"grad_norm": 1.7494961023330688,
"learning_rate": 6.524464791367861e-05,
"loss": 2.7702,
"step": 330
},
{
"epoch": 0.8664921465968587,
"grad_norm": 2.063343048095703,
"learning_rate": 6.503815986193456e-05,
"loss": 2.746,
"step": 331
},
{
"epoch": 0.8691099476439791,
"grad_norm": 1.89371919631958,
"learning_rate": 6.483138944071316e-05,
"loss": 2.7176,
"step": 332
},
{
"epoch": 0.8717277486910995,
"grad_norm": 2.864008665084839,
"learning_rate": 6.462434053251446e-05,
"loss": 2.6897,
"step": 333
},
{
"epoch": 0.8743455497382199,
"grad_norm": 1.761522650718689,
"learning_rate": 6.441701702506754e-05,
"loss": 2.5764,
"step": 334
},
{
"epoch": 0.8769633507853403,
"grad_norm": 1.726992130279541,
"learning_rate": 6.420942281125765e-05,
"loss": 2.6313,
"step": 335
},
{
"epoch": 0.8795811518324608,
"grad_norm": 1.711841106414795,
"learning_rate": 6.400156178905308e-05,
"loss": 2.7067,
"step": 336
},
{
"epoch": 0.8821989528795812,
"grad_norm": 1.8135813474655151,
"learning_rate": 6.379343786143184e-05,
"loss": 2.78,
"step": 337
},
{
"epoch": 0.8848167539267016,
"grad_norm": 1.6920337677001953,
"learning_rate": 6.358505493630858e-05,
"loss": 2.8199,
"step": 338
},
{
"epoch": 0.887434554973822,
"grad_norm": 1.7479579448699951,
"learning_rate": 6.337641692646106e-05,
"loss": 2.8199,
"step": 339
},
{
"epoch": 0.8900523560209425,
"grad_norm": 1.778996467590332,
"learning_rate": 6.316752774945673e-05,
"loss": 2.6521,
"step": 340
},
{
"epoch": 0.8926701570680629,
"grad_norm": 1.964356541633606,
"learning_rate": 6.295839132757919e-05,
"loss": 2.7588,
"step": 341
},
{
"epoch": 0.8952879581151832,
"grad_norm": 1.6494593620300293,
"learning_rate": 6.274901158775454e-05,
"loss": 2.7191,
"step": 342
},
{
"epoch": 0.8979057591623036,
"grad_norm": 1.7057850360870361,
"learning_rate": 6.25393924614776e-05,
"loss": 2.5695,
"step": 343
},
{
"epoch": 0.900523560209424,
"grad_norm": 1.83281672000885,
"learning_rate": 6.232953788473811e-05,
"loss": 2.7329,
"step": 344
},
{
"epoch": 0.9031413612565445,
"grad_norm": 1.7066670656204224,
"learning_rate": 6.211945179794684e-05,
"loss": 2.6925,
"step": 345
},
{
"epoch": 0.9057591623036649,
"grad_norm": 1.7664719820022583,
"learning_rate": 6.190913814586162e-05,
"loss": 2.8158,
"step": 346
},
{
"epoch": 0.9083769633507853,
"grad_norm": 1.8208638429641724,
"learning_rate": 6.169860087751321e-05,
"loss": 2.6722,
"step": 347
},
{
"epoch": 0.9109947643979057,
"grad_norm": 1.891035556793213,
"learning_rate": 6.148784394613119e-05,
"loss": 2.7744,
"step": 348
},
{
"epoch": 0.9136125654450262,
"grad_norm": 1.7416268587112427,
"learning_rate": 6.127687130906972e-05,
"loss": 2.5742,
"step": 349
},
{
"epoch": 0.9162303664921466,
"grad_norm": 2.1573400497436523,
"learning_rate": 6.106568692773324e-05,
"loss": 2.7032,
"step": 350
},
{
"epoch": 0.918848167539267,
"grad_norm": 1.9572385549545288,
"learning_rate": 6.0854294767502084e-05,
"loss": 2.7508,
"step": 351
},
{
"epoch": 0.9214659685863874,
"grad_norm": 1.7982288599014282,
"learning_rate": 6.064269879765805e-05,
"loss": 2.8771,
"step": 352
},
{
"epoch": 0.9240837696335078,
"grad_norm": 1.6815402507781982,
"learning_rate": 6.043090299130978e-05,
"loss": 2.7996,
"step": 353
},
{
"epoch": 0.9267015706806283,
"grad_norm": 1.8650509119033813,
"learning_rate": 6.021891132531825e-05,
"loss": 2.7655,
"step": 354
},
{
"epoch": 0.9293193717277487,
"grad_norm": 1.7183104753494263,
"learning_rate": 6.000672778022208e-05,
"loss": 2.6123,
"step": 355
},
{
"epoch": 0.9319371727748691,
"grad_norm": 1.7947955131530762,
"learning_rate": 5.979435634016277e-05,
"loss": 2.6827,
"step": 356
},
{
"epoch": 0.9345549738219895,
"grad_norm": 1.6206470727920532,
"learning_rate": 5.95818009928099e-05,
"loss": 2.5481,
"step": 357
},
{
"epoch": 0.93717277486911,
"grad_norm": 1.8061578273773193,
"learning_rate": 5.9369065729286245e-05,
"loss": 2.7013,
"step": 358
},
{
"epoch": 0.9397905759162304,
"grad_norm": 1.7426854372024536,
"learning_rate": 5.9156154544092815e-05,
"loss": 2.8558,
"step": 359
},
{
"epoch": 0.9424083769633508,
"grad_norm": 1.7318534851074219,
"learning_rate": 5.894307143503393e-05,
"loss": 2.8222,
"step": 360
},
{
"epoch": 0.9450261780104712,
"grad_norm": 1.6571924686431885,
"learning_rate": 5.8729820403142054e-05,
"loss": 2.7089,
"step": 361
},
{
"epoch": 0.9476439790575916,
"grad_norm": 1.6273107528686523,
"learning_rate": 5.851640545260276e-05,
"loss": 2.5964,
"step": 362
},
{
"epoch": 0.9502617801047121,
"grad_norm": 1.7294092178344727,
"learning_rate": 5.830283059067947e-05,
"loss": 2.6222,
"step": 363
},
{
"epoch": 0.9528795811518325,
"grad_norm": 1.631488561630249,
"learning_rate": 5.808909982763825e-05,
"loss": 2.711,
"step": 364
},
{
"epoch": 0.9554973821989529,
"grad_norm": 1.7980823516845703,
"learning_rate": 5.787521717667247e-05,
"loss": 2.5482,
"step": 365
},
{
"epoch": 0.9581151832460733,
"grad_norm": 1.5978546142578125,
"learning_rate": 5.7661186653827535e-05,
"loss": 2.6563,
"step": 366
},
{
"epoch": 0.9607329842931938,
"grad_norm": 1.7057075500488281,
"learning_rate": 5.744701227792538e-05,
"loss": 2.7849,
"step": 367
},
{
"epoch": 0.9633507853403142,
"grad_norm": 1.9055167436599731,
"learning_rate": 5.7232698070489065e-05,
"loss": 2.8311,
"step": 368
},
{
"epoch": 0.9659685863874345,
"grad_norm": 1.6838874816894531,
"learning_rate": 5.701824805566722e-05,
"loss": 2.6986,
"step": 369
},
{
"epoch": 0.9685863874345549,
"grad_norm": 1.7164732217788696,
"learning_rate": 5.680366626015855e-05,
"loss": 2.7211,
"step": 370
},
{
"epoch": 0.9712041884816754,
"grad_norm": 4.532098770141602,
"learning_rate": 5.658895671313619e-05,
"loss": 2.6758,
"step": 371
},
{
"epoch": 0.9738219895287958,
"grad_norm": 1.914410948753357,
"learning_rate": 5.6374123446172e-05,
"loss": 2.8492,
"step": 372
},
{
"epoch": 0.9764397905759162,
"grad_norm": 1.6964002847671509,
"learning_rate": 5.615917049316095e-05,
"loss": 2.6859,
"step": 373
},
{
"epoch": 0.9790575916230366,
"grad_norm": 1.863756537437439,
"learning_rate": 5.5944101890245324e-05,
"loss": 2.6823,
"step": 374
},
{
"epoch": 0.981675392670157,
"grad_norm": 1.729614496231079,
"learning_rate": 5.5728921675738964e-05,
"loss": 2.6016,
"step": 375
},
{
"epoch": 0.9842931937172775,
"grad_norm": 1.7206858396530151,
"learning_rate": 5.551363389005144e-05,
"loss": 2.6316,
"step": 376
},
{
"epoch": 0.9869109947643979,
"grad_norm": 1.6375540494918823,
"learning_rate": 5.529824257561212e-05,
"loss": 2.5566,
"step": 377
},
{
"epoch": 0.9895287958115183,
"grad_norm": 1.7328511476516724,
"learning_rate": 5.508275177679436e-05,
"loss": 2.7835,
"step": 378
},
{
"epoch": 0.9921465968586387,
"grad_norm": 1.7834824323654175,
"learning_rate": 5.486716553983951e-05,
"loss": 2.6773,
"step": 379
},
{
"epoch": 0.9947643979057592,
"grad_norm": 1.8649531602859497,
"learning_rate": 5.4651487912780906e-05,
"loss": 2.8563,
"step": 380
},
{
"epoch": 0.9973821989528796,
"grad_norm": 1.7680840492248535,
"learning_rate": 5.443572294536801e-05,
"loss": 2.6923,
"step": 381
},
{
"epoch": 1.0,
"grad_norm": 2.223090648651123,
"learning_rate": 5.4219874688990146e-05,
"loss": 2.8072,
"step": 382
}
],
"logging_steps": 1,
"max_steps": 764,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 382,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.459853308847718e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}