Samuel J. Huskey
add: model files
0f68ee2
{
"best_metric": 0.9499685377695699,
"best_model_checkpoint": "./distilbert-finetuned/checkpoint-30520",
"epoch": 20.0,
"eval_steps": 500,
"global_step": 30520,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0655307994757536,
"grad_norm": 1.8972834348678589,
"learning_rate": 4.983617300131062e-05,
"loss": 8.0585,
"step": 100
},
{
"epoch": 0.1310615989515072,
"grad_norm": 1.7784675359725952,
"learning_rate": 4.9672346002621236e-05,
"loss": 8.0566,
"step": 200
},
{
"epoch": 0.1965923984272608,
"grad_norm": 2.920842409133911,
"learning_rate": 4.950851900393185e-05,
"loss": 8.0383,
"step": 300
},
{
"epoch": 0.2621231979030144,
"grad_norm": 3.6251978874206543,
"learning_rate": 4.934469200524246e-05,
"loss": 7.9945,
"step": 400
},
{
"epoch": 0.32765399737876805,
"grad_norm": 4.245726585388184,
"learning_rate": 4.918086500655309e-05,
"loss": 7.8561,
"step": 500
},
{
"epoch": 0.3931847968545216,
"grad_norm": 4.707456111907959,
"learning_rate": 4.90170380078637e-05,
"loss": 7.7312,
"step": 600
},
{
"epoch": 0.45871559633027525,
"grad_norm": 5.121094703674316,
"learning_rate": 4.8853211009174314e-05,
"loss": 7.5316,
"step": 700
},
{
"epoch": 0.5242463958060288,
"grad_norm": 5.974259376525879,
"learning_rate": 4.868938401048493e-05,
"loss": 7.3433,
"step": 800
},
{
"epoch": 0.5897771952817824,
"grad_norm": 5.776278972625732,
"learning_rate": 4.852555701179555e-05,
"loss": 7.0775,
"step": 900
},
{
"epoch": 0.6553079947575361,
"grad_norm": 6.829719543457031,
"learning_rate": 4.836173001310616e-05,
"loss": 6.8544,
"step": 1000
},
{
"epoch": 0.7208387942332897,
"grad_norm": 7.138682842254639,
"learning_rate": 4.819790301441678e-05,
"loss": 6.6293,
"step": 1100
},
{
"epoch": 0.7863695937090432,
"grad_norm": 6.803562641143799,
"learning_rate": 4.803407601572739e-05,
"loss": 6.3618,
"step": 1200
},
{
"epoch": 0.8519003931847968,
"grad_norm": 7.9476752281188965,
"learning_rate": 4.787024901703801e-05,
"loss": 6.0285,
"step": 1300
},
{
"epoch": 0.9174311926605505,
"grad_norm": 9.059676170349121,
"learning_rate": 4.7706422018348626e-05,
"loss": 5.7603,
"step": 1400
},
{
"epoch": 0.9829619921363041,
"grad_norm": 8.36684513092041,
"learning_rate": 4.754259501965924e-05,
"loss": 5.4373,
"step": 1500
},
{
"epoch": 1.0,
"eval_accuracy": 0.3351490825688073,
"eval_f1": 0.2757507443104033,
"eval_loss": 4.97554874420166,
"eval_runtime": 0.9115,
"eval_samples_per_second": 3826.458,
"eval_steps_per_second": 60.337,
"step": 1526
},
{
"epoch": 1.0484927916120577,
"grad_norm": 9.069089889526367,
"learning_rate": 4.737876802096986e-05,
"loss": 4.6364,
"step": 1600
},
{
"epoch": 1.1140235910878113,
"grad_norm": 9.420051574707031,
"learning_rate": 4.7214941022280476e-05,
"loss": 4.1063,
"step": 1700
},
{
"epoch": 1.1795543905635648,
"grad_norm": 9.825067520141602,
"learning_rate": 4.705111402359109e-05,
"loss": 3.7956,
"step": 1800
},
{
"epoch": 1.2450851900393185,
"grad_norm": 11.169500350952148,
"learning_rate": 4.68872870249017e-05,
"loss": 3.5422,
"step": 1900
},
{
"epoch": 1.310615989515072,
"grad_norm": 10.43234920501709,
"learning_rate": 4.672346002621232e-05,
"loss": 3.2258,
"step": 2000
},
{
"epoch": 1.3761467889908257,
"grad_norm": 9.745192527770996,
"learning_rate": 4.655963302752294e-05,
"loss": 2.9775,
"step": 2100
},
{
"epoch": 1.4416775884665793,
"grad_norm": 11.521934509277344,
"learning_rate": 4.6395806028833554e-05,
"loss": 2.8164,
"step": 2200
},
{
"epoch": 1.5072083879423328,
"grad_norm": 10.573694229125977,
"learning_rate": 4.623197903014417e-05,
"loss": 2.5647,
"step": 2300
},
{
"epoch": 1.5727391874180865,
"grad_norm": 11.856453895568848,
"learning_rate": 4.606815203145479e-05,
"loss": 2.3067,
"step": 2400
},
{
"epoch": 1.6382699868938402,
"grad_norm": 13.598255157470703,
"learning_rate": 4.59043250327654e-05,
"loss": 2.2179,
"step": 2500
},
{
"epoch": 1.7038007863695936,
"grad_norm": 9.973114013671875,
"learning_rate": 4.5740498034076015e-05,
"loss": 1.9873,
"step": 2600
},
{
"epoch": 1.7693315858453473,
"grad_norm": 11.870305061340332,
"learning_rate": 4.557667103538664e-05,
"loss": 1.844,
"step": 2700
},
{
"epoch": 1.834862385321101,
"grad_norm": 12.496658325195312,
"learning_rate": 4.541284403669725e-05,
"loss": 1.7231,
"step": 2800
},
{
"epoch": 1.9003931847968545,
"grad_norm": 9.312653541564941,
"learning_rate": 4.5249017038007866e-05,
"loss": 1.5158,
"step": 2900
},
{
"epoch": 1.9659239842726082,
"grad_norm": 12.817119598388672,
"learning_rate": 4.508519003931848e-05,
"loss": 1.4242,
"step": 3000
},
{
"epoch": 2.0,
"eval_accuracy": 0.7577408256880734,
"eval_f1": 0.7301710983333689,
"eval_loss": 1.4731484651565552,
"eval_runtime": 0.8691,
"eval_samples_per_second": 4013.239,
"eval_steps_per_second": 63.282,
"step": 3052
},
{
"epoch": 2.031454783748362,
"grad_norm": 10.233814239501953,
"learning_rate": 4.49213630406291e-05,
"loss": 1.1832,
"step": 3100
},
{
"epoch": 2.0969855832241153,
"grad_norm": 12.700181007385254,
"learning_rate": 4.475753604193971e-05,
"loss": 0.941,
"step": 3200
},
{
"epoch": 2.162516382699869,
"grad_norm": 12.155367851257324,
"learning_rate": 4.459370904325033e-05,
"loss": 0.9038,
"step": 3300
},
{
"epoch": 2.2280471821756227,
"grad_norm": 12.641743659973145,
"learning_rate": 4.4429882044560943e-05,
"loss": 0.8846,
"step": 3400
},
{
"epoch": 2.293577981651376,
"grad_norm": 11.405875205993652,
"learning_rate": 4.426605504587156e-05,
"loss": 0.8371,
"step": 3500
},
{
"epoch": 2.3591087811271296,
"grad_norm": 9.20864200592041,
"learning_rate": 4.410222804718218e-05,
"loss": 0.737,
"step": 3600
},
{
"epoch": 2.4246395806028835,
"grad_norm": 15.105806350708008,
"learning_rate": 4.3938401048492794e-05,
"loss": 0.7328,
"step": 3700
},
{
"epoch": 2.490170380078637,
"grad_norm": 7.7599310874938965,
"learning_rate": 4.3774574049803404e-05,
"loss": 0.7012,
"step": 3800
},
{
"epoch": 2.5557011795543905,
"grad_norm": 10.58204460144043,
"learning_rate": 4.361074705111403e-05,
"loss": 0.6169,
"step": 3900
},
{
"epoch": 2.621231979030144,
"grad_norm": 9.051236152648926,
"learning_rate": 4.344692005242464e-05,
"loss": 0.6348,
"step": 4000
},
{
"epoch": 2.686762778505898,
"grad_norm": 5.441799640655518,
"learning_rate": 4.3283093053735255e-05,
"loss": 0.5538,
"step": 4100
},
{
"epoch": 2.7522935779816513,
"grad_norm": 9.519750595092773,
"learning_rate": 4.311926605504588e-05,
"loss": 0.5626,
"step": 4200
},
{
"epoch": 2.8178243774574048,
"grad_norm": 6.30112886428833,
"learning_rate": 4.295543905635649e-05,
"loss": 0.5072,
"step": 4300
},
{
"epoch": 2.8833551769331587,
"grad_norm": 11.238988876342773,
"learning_rate": 4.2791612057667106e-05,
"loss": 0.482,
"step": 4400
},
{
"epoch": 2.948885976408912,
"grad_norm": 8.047210693359375,
"learning_rate": 4.262778505897772e-05,
"loss": 0.5018,
"step": 4500
},
{
"epoch": 3.0,
"eval_accuracy": 0.8847477064220184,
"eval_f1": 0.8750008035415145,
"eval_loss": 0.6360189318656921,
"eval_runtime": 0.9139,
"eval_samples_per_second": 3816.692,
"eval_steps_per_second": 60.183,
"step": 4578
},
{
"epoch": 3.0144167758846656,
"grad_norm": 5.651986598968506,
"learning_rate": 4.246395806028834e-05,
"loss": 0.4096,
"step": 4600
},
{
"epoch": 3.0799475753604195,
"grad_norm": 4.4963274002075195,
"learning_rate": 4.230013106159895e-05,
"loss": 0.2835,
"step": 4700
},
{
"epoch": 3.145478374836173,
"grad_norm": 3.222943067550659,
"learning_rate": 4.2136304062909573e-05,
"loss": 0.2747,
"step": 4800
},
{
"epoch": 3.2110091743119265,
"grad_norm": 9.179097175598145,
"learning_rate": 4.1972477064220184e-05,
"loss": 0.2935,
"step": 4900
},
{
"epoch": 3.2765399737876804,
"grad_norm": 6.673375606536865,
"learning_rate": 4.18086500655308e-05,
"loss": 0.2708,
"step": 5000
},
{
"epoch": 3.342070773263434,
"grad_norm": 6.3241400718688965,
"learning_rate": 4.164482306684142e-05,
"loss": 0.2805,
"step": 5100
},
{
"epoch": 3.4076015727391873,
"grad_norm": 3.614450216293335,
"learning_rate": 4.1480996068152034e-05,
"loss": 0.2383,
"step": 5200
},
{
"epoch": 3.473132372214941,
"grad_norm": 1.6470447778701782,
"learning_rate": 4.1317169069462644e-05,
"loss": 0.205,
"step": 5300
},
{
"epoch": 3.5386631716906947,
"grad_norm": 3.095306873321533,
"learning_rate": 4.115334207077327e-05,
"loss": 0.2245,
"step": 5400
},
{
"epoch": 3.604193971166448,
"grad_norm": 2.1946816444396973,
"learning_rate": 4.0989515072083885e-05,
"loss": 0.2159,
"step": 5500
},
{
"epoch": 3.669724770642202,
"grad_norm": 4.0140886306762695,
"learning_rate": 4.0825688073394495e-05,
"loss": 0.2264,
"step": 5600
},
{
"epoch": 3.7352555701179555,
"grad_norm": 4.582081317901611,
"learning_rate": 4.066186107470511e-05,
"loss": 0.2393,
"step": 5700
},
{
"epoch": 3.800786369593709,
"grad_norm": 3.2640044689178467,
"learning_rate": 4.049803407601573e-05,
"loss": 0.1792,
"step": 5800
},
{
"epoch": 3.866317169069463,
"grad_norm": 4.081344127655029,
"learning_rate": 4.0334207077326346e-05,
"loss": 0.1883,
"step": 5900
},
{
"epoch": 3.9318479685452163,
"grad_norm": 7.787130355834961,
"learning_rate": 4.017038007863696e-05,
"loss": 0.2032,
"step": 6000
},
{
"epoch": 3.99737876802097,
"grad_norm": 10.889232635498047,
"learning_rate": 4.000655307994758e-05,
"loss": 0.1863,
"step": 6100
},
{
"epoch": 4.0,
"eval_accuracy": 0.9208715596330275,
"eval_f1": 0.91669118025196,
"eval_loss": 0.4279778301715851,
"eval_runtime": 0.9015,
"eval_samples_per_second": 3869.286,
"eval_steps_per_second": 61.012,
"step": 6104
},
{
"epoch": 4.062909567496724,
"grad_norm": 3.0653154850006104,
"learning_rate": 3.984272608125819e-05,
"loss": 0.1164,
"step": 6200
},
{
"epoch": 4.128440366972477,
"grad_norm": 4.659291744232178,
"learning_rate": 3.967889908256881e-05,
"loss": 0.1092,
"step": 6300
},
{
"epoch": 4.193971166448231,
"grad_norm": 1.27858567237854,
"learning_rate": 3.9515072083879424e-05,
"loss": 0.1057,
"step": 6400
},
{
"epoch": 4.259501965923985,
"grad_norm": 1.6424704790115356,
"learning_rate": 3.935124508519004e-05,
"loss": 0.1048,
"step": 6500
},
{
"epoch": 4.325032765399738,
"grad_norm": 2.527622938156128,
"learning_rate": 3.918741808650066e-05,
"loss": 0.1116,
"step": 6600
},
{
"epoch": 4.3905635648754915,
"grad_norm": 2.611750364303589,
"learning_rate": 3.9023591087811274e-05,
"loss": 0.1106,
"step": 6700
},
{
"epoch": 4.456094364351245,
"grad_norm": 3.4234442710876465,
"learning_rate": 3.885976408912189e-05,
"loss": 0.0898,
"step": 6800
},
{
"epoch": 4.521625163826998,
"grad_norm": 0.4667866826057434,
"learning_rate": 3.86959370904325e-05,
"loss": 0.0793,
"step": 6900
},
{
"epoch": 4.587155963302752,
"grad_norm": 4.575076580047607,
"learning_rate": 3.8532110091743125e-05,
"loss": 0.0811,
"step": 7000
},
{
"epoch": 4.652686762778506,
"grad_norm": 0.7901601195335388,
"learning_rate": 3.8368283093053735e-05,
"loss": 0.0871,
"step": 7100
},
{
"epoch": 4.718217562254259,
"grad_norm": 2.668879270553589,
"learning_rate": 3.820445609436435e-05,
"loss": 0.0834,
"step": 7200
},
{
"epoch": 4.783748361730013,
"grad_norm": 6.539068698883057,
"learning_rate": 3.804062909567497e-05,
"loss": 0.0755,
"step": 7300
},
{
"epoch": 4.849279161205767,
"grad_norm": 2.785691738128662,
"learning_rate": 3.7876802096985586e-05,
"loss": 0.0869,
"step": 7400
},
{
"epoch": 4.91480996068152,
"grad_norm": 1.0723165273666382,
"learning_rate": 3.7712975098296196e-05,
"loss": 0.1016,
"step": 7500
},
{
"epoch": 4.980340760157274,
"grad_norm": 7.655533790588379,
"learning_rate": 3.754914809960682e-05,
"loss": 0.082,
"step": 7600
},
{
"epoch": 5.0,
"eval_accuracy": 0.9323394495412844,
"eval_f1": 0.9303857696202881,
"eval_loss": 0.3835083246231079,
"eval_runtime": 0.8868,
"eval_samples_per_second": 3933.101,
"eval_steps_per_second": 62.019,
"step": 7630
},
{
"epoch": 5.045871559633028,
"grad_norm": 6.8434882164001465,
"learning_rate": 3.738532110091743e-05,
"loss": 0.0531,
"step": 7700
},
{
"epoch": 5.111402359108781,
"grad_norm": 11.0233736038208,
"learning_rate": 3.722149410222805e-05,
"loss": 0.0401,
"step": 7800
},
{
"epoch": 5.176933158584535,
"grad_norm": 1.0623722076416016,
"learning_rate": 3.7057667103538664e-05,
"loss": 0.0497,
"step": 7900
},
{
"epoch": 5.242463958060289,
"grad_norm": 0.9341715574264526,
"learning_rate": 3.689384010484928e-05,
"loss": 0.0384,
"step": 8000
},
{
"epoch": 5.307994757536042,
"grad_norm": 0.4840922951698303,
"learning_rate": 3.67300131061599e-05,
"loss": 0.0419,
"step": 8100
},
{
"epoch": 5.373525557011796,
"grad_norm": 1.7515119314193726,
"learning_rate": 3.6566186107470514e-05,
"loss": 0.0467,
"step": 8200
},
{
"epoch": 5.43905635648755,
"grad_norm": 0.6603133082389832,
"learning_rate": 3.640235910878113e-05,
"loss": 0.0437,
"step": 8300
},
{
"epoch": 5.504587155963303,
"grad_norm": 3.0891926288604736,
"learning_rate": 3.623853211009174e-05,
"loss": 0.048,
"step": 8400
},
{
"epoch": 5.5701179554390565,
"grad_norm": 0.4996233582496643,
"learning_rate": 3.6074705111402365e-05,
"loss": 0.0444,
"step": 8500
},
{
"epoch": 5.6356487549148095,
"grad_norm": 2.793330430984497,
"learning_rate": 3.5910878112712975e-05,
"loss": 0.0458,
"step": 8600
},
{
"epoch": 5.7011795543905635,
"grad_norm": 0.8790336847305298,
"learning_rate": 3.574705111402359e-05,
"loss": 0.0491,
"step": 8700
},
{
"epoch": 5.766710353866317,
"grad_norm": 0.37869125604629517,
"learning_rate": 3.558322411533421e-05,
"loss": 0.0322,
"step": 8800
},
{
"epoch": 5.832241153342071,
"grad_norm": 0.6503167152404785,
"learning_rate": 3.5419397116644826e-05,
"loss": 0.051,
"step": 8900
},
{
"epoch": 5.897771952817824,
"grad_norm": 0.16301073133945465,
"learning_rate": 3.5255570117955436e-05,
"loss": 0.0492,
"step": 9000
},
{
"epoch": 5.963302752293578,
"grad_norm": 0.3980591893196106,
"learning_rate": 3.509174311926606e-05,
"loss": 0.038,
"step": 9100
},
{
"epoch": 6.0,
"eval_accuracy": 0.9352064220183486,
"eval_f1": 0.9317561038604617,
"eval_loss": 0.3840370178222656,
"eval_runtime": 0.8822,
"eval_samples_per_second": 3953.595,
"eval_steps_per_second": 62.342,
"step": 9156
},
{
"epoch": 6.028833551769331,
"grad_norm": 0.24182792007923126,
"learning_rate": 3.492791612057667e-05,
"loss": 0.0335,
"step": 9200
},
{
"epoch": 6.094364351245085,
"grad_norm": 0.41973385214805603,
"learning_rate": 3.476408912188729e-05,
"loss": 0.0226,
"step": 9300
},
{
"epoch": 6.159895150720839,
"grad_norm": 0.933502197265625,
"learning_rate": 3.460026212319791e-05,
"loss": 0.0268,
"step": 9400
},
{
"epoch": 6.225425950196592,
"grad_norm": 2.3950750827789307,
"learning_rate": 3.443643512450852e-05,
"loss": 0.0253,
"step": 9500
},
{
"epoch": 6.290956749672346,
"grad_norm": 0.6362214088439941,
"learning_rate": 3.427260812581914e-05,
"loss": 0.0205,
"step": 9600
},
{
"epoch": 6.3564875491481,
"grad_norm": 0.22217431664466858,
"learning_rate": 3.4108781127129755e-05,
"loss": 0.0214,
"step": 9700
},
{
"epoch": 6.422018348623853,
"grad_norm": 0.2135070413351059,
"learning_rate": 3.394495412844037e-05,
"loss": 0.0268,
"step": 9800
},
{
"epoch": 6.487549148099607,
"grad_norm": 5.583222389221191,
"learning_rate": 3.378112712975098e-05,
"loss": 0.0236,
"step": 9900
},
{
"epoch": 6.553079947575361,
"grad_norm": 0.9507617354393005,
"learning_rate": 3.3617300131061605e-05,
"loss": 0.0285,
"step": 10000
},
{
"epoch": 6.618610747051114,
"grad_norm": 3.5924887657165527,
"learning_rate": 3.3453473132372215e-05,
"loss": 0.0196,
"step": 10100
},
{
"epoch": 6.684141546526868,
"grad_norm": 0.3054388463497162,
"learning_rate": 3.328964613368283e-05,
"loss": 0.0162,
"step": 10200
},
{
"epoch": 6.749672346002622,
"grad_norm": 0.09917047619819641,
"learning_rate": 3.312581913499345e-05,
"loss": 0.0203,
"step": 10300
},
{
"epoch": 6.815203145478375,
"grad_norm": 10.647476196289062,
"learning_rate": 3.2961992136304066e-05,
"loss": 0.0293,
"step": 10400
},
{
"epoch": 6.8807339449541285,
"grad_norm": 0.5372545123100281,
"learning_rate": 3.2798165137614676e-05,
"loss": 0.0157,
"step": 10500
},
{
"epoch": 6.946264744429882,
"grad_norm": 0.11427264660596848,
"learning_rate": 3.26343381389253e-05,
"loss": 0.0254,
"step": 10600
},
{
"epoch": 7.0,
"eval_accuracy": 0.9386467889908257,
"eval_f1": 0.9363697162292346,
"eval_loss": 0.3672682046890259,
"eval_runtime": 0.8719,
"eval_samples_per_second": 4000.361,
"eval_steps_per_second": 63.079,
"step": 10682
},
{
"epoch": 7.011795543905635,
"grad_norm": 5.333648204803467,
"learning_rate": 3.247051114023591e-05,
"loss": 0.0202,
"step": 10700
},
{
"epoch": 7.077326343381389,
"grad_norm": 0.7541437149047852,
"learning_rate": 3.230668414154653e-05,
"loss": 0.022,
"step": 10800
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.06409142166376114,
"learning_rate": 3.2142857142857144e-05,
"loss": 0.0134,
"step": 10900
},
{
"epoch": 7.208387942332896,
"grad_norm": 6.461215972900391,
"learning_rate": 3.197903014416776e-05,
"loss": 0.0163,
"step": 11000
},
{
"epoch": 7.27391874180865,
"grad_norm": 0.5002973675727844,
"learning_rate": 3.181520314547838e-05,
"loss": 0.0131,
"step": 11100
},
{
"epoch": 7.339449541284404,
"grad_norm": 0.16848881542682648,
"learning_rate": 3.1651376146788995e-05,
"loss": 0.0139,
"step": 11200
},
{
"epoch": 7.404980340760157,
"grad_norm": 0.32861247658729553,
"learning_rate": 3.148754914809961e-05,
"loss": 0.019,
"step": 11300
},
{
"epoch": 7.470511140235911,
"grad_norm": 1.0504356622695923,
"learning_rate": 3.132372214941022e-05,
"loss": 0.0112,
"step": 11400
},
{
"epoch": 7.536041939711664,
"grad_norm": 0.2850879430770874,
"learning_rate": 3.115989515072084e-05,
"loss": 0.0164,
"step": 11500
},
{
"epoch": 7.601572739187418,
"grad_norm": 0.8835840225219727,
"learning_rate": 3.0996068152031456e-05,
"loss": 0.0195,
"step": 11600
},
{
"epoch": 7.667103538663172,
"grad_norm": 0.16748446226119995,
"learning_rate": 3.083224115334207e-05,
"loss": 0.0143,
"step": 11700
},
{
"epoch": 7.732634338138926,
"grad_norm": 0.5115922689437866,
"learning_rate": 3.066841415465269e-05,
"loss": 0.0195,
"step": 11800
},
{
"epoch": 7.798165137614679,
"grad_norm": 0.17508633434772491,
"learning_rate": 3.0504587155963303e-05,
"loss": 0.0124,
"step": 11900
},
{
"epoch": 7.863695937090433,
"grad_norm": 0.06938499212265015,
"learning_rate": 3.0340760157273916e-05,
"loss": 0.0151,
"step": 12000
},
{
"epoch": 7.929226736566186,
"grad_norm": 1.1097829341888428,
"learning_rate": 3.0176933158584537e-05,
"loss": 0.0162,
"step": 12100
},
{
"epoch": 7.99475753604194,
"grad_norm": 0.474563866853714,
"learning_rate": 3.0013106159895154e-05,
"loss": 0.0142,
"step": 12200
},
{
"epoch": 8.0,
"eval_accuracy": 0.9369266055045872,
"eval_f1": 0.9333123982923296,
"eval_loss": 0.39479872584342957,
"eval_runtime": 0.8765,
"eval_samples_per_second": 3979.645,
"eval_steps_per_second": 62.752,
"step": 12208
},
{
"epoch": 8.060288335517694,
"grad_norm": 2.1137709617614746,
"learning_rate": 2.9849279161205767e-05,
"loss": 0.0087,
"step": 12300
},
{
"epoch": 8.125819134993447,
"grad_norm": 0.14595501124858856,
"learning_rate": 2.9685452162516387e-05,
"loss": 0.0097,
"step": 12400
},
{
"epoch": 8.191349934469201,
"grad_norm": 0.038920313119888306,
"learning_rate": 2.9521625163826998e-05,
"loss": 0.0132,
"step": 12500
},
{
"epoch": 8.256880733944953,
"grad_norm": 0.03425636142492294,
"learning_rate": 2.9357798165137618e-05,
"loss": 0.0073,
"step": 12600
},
{
"epoch": 8.322411533420707,
"grad_norm": 0.23988936841487885,
"learning_rate": 2.919397116644823e-05,
"loss": 0.0091,
"step": 12700
},
{
"epoch": 8.387942332896461,
"grad_norm": 0.10584782809019089,
"learning_rate": 2.9030144167758848e-05,
"loss": 0.0083,
"step": 12800
},
{
"epoch": 8.453473132372215,
"grad_norm": 0.09316133707761765,
"learning_rate": 2.8866317169069462e-05,
"loss": 0.0107,
"step": 12900
},
{
"epoch": 8.51900393184797,
"grad_norm": 0.6492702960968018,
"learning_rate": 2.8702490170380082e-05,
"loss": 0.012,
"step": 13000
},
{
"epoch": 8.584534731323721,
"grad_norm": 0.19327221810817719,
"learning_rate": 2.8538663171690692e-05,
"loss": 0.0087,
"step": 13100
},
{
"epoch": 8.650065530799475,
"grad_norm": 0.13705046474933624,
"learning_rate": 2.8374836173001313e-05,
"loss": 0.0082,
"step": 13200
},
{
"epoch": 8.715596330275229,
"grad_norm": 0.16649670898914337,
"learning_rate": 2.8211009174311926e-05,
"loss": 0.0083,
"step": 13300
},
{
"epoch": 8.781127129750983,
"grad_norm": 0.4147738218307495,
"learning_rate": 2.8047182175622543e-05,
"loss": 0.0099,
"step": 13400
},
{
"epoch": 8.846657929226737,
"grad_norm": 0.2398168295621872,
"learning_rate": 2.7883355176933163e-05,
"loss": 0.0039,
"step": 13500
},
{
"epoch": 8.91218872870249,
"grad_norm": 2.4962239265441895,
"learning_rate": 2.7719528178243777e-05,
"loss": 0.0173,
"step": 13600
},
{
"epoch": 8.977719528178245,
"grad_norm": 0.03992962837219238,
"learning_rate": 2.7555701179554394e-05,
"loss": 0.0103,
"step": 13700
},
{
"epoch": 9.0,
"eval_accuracy": 0.9409403669724771,
"eval_f1": 0.939672650552811,
"eval_loss": 0.3894253671169281,
"eval_runtime": 0.8797,
"eval_samples_per_second": 3965.161,
"eval_steps_per_second": 62.524,
"step": 13734
},
{
"epoch": 9.043250327653997,
"grad_norm": 0.19869303703308105,
"learning_rate": 2.7391874180865007e-05,
"loss": 0.0061,
"step": 13800
},
{
"epoch": 9.10878112712975,
"grad_norm": 0.08110935240983963,
"learning_rate": 2.7228047182175624e-05,
"loss": 0.0086,
"step": 13900
},
{
"epoch": 9.174311926605505,
"grad_norm": 3.7491238117218018,
"learning_rate": 2.7064220183486238e-05,
"loss": 0.0092,
"step": 14000
},
{
"epoch": 9.239842726081259,
"grad_norm": 0.04541350528597832,
"learning_rate": 2.6900393184796858e-05,
"loss": 0.0125,
"step": 14100
},
{
"epoch": 9.305373525557012,
"grad_norm": 0.02239099144935608,
"learning_rate": 2.673656618610747e-05,
"loss": 0.007,
"step": 14200
},
{
"epoch": 9.370904325032765,
"grad_norm": 0.5650951862335205,
"learning_rate": 2.657273918741809e-05,
"loss": 0.0047,
"step": 14300
},
{
"epoch": 9.436435124508519,
"grad_norm": 0.012850129045546055,
"learning_rate": 2.6408912188728702e-05,
"loss": 0.0107,
"step": 14400
},
{
"epoch": 9.501965923984272,
"grad_norm": 0.015726063400506973,
"learning_rate": 2.624508519003932e-05,
"loss": 0.0156,
"step": 14500
},
{
"epoch": 9.567496723460026,
"grad_norm": 0.0906534269452095,
"learning_rate": 2.6081258191349932e-05,
"loss": 0.0079,
"step": 14600
},
{
"epoch": 9.63302752293578,
"grad_norm": 0.23419497907161713,
"learning_rate": 2.5917431192660553e-05,
"loss": 0.0051,
"step": 14700
},
{
"epoch": 9.698558322411534,
"grad_norm": 0.0450860969722271,
"learning_rate": 2.575360419397117e-05,
"loss": 0.008,
"step": 14800
},
{
"epoch": 9.764089121887286,
"grad_norm": 3.0917110443115234,
"learning_rate": 2.5589777195281783e-05,
"loss": 0.0044,
"step": 14900
},
{
"epoch": 9.82961992136304,
"grad_norm": 1.0102367401123047,
"learning_rate": 2.5425950196592403e-05,
"loss": 0.0081,
"step": 15000
},
{
"epoch": 9.895150720838794,
"grad_norm": 0.39359351992607117,
"learning_rate": 2.5262123197903013e-05,
"loss": 0.0061,
"step": 15100
},
{
"epoch": 9.960681520314548,
"grad_norm": 0.03703628107905388,
"learning_rate": 2.5098296199213634e-05,
"loss": 0.0074,
"step": 15200
},
{
"epoch": 10.0,
"eval_accuracy": 0.9423738532110092,
"eval_f1": 0.9400918780505937,
"eval_loss": 0.39878183603286743,
"eval_runtime": 0.8893,
"eval_samples_per_second": 3922.19,
"eval_steps_per_second": 61.846,
"step": 15260
},
{
"epoch": 10.026212319790302,
"grad_norm": 0.055755846202373505,
"learning_rate": 2.4934469200524247e-05,
"loss": 0.0079,
"step": 15300
},
{
"epoch": 10.091743119266056,
"grad_norm": 0.21388879418373108,
"learning_rate": 2.4770642201834864e-05,
"loss": 0.0015,
"step": 15400
},
{
"epoch": 10.157273918741808,
"grad_norm": 0.05140744522213936,
"learning_rate": 2.460681520314548e-05,
"loss": 0.0081,
"step": 15500
},
{
"epoch": 10.222804718217562,
"grad_norm": 0.1071576178073883,
"learning_rate": 2.4442988204456098e-05,
"loss": 0.0109,
"step": 15600
},
{
"epoch": 10.288335517693316,
"grad_norm": 0.037079449743032455,
"learning_rate": 2.427916120576671e-05,
"loss": 0.0025,
"step": 15700
},
{
"epoch": 10.35386631716907,
"grad_norm": 0.08620253950357437,
"learning_rate": 2.411533420707733e-05,
"loss": 0.0067,
"step": 15800
},
{
"epoch": 10.419397116644824,
"grad_norm": 0.053181178867816925,
"learning_rate": 2.3951507208387945e-05,
"loss": 0.003,
"step": 15900
},
{
"epoch": 10.484927916120578,
"grad_norm": 0.019558211788535118,
"learning_rate": 2.378768020969856e-05,
"loss": 0.0015,
"step": 16000
},
{
"epoch": 10.55045871559633,
"grad_norm": 0.5422232151031494,
"learning_rate": 2.3623853211009176e-05,
"loss": 0.0182,
"step": 16100
},
{
"epoch": 10.615989515072084,
"grad_norm": 0.15415391325950623,
"learning_rate": 2.3460026212319793e-05,
"loss": 0.0043,
"step": 16200
},
{
"epoch": 10.681520314547837,
"grad_norm": 0.33526667952537537,
"learning_rate": 2.3296199213630406e-05,
"loss": 0.0015,
"step": 16300
},
{
"epoch": 10.747051114023591,
"grad_norm": 0.5435523986816406,
"learning_rate": 2.3132372214941023e-05,
"loss": 0.002,
"step": 16400
},
{
"epoch": 10.812581913499345,
"grad_norm": 0.03789573162794113,
"learning_rate": 2.296854521625164e-05,
"loss": 0.0017,
"step": 16500
},
{
"epoch": 10.8781127129751,
"grad_norm": 0.46037229895591736,
"learning_rate": 2.2804718217562254e-05,
"loss": 0.0047,
"step": 16600
},
{
"epoch": 10.943643512450851,
"grad_norm": 0.018136654049158096,
"learning_rate": 2.264089121887287e-05,
"loss": 0.0026,
"step": 16700
},
{
"epoch": 11.0,
"eval_accuracy": 0.9412270642201835,
"eval_f1": 0.9393162647749803,
"eval_loss": 0.3977106511592865,
"eval_runtime": 0.8728,
"eval_samples_per_second": 3996.231,
"eval_steps_per_second": 63.014,
"step": 16786
},
{
"epoch": 11.009174311926605,
"grad_norm": 0.029348287731409073,
"learning_rate": 2.2477064220183487e-05,
"loss": 0.0071,
"step": 16800
},
{
"epoch": 11.07470511140236,
"grad_norm": 0.030079521238803864,
"learning_rate": 2.2313237221494104e-05,
"loss": 0.0051,
"step": 16900
},
{
"epoch": 11.140235910878113,
"grad_norm": 0.014014150016009808,
"learning_rate": 2.214941022280472e-05,
"loss": 0.0047,
"step": 17000
},
{
"epoch": 11.205766710353867,
"grad_norm": 0.028272485360503197,
"learning_rate": 2.1985583224115335e-05,
"loss": 0.0027,
"step": 17100
},
{
"epoch": 11.271297509829619,
"grad_norm": 0.008005212992429733,
"learning_rate": 2.182175622542595e-05,
"loss": 0.0021,
"step": 17200
},
{
"epoch": 11.336828309305373,
"grad_norm": 0.18947385251522064,
"learning_rate": 2.165792922673657e-05,
"loss": 0.0021,
"step": 17300
},
{
"epoch": 11.402359108781127,
"grad_norm": 0.021595077589154243,
"learning_rate": 2.1494102228047182e-05,
"loss": 0.0112,
"step": 17400
},
{
"epoch": 11.46788990825688,
"grad_norm": 0.10984991490840912,
"learning_rate": 2.13302752293578e-05,
"loss": 0.0023,
"step": 17500
},
{
"epoch": 11.533420707732635,
"grad_norm": 0.04003112018108368,
"learning_rate": 2.1166448230668416e-05,
"loss": 0.0069,
"step": 17600
},
{
"epoch": 11.598951507208389,
"grad_norm": 0.03961130604147911,
"learning_rate": 2.100262123197903e-05,
"loss": 0.0045,
"step": 17700
},
{
"epoch": 11.66448230668414,
"grad_norm": 1.8393652439117432,
"learning_rate": 2.0838794233289646e-05,
"loss": 0.0074,
"step": 17800
},
{
"epoch": 11.730013106159895,
"grad_norm": 0.05228583887219429,
"learning_rate": 2.0674967234600263e-05,
"loss": 0.0074,
"step": 17900
},
{
"epoch": 11.795543905635649,
"grad_norm": 0.06190050393342972,
"learning_rate": 2.0511140235910877e-05,
"loss": 0.0017,
"step": 18000
},
{
"epoch": 11.861074705111402,
"grad_norm": 0.06853855401277542,
"learning_rate": 2.0347313237221497e-05,
"loss": 0.0065,
"step": 18100
},
{
"epoch": 11.926605504587156,
"grad_norm": 0.13988357782363892,
"learning_rate": 2.018348623853211e-05,
"loss": 0.0027,
"step": 18200
},
{
"epoch": 11.99213630406291,
"grad_norm": 0.13300713896751404,
"learning_rate": 2.0019659239842727e-05,
"loss": 0.0006,
"step": 18300
},
{
"epoch": 12.0,
"eval_accuracy": 0.9463876146788991,
"eval_f1": 0.9448165287184782,
"eval_loss": 0.37657901644706726,
"eval_runtime": 0.8735,
"eval_samples_per_second": 3992.99,
"eval_steps_per_second": 62.963,
"step": 18312
},
{
"epoch": 12.057667103538662,
"grad_norm": 0.021236807107925415,
"learning_rate": 1.9855832241153344e-05,
"loss": 0.0056,
"step": 18400
},
{
"epoch": 12.123197903014416,
"grad_norm": 0.03615666553378105,
"learning_rate": 1.9692005242463958e-05,
"loss": 0.0061,
"step": 18500
},
{
"epoch": 12.18872870249017,
"grad_norm": 0.045921873301267624,
"learning_rate": 1.9528178243774575e-05,
"loss": 0.0007,
"step": 18600
},
{
"epoch": 12.254259501965924,
"grad_norm": 0.023187097162008286,
"learning_rate": 1.9364351245085192e-05,
"loss": 0.0006,
"step": 18700
},
{
"epoch": 12.319790301441678,
"grad_norm": 0.007863562554121017,
"learning_rate": 1.9200524246395805e-05,
"loss": 0.0027,
"step": 18800
},
{
"epoch": 12.385321100917432,
"grad_norm": 0.028795627877116203,
"learning_rate": 1.9036697247706422e-05,
"loss": 0.0076,
"step": 18900
},
{
"epoch": 12.450851900393184,
"grad_norm": 0.030297929421067238,
"learning_rate": 1.887287024901704e-05,
"loss": 0.0009,
"step": 19000
},
{
"epoch": 12.516382699868938,
"grad_norm": 1.041812777519226,
"learning_rate": 1.8709043250327653e-05,
"loss": 0.0009,
"step": 19100
},
{
"epoch": 12.581913499344692,
"grad_norm": 0.04054298996925354,
"learning_rate": 1.854521625163827e-05,
"loss": 0.0084,
"step": 19200
},
{
"epoch": 12.647444298820446,
"grad_norm": 0.019086740911006927,
"learning_rate": 1.8381389252948886e-05,
"loss": 0.0011,
"step": 19300
},
{
"epoch": 12.7129750982962,
"grad_norm": 0.04095865413546562,
"learning_rate": 1.82175622542595e-05,
"loss": 0.0009,
"step": 19400
},
{
"epoch": 12.778505897771954,
"grad_norm": 0.021935787051916122,
"learning_rate": 1.805373525557012e-05,
"loss": 0.0005,
"step": 19500
},
{
"epoch": 12.844036697247706,
"grad_norm": 0.1897253543138504,
"learning_rate": 1.7889908256880737e-05,
"loss": 0.0019,
"step": 19600
},
{
"epoch": 12.90956749672346,
"grad_norm": 0.07480347901582718,
"learning_rate": 1.772608125819135e-05,
"loss": 0.0013,
"step": 19700
},
{
"epoch": 12.975098296199214,
"grad_norm": 0.00786515325307846,
"learning_rate": 1.7562254259501968e-05,
"loss": 0.0005,
"step": 19800
},
{
"epoch": 13.0,
"eval_accuracy": 0.9463876146788991,
"eval_f1": 0.9448447433682525,
"eval_loss": 0.40437009930610657,
"eval_runtime": 0.8642,
"eval_samples_per_second": 4036.236,
"eval_steps_per_second": 63.645,
"step": 19838
},
{
"epoch": 13.040629095674968,
"grad_norm": 0.26656442880630493,
"learning_rate": 1.7398427260812584e-05,
"loss": 0.0039,
"step": 19900
},
{
"epoch": 13.106159895150721,
"grad_norm": 0.015545975416898727,
"learning_rate": 1.7234600262123198e-05,
"loss": 0.0006,
"step": 20000
},
{
"epoch": 13.171690694626474,
"grad_norm": 0.012939069420099258,
"learning_rate": 1.7070773263433815e-05,
"loss": 0.0009,
"step": 20100
},
{
"epoch": 13.237221494102227,
"grad_norm": 0.013902663253247738,
"learning_rate": 1.6906946264744432e-05,
"loss": 0.001,
"step": 20200
},
{
"epoch": 13.302752293577981,
"grad_norm": 0.12842603027820587,
"learning_rate": 1.6743119266055045e-05,
"loss": 0.0007,
"step": 20300
},
{
"epoch": 13.368283093053735,
"grad_norm": 0.01566697470843792,
"learning_rate": 1.6579292267365662e-05,
"loss": 0.0017,
"step": 20400
},
{
"epoch": 13.43381389252949,
"grad_norm": 0.028622334823012352,
"learning_rate": 1.641546526867628e-05,
"loss": 0.0005,
"step": 20500
},
{
"epoch": 13.499344692005243,
"grad_norm": 0.01282609160989523,
"learning_rate": 1.6251638269986893e-05,
"loss": 0.0007,
"step": 20600
},
{
"epoch": 13.564875491480995,
"grad_norm": 0.0726955458521843,
"learning_rate": 1.608781127129751e-05,
"loss": 0.0004,
"step": 20700
},
{
"epoch": 13.63040629095675,
"grad_norm": 0.0037081395275890827,
"learning_rate": 1.5923984272608126e-05,
"loss": 0.0035,
"step": 20800
},
{
"epoch": 13.695937090432503,
"grad_norm": 0.11256258934736252,
"learning_rate": 1.5760157273918743e-05,
"loss": 0.0009,
"step": 20900
},
{
"epoch": 13.761467889908257,
"grad_norm": 0.012089048512279987,
"learning_rate": 1.559633027522936e-05,
"loss": 0.0004,
"step": 21000
},
{
"epoch": 13.82699868938401,
"grad_norm": 0.008633548393845558,
"learning_rate": 1.5432503276539974e-05,
"loss": 0.0049,
"step": 21100
},
{
"epoch": 13.892529488859765,
"grad_norm": 0.012773215770721436,
"learning_rate": 1.526867627785059e-05,
"loss": 0.0051,
"step": 21200
},
{
"epoch": 13.958060288335517,
"grad_norm": 0.5564557909965515,
"learning_rate": 1.5104849279161206e-05,
"loss": 0.0026,
"step": 21300
},
{
"epoch": 14.0,
"eval_accuracy": 0.9475344036697247,
"eval_f1": 0.9462485272118298,
"eval_loss": 0.3972223997116089,
"eval_runtime": 0.8884,
"eval_samples_per_second": 3926.295,
"eval_steps_per_second": 61.911,
"step": 21364
},
{
"epoch": 14.02359108781127,
"grad_norm": 0.014650699682533741,
"learning_rate": 1.4941022280471823e-05,
"loss": 0.0004,
"step": 21400
},
{
"epoch": 14.089121887287025,
"grad_norm": 0.7793611288070679,
"learning_rate": 1.4777195281782438e-05,
"loss": 0.0004,
"step": 21500
},
{
"epoch": 14.154652686762779,
"grad_norm": 0.27260562777519226,
"learning_rate": 1.4613368283093053e-05,
"loss": 0.0003,
"step": 21600
},
{
"epoch": 14.220183486238533,
"grad_norm": 0.0209233146160841,
"learning_rate": 1.444954128440367e-05,
"loss": 0.0004,
"step": 21700
},
{
"epoch": 14.285714285714286,
"grad_norm": 0.009809763170778751,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.0009,
"step": 21800
},
{
"epoch": 14.351245085190039,
"grad_norm": 0.004338541068136692,
"learning_rate": 1.41218872870249e-05,
"loss": 0.001,
"step": 21900
},
{
"epoch": 14.416775884665793,
"grad_norm": 0.05535097420215607,
"learning_rate": 1.3958060288335518e-05,
"loss": 0.0003,
"step": 22000
},
{
"epoch": 14.482306684141546,
"grad_norm": 0.013605128042399883,
"learning_rate": 1.3794233289646136e-05,
"loss": 0.0007,
"step": 22100
},
{
"epoch": 14.5478374836173,
"grad_norm": 0.05343281850218773,
"learning_rate": 1.3630406290956751e-05,
"loss": 0.0076,
"step": 22200
},
{
"epoch": 14.613368283093054,
"grad_norm": 0.03259427472949028,
"learning_rate": 1.3466579292267367e-05,
"loss": 0.0006,
"step": 22300
},
{
"epoch": 14.678899082568808,
"grad_norm": 0.01491202600300312,
"learning_rate": 1.3302752293577984e-05,
"loss": 0.0003,
"step": 22400
},
{
"epoch": 14.74442988204456,
"grad_norm": 0.04238777980208397,
"learning_rate": 1.3138925294888599e-05,
"loss": 0.0048,
"step": 22500
},
{
"epoch": 14.809960681520314,
"grad_norm": 0.07502109557390213,
"learning_rate": 1.2975098296199214e-05,
"loss": 0.0003,
"step": 22600
},
{
"epoch": 14.875491480996068,
"grad_norm": 0.013529137708246708,
"learning_rate": 1.281127129750983e-05,
"loss": 0.0003,
"step": 22700
},
{
"epoch": 14.941022280471822,
"grad_norm": 0.005839935038238764,
"learning_rate": 1.2647444298820446e-05,
"loss": 0.0046,
"step": 22800
},
{
"epoch": 15.0,
"eval_accuracy": 0.9489678899082569,
"eval_f1": 0.947361701117435,
"eval_loss": 0.3973062038421631,
"eval_runtime": 0.8841,
"eval_samples_per_second": 3945.29,
"eval_steps_per_second": 62.211,
"step": 22890
},
{
"epoch": 15.006553079947576,
"grad_norm": 0.017636075615882874,
"learning_rate": 1.2483617300131061e-05,
"loss": 0.0024,
"step": 22900
},
{
"epoch": 15.07208387942333,
"grad_norm": 0.022421281784772873,
"learning_rate": 1.2319790301441678e-05,
"loss": 0.0031,
"step": 23000
},
{
"epoch": 15.137614678899082,
"grad_norm": 0.01283260341733694,
"learning_rate": 1.2155963302752295e-05,
"loss": 0.0003,
"step": 23100
},
{
"epoch": 15.203145478374836,
"grad_norm": 0.007434241008013487,
"learning_rate": 1.199213630406291e-05,
"loss": 0.0061,
"step": 23200
},
{
"epoch": 15.26867627785059,
"grad_norm": 0.02019626460969448,
"learning_rate": 1.1828309305373527e-05,
"loss": 0.0003,
"step": 23300
},
{
"epoch": 15.334207077326344,
"grad_norm": 0.010890827514231205,
"learning_rate": 1.1664482306684142e-05,
"loss": 0.0002,
"step": 23400
},
{
"epoch": 15.399737876802098,
"grad_norm": 0.016969241201877594,
"learning_rate": 1.1500655307994758e-05,
"loss": 0.0075,
"step": 23500
},
{
"epoch": 15.46526867627785,
"grad_norm": 0.02037014067173004,
"learning_rate": 1.1336828309305373e-05,
"loss": 0.0007,
"step": 23600
},
{
"epoch": 15.530799475753604,
"grad_norm": 0.00850609689950943,
"learning_rate": 1.117300131061599e-05,
"loss": 0.0002,
"step": 23700
},
{
"epoch": 15.596330275229358,
"grad_norm": 0.003405163995921612,
"learning_rate": 1.1009174311926607e-05,
"loss": 0.0024,
"step": 23800
},
{
"epoch": 15.661861074705111,
"grad_norm": 0.013705270364880562,
"learning_rate": 1.0845347313237222e-05,
"loss": 0.0002,
"step": 23900
},
{
"epoch": 15.727391874180865,
"grad_norm": 0.00401644641533494,
"learning_rate": 1.0681520314547839e-05,
"loss": 0.0002,
"step": 24000
},
{
"epoch": 15.79292267365662,
"grad_norm": 0.06715580821037292,
"learning_rate": 1.0517693315858454e-05,
"loss": 0.0068,
"step": 24100
},
{
"epoch": 15.858453473132371,
"grad_norm": 0.007226752582937479,
"learning_rate": 1.035386631716907e-05,
"loss": 0.0003,
"step": 24200
},
{
"epoch": 15.923984272608125,
"grad_norm": 0.07869122177362442,
"learning_rate": 1.0190039318479686e-05,
"loss": 0.0016,
"step": 24300
},
{
"epoch": 15.98951507208388,
"grad_norm": 0.010308779776096344,
"learning_rate": 1.0026212319790301e-05,
"loss": 0.0001,
"step": 24400
},
{
"epoch": 16.0,
"eval_accuracy": 0.9492545871559633,
"eval_f1": 0.9480250779419357,
"eval_loss": 0.40250906348228455,
"eval_runtime": 0.885,
"eval_samples_per_second": 3941.411,
"eval_steps_per_second": 62.15,
"step": 24416
},
{
"epoch": 16.05504587155963,
"grad_norm": 0.011048965156078339,
"learning_rate": 9.862385321100918e-06,
"loss": 0.0002,
"step": 24500
},
{
"epoch": 16.120576671035387,
"grad_norm": 0.005700926296412945,
"learning_rate": 9.698558322411533e-06,
"loss": 0.001,
"step": 24600
},
{
"epoch": 16.18610747051114,
"grad_norm": 0.00868783425539732,
"learning_rate": 9.53473132372215e-06,
"loss": 0.0002,
"step": 24700
},
{
"epoch": 16.251638269986895,
"grad_norm": 0.003008009400218725,
"learning_rate": 9.370904325032766e-06,
"loss": 0.0002,
"step": 24800
},
{
"epoch": 16.317169069462647,
"grad_norm": 0.039118170738220215,
"learning_rate": 9.20707732634338e-06,
"loss": 0.0017,
"step": 24900
},
{
"epoch": 16.382699868938403,
"grad_norm": 0.003417972009629011,
"learning_rate": 9.043250327653998e-06,
"loss": 0.0002,
"step": 25000
},
{
"epoch": 16.448230668414155,
"grad_norm": 0.012208909727633,
"learning_rate": 8.879423328964615e-06,
"loss": 0.0002,
"step": 25100
},
{
"epoch": 16.513761467889907,
"grad_norm": 0.015429310500621796,
"learning_rate": 8.71559633027523e-06,
"loss": 0.0001,
"step": 25200
},
{
"epoch": 16.579292267365663,
"grad_norm": 0.007409450598061085,
"learning_rate": 8.551769331585847e-06,
"loss": 0.0004,
"step": 25300
},
{
"epoch": 16.644823066841415,
"grad_norm": 0.009294740855693817,
"learning_rate": 8.387942332896462e-06,
"loss": 0.0002,
"step": 25400
},
{
"epoch": 16.71035386631717,
"grad_norm": 0.0043761348351836205,
"learning_rate": 8.224115334207077e-06,
"loss": 0.0033,
"step": 25500
},
{
"epoch": 16.775884665792923,
"grad_norm": 0.017104586586356163,
"learning_rate": 8.060288335517694e-06,
"loss": 0.0002,
"step": 25600
},
{
"epoch": 16.841415465268675,
"grad_norm": 0.0103053729981184,
"learning_rate": 7.89646133682831e-06,
"loss": 0.0002,
"step": 25700
},
{
"epoch": 16.90694626474443,
"grad_norm": 0.008107037283480167,
"learning_rate": 7.732634338138926e-06,
"loss": 0.0002,
"step": 25800
},
{
"epoch": 16.972477064220183,
"grad_norm": 0.025965586304664612,
"learning_rate": 7.568807339449542e-06,
"loss": 0.0003,
"step": 25900
},
{
"epoch": 17.0,
"eval_accuracy": 0.950401376146789,
"eval_f1": 0.949057083016372,
"eval_loss": 0.3941075801849365,
"eval_runtime": 0.8787,
"eval_samples_per_second": 3969.396,
"eval_steps_per_second": 62.591,
"step": 25942
},
{
"epoch": 17.03800786369594,
"grad_norm": 0.0071102771908044815,
"learning_rate": 7.4049803407601575e-06,
"loss": 0.0001,
"step": 26000
},
{
"epoch": 17.10353866317169,
"grad_norm": 0.0024903868325054646,
"learning_rate": 7.241153342070774e-06,
"loss": 0.0003,
"step": 26100
},
{
"epoch": 17.169069462647446,
"grad_norm": 0.00496539194136858,
"learning_rate": 7.07732634338139e-06,
"loss": 0.0001,
"step": 26200
},
{
"epoch": 17.234600262123198,
"grad_norm": 0.005121257156133652,
"learning_rate": 6.913499344692005e-06,
"loss": 0.0001,
"step": 26300
},
{
"epoch": 17.30013106159895,
"grad_norm": 0.0038872575387358665,
"learning_rate": 6.749672346002621e-06,
"loss": 0.0001,
"step": 26400
},
{
"epoch": 17.365661861074706,
"grad_norm": 0.0026088629383593798,
"learning_rate": 6.585845347313238e-06,
"loss": 0.0001,
"step": 26500
},
{
"epoch": 17.431192660550458,
"grad_norm": 0.0027847271412611008,
"learning_rate": 6.422018348623854e-06,
"loss": 0.0004,
"step": 26600
},
{
"epoch": 17.496723460026214,
"grad_norm": 0.010283850133419037,
"learning_rate": 6.25819134993447e-06,
"loss": 0.0001,
"step": 26700
},
{
"epoch": 17.562254259501966,
"grad_norm": 0.0012891000369563699,
"learning_rate": 6.094364351245085e-06,
"loss": 0.0004,
"step": 26800
},
{
"epoch": 17.627785058977718,
"grad_norm": 0.0068209609016776085,
"learning_rate": 5.930537352555701e-06,
"loss": 0.0001,
"step": 26900
},
{
"epoch": 17.693315858453474,
"grad_norm": 0.012583351694047451,
"learning_rate": 5.766710353866317e-06,
"loss": 0.0001,
"step": 27000
},
{
"epoch": 17.758846657929226,
"grad_norm": 0.014517087489366531,
"learning_rate": 5.602883355176933e-06,
"loss": 0.0053,
"step": 27100
},
{
"epoch": 17.82437745740498,
"grad_norm": 0.003594920039176941,
"learning_rate": 5.4390563564875494e-06,
"loss": 0.0001,
"step": 27200
},
{
"epoch": 17.889908256880734,
"grad_norm": 0.014988411217927933,
"learning_rate": 5.2752293577981655e-06,
"loss": 0.0003,
"step": 27300
},
{
"epoch": 17.955439056356486,
"grad_norm": 0.00935112964361906,
"learning_rate": 5.1114023591087816e-06,
"loss": 0.0001,
"step": 27400
},
{
"epoch": 18.0,
"eval_accuracy": 0.9501146788990825,
"eval_f1": 0.948460314214328,
"eval_loss": 0.40203723311424255,
"eval_runtime": 0.9031,
"eval_samples_per_second": 3862.373,
"eval_steps_per_second": 60.903,
"step": 27468
},
{
"epoch": 18.02096985583224,
"grad_norm": 0.0007160278619267046,
"learning_rate": 4.947575360419398e-06,
"loss": 0.0001,
"step": 27500
},
{
"epoch": 18.086500655307994,
"grad_norm": 0.0028267614543437958,
"learning_rate": 4.783748361730013e-06,
"loss": 0.0001,
"step": 27600
},
{
"epoch": 18.15203145478375,
"grad_norm": 0.010204290971159935,
"learning_rate": 4.61992136304063e-06,
"loss": 0.0001,
"step": 27700
},
{
"epoch": 18.2175622542595,
"grad_norm": 0.005847644526511431,
"learning_rate": 4.456094364351245e-06,
"loss": 0.0001,
"step": 27800
},
{
"epoch": 18.283093053735257,
"grad_norm": 0.018655648455023766,
"learning_rate": 4.292267365661861e-06,
"loss": 0.0001,
"step": 27900
},
{
"epoch": 18.34862385321101,
"grad_norm": 0.0011456008069217205,
"learning_rate": 4.128440366972477e-06,
"loss": 0.0006,
"step": 28000
},
{
"epoch": 18.41415465268676,
"grad_norm": 0.0034626726992428303,
"learning_rate": 3.964613368283093e-06,
"loss": 0.0001,
"step": 28100
},
{
"epoch": 18.479685452162517,
"grad_norm": 0.015080388635396957,
"learning_rate": 3.800786369593709e-06,
"loss": 0.0001,
"step": 28200
},
{
"epoch": 18.54521625163827,
"grad_norm": 0.020128346979618073,
"learning_rate": 3.6369593709043257e-06,
"loss": 0.0031,
"step": 28300
},
{
"epoch": 18.610747051114025,
"grad_norm": 0.011367076076567173,
"learning_rate": 3.4731323722149413e-06,
"loss": 0.0001,
"step": 28400
},
{
"epoch": 18.676277850589777,
"grad_norm": 0.0023978736717253923,
"learning_rate": 3.309305373525557e-06,
"loss": 0.0001,
"step": 28500
},
{
"epoch": 18.74180865006553,
"grad_norm": 0.0030761794187128544,
"learning_rate": 3.145478374836173e-06,
"loss": 0.0025,
"step": 28600
},
{
"epoch": 18.807339449541285,
"grad_norm": 0.001883818069472909,
"learning_rate": 2.981651376146789e-06,
"loss": 0.0001,
"step": 28700
},
{
"epoch": 18.872870249017037,
"grad_norm": 0.00347805954515934,
"learning_rate": 2.817824377457405e-06,
"loss": 0.0001,
"step": 28800
},
{
"epoch": 18.938401048492793,
"grad_norm": 0.004366457927972078,
"learning_rate": 2.6539973787680212e-06,
"loss": 0.0001,
"step": 28900
},
{
"epoch": 19.0,
"eval_accuracy": 0.9509747706422018,
"eval_f1": 0.9494461116737494,
"eval_loss": 0.3973633944988251,
"eval_runtime": 0.8752,
"eval_samples_per_second": 3985.248,
"eval_steps_per_second": 62.841,
"step": 28994
},
{
"epoch": 19.003931847968545,
"grad_norm": 0.004239593632519245,
"learning_rate": 2.490170380078637e-06,
"loss": 0.0001,
"step": 29000
},
{
"epoch": 19.0694626474443,
"grad_norm": 0.051923561841249466,
"learning_rate": 2.326343381389253e-06,
"loss": 0.0001,
"step": 29100
},
{
"epoch": 19.134993446920053,
"grad_norm": 0.01663641817867756,
"learning_rate": 2.1625163826998694e-06,
"loss": 0.001,
"step": 29200
},
{
"epoch": 19.200524246395805,
"grad_norm": 0.006036526523530483,
"learning_rate": 1.998689384010485e-06,
"loss": 0.0001,
"step": 29300
},
{
"epoch": 19.26605504587156,
"grad_norm": 0.004666994791477919,
"learning_rate": 1.8348623853211011e-06,
"loss": 0.0001,
"step": 29400
},
{
"epoch": 19.331585845347313,
"grad_norm": 0.0010127611458301544,
"learning_rate": 1.671035386631717e-06,
"loss": 0.0001,
"step": 29500
},
{
"epoch": 19.39711664482307,
"grad_norm": 0.0074880653992295265,
"learning_rate": 1.507208387942333e-06,
"loss": 0.0001,
"step": 29600
},
{
"epoch": 19.46264744429882,
"grad_norm": 0.0026892530731856823,
"learning_rate": 1.3433813892529489e-06,
"loss": 0.0014,
"step": 29700
},
{
"epoch": 19.528178243774573,
"grad_norm": 0.0020644895266741514,
"learning_rate": 1.179554390563565e-06,
"loss": 0.0001,
"step": 29800
},
{
"epoch": 19.59370904325033,
"grad_norm": 0.0029051878955215216,
"learning_rate": 1.0157273918741808e-06,
"loss": 0.0001,
"step": 29900
},
{
"epoch": 19.65923984272608,
"grad_norm": 0.005995690356940031,
"learning_rate": 8.51900393184797e-07,
"loss": 0.0001,
"step": 30000
},
{
"epoch": 19.724770642201836,
"grad_norm": 0.006156248040497303,
"learning_rate": 6.880733944954129e-07,
"loss": 0.0001,
"step": 30100
},
{
"epoch": 19.790301441677588,
"grad_norm": 0.0009661901276558638,
"learning_rate": 5.242463958060289e-07,
"loss": 0.0001,
"step": 30200
},
{
"epoch": 19.855832241153344,
"grad_norm": 0.007489080540835857,
"learning_rate": 3.6041939711664483e-07,
"loss": 0.0001,
"step": 30300
},
{
"epoch": 19.921363040629096,
"grad_norm": 0.006394806317985058,
"learning_rate": 1.9659239842726081e-07,
"loss": 0.0004,
"step": 30400
},
{
"epoch": 19.986893840104848,
"grad_norm": 0.007664592005312443,
"learning_rate": 3.2765399737876805e-08,
"loss": 0.0001,
"step": 30500
},
{
"epoch": 20.0,
"eval_accuracy": 0.9515481651376146,
"eval_f1": 0.9499685377695699,
"eval_loss": 0.398087739944458,
"eval_runtime": 0.9372,
"eval_samples_per_second": 3721.91,
"eval_steps_per_second": 58.688,
"step": 30520
}
],
"logging_steps": 100,
"max_steps": 30520,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2534266005283680.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}