|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 1000, |
|
"global_step": 10608, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002828054298642534, |
|
"grad_norm": 2.258908748626709, |
|
"learning_rate": 1.9981146304675718e-05, |
|
"loss": 1.5648, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005656108597285068, |
|
"grad_norm": 2.3489885330200195, |
|
"learning_rate": 1.9962292609351435e-05, |
|
"loss": 1.2587, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008484162895927601, |
|
"grad_norm": 3.1505980491638184, |
|
"learning_rate": 1.994343891402715e-05, |
|
"loss": 0.996, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.011312217194570135, |
|
"grad_norm": 2.7015912532806396, |
|
"learning_rate": 1.9924585218702868e-05, |
|
"loss": 0.991, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01414027149321267, |
|
"grad_norm": 2.6314280033111572, |
|
"learning_rate": 1.9905731523378585e-05, |
|
"loss": 0.934, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.016968325791855202, |
|
"grad_norm": 2.055577516555786, |
|
"learning_rate": 1.98868778280543e-05, |
|
"loss": 0.8245, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.019796380090497737, |
|
"grad_norm": 2.4679293632507324, |
|
"learning_rate": 1.9868024132730018e-05, |
|
"loss": 0.7501, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02262443438914027, |
|
"grad_norm": 4.737666130065918, |
|
"learning_rate": 1.9849170437405735e-05, |
|
"loss": 0.7572, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.025452488687782805, |
|
"grad_norm": 3.135712146759033, |
|
"learning_rate": 1.983031674208145e-05, |
|
"loss": 0.6785, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02828054298642534, |
|
"grad_norm": 3.1648664474487305, |
|
"learning_rate": 1.9811463046757168e-05, |
|
"loss": 0.6764, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.031108597285067874, |
|
"grad_norm": 2.4548099040985107, |
|
"learning_rate": 1.9792609351432884e-05, |
|
"loss": 0.6322, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.033936651583710405, |
|
"grad_norm": 2.9620838165283203, |
|
"learning_rate": 1.97737556561086e-05, |
|
"loss": 0.6756, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03676470588235294, |
|
"grad_norm": 6.274707794189453, |
|
"learning_rate": 1.9754901960784318e-05, |
|
"loss": 0.6566, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03959276018099547, |
|
"grad_norm": 4.661096096038818, |
|
"learning_rate": 1.9736048265460034e-05, |
|
"loss": 0.6179, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04242081447963801, |
|
"grad_norm": 3.97485613822937, |
|
"learning_rate": 1.971719457013575e-05, |
|
"loss": 0.5623, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04524886877828054, |
|
"grad_norm": 5.157923698425293, |
|
"learning_rate": 1.9698340874811464e-05, |
|
"loss": 0.5132, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04807692307692308, |
|
"grad_norm": 4.4768385887146, |
|
"learning_rate": 1.967948717948718e-05, |
|
"loss": 0.5081, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.05090497737556561, |
|
"grad_norm": 4.282762050628662, |
|
"learning_rate": 1.9660633484162897e-05, |
|
"loss": 0.5104, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05373303167420815, |
|
"grad_norm": 5.214530944824219, |
|
"learning_rate": 1.9641779788838614e-05, |
|
"loss": 0.6005, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05656108597285068, |
|
"grad_norm": 2.7484629154205322, |
|
"learning_rate": 1.962292609351433e-05, |
|
"loss": 0.5388, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05938914027149321, |
|
"grad_norm": 3.6074981689453125, |
|
"learning_rate": 1.9604072398190047e-05, |
|
"loss": 0.512, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06221719457013575, |
|
"grad_norm": 5.487575054168701, |
|
"learning_rate": 1.9585218702865764e-05, |
|
"loss": 0.5233, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06504524886877829, |
|
"grad_norm": 3.193223714828491, |
|
"learning_rate": 1.956636500754148e-05, |
|
"loss": 0.5091, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06787330316742081, |
|
"grad_norm": 8.64730167388916, |
|
"learning_rate": 1.9547511312217197e-05, |
|
"loss": 0.472, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.07070135746606335, |
|
"grad_norm": 3.8944942951202393, |
|
"learning_rate": 1.9528657616892914e-05, |
|
"loss": 0.5551, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.07352941176470588, |
|
"grad_norm": 3.6671645641326904, |
|
"learning_rate": 1.950980392156863e-05, |
|
"loss": 0.5575, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07635746606334842, |
|
"grad_norm": 5.698751449584961, |
|
"learning_rate": 1.9490950226244343e-05, |
|
"loss": 0.4495, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07918552036199095, |
|
"grad_norm": 6.565758228302002, |
|
"learning_rate": 1.947209653092006e-05, |
|
"loss": 0.5017, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.08201357466063348, |
|
"grad_norm": 4.61959171295166, |
|
"learning_rate": 1.9453242835595777e-05, |
|
"loss": 0.4639, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.08484162895927602, |
|
"grad_norm": 4.656176567077637, |
|
"learning_rate": 1.9434389140271493e-05, |
|
"loss": 0.4566, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08766968325791855, |
|
"grad_norm": 3.0015969276428223, |
|
"learning_rate": 1.941553544494721e-05, |
|
"loss": 0.5226, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.09049773755656108, |
|
"grad_norm": 3.7588982582092285, |
|
"learning_rate": 1.9396681749622927e-05, |
|
"loss": 0.4415, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.09332579185520362, |
|
"grad_norm": 8.415599822998047, |
|
"learning_rate": 1.9377828054298643e-05, |
|
"loss": 0.4992, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.09615384615384616, |
|
"grad_norm": 2.917985677719116, |
|
"learning_rate": 1.935897435897436e-05, |
|
"loss": 0.392, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.09898190045248868, |
|
"grad_norm": 3.867098093032837, |
|
"learning_rate": 1.9340120663650076e-05, |
|
"loss": 0.4382, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.10180995475113122, |
|
"grad_norm": 3.6737847328186035, |
|
"learning_rate": 1.9321266968325793e-05, |
|
"loss": 0.4458, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.10463800904977376, |
|
"grad_norm": 3.9890048503875732, |
|
"learning_rate": 1.930241327300151e-05, |
|
"loss": 0.4038, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1074660633484163, |
|
"grad_norm": 7.732723236083984, |
|
"learning_rate": 1.9283559577677226e-05, |
|
"loss": 0.4541, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.11029411764705882, |
|
"grad_norm": 7.815601348876953, |
|
"learning_rate": 1.9264705882352943e-05, |
|
"loss": 0.5302, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.11312217194570136, |
|
"grad_norm": 4.053082466125488, |
|
"learning_rate": 1.924585218702866e-05, |
|
"loss": 0.4203, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1159502262443439, |
|
"grad_norm": 5.627740859985352, |
|
"learning_rate": 1.9226998491704376e-05, |
|
"loss": 0.4473, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.11877828054298642, |
|
"grad_norm": 5.07185697555542, |
|
"learning_rate": 1.9208144796380093e-05, |
|
"loss": 0.4345, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.12160633484162896, |
|
"grad_norm": 3.9544167518615723, |
|
"learning_rate": 1.918929110105581e-05, |
|
"loss": 0.4849, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1244343891402715, |
|
"grad_norm": 6.9721293449401855, |
|
"learning_rate": 1.9170437405731526e-05, |
|
"loss": 0.4029, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.12726244343891402, |
|
"grad_norm": 4.0150556564331055, |
|
"learning_rate": 1.9151583710407243e-05, |
|
"loss": 0.5236, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.13009049773755657, |
|
"grad_norm": 6.681572914123535, |
|
"learning_rate": 1.913273001508296e-05, |
|
"loss": 0.4059, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1329185520361991, |
|
"grad_norm": 5.1267876625061035, |
|
"learning_rate": 1.9113876319758676e-05, |
|
"loss": 0.467, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.13574660633484162, |
|
"grad_norm": 3.66207218170166, |
|
"learning_rate": 1.9095022624434392e-05, |
|
"loss": 0.3689, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.13857466063348417, |
|
"grad_norm": 4.016237735748291, |
|
"learning_rate": 1.907616892911011e-05, |
|
"loss": 0.4502, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1414027149321267, |
|
"grad_norm": 5.003229141235352, |
|
"learning_rate": 1.9057315233785822e-05, |
|
"loss": 0.3935, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14423076923076922, |
|
"grad_norm": 7.917128562927246, |
|
"learning_rate": 1.903846153846154e-05, |
|
"loss": 0.4765, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 4.975776672363281, |
|
"learning_rate": 1.9019607843137255e-05, |
|
"loss": 0.4564, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1498868778280543, |
|
"grad_norm": 5.559664726257324, |
|
"learning_rate": 1.9000754147812972e-05, |
|
"loss": 0.3663, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.15271493212669685, |
|
"grad_norm": 4.463156700134277, |
|
"learning_rate": 1.898190045248869e-05, |
|
"loss": 0.358, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.15554298642533937, |
|
"grad_norm": 4.642306327819824, |
|
"learning_rate": 1.8963046757164405e-05, |
|
"loss": 0.4083, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1583710407239819, |
|
"grad_norm": 10.270988464355469, |
|
"learning_rate": 1.8944193061840122e-05, |
|
"loss": 0.5147, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.16119909502262444, |
|
"grad_norm": 4.234387397766113, |
|
"learning_rate": 1.892533936651584e-05, |
|
"loss": 0.5115, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.16402714932126697, |
|
"grad_norm": 5.710938930511475, |
|
"learning_rate": 1.8906485671191555e-05, |
|
"loss": 0.4483, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1668552036199095, |
|
"grad_norm": 8.131850242614746, |
|
"learning_rate": 1.8887631975867272e-05, |
|
"loss": 0.4282, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.16968325791855204, |
|
"grad_norm": 4.962357997894287, |
|
"learning_rate": 1.886877828054299e-05, |
|
"loss": 0.4691, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17251131221719457, |
|
"grad_norm": 7.434023857116699, |
|
"learning_rate": 1.8849924585218705e-05, |
|
"loss": 0.3864, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1753393665158371, |
|
"grad_norm": 3.2809391021728516, |
|
"learning_rate": 1.883107088989442e-05, |
|
"loss": 0.3506, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.17816742081447964, |
|
"grad_norm": 4.324213027954102, |
|
"learning_rate": 1.8812217194570138e-05, |
|
"loss": 0.3316, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.18099547511312217, |
|
"grad_norm": 4.655824661254883, |
|
"learning_rate": 1.8793363499245855e-05, |
|
"loss": 0.4192, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.18382352941176472, |
|
"grad_norm": 3.21587872505188, |
|
"learning_rate": 1.877450980392157e-05, |
|
"loss": 0.3091, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.18665158371040724, |
|
"grad_norm": 5.112785339355469, |
|
"learning_rate": 1.8755656108597288e-05, |
|
"loss": 0.4218, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.18947963800904977, |
|
"grad_norm": 2.902008533477783, |
|
"learning_rate": 1.8736802413273005e-05, |
|
"loss": 0.3595, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.19230769230769232, |
|
"grad_norm": 4.45237398147583, |
|
"learning_rate": 1.8717948717948718e-05, |
|
"loss": 0.4378, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.19513574660633484, |
|
"grad_norm": 3.8630735874176025, |
|
"learning_rate": 1.8699095022624435e-05, |
|
"loss": 0.4072, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.19796380090497737, |
|
"grad_norm": 3.8529324531555176, |
|
"learning_rate": 1.868024132730015e-05, |
|
"loss": 0.4335, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.20079185520361992, |
|
"grad_norm": 2.7820119857788086, |
|
"learning_rate": 1.8661387631975868e-05, |
|
"loss": 0.4457, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.20361990950226244, |
|
"grad_norm": 7.398665428161621, |
|
"learning_rate": 1.8642533936651584e-05, |
|
"loss": 0.4552, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.20644796380090497, |
|
"grad_norm": 5.17724609375, |
|
"learning_rate": 1.86236802413273e-05, |
|
"loss": 0.4322, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.20927601809954752, |
|
"grad_norm": 7.6877851486206055, |
|
"learning_rate": 1.8604826546003018e-05, |
|
"loss": 0.3493, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.21210407239819004, |
|
"grad_norm": 3.655296802520752, |
|
"learning_rate": 1.8585972850678734e-05, |
|
"loss": 0.4361, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2149321266968326, |
|
"grad_norm": 5.219052791595459, |
|
"learning_rate": 1.856711915535445e-05, |
|
"loss": 0.391, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.21776018099547512, |
|
"grad_norm": 7.211420059204102, |
|
"learning_rate": 1.8548265460030168e-05, |
|
"loss": 0.3808, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.22058823529411764, |
|
"grad_norm": 3.348724842071533, |
|
"learning_rate": 1.8529411764705884e-05, |
|
"loss": 0.4041, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2234162895927602, |
|
"grad_norm": 5.180337905883789, |
|
"learning_rate": 1.85105580693816e-05, |
|
"loss": 0.2996, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.22624434389140272, |
|
"grad_norm": 5.770839214324951, |
|
"learning_rate": 1.8491704374057317e-05, |
|
"loss": 0.3898, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.22907239819004524, |
|
"grad_norm": 2.4614834785461426, |
|
"learning_rate": 1.8472850678733034e-05, |
|
"loss": 0.3307, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.2319004524886878, |
|
"grad_norm": 5.4143548011779785, |
|
"learning_rate": 1.845399698340875e-05, |
|
"loss": 0.4487, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.23472850678733032, |
|
"grad_norm": 3.846611499786377, |
|
"learning_rate": 1.8435143288084464e-05, |
|
"loss": 0.4334, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.23755656108597284, |
|
"grad_norm": 7.2528510093688965, |
|
"learning_rate": 1.841628959276018e-05, |
|
"loss": 0.3471, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2403846153846154, |
|
"grad_norm": 4.265413284301758, |
|
"learning_rate": 1.8397435897435897e-05, |
|
"loss": 0.3879, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.24321266968325791, |
|
"grad_norm": 4.564918518066406, |
|
"learning_rate": 1.8378582202111614e-05, |
|
"loss": 0.4142, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.24604072398190044, |
|
"grad_norm": 4.268716335296631, |
|
"learning_rate": 1.835972850678733e-05, |
|
"loss": 0.4358, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.248868778280543, |
|
"grad_norm": 6.909433841705322, |
|
"learning_rate": 1.8340874811463047e-05, |
|
"loss": 0.405, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2516968325791855, |
|
"grad_norm": 6.801779270172119, |
|
"learning_rate": 1.8322021116138764e-05, |
|
"loss": 0.413, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.25452488687782804, |
|
"grad_norm": 4.851901531219482, |
|
"learning_rate": 1.830316742081448e-05, |
|
"loss": 0.4337, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.25735294117647056, |
|
"grad_norm": 2.650651693344116, |
|
"learning_rate": 1.8284313725490197e-05, |
|
"loss": 0.372, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.26018099547511314, |
|
"grad_norm": 3.660430669784546, |
|
"learning_rate": 1.8265460030165913e-05, |
|
"loss": 0.4142, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.26300904977375567, |
|
"grad_norm": 6.7838544845581055, |
|
"learning_rate": 1.824660633484163e-05, |
|
"loss": 0.4053, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.2658371040723982, |
|
"grad_norm": 4.861571311950684, |
|
"learning_rate": 1.8227752639517347e-05, |
|
"loss": 0.4416, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.2686651583710407, |
|
"grad_norm": 5.85540771484375, |
|
"learning_rate": 1.8208898944193063e-05, |
|
"loss": 0.3848, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.27149321266968324, |
|
"grad_norm": 2.6535346508026123, |
|
"learning_rate": 1.819004524886878e-05, |
|
"loss": 0.4228, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2743212669683258, |
|
"grad_norm": 2.8000805377960205, |
|
"learning_rate": 1.8171191553544496e-05, |
|
"loss": 0.3476, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.27714932126696834, |
|
"grad_norm": 5.302433013916016, |
|
"learning_rate": 1.8152337858220213e-05, |
|
"loss": 0.3536, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.27997737556561086, |
|
"grad_norm": 4.889918327331543, |
|
"learning_rate": 1.813348416289593e-05, |
|
"loss": 0.3817, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.2828054298642534, |
|
"grad_norm": 6.261002540588379, |
|
"learning_rate": 1.8114630467571646e-05, |
|
"loss": 0.3976, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2828054298642534, |
|
"eval_accuracy": 0.8570633153038498, |
|
"eval_loss": 0.3707549571990967, |
|
"eval_runtime": 127.6312, |
|
"eval_samples_per_second": 98.503, |
|
"eval_steps_per_second": 3.079, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2856334841628959, |
|
"grad_norm": 4.15647554397583, |
|
"learning_rate": 1.8095776772247363e-05, |
|
"loss": 0.3198, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.28846153846153844, |
|
"grad_norm": 3.652892589569092, |
|
"learning_rate": 1.807692307692308e-05, |
|
"loss": 0.376, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.291289592760181, |
|
"grad_norm": 7.219604015350342, |
|
"learning_rate": 1.8058069381598796e-05, |
|
"loss": 0.4519, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 3.8920180797576904, |
|
"learning_rate": 1.8039215686274513e-05, |
|
"loss": 0.3336, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.29694570135746606, |
|
"grad_norm": 4.875617504119873, |
|
"learning_rate": 1.802036199095023e-05, |
|
"loss": 0.357, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2997737556561086, |
|
"grad_norm": 3.1264288425445557, |
|
"learning_rate": 1.8001508295625946e-05, |
|
"loss": 0.3482, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.3026018099547511, |
|
"grad_norm": 3.4531030654907227, |
|
"learning_rate": 1.7982654600301663e-05, |
|
"loss": 0.3533, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.3054298642533937, |
|
"grad_norm": 3.0971388816833496, |
|
"learning_rate": 1.796380090497738e-05, |
|
"loss": 0.3737, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3082579185520362, |
|
"grad_norm": 3.3527133464813232, |
|
"learning_rate": 1.7944947209653092e-05, |
|
"loss": 0.4425, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.31108597285067874, |
|
"grad_norm": 3.197056293487549, |
|
"learning_rate": 1.792609351432881e-05, |
|
"loss": 0.3657, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.31391402714932126, |
|
"grad_norm": 4.942928791046143, |
|
"learning_rate": 1.7907239819004526e-05, |
|
"loss": 0.3988, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3167420814479638, |
|
"grad_norm": 4.839690208435059, |
|
"learning_rate": 1.7888386123680242e-05, |
|
"loss": 0.3794, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.3195701357466063, |
|
"grad_norm": 5.171438694000244, |
|
"learning_rate": 1.786953242835596e-05, |
|
"loss": 0.3795, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3223981900452489, |
|
"grad_norm": 2.4731950759887695, |
|
"learning_rate": 1.7850678733031676e-05, |
|
"loss": 0.3396, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.3252262443438914, |
|
"grad_norm": 4.658932209014893, |
|
"learning_rate": 1.7831825037707392e-05, |
|
"loss": 0.2841, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.32805429864253394, |
|
"grad_norm": 3.5409414768218994, |
|
"learning_rate": 1.781297134238311e-05, |
|
"loss": 0.3701, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.33088235294117646, |
|
"grad_norm": 3.814213275909424, |
|
"learning_rate": 1.7794117647058825e-05, |
|
"loss": 0.3445, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.333710407239819, |
|
"grad_norm": 3.226147413253784, |
|
"learning_rate": 1.7775263951734542e-05, |
|
"loss": 0.3513, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.33653846153846156, |
|
"grad_norm": 4.451591491699219, |
|
"learning_rate": 1.775641025641026e-05, |
|
"loss": 0.3648, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3393665158371041, |
|
"grad_norm": 4.0332818031311035, |
|
"learning_rate": 1.7737556561085972e-05, |
|
"loss": 0.4392, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3421945701357466, |
|
"grad_norm": 3.1572704315185547, |
|
"learning_rate": 1.771870286576169e-05, |
|
"loss": 0.3602, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.34502262443438914, |
|
"grad_norm": 4.314695835113525, |
|
"learning_rate": 1.7699849170437405e-05, |
|
"loss": 0.29, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.34785067873303166, |
|
"grad_norm": 5.7975239753723145, |
|
"learning_rate": 1.768099547511312e-05, |
|
"loss": 0.3716, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3506787330316742, |
|
"grad_norm": 5.377049446105957, |
|
"learning_rate": 1.7662141779788838e-05, |
|
"loss": 0.3566, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.35350678733031676, |
|
"grad_norm": 3.84669828414917, |
|
"learning_rate": 1.7643288084464555e-05, |
|
"loss": 0.3961, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3563348416289593, |
|
"grad_norm": 5.146121501922607, |
|
"learning_rate": 1.762443438914027e-05, |
|
"loss": 0.4366, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.3591628959276018, |
|
"grad_norm": 3.1066689491271973, |
|
"learning_rate": 1.7605580693815988e-05, |
|
"loss": 0.3698, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.36199095022624433, |
|
"grad_norm": 4.1310296058654785, |
|
"learning_rate": 1.7586726998491705e-05, |
|
"loss": 0.3631, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.36481900452488686, |
|
"grad_norm": 3.0287930965423584, |
|
"learning_rate": 1.756787330316742e-05, |
|
"loss": 0.3151, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.36764705882352944, |
|
"grad_norm": 4.4270219802856445, |
|
"learning_rate": 1.7549019607843138e-05, |
|
"loss": 0.357, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.37047511312217196, |
|
"grad_norm": 4.785469055175781, |
|
"learning_rate": 1.7530165912518855e-05, |
|
"loss": 0.3809, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.3733031674208145, |
|
"grad_norm": 5.920436859130859, |
|
"learning_rate": 1.751131221719457e-05, |
|
"loss": 0.4003, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.376131221719457, |
|
"grad_norm": 5.400911331176758, |
|
"learning_rate": 1.7492458521870288e-05, |
|
"loss": 0.4313, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.37895927601809953, |
|
"grad_norm": 6.202630996704102, |
|
"learning_rate": 1.7473604826546004e-05, |
|
"loss": 0.3792, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.38178733031674206, |
|
"grad_norm": 3.413867473602295, |
|
"learning_rate": 1.745475113122172e-05, |
|
"loss": 0.3881, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 5.005847930908203, |
|
"learning_rate": 1.7435897435897438e-05, |
|
"loss": 0.4344, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.38744343891402716, |
|
"grad_norm": 4.416658878326416, |
|
"learning_rate": 1.7417043740573154e-05, |
|
"loss": 0.332, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.3902714932126697, |
|
"grad_norm": 5.2433247566223145, |
|
"learning_rate": 1.739819004524887e-05, |
|
"loss": 0.3873, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.3930995475113122, |
|
"grad_norm": 3.740522861480713, |
|
"learning_rate": 1.7379336349924588e-05, |
|
"loss": 0.3495, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.39592760180995473, |
|
"grad_norm": 6.047609329223633, |
|
"learning_rate": 1.7360482654600304e-05, |
|
"loss": 0.3922, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3987556561085973, |
|
"grad_norm": 3.7461910247802734, |
|
"learning_rate": 1.734162895927602e-05, |
|
"loss": 0.3556, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.40158371040723984, |
|
"grad_norm": 7.2883405685424805, |
|
"learning_rate": 1.7322775263951737e-05, |
|
"loss": 0.319, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.40441176470588236, |
|
"grad_norm": 5.338521480560303, |
|
"learning_rate": 1.7303921568627454e-05, |
|
"loss": 0.3272, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.4072398190045249, |
|
"grad_norm": 5.680319309234619, |
|
"learning_rate": 1.728506787330317e-05, |
|
"loss": 0.3336, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4100678733031674, |
|
"grad_norm": 3.7183480262756348, |
|
"learning_rate": 1.7266214177978887e-05, |
|
"loss": 0.3218, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.41289592760180993, |
|
"grad_norm": 4.478979110717773, |
|
"learning_rate": 1.7247360482654604e-05, |
|
"loss": 0.3719, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.4157239819004525, |
|
"grad_norm": 3.1170661449432373, |
|
"learning_rate": 1.722850678733032e-05, |
|
"loss": 0.3563, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.41855203619909503, |
|
"grad_norm": 5.310198783874512, |
|
"learning_rate": 1.7209653092006037e-05, |
|
"loss": 0.3009, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.42138009049773756, |
|
"grad_norm": 4.134536266326904, |
|
"learning_rate": 1.7190799396681754e-05, |
|
"loss": 0.4116, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.4242081447963801, |
|
"grad_norm": 2.9341182708740234, |
|
"learning_rate": 1.7171945701357467e-05, |
|
"loss": 0.3547, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4270361990950226, |
|
"grad_norm": 2.2353224754333496, |
|
"learning_rate": 1.7153092006033184e-05, |
|
"loss": 0.3652, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.4298642533936652, |
|
"grad_norm": 4.16320276260376, |
|
"learning_rate": 1.71342383107089e-05, |
|
"loss": 0.3733, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4326923076923077, |
|
"grad_norm": 4.933135986328125, |
|
"learning_rate": 1.7115384615384617e-05, |
|
"loss": 0.4125, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.43552036199095023, |
|
"grad_norm": 5.511205673217773, |
|
"learning_rate": 1.7096530920060333e-05, |
|
"loss": 0.3102, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.43834841628959276, |
|
"grad_norm": 4.415884494781494, |
|
"learning_rate": 1.707767722473605e-05, |
|
"loss": 0.3348, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 3.8917481899261475, |
|
"learning_rate": 1.7058823529411767e-05, |
|
"loss": 0.3866, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.4440045248868778, |
|
"grad_norm": 2.751532793045044, |
|
"learning_rate": 1.7039969834087483e-05, |
|
"loss": 0.3644, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.4468325791855204, |
|
"grad_norm": 5.6193413734436035, |
|
"learning_rate": 1.70211161387632e-05, |
|
"loss": 0.3566, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.4496606334841629, |
|
"grad_norm": 3.058835744857788, |
|
"learning_rate": 1.7002262443438916e-05, |
|
"loss": 0.3681, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.45248868778280543, |
|
"grad_norm": 3.9540457725524902, |
|
"learning_rate": 1.6983408748114633e-05, |
|
"loss": 0.3132, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.45531674208144796, |
|
"grad_norm": 3.8163225650787354, |
|
"learning_rate": 1.6964555052790346e-05, |
|
"loss": 0.3654, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.4581447963800905, |
|
"grad_norm": 4.724973201751709, |
|
"learning_rate": 1.6945701357466063e-05, |
|
"loss": 0.3434, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.46097285067873306, |
|
"grad_norm": 3.3608546257019043, |
|
"learning_rate": 1.692684766214178e-05, |
|
"loss": 0.3593, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.4638009049773756, |
|
"grad_norm": 4.132437705993652, |
|
"learning_rate": 1.6907993966817496e-05, |
|
"loss": 0.3552, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4666289592760181, |
|
"grad_norm": 4.544163227081299, |
|
"learning_rate": 1.6889140271493213e-05, |
|
"loss": 0.3678, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.46945701357466063, |
|
"grad_norm": 4.244106769561768, |
|
"learning_rate": 1.687028657616893e-05, |
|
"loss": 0.3432, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.47228506787330315, |
|
"grad_norm": 3.3168179988861084, |
|
"learning_rate": 1.6851432880844646e-05, |
|
"loss": 0.313, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.4751131221719457, |
|
"grad_norm": 4.040717601776123, |
|
"learning_rate": 1.6832579185520363e-05, |
|
"loss": 0.3334, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.47794117647058826, |
|
"grad_norm": 4.582857608795166, |
|
"learning_rate": 1.681372549019608e-05, |
|
"loss": 0.3004, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.4807692307692308, |
|
"grad_norm": 6.330207347869873, |
|
"learning_rate": 1.6794871794871796e-05, |
|
"loss": 0.3486, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4835972850678733, |
|
"grad_norm": 3.564183473587036, |
|
"learning_rate": 1.6776018099547512e-05, |
|
"loss": 0.3312, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.48642533936651583, |
|
"grad_norm": 5.753744125366211, |
|
"learning_rate": 1.675716440422323e-05, |
|
"loss": 0.4188, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.48925339366515835, |
|
"grad_norm": 2.692269802093506, |
|
"learning_rate": 1.6738310708898946e-05, |
|
"loss": 0.3149, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.4920814479638009, |
|
"grad_norm": 3.748378038406372, |
|
"learning_rate": 1.6719457013574662e-05, |
|
"loss": 0.374, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.49490950226244346, |
|
"grad_norm": 7.150949478149414, |
|
"learning_rate": 1.670060331825038e-05, |
|
"loss": 0.4427, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.497737556561086, |
|
"grad_norm": 4.332088470458984, |
|
"learning_rate": 1.6681749622926096e-05, |
|
"loss": 0.296, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5005656108597285, |
|
"grad_norm": 3.9501969814300537, |
|
"learning_rate": 1.6662895927601812e-05, |
|
"loss": 0.3236, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.503393665158371, |
|
"grad_norm": 4.039945602416992, |
|
"learning_rate": 1.664404223227753e-05, |
|
"loss": 0.3654, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5062217194570136, |
|
"grad_norm": 4.735800743103027, |
|
"learning_rate": 1.6625188536953245e-05, |
|
"loss": 0.3177, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.5090497737556561, |
|
"grad_norm": 3.796029806137085, |
|
"learning_rate": 1.6606334841628962e-05, |
|
"loss": 0.3302, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5118778280542986, |
|
"grad_norm": 2.808561086654663, |
|
"learning_rate": 1.658748114630468e-05, |
|
"loss": 0.3533, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.5147058823529411, |
|
"grad_norm": 3.9006407260894775, |
|
"learning_rate": 1.6568627450980395e-05, |
|
"loss": 0.3862, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5175339366515838, |
|
"grad_norm": 6.6023850440979, |
|
"learning_rate": 1.654977375565611e-05, |
|
"loss": 0.4115, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5203619909502263, |
|
"grad_norm": 3.3932111263275146, |
|
"learning_rate": 1.6530920060331825e-05, |
|
"loss": 0.2936, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5231900452488688, |
|
"grad_norm": 4.266836166381836, |
|
"learning_rate": 1.651206636500754e-05, |
|
"loss": 0.2848, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5260180995475113, |
|
"grad_norm": 4.283823490142822, |
|
"learning_rate": 1.6493212669683258e-05, |
|
"loss": 0.4285, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5288461538461539, |
|
"grad_norm": 3.3755383491516113, |
|
"learning_rate": 1.6474358974358975e-05, |
|
"loss": 0.3579, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.5316742081447964, |
|
"grad_norm": 5.754073143005371, |
|
"learning_rate": 1.645550527903469e-05, |
|
"loss": 0.3325, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.5345022624434389, |
|
"grad_norm": 2.890216588973999, |
|
"learning_rate": 1.6436651583710408e-05, |
|
"loss": 0.3866, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5373303167420814, |
|
"grad_norm": 4.1960978507995605, |
|
"learning_rate": 1.6417797888386125e-05, |
|
"loss": 0.4097, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.540158371040724, |
|
"grad_norm": 4.490061283111572, |
|
"learning_rate": 1.639894419306184e-05, |
|
"loss": 0.3541, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.5429864253393665, |
|
"grad_norm": 2.911954879760742, |
|
"learning_rate": 1.6380090497737558e-05, |
|
"loss": 0.3435, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.545814479638009, |
|
"grad_norm": 2.816277027130127, |
|
"learning_rate": 1.6361236802413275e-05, |
|
"loss": 0.3168, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5486425339366516, |
|
"grad_norm": 5.4081807136535645, |
|
"learning_rate": 1.634238310708899e-05, |
|
"loss": 0.404, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5514705882352942, |
|
"grad_norm": 6.02499532699585, |
|
"learning_rate": 1.6323529411764708e-05, |
|
"loss": 0.4293, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5542986425339367, |
|
"grad_norm": 5.138996124267578, |
|
"learning_rate": 1.6304675716440424e-05, |
|
"loss": 0.4194, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5571266968325792, |
|
"grad_norm": 4.069638252258301, |
|
"learning_rate": 1.628582202111614e-05, |
|
"loss": 0.4188, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5599547511312217, |
|
"grad_norm": 4.273077487945557, |
|
"learning_rate": 1.6266968325791858e-05, |
|
"loss": 0.3714, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5627828054298643, |
|
"grad_norm": 3.559727430343628, |
|
"learning_rate": 1.6248114630467574e-05, |
|
"loss": 0.2804, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5656108597285068, |
|
"grad_norm": 3.5052013397216797, |
|
"learning_rate": 1.622926093514329e-05, |
|
"loss": 0.375, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5656108597285068, |
|
"eval_accuracy": 0.8721762647152402, |
|
"eval_loss": 0.32142505049705505, |
|
"eval_runtime": 126.2811, |
|
"eval_samples_per_second": 99.556, |
|
"eval_steps_per_second": 3.112, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5684389140271493, |
|
"grad_norm": 3.033839464187622, |
|
"learning_rate": 1.6210407239819008e-05, |
|
"loss": 0.2655, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5712669683257918, |
|
"grad_norm": 8.15062427520752, |
|
"learning_rate": 1.6191553544494724e-05, |
|
"loss": 0.282, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5740950226244343, |
|
"grad_norm": 4.665267467498779, |
|
"learning_rate": 1.6172699849170437e-05, |
|
"loss": 0.3432, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.5769230769230769, |
|
"grad_norm": 5.122295379638672, |
|
"learning_rate": 1.6153846153846154e-05, |
|
"loss": 0.4143, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5797511312217195, |
|
"grad_norm": 5.127368450164795, |
|
"learning_rate": 1.613499245852187e-05, |
|
"loss": 0.3529, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.582579185520362, |
|
"grad_norm": 4.725905418395996, |
|
"learning_rate": 1.6116138763197587e-05, |
|
"loss": 0.3102, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.5854072398190046, |
|
"grad_norm": 2.358879566192627, |
|
"learning_rate": 1.6097285067873304e-05, |
|
"loss": 0.4083, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 4.624474048614502, |
|
"learning_rate": 1.607843137254902e-05, |
|
"loss": 0.3742, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.5910633484162896, |
|
"grad_norm": 3.6771047115325928, |
|
"learning_rate": 1.6059577677224737e-05, |
|
"loss": 0.3705, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.5938914027149321, |
|
"grad_norm": 3.136711359024048, |
|
"learning_rate": 1.6040723981900454e-05, |
|
"loss": 0.3365, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5967194570135747, |
|
"grad_norm": 4.1188883781433105, |
|
"learning_rate": 1.602187028657617e-05, |
|
"loss": 0.3138, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.5995475113122172, |
|
"grad_norm": 2.472294569015503, |
|
"learning_rate": 1.6003016591251887e-05, |
|
"loss": 0.2888, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.6023755656108597, |
|
"grad_norm": 3.7209103107452393, |
|
"learning_rate": 1.5984162895927604e-05, |
|
"loss": 0.3057, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6052036199095022, |
|
"grad_norm": 3.5798637866973877, |
|
"learning_rate": 1.596530920060332e-05, |
|
"loss": 0.3481, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.6080316742081447, |
|
"grad_norm": 3.1317641735076904, |
|
"learning_rate": 1.5946455505279037e-05, |
|
"loss": 0.2694, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6108597285067874, |
|
"grad_norm": 3.438688278198242, |
|
"learning_rate": 1.592760180995475e-05, |
|
"loss": 0.338, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6136877828054299, |
|
"grad_norm": 2.2631101608276367, |
|
"learning_rate": 1.5908748114630467e-05, |
|
"loss": 0.4032, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.6165158371040724, |
|
"grad_norm": 3.2705330848693848, |
|
"learning_rate": 1.5889894419306183e-05, |
|
"loss": 0.3878, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.619343891402715, |
|
"grad_norm": 5.617705821990967, |
|
"learning_rate": 1.58710407239819e-05, |
|
"loss": 0.3213, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.6221719457013575, |
|
"grad_norm": 5.0493550300598145, |
|
"learning_rate": 1.5852187028657616e-05, |
|
"loss": 0.3606, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 2.885690689086914, |
|
"learning_rate": 1.5833333333333333e-05, |
|
"loss": 0.3232, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.6278280542986425, |
|
"grad_norm": 2.4986419677734375, |
|
"learning_rate": 1.581447963800905e-05, |
|
"loss": 0.3318, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.630656108597285, |
|
"grad_norm": 3.8310494422912598, |
|
"learning_rate": 1.5795625942684766e-05, |
|
"loss": 0.3241, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.6334841628959276, |
|
"grad_norm": 4.589399337768555, |
|
"learning_rate": 1.5776772247360483e-05, |
|
"loss": 0.3537, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.6363122171945701, |
|
"grad_norm": 3.939833164215088, |
|
"learning_rate": 1.57579185520362e-05, |
|
"loss": 0.3665, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6391402714932126, |
|
"grad_norm": 3.5939204692840576, |
|
"learning_rate": 1.5739064856711916e-05, |
|
"loss": 0.3462, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.6419683257918553, |
|
"grad_norm": 4.346156597137451, |
|
"learning_rate": 1.5720211161387633e-05, |
|
"loss": 0.3887, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.6447963800904978, |
|
"grad_norm": 4.5238165855407715, |
|
"learning_rate": 1.570135746606335e-05, |
|
"loss": 0.269, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6476244343891403, |
|
"grad_norm": 4.225012302398682, |
|
"learning_rate": 1.5682503770739066e-05, |
|
"loss": 0.3346, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.6504524886877828, |
|
"grad_norm": 5.076806545257568, |
|
"learning_rate": 1.5663650075414783e-05, |
|
"loss": 0.4031, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6532805429864253, |
|
"grad_norm": 5.921730041503906, |
|
"learning_rate": 1.56447963800905e-05, |
|
"loss": 0.3348, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6561085972850679, |
|
"grad_norm": 5.128915309906006, |
|
"learning_rate": 1.5625942684766216e-05, |
|
"loss": 0.3626, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6589366515837104, |
|
"grad_norm": 3.5405006408691406, |
|
"learning_rate": 1.5607088989441932e-05, |
|
"loss": 0.3396, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.6617647058823529, |
|
"grad_norm": 5.166226863861084, |
|
"learning_rate": 1.558823529411765e-05, |
|
"loss": 0.3281, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6645927601809954, |
|
"grad_norm": 3.0114974975585938, |
|
"learning_rate": 1.5569381598793366e-05, |
|
"loss": 0.3495, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.667420814479638, |
|
"grad_norm": 4.730240345001221, |
|
"learning_rate": 1.5550527903469082e-05, |
|
"loss": 0.3775, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.6702488687782805, |
|
"grad_norm": 6.134402275085449, |
|
"learning_rate": 1.55316742081448e-05, |
|
"loss": 0.408, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6730769230769231, |
|
"grad_norm": 8.204373359680176, |
|
"learning_rate": 1.5512820512820516e-05, |
|
"loss": 0.3215, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.6759049773755657, |
|
"grad_norm": 5.2875895500183105, |
|
"learning_rate": 1.5493966817496232e-05, |
|
"loss": 0.3004, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6787330316742082, |
|
"grad_norm": 4.722002029418945, |
|
"learning_rate": 1.547511312217195e-05, |
|
"loss": 0.3495, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6815610859728507, |
|
"grad_norm": 3.777385711669922, |
|
"learning_rate": 1.5456259426847665e-05, |
|
"loss": 0.314, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.6843891402714932, |
|
"grad_norm": 4.804584503173828, |
|
"learning_rate": 1.5437405731523382e-05, |
|
"loss": 0.302, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6872171945701357, |
|
"grad_norm": 1.9814542531967163, |
|
"learning_rate": 1.54185520361991e-05, |
|
"loss": 0.32, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.6900452488687783, |
|
"grad_norm": 4.671655178070068, |
|
"learning_rate": 1.5399698340874812e-05, |
|
"loss": 0.3471, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.6928733031674208, |
|
"grad_norm": 4.3465776443481445, |
|
"learning_rate": 1.538084464555053e-05, |
|
"loss": 0.4079, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.6957013574660633, |
|
"grad_norm": 5.087115287780762, |
|
"learning_rate": 1.5361990950226245e-05, |
|
"loss": 0.2957, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.6985294117647058, |
|
"grad_norm": 4.124098777770996, |
|
"learning_rate": 1.5343137254901962e-05, |
|
"loss": 0.3573, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.7013574660633484, |
|
"grad_norm": 4.266404628753662, |
|
"learning_rate": 1.532428355957768e-05, |
|
"loss": 0.3054, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.704185520361991, |
|
"grad_norm": 3.325258731842041, |
|
"learning_rate": 1.5305429864253395e-05, |
|
"loss": 0.3675, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.7070135746606335, |
|
"grad_norm": 2.9218814373016357, |
|
"learning_rate": 1.528657616892911e-05, |
|
"loss": 0.3175, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.709841628959276, |
|
"grad_norm": 4.399160385131836, |
|
"learning_rate": 1.5267722473604828e-05, |
|
"loss": 0.3239, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.7126696832579186, |
|
"grad_norm": 4.460221290588379, |
|
"learning_rate": 1.5248868778280543e-05, |
|
"loss": 0.3827, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7154977375565611, |
|
"grad_norm": 3.0739834308624268, |
|
"learning_rate": 1.523001508295626e-05, |
|
"loss": 0.3725, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.7183257918552036, |
|
"grad_norm": 2.8812670707702637, |
|
"learning_rate": 1.5211161387631976e-05, |
|
"loss": 0.2495, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.7211538461538461, |
|
"grad_norm": 6.949345588684082, |
|
"learning_rate": 1.5192307692307693e-05, |
|
"loss": 0.3632, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.7239819004524887, |
|
"grad_norm": 3.124908685684204, |
|
"learning_rate": 1.517345399698341e-05, |
|
"loss": 0.3681, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.7268099547511312, |
|
"grad_norm": 4.435882091522217, |
|
"learning_rate": 1.5154600301659126e-05, |
|
"loss": 0.3279, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.7296380090497737, |
|
"grad_norm": 3.6505391597747803, |
|
"learning_rate": 1.5135746606334843e-05, |
|
"loss": 0.3487, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.7324660633484162, |
|
"grad_norm": 3.057103395462036, |
|
"learning_rate": 1.511689291101056e-05, |
|
"loss": 0.3086, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 2.988297462463379, |
|
"learning_rate": 1.5098039215686276e-05, |
|
"loss": 0.2901, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7381221719457014, |
|
"grad_norm": 8.121850967407227, |
|
"learning_rate": 1.5079185520361993e-05, |
|
"loss": 0.3656, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.7409502262443439, |
|
"grad_norm": 3.4862985610961914, |
|
"learning_rate": 1.506033182503771e-05, |
|
"loss": 0.3144, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.7437782805429864, |
|
"grad_norm": 2.3046765327453613, |
|
"learning_rate": 1.5041478129713424e-05, |
|
"loss": 0.2498, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.746606334841629, |
|
"grad_norm": 3.606008529663086, |
|
"learning_rate": 1.502262443438914e-05, |
|
"loss": 0.3449, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.7494343891402715, |
|
"grad_norm": 3.494842767715454, |
|
"learning_rate": 1.5003770739064857e-05, |
|
"loss": 0.2784, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.752262443438914, |
|
"grad_norm": 5.306181907653809, |
|
"learning_rate": 1.4984917043740574e-05, |
|
"loss": 0.2783, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.7550904977375565, |
|
"grad_norm": 3.1774604320526123, |
|
"learning_rate": 1.496606334841629e-05, |
|
"loss": 0.2216, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.7579185520361991, |
|
"grad_norm": 5.226140022277832, |
|
"learning_rate": 1.4947209653092007e-05, |
|
"loss": 0.386, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.7607466063348416, |
|
"grad_norm": 3.7945973873138428, |
|
"learning_rate": 1.4928355957767724e-05, |
|
"loss": 0.3213, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.7635746606334841, |
|
"grad_norm": 3.4387052059173584, |
|
"learning_rate": 1.490950226244344e-05, |
|
"loss": 0.3386, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7664027149321267, |
|
"grad_norm": 3.023867607116699, |
|
"learning_rate": 1.4890648567119157e-05, |
|
"loss": 0.3507, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 5.2512640953063965, |
|
"learning_rate": 1.4871794871794874e-05, |
|
"loss": 0.3141, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.7720588235294118, |
|
"grad_norm": 3.79915452003479, |
|
"learning_rate": 1.485294117647059e-05, |
|
"loss": 0.3441, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.7748868778280543, |
|
"grad_norm": 1.8601824045181274, |
|
"learning_rate": 1.4834087481146307e-05, |
|
"loss": 0.3662, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.7777149321266968, |
|
"grad_norm": 2.1231563091278076, |
|
"learning_rate": 1.4815233785822024e-05, |
|
"loss": 0.3365, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.7805429864253394, |
|
"grad_norm": 5.087540149688721, |
|
"learning_rate": 1.479638009049774e-05, |
|
"loss": 0.3339, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.7833710407239819, |
|
"grad_norm": 6.3354668617248535, |
|
"learning_rate": 1.4777526395173457e-05, |
|
"loss": 0.317, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.7861990950226244, |
|
"grad_norm": 3.519740581512451, |
|
"learning_rate": 1.4758672699849172e-05, |
|
"loss": 0.3237, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.7890271493212669, |
|
"grad_norm": 2.299184560775757, |
|
"learning_rate": 1.4739819004524888e-05, |
|
"loss": 0.289, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.7918552036199095, |
|
"grad_norm": 6.4508490562438965, |
|
"learning_rate": 1.4720965309200605e-05, |
|
"loss": 0.3214, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.794683257918552, |
|
"grad_norm": 1.989512324333191, |
|
"learning_rate": 1.4702111613876322e-05, |
|
"loss": 0.2949, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.7975113122171946, |
|
"grad_norm": 5.373081684112549, |
|
"learning_rate": 1.4683257918552036e-05, |
|
"loss": 0.3365, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8003393665158371, |
|
"grad_norm": 2.989363193511963, |
|
"learning_rate": 1.4664404223227753e-05, |
|
"loss": 0.2495, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.8031674208144797, |
|
"grad_norm": 4.9633660316467285, |
|
"learning_rate": 1.464555052790347e-05, |
|
"loss": 0.2815, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.8059954751131222, |
|
"grad_norm": 6.031944274902344, |
|
"learning_rate": 1.4626696832579186e-05, |
|
"loss": 0.2632, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.8088235294117647, |
|
"grad_norm": 3.689105987548828, |
|
"learning_rate": 1.4607843137254903e-05, |
|
"loss": 0.3192, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.8116515837104072, |
|
"grad_norm": 3.926541805267334, |
|
"learning_rate": 1.458898944193062e-05, |
|
"loss": 0.3569, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.8144796380090498, |
|
"grad_norm": 4.753978252410889, |
|
"learning_rate": 1.4570135746606336e-05, |
|
"loss": 0.3957, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.8173076923076923, |
|
"grad_norm": 5.156829833984375, |
|
"learning_rate": 1.4551282051282051e-05, |
|
"loss": 0.3271, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.8201357466063348, |
|
"grad_norm": 3.1717522144317627, |
|
"learning_rate": 1.4532428355957768e-05, |
|
"loss": 0.3337, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8229638009049773, |
|
"grad_norm": 2.9101099967956543, |
|
"learning_rate": 1.4513574660633484e-05, |
|
"loss": 0.3331, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.8257918552036199, |
|
"grad_norm": 3.014803171157837, |
|
"learning_rate": 1.4494720965309201e-05, |
|
"loss": 0.303, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.8286199095022625, |
|
"grad_norm": 6.68400764465332, |
|
"learning_rate": 1.4475867269984918e-05, |
|
"loss": 0.3414, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.831447963800905, |
|
"grad_norm": 4.039949417114258, |
|
"learning_rate": 1.4457013574660634e-05, |
|
"loss": 0.3107, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.8342760180995475, |
|
"grad_norm": 5.4491286277771, |
|
"learning_rate": 1.443815987933635e-05, |
|
"loss": 0.3094, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.8371040723981901, |
|
"grad_norm": 4.456144332885742, |
|
"learning_rate": 1.4419306184012067e-05, |
|
"loss": 0.3137, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.8399321266968326, |
|
"grad_norm": 3.682917594909668, |
|
"learning_rate": 1.4400452488687784e-05, |
|
"loss": 0.3221, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.8427601809954751, |
|
"grad_norm": 4.826881408691406, |
|
"learning_rate": 1.43815987933635e-05, |
|
"loss": 0.3828, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.8455882352941176, |
|
"grad_norm": 4.945711135864258, |
|
"learning_rate": 1.4362745098039217e-05, |
|
"loss": 0.2887, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.8484162895927602, |
|
"grad_norm": 6.562948226928711, |
|
"learning_rate": 1.4343891402714934e-05, |
|
"loss": 0.3222, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8484162895927602, |
|
"eval_accuracy": 0.8736875596563792, |
|
"eval_loss": 0.32356539368629456, |
|
"eval_runtime": 126.3914, |
|
"eval_samples_per_second": 99.469, |
|
"eval_steps_per_second": 3.109, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8512443438914027, |
|
"grad_norm": 5.110568523406982, |
|
"learning_rate": 1.432503770739065e-05, |
|
"loss": 0.3145, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.8540723981900452, |
|
"grad_norm": 4.4614081382751465, |
|
"learning_rate": 1.4306184012066367e-05, |
|
"loss": 0.3098, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.8569004524886877, |
|
"grad_norm": 3.0560202598571777, |
|
"learning_rate": 1.4287330316742084e-05, |
|
"loss": 0.3271, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.8597285067873304, |
|
"grad_norm": 5.284294605255127, |
|
"learning_rate": 1.4268476621417799e-05, |
|
"loss": 0.3461, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.8625565610859729, |
|
"grad_norm": 3.1206512451171875, |
|
"learning_rate": 1.4249622926093515e-05, |
|
"loss": 0.2705, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.8653846153846154, |
|
"grad_norm": 4.308442115783691, |
|
"learning_rate": 1.4230769230769232e-05, |
|
"loss": 0.3691, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.8682126696832579, |
|
"grad_norm": 2.637321710586548, |
|
"learning_rate": 1.4211915535444948e-05, |
|
"loss": 0.3388, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.8710407239819005, |
|
"grad_norm": 4.938368797302246, |
|
"learning_rate": 1.4193061840120665e-05, |
|
"loss": 0.288, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.873868778280543, |
|
"grad_norm": 4.269132137298584, |
|
"learning_rate": 1.4174208144796382e-05, |
|
"loss": 0.3347, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.8766968325791855, |
|
"grad_norm": 4.967940807342529, |
|
"learning_rate": 1.4155354449472098e-05, |
|
"loss": 0.318, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.879524886877828, |
|
"grad_norm": 4.122420787811279, |
|
"learning_rate": 1.4136500754147815e-05, |
|
"loss": 0.3139, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 3.85040545463562, |
|
"learning_rate": 1.4117647058823532e-05, |
|
"loss": 0.3544, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.8851809954751131, |
|
"grad_norm": 5.834123134613037, |
|
"learning_rate": 1.4098793363499248e-05, |
|
"loss": 0.3717, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.8880090497737556, |
|
"grad_norm": 3.2261178493499756, |
|
"learning_rate": 1.4079939668174965e-05, |
|
"loss": 0.3071, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.8908371040723982, |
|
"grad_norm": 5.519437789916992, |
|
"learning_rate": 1.4061085972850678e-05, |
|
"loss": 0.2519, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.8936651583710408, |
|
"grad_norm": 2.5292046070098877, |
|
"learning_rate": 1.4042232277526395e-05, |
|
"loss": 0.3543, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.8964932126696833, |
|
"grad_norm": 3.737870454788208, |
|
"learning_rate": 1.4023378582202111e-05, |
|
"loss": 0.3036, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.8993212669683258, |
|
"grad_norm": 4.248650550842285, |
|
"learning_rate": 1.4004524886877828e-05, |
|
"loss": 0.321, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.9021493212669683, |
|
"grad_norm": 3.5133938789367676, |
|
"learning_rate": 1.3985671191553544e-05, |
|
"loss": 0.354, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.9049773755656109, |
|
"grad_norm": 2.819633722305298, |
|
"learning_rate": 1.3966817496229261e-05, |
|
"loss": 0.2789, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9078054298642534, |
|
"grad_norm": 3.6485252380371094, |
|
"learning_rate": 1.3947963800904978e-05, |
|
"loss": 0.3348, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.9106334841628959, |
|
"grad_norm": 4.469762325286865, |
|
"learning_rate": 1.3929110105580694e-05, |
|
"loss": 0.2611, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.9134615384615384, |
|
"grad_norm": 5.00715970993042, |
|
"learning_rate": 1.3910256410256411e-05, |
|
"loss": 0.3701, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.916289592760181, |
|
"grad_norm": 3.802788734436035, |
|
"learning_rate": 1.3891402714932128e-05, |
|
"loss": 0.4052, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.9191176470588235, |
|
"grad_norm": 3.6908090114593506, |
|
"learning_rate": 1.3872549019607844e-05, |
|
"loss": 0.3163, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.9219457013574661, |
|
"grad_norm": 4.198665142059326, |
|
"learning_rate": 1.385369532428356e-05, |
|
"loss": 0.3004, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.9247737556561086, |
|
"grad_norm": 5.080460071563721, |
|
"learning_rate": 1.3834841628959277e-05, |
|
"loss": 0.3444, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.9276018099547512, |
|
"grad_norm": 5.11644983291626, |
|
"learning_rate": 1.3815987933634994e-05, |
|
"loss": 0.3238, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.9304298642533937, |
|
"grad_norm": 7.753527641296387, |
|
"learning_rate": 1.379713423831071e-05, |
|
"loss": 0.3969, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.9332579185520362, |
|
"grad_norm": 3.2283082008361816, |
|
"learning_rate": 1.3778280542986426e-05, |
|
"loss": 0.3186, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9360859728506787, |
|
"grad_norm": 5.4364094734191895, |
|
"learning_rate": 1.3759426847662142e-05, |
|
"loss": 0.2786, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.9389140271493213, |
|
"grad_norm": 4.061675071716309, |
|
"learning_rate": 1.3740573152337859e-05, |
|
"loss": 0.2909, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.9417420814479638, |
|
"grad_norm": 2.8919031620025635, |
|
"learning_rate": 1.3721719457013575e-05, |
|
"loss": 0.3572, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.9445701357466063, |
|
"grad_norm": 2.643793821334839, |
|
"learning_rate": 1.3702865761689292e-05, |
|
"loss": 0.2582, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.9473981900452488, |
|
"grad_norm": 2.6080071926116943, |
|
"learning_rate": 1.3684012066365009e-05, |
|
"loss": 0.3299, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.9502262443438914, |
|
"grad_norm": 3.8307015895843506, |
|
"learning_rate": 1.3665158371040725e-05, |
|
"loss": 0.3193, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.9530542986425339, |
|
"grad_norm": 5.132751941680908, |
|
"learning_rate": 1.3646304675716442e-05, |
|
"loss": 0.3029, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.9558823529411765, |
|
"grad_norm": 2.5157196521759033, |
|
"learning_rate": 1.3627450980392158e-05, |
|
"loss": 0.2851, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.958710407239819, |
|
"grad_norm": 2.9101061820983887, |
|
"learning_rate": 1.3608597285067875e-05, |
|
"loss": 0.2542, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 4.939927101135254, |
|
"learning_rate": 1.3589743589743592e-05, |
|
"loss": 0.2904, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9643665158371041, |
|
"grad_norm": 3.6113576889038086, |
|
"learning_rate": 1.3570889894419308e-05, |
|
"loss": 0.2654, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.9671945701357466, |
|
"grad_norm": 7.237005710601807, |
|
"learning_rate": 1.3552036199095025e-05, |
|
"loss": 0.2636, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.9700226244343891, |
|
"grad_norm": 4.309847354888916, |
|
"learning_rate": 1.3533182503770742e-05, |
|
"loss": 0.3095, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.9728506787330317, |
|
"grad_norm": 3.404597520828247, |
|
"learning_rate": 1.3514328808446458e-05, |
|
"loss": 0.3147, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.9756787330316742, |
|
"grad_norm": 2.2847061157226562, |
|
"learning_rate": 1.3495475113122173e-05, |
|
"loss": 0.2536, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.9785067873303167, |
|
"grad_norm": 3.670473337173462, |
|
"learning_rate": 1.347662141779789e-05, |
|
"loss": 0.4246, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.9813348416289592, |
|
"grad_norm": 3.955064296722412, |
|
"learning_rate": 1.3457767722473606e-05, |
|
"loss": 0.3578, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.9841628959276018, |
|
"grad_norm": 4.502097129821777, |
|
"learning_rate": 1.3438914027149323e-05, |
|
"loss": 0.2467, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.9869909502262444, |
|
"grad_norm": 2.4463083744049072, |
|
"learning_rate": 1.3420060331825038e-05, |
|
"loss": 0.2968, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.9898190045248869, |
|
"grad_norm": 4.400903224945068, |
|
"learning_rate": 1.3401206636500754e-05, |
|
"loss": 0.2767, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.9926470588235294, |
|
"grad_norm": 3.8190836906433105, |
|
"learning_rate": 1.3382352941176471e-05, |
|
"loss": 0.2868, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.995475113122172, |
|
"grad_norm": 6.496269702911377, |
|
"learning_rate": 1.3363499245852188e-05, |
|
"loss": 0.2562, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.9983031674208145, |
|
"grad_norm": 6.07765531539917, |
|
"learning_rate": 1.3344645550527904e-05, |
|
"loss": 0.3454, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.001131221719457, |
|
"grad_norm": 2.3242061138153076, |
|
"learning_rate": 1.3325791855203621e-05, |
|
"loss": 0.2916, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.0039592760180995, |
|
"grad_norm": 4.1568922996521, |
|
"learning_rate": 1.3306938159879338e-05, |
|
"loss": 0.2674, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.006787330316742, |
|
"grad_norm": 4.240556240081787, |
|
"learning_rate": 1.3288084464555052e-05, |
|
"loss": 0.2308, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.0096153846153846, |
|
"grad_norm": 3.1726534366607666, |
|
"learning_rate": 1.3269230769230769e-05, |
|
"loss": 0.2158, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.012443438914027, |
|
"grad_norm": 2.371945381164551, |
|
"learning_rate": 1.3250377073906486e-05, |
|
"loss": 0.2726, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.0152714932126696, |
|
"grad_norm": 4.0892744064331055, |
|
"learning_rate": 1.3231523378582202e-05, |
|
"loss": 0.2854, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 1.0180995475113122, |
|
"grad_norm": 4.087936878204346, |
|
"learning_rate": 1.3212669683257919e-05, |
|
"loss": 0.2819, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.0209276018099547, |
|
"grad_norm": 2.393385171890259, |
|
"learning_rate": 1.3193815987933636e-05, |
|
"loss": 0.2647, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 1.0237556561085972, |
|
"grad_norm": 5.088064193725586, |
|
"learning_rate": 1.3174962292609352e-05, |
|
"loss": 0.3036, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.0265837104072397, |
|
"grad_norm": 5.494114875793457, |
|
"learning_rate": 1.3156108597285069e-05, |
|
"loss": 0.2903, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.0294117647058822, |
|
"grad_norm": 1.7145814895629883, |
|
"learning_rate": 1.3137254901960785e-05, |
|
"loss": 0.2035, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.032239819004525, |
|
"grad_norm": 5.571091651916504, |
|
"learning_rate": 1.3118401206636502e-05, |
|
"loss": 0.2725, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.0350678733031675, |
|
"grad_norm": 2.1600940227508545, |
|
"learning_rate": 1.3099547511312219e-05, |
|
"loss": 0.2664, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.03789592760181, |
|
"grad_norm": 5.715987205505371, |
|
"learning_rate": 1.3080693815987935e-05, |
|
"loss": 0.288, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 1.0407239819004526, |
|
"grad_norm": 3.7473366260528564, |
|
"learning_rate": 1.3061840120663652e-05, |
|
"loss": 0.3043, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.043552036199095, |
|
"grad_norm": 6.1543121337890625, |
|
"learning_rate": 1.3042986425339369e-05, |
|
"loss": 0.3095, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.0463800904977376, |
|
"grad_norm": 2.4978766441345215, |
|
"learning_rate": 1.3024132730015085e-05, |
|
"loss": 0.2721, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.0492081447963801, |
|
"grad_norm": 6.851878643035889, |
|
"learning_rate": 1.3005279034690802e-05, |
|
"loss": 0.2912, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 1.0520361990950227, |
|
"grad_norm": 3.8570425510406494, |
|
"learning_rate": 1.2986425339366517e-05, |
|
"loss": 0.3127, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.0548642533936652, |
|
"grad_norm": 7.417280197143555, |
|
"learning_rate": 1.2967571644042233e-05, |
|
"loss": 0.2707, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.0576923076923077, |
|
"grad_norm": 4.451798915863037, |
|
"learning_rate": 1.294871794871795e-05, |
|
"loss": 0.2387, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.0605203619909502, |
|
"grad_norm": 3.9390320777893066, |
|
"learning_rate": 1.2929864253393667e-05, |
|
"loss": 0.2626, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.0633484162895928, |
|
"grad_norm": 1.990342617034912, |
|
"learning_rate": 1.2911010558069383e-05, |
|
"loss": 0.2694, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.0661764705882353, |
|
"grad_norm": 3.4424543380737305, |
|
"learning_rate": 1.28921568627451e-05, |
|
"loss": 0.3381, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.0690045248868778, |
|
"grad_norm": 1.7783031463623047, |
|
"learning_rate": 1.2873303167420816e-05, |
|
"loss": 0.1981, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.0718325791855203, |
|
"grad_norm": 3.8346874713897705, |
|
"learning_rate": 1.2854449472096533e-05, |
|
"loss": 0.2624, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.0746606334841629, |
|
"grad_norm": 5.832867622375488, |
|
"learning_rate": 1.283559577677225e-05, |
|
"loss": 0.2581, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.0774886877828054, |
|
"grad_norm": 4.65895414352417, |
|
"learning_rate": 1.2816742081447966e-05, |
|
"loss": 0.3619, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.080316742081448, |
|
"grad_norm": 3.93692946434021, |
|
"learning_rate": 1.279788838612368e-05, |
|
"loss": 0.3296, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.0831447963800904, |
|
"grad_norm": 2.544408082962036, |
|
"learning_rate": 1.2779034690799396e-05, |
|
"loss": 0.2733, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.085972850678733, |
|
"grad_norm": 4.070341110229492, |
|
"learning_rate": 1.2760180995475113e-05, |
|
"loss": 0.2303, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.0888009049773755, |
|
"grad_norm": 3.7344400882720947, |
|
"learning_rate": 1.274132730015083e-05, |
|
"loss": 0.2809, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.091628959276018, |
|
"grad_norm": 5.270275592803955, |
|
"learning_rate": 1.2722473604826546e-05, |
|
"loss": 0.2894, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.0944570135746607, |
|
"grad_norm": 4.697700500488281, |
|
"learning_rate": 1.2703619909502263e-05, |
|
"loss": 0.2382, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.0972850678733033, |
|
"grad_norm": 3.1016902923583984, |
|
"learning_rate": 1.2684766214177979e-05, |
|
"loss": 0.2545, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.1001131221719458, |
|
"grad_norm": 3.6058175563812256, |
|
"learning_rate": 1.2665912518853696e-05, |
|
"loss": 0.2747, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.1029411764705883, |
|
"grad_norm": 6.918750286102295, |
|
"learning_rate": 1.2647058823529412e-05, |
|
"loss": 0.2541, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.1057692307692308, |
|
"grad_norm": 4.158249855041504, |
|
"learning_rate": 1.2628205128205129e-05, |
|
"loss": 0.2514, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.1085972850678734, |
|
"grad_norm": 5.783833980560303, |
|
"learning_rate": 1.2609351432880846e-05, |
|
"loss": 0.2547, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.1114253393665159, |
|
"grad_norm": 3.524967670440674, |
|
"learning_rate": 1.2590497737556562e-05, |
|
"loss": 0.2767, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.1142533936651584, |
|
"grad_norm": 2.39933705329895, |
|
"learning_rate": 1.2571644042232279e-05, |
|
"loss": 0.2359, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.117081447963801, |
|
"grad_norm": 4.107085704803467, |
|
"learning_rate": 1.2552790346907995e-05, |
|
"loss": 0.244, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.1199095022624435, |
|
"grad_norm": 6.81174898147583, |
|
"learning_rate": 1.2533936651583712e-05, |
|
"loss": 0.2579, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.122737556561086, |
|
"grad_norm": 1.8872418403625488, |
|
"learning_rate": 1.2515082956259429e-05, |
|
"loss": 0.2101, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.1255656108597285, |
|
"grad_norm": 3.872263193130493, |
|
"learning_rate": 1.2496229260935144e-05, |
|
"loss": 0.3009, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.128393665158371, |
|
"grad_norm": 2.7092275619506836, |
|
"learning_rate": 1.247737556561086e-05, |
|
"loss": 0.3053, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.1312217194570136, |
|
"grad_norm": 6.832910537719727, |
|
"learning_rate": 1.2458521870286577e-05, |
|
"loss": 0.2339, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.1312217194570136, |
|
"eval_accuracy": 0.8789373210308622, |
|
"eval_loss": 0.31826069951057434, |
|
"eval_runtime": 126.5036, |
|
"eval_samples_per_second": 99.381, |
|
"eval_steps_per_second": 3.107, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.134049773755656, |
|
"grad_norm": 4.47673225402832, |
|
"learning_rate": 1.2439668174962293e-05, |
|
"loss": 0.2778, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 1.1368778280542986, |
|
"grad_norm": 5.049123287200928, |
|
"learning_rate": 1.242081447963801e-05, |
|
"loss": 0.3089, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.1397058823529411, |
|
"grad_norm": 3.6429476737976074, |
|
"learning_rate": 1.2401960784313727e-05, |
|
"loss": 0.2583, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 1.1425339366515836, |
|
"grad_norm": 4.532712936401367, |
|
"learning_rate": 1.2383107088989443e-05, |
|
"loss": 0.276, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.1453619909502262, |
|
"grad_norm": 2.8139843940734863, |
|
"learning_rate": 1.236425339366516e-05, |
|
"loss": 0.2866, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.1481900452488687, |
|
"grad_norm": 3.5737717151641846, |
|
"learning_rate": 1.2345399698340877e-05, |
|
"loss": 0.2609, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.1510180995475112, |
|
"grad_norm": 3.4656126499176025, |
|
"learning_rate": 1.2326546003016593e-05, |
|
"loss": 0.2063, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 7.43180513381958, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.3614, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.1566742081447963, |
|
"grad_norm": 2.853827476501465, |
|
"learning_rate": 1.2288838612368026e-05, |
|
"loss": 0.3006, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 1.1595022624434388, |
|
"grad_norm": 4.522756099700928, |
|
"learning_rate": 1.2269984917043743e-05, |
|
"loss": 0.3122, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.1623303167420815, |
|
"grad_norm": 3.891043186187744, |
|
"learning_rate": 1.225113122171946e-05, |
|
"loss": 0.3056, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.165158371040724, |
|
"grad_norm": 2.7950618267059326, |
|
"learning_rate": 1.2232277526395176e-05, |
|
"loss": 0.3195, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.1679864253393666, |
|
"grad_norm": 3.972943067550659, |
|
"learning_rate": 1.2213423831070891e-05, |
|
"loss": 0.3315, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 1.170814479638009, |
|
"grad_norm": 10.891520500183105, |
|
"learning_rate": 1.2194570135746608e-05, |
|
"loss": 0.276, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.1736425339366516, |
|
"grad_norm": 4.51856803894043, |
|
"learning_rate": 1.2175716440422323e-05, |
|
"loss": 0.2355, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 2.575591564178467, |
|
"learning_rate": 1.215686274509804e-05, |
|
"loss": 0.195, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.1792986425339367, |
|
"grad_norm": 3.9654879570007324, |
|
"learning_rate": 1.2138009049773756e-05, |
|
"loss": 0.2505, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 1.1821266968325792, |
|
"grad_norm": 5.328989028930664, |
|
"learning_rate": 1.2119155354449473e-05, |
|
"loss": 0.2326, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 1.1849547511312217, |
|
"grad_norm": 3.573969841003418, |
|
"learning_rate": 1.2100301659125189e-05, |
|
"loss": 0.2302, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 1.1877828054298643, |
|
"grad_norm": 4.914673328399658, |
|
"learning_rate": 1.2081447963800906e-05, |
|
"loss": 0.3016, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.1906108597285068, |
|
"grad_norm": 3.4880571365356445, |
|
"learning_rate": 1.2062594268476622e-05, |
|
"loss": 0.2811, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 1.1934389140271493, |
|
"grad_norm": 3.827131509780884, |
|
"learning_rate": 1.2043740573152339e-05, |
|
"loss": 0.2431, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 1.1962669683257918, |
|
"grad_norm": 4.236039161682129, |
|
"learning_rate": 1.2024886877828056e-05, |
|
"loss": 0.2951, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 1.1990950226244343, |
|
"grad_norm": 4.934144973754883, |
|
"learning_rate": 1.200603318250377e-05, |
|
"loss": 0.2739, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 1.2019230769230769, |
|
"grad_norm": 6.265100479125977, |
|
"learning_rate": 1.1987179487179487e-05, |
|
"loss": 0.2298, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.2047511312217194, |
|
"grad_norm": 2.0881166458129883, |
|
"learning_rate": 1.1968325791855204e-05, |
|
"loss": 0.245, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.207579185520362, |
|
"grad_norm": 8.104879379272461, |
|
"learning_rate": 1.194947209653092e-05, |
|
"loss": 0.2787, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 1.2104072398190044, |
|
"grad_norm": 4.176904201507568, |
|
"learning_rate": 1.1930618401206637e-05, |
|
"loss": 0.2805, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 1.213235294117647, |
|
"grad_norm": 4.049289226531982, |
|
"learning_rate": 1.1911764705882354e-05, |
|
"loss": 0.3075, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 1.2160633484162897, |
|
"grad_norm": 1.6341451406478882, |
|
"learning_rate": 1.189291101055807e-05, |
|
"loss": 0.2664, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.2188914027149322, |
|
"grad_norm": 7.3927388191223145, |
|
"learning_rate": 1.1874057315233787e-05, |
|
"loss": 0.3117, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 1.2217194570135748, |
|
"grad_norm": 1.8302087783813477, |
|
"learning_rate": 1.1855203619909503e-05, |
|
"loss": 0.333, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 1.2245475113122173, |
|
"grad_norm": 2.7119219303131104, |
|
"learning_rate": 1.183634992458522e-05, |
|
"loss": 0.313, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 1.2273755656108598, |
|
"grad_norm": 6.975501537322998, |
|
"learning_rate": 1.1817496229260937e-05, |
|
"loss": 0.2833, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 1.2302036199095023, |
|
"grad_norm": 2.1703109741210938, |
|
"learning_rate": 1.1798642533936653e-05, |
|
"loss": 0.2671, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.2330316742081449, |
|
"grad_norm": 3.932482957839966, |
|
"learning_rate": 1.177978883861237e-05, |
|
"loss": 0.2358, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 1.2358597285067874, |
|
"grad_norm": 2.7635726928710938, |
|
"learning_rate": 1.1760935143288087e-05, |
|
"loss": 0.3196, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 1.23868778280543, |
|
"grad_norm": 2.945617914199829, |
|
"learning_rate": 1.1742081447963803e-05, |
|
"loss": 0.286, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 1.2415158371040724, |
|
"grad_norm": 4.623812675476074, |
|
"learning_rate": 1.1723227752639518e-05, |
|
"loss": 0.2605, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 1.244343891402715, |
|
"grad_norm": 3.3469064235687256, |
|
"learning_rate": 1.1704374057315235e-05, |
|
"loss": 0.2515, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.2471719457013575, |
|
"grad_norm": 6.414296627044678, |
|
"learning_rate": 1.1685520361990951e-05, |
|
"loss": 0.3239, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 5.76809549331665, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 0.2534, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 1.2528280542986425, |
|
"grad_norm": 5.739138603210449, |
|
"learning_rate": 1.1647812971342385e-05, |
|
"loss": 0.2617, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 1.255656108597285, |
|
"grad_norm": 3.76336407661438, |
|
"learning_rate": 1.1628959276018101e-05, |
|
"loss": 0.2809, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 1.2584841628959276, |
|
"grad_norm": 3.3274784088134766, |
|
"learning_rate": 1.1610105580693818e-05, |
|
"loss": 0.2785, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.26131221719457, |
|
"grad_norm": 3.9663026332855225, |
|
"learning_rate": 1.1591251885369534e-05, |
|
"loss": 0.2404, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 1.2641402714932126, |
|
"grad_norm": 3.5841290950775146, |
|
"learning_rate": 1.1572398190045251e-05, |
|
"loss": 0.2417, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 1.2669683257918551, |
|
"grad_norm": 3.4801056385040283, |
|
"learning_rate": 1.1553544494720966e-05, |
|
"loss": 0.372, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 1.2697963800904977, |
|
"grad_norm": 4.876957416534424, |
|
"learning_rate": 1.1534690799396683e-05, |
|
"loss": 0.2947, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 1.2726244343891402, |
|
"grad_norm": 5.119454860687256, |
|
"learning_rate": 1.1515837104072397e-05, |
|
"loss": 0.3047, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.2754524886877827, |
|
"grad_norm": 4.234288215637207, |
|
"learning_rate": 1.1496983408748114e-05, |
|
"loss": 0.3204, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 1.2782805429864252, |
|
"grad_norm": 2.9957668781280518, |
|
"learning_rate": 1.147812971342383e-05, |
|
"loss": 0.2602, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 1.2811085972850678, |
|
"grad_norm": 2.333770990371704, |
|
"learning_rate": 1.1459276018099547e-05, |
|
"loss": 0.1949, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 1.2839366515837103, |
|
"grad_norm": 4.577385425567627, |
|
"learning_rate": 1.1440422322775264e-05, |
|
"loss": 0.3381, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 1.2867647058823528, |
|
"grad_norm": 4.607064723968506, |
|
"learning_rate": 1.142156862745098e-05, |
|
"loss": 0.268, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.2895927601809956, |
|
"grad_norm": 4.690824031829834, |
|
"learning_rate": 1.1402714932126697e-05, |
|
"loss": 0.2716, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 1.292420814479638, |
|
"grad_norm": 4.504805564880371, |
|
"learning_rate": 1.1383861236802414e-05, |
|
"loss": 0.2324, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 1.2952488687782806, |
|
"grad_norm": 5.126098155975342, |
|
"learning_rate": 1.136500754147813e-05, |
|
"loss": 0.3124, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 1.2980769230769231, |
|
"grad_norm": 4.265847206115723, |
|
"learning_rate": 1.1346153846153847e-05, |
|
"loss": 0.2522, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 1.3009049773755657, |
|
"grad_norm": 6.03093957901001, |
|
"learning_rate": 1.1327300150829564e-05, |
|
"loss": 0.1997, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.3037330316742082, |
|
"grad_norm": 5.404048442840576, |
|
"learning_rate": 1.130844645550528e-05, |
|
"loss": 0.2789, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 1.3065610859728507, |
|
"grad_norm": 3.481680393218994, |
|
"learning_rate": 1.1289592760180997e-05, |
|
"loss": 0.3238, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 1.3093891402714932, |
|
"grad_norm": 5.523119926452637, |
|
"learning_rate": 1.1270739064856713e-05, |
|
"loss": 0.2349, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 1.3122171945701357, |
|
"grad_norm": 8.244139671325684, |
|
"learning_rate": 1.125188536953243e-05, |
|
"loss": 0.2466, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 1.3150452488687783, |
|
"grad_norm": 4.985035419464111, |
|
"learning_rate": 1.1233031674208145e-05, |
|
"loss": 0.2634, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.3178733031674208, |
|
"grad_norm": 5.409512996673584, |
|
"learning_rate": 1.1214177978883862e-05, |
|
"loss": 0.2667, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 1.3207013574660633, |
|
"grad_norm": 3.6168251037597656, |
|
"learning_rate": 1.1195324283559578e-05, |
|
"loss": 0.2507, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 1.3235294117647058, |
|
"grad_norm": 4.121711730957031, |
|
"learning_rate": 1.1176470588235295e-05, |
|
"loss": 0.3009, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 1.3263574660633484, |
|
"grad_norm": 5.204695224761963, |
|
"learning_rate": 1.1157616892911011e-05, |
|
"loss": 0.3055, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 1.329185520361991, |
|
"grad_norm": 3.2947821617126465, |
|
"learning_rate": 1.1138763197586728e-05, |
|
"loss": 0.2476, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.3320135746606334, |
|
"grad_norm": 4.02095365524292, |
|
"learning_rate": 1.1119909502262445e-05, |
|
"loss": 0.2667, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 1.334841628959276, |
|
"grad_norm": 4.2972025871276855, |
|
"learning_rate": 1.1101055806938161e-05, |
|
"loss": 0.2281, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 1.3376696832579187, |
|
"grad_norm": 3.918163537979126, |
|
"learning_rate": 1.1082202111613878e-05, |
|
"loss": 0.2363, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 1.3404977375565612, |
|
"grad_norm": 4.1806206703186035, |
|
"learning_rate": 1.1063348416289595e-05, |
|
"loss": 0.2639, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 1.3433257918552037, |
|
"grad_norm": 2.949676990509033, |
|
"learning_rate": 1.1044494720965311e-05, |
|
"loss": 0.2625, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.3461538461538463, |
|
"grad_norm": 5.957220554351807, |
|
"learning_rate": 1.1025641025641028e-05, |
|
"loss": 0.2416, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 1.3489819004524888, |
|
"grad_norm": 12.7191162109375, |
|
"learning_rate": 1.1006787330316744e-05, |
|
"loss": 0.2752, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 1.3518099547511313, |
|
"grad_norm": 4.849847793579102, |
|
"learning_rate": 1.0987933634992461e-05, |
|
"loss": 0.295, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 1.3546380090497738, |
|
"grad_norm": 3.8798282146453857, |
|
"learning_rate": 1.0969079939668178e-05, |
|
"loss": 0.2744, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 1.3574660633484164, |
|
"grad_norm": 3.093064546585083, |
|
"learning_rate": 1.0950226244343893e-05, |
|
"loss": 0.332, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.3602941176470589, |
|
"grad_norm": 5.489840507507324, |
|
"learning_rate": 1.0931372549019607e-05, |
|
"loss": 0.3402, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 1.3631221719457014, |
|
"grad_norm": 3.8440115451812744, |
|
"learning_rate": 1.0912518853695324e-05, |
|
"loss": 0.2283, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 1.365950226244344, |
|
"grad_norm": 6.518070220947266, |
|
"learning_rate": 1.089366515837104e-05, |
|
"loss": 0.2859, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 1.3687782805429864, |
|
"grad_norm": 1.7918236255645752, |
|
"learning_rate": 1.0874811463046757e-05, |
|
"loss": 0.2879, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 1.371606334841629, |
|
"grad_norm": 5.9217424392700195, |
|
"learning_rate": 1.0855957767722474e-05, |
|
"loss": 0.3221, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.3744343891402715, |
|
"grad_norm": 4.664400100708008, |
|
"learning_rate": 1.083710407239819e-05, |
|
"loss": 0.3259, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 1.377262443438914, |
|
"grad_norm": 8.561564445495605, |
|
"learning_rate": 1.0818250377073907e-05, |
|
"loss": 0.2629, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 1.3800904977375565, |
|
"grad_norm": 5.151259422302246, |
|
"learning_rate": 1.0799396681749624e-05, |
|
"loss": 0.3046, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 1.382918552036199, |
|
"grad_norm": 5.32489538192749, |
|
"learning_rate": 1.078054298642534e-05, |
|
"loss": 0.2654, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 1.3857466063348416, |
|
"grad_norm": 4.306127071380615, |
|
"learning_rate": 1.0761689291101057e-05, |
|
"loss": 0.2518, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.3885746606334841, |
|
"grad_norm": 2.7166082859039307, |
|
"learning_rate": 1.0742835595776772e-05, |
|
"loss": 0.2278, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 1.3914027149321266, |
|
"grad_norm": 5.469938278198242, |
|
"learning_rate": 1.0723981900452489e-05, |
|
"loss": 0.2938, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 1.3942307692307692, |
|
"grad_norm": 5.4974260330200195, |
|
"learning_rate": 1.0705128205128205e-05, |
|
"loss": 0.2261, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 1.3970588235294117, |
|
"grad_norm": 2.7733094692230225, |
|
"learning_rate": 1.0686274509803922e-05, |
|
"loss": 0.3328, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 1.3998868778280542, |
|
"grad_norm": 3.532456398010254, |
|
"learning_rate": 1.0667420814479638e-05, |
|
"loss": 0.323, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.4027149321266967, |
|
"grad_norm": 5.4216227531433105, |
|
"learning_rate": 1.0648567119155355e-05, |
|
"loss": 0.3178, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 1.4055429864253393, |
|
"grad_norm": 5.761581897735596, |
|
"learning_rate": 1.0629713423831072e-05, |
|
"loss": 0.2391, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 1.4083710407239818, |
|
"grad_norm": 7.104434013366699, |
|
"learning_rate": 1.0610859728506788e-05, |
|
"loss": 0.2565, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 1.4111990950226243, |
|
"grad_norm": 5.054209232330322, |
|
"learning_rate": 1.0592006033182505e-05, |
|
"loss": 0.2805, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 1.4140271493212668, |
|
"grad_norm": 7.0140228271484375, |
|
"learning_rate": 1.0573152337858221e-05, |
|
"loss": 0.3011, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.4140271493212668, |
|
"eval_accuracy": 0.881880369074133, |
|
"eval_loss": 0.3024204969406128, |
|
"eval_runtime": 126.4534, |
|
"eval_samples_per_second": 99.42, |
|
"eval_steps_per_second": 3.108, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.4168552036199096, |
|
"grad_norm": 5.970915794372559, |
|
"learning_rate": 1.0554298642533938e-05, |
|
"loss": 0.2841, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 1.419683257918552, |
|
"grad_norm": 4.6934943199157715, |
|
"learning_rate": 1.0535444947209655e-05, |
|
"loss": 0.2451, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 1.4225113122171946, |
|
"grad_norm": 5.1019978523254395, |
|
"learning_rate": 1.0516591251885371e-05, |
|
"loss": 0.2622, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 1.4253393665158371, |
|
"grad_norm": 3.4515976905822754, |
|
"learning_rate": 1.0497737556561088e-05, |
|
"loss": 0.3172, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 1.4281674208144797, |
|
"grad_norm": 4.001848220825195, |
|
"learning_rate": 1.0478883861236805e-05, |
|
"loss": 0.2949, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.4309954751131222, |
|
"grad_norm": 3.414452075958252, |
|
"learning_rate": 1.046003016591252e-05, |
|
"loss": 0.2345, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 1.4338235294117647, |
|
"grad_norm": 6.0561747550964355, |
|
"learning_rate": 1.0441176470588236e-05, |
|
"loss": 0.3239, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 1.4366515837104072, |
|
"grad_norm": 2.448591470718384, |
|
"learning_rate": 1.0422322775263953e-05, |
|
"loss": 0.2031, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 1.4394796380090498, |
|
"grad_norm": 5.490105152130127, |
|
"learning_rate": 1.040346907993967e-05, |
|
"loss": 0.2607, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 1.4423076923076923, |
|
"grad_norm": 2.7472801208496094, |
|
"learning_rate": 1.0384615384615386e-05, |
|
"loss": 0.2412, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.4451357466063348, |
|
"grad_norm": 4.4468770027160645, |
|
"learning_rate": 1.0365761689291103e-05, |
|
"loss": 0.288, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 1.4479638009049773, |
|
"grad_norm": 1.942518949508667, |
|
"learning_rate": 1.0346907993966819e-05, |
|
"loss": 0.2592, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 1.4507918552036199, |
|
"grad_norm": 4.880716800689697, |
|
"learning_rate": 1.0328054298642536e-05, |
|
"loss": 0.2454, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 1.4536199095022624, |
|
"grad_norm": 3.7106387615203857, |
|
"learning_rate": 1.030920060331825e-05, |
|
"loss": 0.2863, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 1.456447963800905, |
|
"grad_norm": 5.332839488983154, |
|
"learning_rate": 1.0290346907993967e-05, |
|
"loss": 0.3325, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.4592760180995474, |
|
"grad_norm": 4.884565353393555, |
|
"learning_rate": 1.0271493212669684e-05, |
|
"loss": 0.2284, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 1.4621040723981902, |
|
"grad_norm": 4.775869846343994, |
|
"learning_rate": 1.0252639517345399e-05, |
|
"loss": 0.1897, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 1.4649321266968327, |
|
"grad_norm": 2.5493810176849365, |
|
"learning_rate": 1.0233785822021115e-05, |
|
"loss": 0.2919, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 1.4677601809954752, |
|
"grad_norm": 3.7652482986450195, |
|
"learning_rate": 1.0214932126696832e-05, |
|
"loss": 0.2795, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 4.398680686950684, |
|
"learning_rate": 1.0196078431372549e-05, |
|
"loss": 0.2685, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.4734162895927603, |
|
"grad_norm": 2.400367498397827, |
|
"learning_rate": 1.0177224736048265e-05, |
|
"loss": 0.251, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 1.4762443438914028, |
|
"grad_norm": 3.4146950244903564, |
|
"learning_rate": 1.0158371040723982e-05, |
|
"loss": 0.2115, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 1.4790723981900453, |
|
"grad_norm": 4.488588809967041, |
|
"learning_rate": 1.0139517345399699e-05, |
|
"loss": 0.2405, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 1.4819004524886878, |
|
"grad_norm": 6.304666996002197, |
|
"learning_rate": 1.0120663650075415e-05, |
|
"loss": 0.3394, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 1.4847285067873304, |
|
"grad_norm": 2.8380801677703857, |
|
"learning_rate": 1.0101809954751132e-05, |
|
"loss": 0.2637, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.487556561085973, |
|
"grad_norm": 4.873356819152832, |
|
"learning_rate": 1.0082956259426848e-05, |
|
"loss": 0.2652, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 1.4903846153846154, |
|
"grad_norm": 5.6608123779296875, |
|
"learning_rate": 1.0064102564102565e-05, |
|
"loss": 0.2961, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 1.493212669683258, |
|
"grad_norm": 4.332230567932129, |
|
"learning_rate": 1.0045248868778282e-05, |
|
"loss": 0.2705, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 1.4960407239819005, |
|
"grad_norm": 5.802159309387207, |
|
"learning_rate": 1.0026395173453998e-05, |
|
"loss": 0.2599, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 1.498868778280543, |
|
"grad_norm": 3.019793748855591, |
|
"learning_rate": 1.0007541478129715e-05, |
|
"loss": 0.2623, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.5016968325791855, |
|
"grad_norm": 4.762251377105713, |
|
"learning_rate": 9.988687782805431e-06, |
|
"loss": 0.2585, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 1.504524886877828, |
|
"grad_norm": 6.202815055847168, |
|
"learning_rate": 9.969834087481146e-06, |
|
"loss": 0.2778, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 1.5073529411764706, |
|
"grad_norm": 3.872309684753418, |
|
"learning_rate": 9.950980392156863e-06, |
|
"loss": 0.3034, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 1.510180995475113, |
|
"grad_norm": 4.060298919677734, |
|
"learning_rate": 9.93212669683258e-06, |
|
"loss": 0.2884, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 1.5130090497737556, |
|
"grad_norm": 2.0391085147857666, |
|
"learning_rate": 9.913273001508296e-06, |
|
"loss": 0.2867, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.5158371040723981, |
|
"grad_norm": 4.735014915466309, |
|
"learning_rate": 9.894419306184013e-06, |
|
"loss": 0.2797, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 1.5186651583710407, |
|
"grad_norm": 4.086658000946045, |
|
"learning_rate": 9.87556561085973e-06, |
|
"loss": 0.2841, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 1.5214932126696832, |
|
"grad_norm": 4.3362040519714355, |
|
"learning_rate": 9.856711915535446e-06, |
|
"loss": 0.2566, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 1.5243212669683257, |
|
"grad_norm": 3.9439034461975098, |
|
"learning_rate": 9.837858220211161e-06, |
|
"loss": 0.2506, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 1.5271493212669682, |
|
"grad_norm": 4.754290580749512, |
|
"learning_rate": 9.819004524886878e-06, |
|
"loss": 0.2271, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.5299773755656108, |
|
"grad_norm": 4.914488792419434, |
|
"learning_rate": 9.800150829562594e-06, |
|
"loss": 0.3032, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 1.5328054298642533, |
|
"grad_norm": 3.0046920776367188, |
|
"learning_rate": 9.781297134238311e-06, |
|
"loss": 0.2123, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 1.5356334841628958, |
|
"grad_norm": 4.0427985191345215, |
|
"learning_rate": 9.762443438914027e-06, |
|
"loss": 0.2621, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 6.442467212677002, |
|
"learning_rate": 9.743589743589744e-06, |
|
"loss": 0.247, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 1.5412895927601808, |
|
"grad_norm": 3.7217085361480713, |
|
"learning_rate": 9.72473604826546e-06, |
|
"loss": 0.3037, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.5441176470588234, |
|
"grad_norm": 7.558680534362793, |
|
"learning_rate": 9.705882352941177e-06, |
|
"loss": 0.3076, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 1.5469457013574661, |
|
"grad_norm": 3.152740240097046, |
|
"learning_rate": 9.687028657616894e-06, |
|
"loss": 0.2778, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 1.5497737556561086, |
|
"grad_norm": 3.996135711669922, |
|
"learning_rate": 9.66817496229261e-06, |
|
"loss": 0.3243, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 1.5526018099547512, |
|
"grad_norm": 3.837599039077759, |
|
"learning_rate": 9.649321266968327e-06, |
|
"loss": 0.2284, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 1.5554298642533937, |
|
"grad_norm": 4.957329750061035, |
|
"learning_rate": 9.630467571644044e-06, |
|
"loss": 0.2585, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.5582579185520362, |
|
"grad_norm": 4.0857133865356445, |
|
"learning_rate": 9.61161387631976e-06, |
|
"loss": 0.2947, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 1.5610859728506787, |
|
"grad_norm": 5.3217902183532715, |
|
"learning_rate": 9.592760180995477e-06, |
|
"loss": 0.3083, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 1.5639140271493213, |
|
"grad_norm": 6.3014326095581055, |
|
"learning_rate": 9.573906485671192e-06, |
|
"loss": 0.2514, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 1.5667420814479638, |
|
"grad_norm": 5.7632670402526855, |
|
"learning_rate": 9.555052790346909e-06, |
|
"loss": 0.2889, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 1.5695701357466063, |
|
"grad_norm": 3.6774861812591553, |
|
"learning_rate": 9.536199095022625e-06, |
|
"loss": 0.2933, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.5723981900452488, |
|
"grad_norm": 2.207911968231201, |
|
"learning_rate": 9.517345399698342e-06, |
|
"loss": 0.2594, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 1.5752262443438914, |
|
"grad_norm": 4.789866924285889, |
|
"learning_rate": 9.498491704374058e-06, |
|
"loss": 0.3103, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 1.5780542986425339, |
|
"grad_norm": 5.097392559051514, |
|
"learning_rate": 9.479638009049773e-06, |
|
"loss": 0.2757, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 1.5808823529411766, |
|
"grad_norm": 4.389581203460693, |
|
"learning_rate": 9.46078431372549e-06, |
|
"loss": 0.3006, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 1.5837104072398192, |
|
"grad_norm": 6.803945541381836, |
|
"learning_rate": 9.441930618401207e-06, |
|
"loss": 0.2912, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.5865384615384617, |
|
"grad_norm": 2.0034751892089844, |
|
"learning_rate": 9.423076923076923e-06, |
|
"loss": 0.2173, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 1.5893665158371042, |
|
"grad_norm": 3.0462636947631836, |
|
"learning_rate": 9.40422322775264e-06, |
|
"loss": 0.3155, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 1.5921945701357467, |
|
"grad_norm": 6.887737274169922, |
|
"learning_rate": 9.385369532428356e-06, |
|
"loss": 0.238, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 1.5950226244343892, |
|
"grad_norm": 7.331830978393555, |
|
"learning_rate": 9.366515837104073e-06, |
|
"loss": 0.3028, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 1.5978506787330318, |
|
"grad_norm": 3.274845600128174, |
|
"learning_rate": 9.34766214177979e-06, |
|
"loss": 0.2585, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 1.6006787330316743, |
|
"grad_norm": 6.801854133605957, |
|
"learning_rate": 9.328808446455506e-06, |
|
"loss": 0.256, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 1.6035067873303168, |
|
"grad_norm": 7.7837982177734375, |
|
"learning_rate": 9.309954751131223e-06, |
|
"loss": 0.2289, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 1.6063348416289593, |
|
"grad_norm": 8.501007080078125, |
|
"learning_rate": 9.29110105580694e-06, |
|
"loss": 0.2766, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 1.6091628959276019, |
|
"grad_norm": 4.016129493713379, |
|
"learning_rate": 9.272247360482656e-06, |
|
"loss": 0.2611, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 1.6119909502262444, |
|
"grad_norm": 5.062587738037109, |
|
"learning_rate": 9.253393665158373e-06, |
|
"loss": 0.2142, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.614819004524887, |
|
"grad_norm": 2.5895862579345703, |
|
"learning_rate": 9.23453996983409e-06, |
|
"loss": 0.2681, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 1.6176470588235294, |
|
"grad_norm": 5.066253662109375, |
|
"learning_rate": 9.215686274509804e-06, |
|
"loss": 0.2604, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 1.620475113122172, |
|
"grad_norm": 5.256166934967041, |
|
"learning_rate": 9.196832579185521e-06, |
|
"loss": 0.3009, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 1.6233031674208145, |
|
"grad_norm": 4.829041004180908, |
|
"learning_rate": 9.177978883861237e-06, |
|
"loss": 0.2614, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 1.626131221719457, |
|
"grad_norm": 4.902761459350586, |
|
"learning_rate": 9.159125188536954e-06, |
|
"loss": 0.2348, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 1.6289592760180995, |
|
"grad_norm": 5.516357421875, |
|
"learning_rate": 9.14027149321267e-06, |
|
"loss": 0.328, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 1.631787330316742, |
|
"grad_norm": 3.2983596324920654, |
|
"learning_rate": 9.121417797888387e-06, |
|
"loss": 0.1956, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 1.6346153846153846, |
|
"grad_norm": 7.548886775970459, |
|
"learning_rate": 9.102564102564104e-06, |
|
"loss": 0.2712, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 1.637443438914027, |
|
"grad_norm": 4.081298828125, |
|
"learning_rate": 9.083710407239819e-06, |
|
"loss": 0.2726, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 1.6402714932126696, |
|
"grad_norm": 6.161011695861816, |
|
"learning_rate": 9.064856711915535e-06, |
|
"loss": 0.2995, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.6430995475113122, |
|
"grad_norm": 4.223090171813965, |
|
"learning_rate": 9.046003016591252e-06, |
|
"loss": 0.3111, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 1.6459276018099547, |
|
"grad_norm": 7.8988728523254395, |
|
"learning_rate": 9.027149321266969e-06, |
|
"loss": 0.2875, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 1.6487556561085972, |
|
"grad_norm": 2.9701428413391113, |
|
"learning_rate": 9.008295625942685e-06, |
|
"loss": 0.2549, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 1.6515837104072397, |
|
"grad_norm": 6.37022066116333, |
|
"learning_rate": 8.989441930618402e-06, |
|
"loss": 0.3591, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 1.6544117647058822, |
|
"grad_norm": 4.708193302154541, |
|
"learning_rate": 8.970588235294119e-06, |
|
"loss": 0.2561, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 1.6572398190045248, |
|
"grad_norm": 5.106235027313232, |
|
"learning_rate": 8.951734539969835e-06, |
|
"loss": 0.2546, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 1.6600678733031673, |
|
"grad_norm": 4.135291576385498, |
|
"learning_rate": 8.932880844645552e-06, |
|
"loss": 0.2841, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 1.6628959276018098, |
|
"grad_norm": 5.418251991271973, |
|
"learning_rate": 8.914027149321268e-06, |
|
"loss": 0.2606, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 1.6657239819004523, |
|
"grad_norm": 7.133711338043213, |
|
"learning_rate": 8.895173453996983e-06, |
|
"loss": 0.3058, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 1.6685520361990949, |
|
"grad_norm": 3.556772470474243, |
|
"learning_rate": 8.8763197586727e-06, |
|
"loss": 0.271, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.6713800904977374, |
|
"grad_norm": 4.334698677062988, |
|
"learning_rate": 8.857466063348417e-06, |
|
"loss": 0.2695, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 1.6742081447963801, |
|
"grad_norm": 5.072098731994629, |
|
"learning_rate": 8.838612368024133e-06, |
|
"loss": 0.2845, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 1.6770361990950227, |
|
"grad_norm": 5.321040630340576, |
|
"learning_rate": 8.81975867269985e-06, |
|
"loss": 0.3065, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 1.6798642533936652, |
|
"grad_norm": 3.292698860168457, |
|
"learning_rate": 8.800904977375566e-06, |
|
"loss": 0.2547, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 1.6826923076923077, |
|
"grad_norm": 8.568231582641602, |
|
"learning_rate": 8.782051282051283e-06, |
|
"loss": 0.2547, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 1.6855203619909502, |
|
"grad_norm": 5.787846088409424, |
|
"learning_rate": 8.763197586727e-06, |
|
"loss": 0.3768, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 1.6883484162895928, |
|
"grad_norm": 4.789765357971191, |
|
"learning_rate": 8.744343891402716e-06, |
|
"loss": 0.2252, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 1.6911764705882353, |
|
"grad_norm": 6.947218418121338, |
|
"learning_rate": 8.725490196078433e-06, |
|
"loss": 0.2309, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 1.6940045248868778, |
|
"grad_norm": 3.733675956726074, |
|
"learning_rate": 8.70663650075415e-06, |
|
"loss": 0.2422, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 1.6968325791855203, |
|
"grad_norm": 4.800724506378174, |
|
"learning_rate": 8.687782805429864e-06, |
|
"loss": 0.2322, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.6968325791855203, |
|
"eval_accuracy": 0.8820394527521477, |
|
"eval_loss": 0.29341939091682434, |
|
"eval_runtime": 126.3457, |
|
"eval_samples_per_second": 99.505, |
|
"eval_steps_per_second": 3.111, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.6996606334841629, |
|
"grad_norm": 2.899115800857544, |
|
"learning_rate": 8.668929110105581e-06, |
|
"loss": 0.2858, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 1.7024886877828054, |
|
"grad_norm": 5.119002819061279, |
|
"learning_rate": 8.650075414781298e-06, |
|
"loss": 0.2373, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 1.7053167420814481, |
|
"grad_norm": 4.328557968139648, |
|
"learning_rate": 8.631221719457014e-06, |
|
"loss": 0.2857, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 1.7081447963800906, |
|
"grad_norm": 6.154530048370361, |
|
"learning_rate": 8.612368024132731e-06, |
|
"loss": 0.2563, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 1.7109728506787332, |
|
"grad_norm": 2.4150142669677734, |
|
"learning_rate": 8.593514328808446e-06, |
|
"loss": 0.2766, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 1.7138009049773757, |
|
"grad_norm": 5.834397315979004, |
|
"learning_rate": 8.574660633484162e-06, |
|
"loss": 0.2804, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 1.7166289592760182, |
|
"grad_norm": 5.142675876617432, |
|
"learning_rate": 8.555806938159879e-06, |
|
"loss": 0.2573, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 1.7194570135746607, |
|
"grad_norm": 4.238577842712402, |
|
"learning_rate": 8.536953242835596e-06, |
|
"loss": 0.235, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 1.7222850678733033, |
|
"grad_norm": 4.491209506988525, |
|
"learning_rate": 8.518099547511312e-06, |
|
"loss": 0.2605, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 1.7251131221719458, |
|
"grad_norm": 5.393953323364258, |
|
"learning_rate": 8.499245852187029e-06, |
|
"loss": 0.2804, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.7279411764705883, |
|
"grad_norm": 4.455014228820801, |
|
"learning_rate": 8.480392156862745e-06, |
|
"loss": 0.2453, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 1.7307692307692308, |
|
"grad_norm": 4.781386375427246, |
|
"learning_rate": 8.461538461538462e-06, |
|
"loss": 0.2194, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 1.7335972850678734, |
|
"grad_norm": 5.215591907501221, |
|
"learning_rate": 8.442684766214179e-06, |
|
"loss": 0.2602, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 1.7364253393665159, |
|
"grad_norm": 5.542301654815674, |
|
"learning_rate": 8.423831070889895e-06, |
|
"loss": 0.3245, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 1.7392533936651584, |
|
"grad_norm": 2.144392967224121, |
|
"learning_rate": 8.404977375565612e-06, |
|
"loss": 0.2445, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 1.742081447963801, |
|
"grad_norm": 3.160285711288452, |
|
"learning_rate": 8.386123680241329e-06, |
|
"loss": 0.2702, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 1.7449095022624435, |
|
"grad_norm": 4.129340171813965, |
|
"learning_rate": 8.367269984917045e-06, |
|
"loss": 0.2924, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 1.747737556561086, |
|
"grad_norm": 4.408333778381348, |
|
"learning_rate": 8.348416289592762e-06, |
|
"loss": 0.2364, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 1.7505656108597285, |
|
"grad_norm": 5.696101188659668, |
|
"learning_rate": 8.329562594268478e-06, |
|
"loss": 0.2445, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 1.753393665158371, |
|
"grad_norm": 4.723424434661865, |
|
"learning_rate": 8.310708898944195e-06, |
|
"loss": 0.2284, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.7562217194570136, |
|
"grad_norm": 4.272291660308838, |
|
"learning_rate": 8.29185520361991e-06, |
|
"loss": 0.3189, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 1.759049773755656, |
|
"grad_norm": 4.042122840881348, |
|
"learning_rate": 8.273001508295627e-06, |
|
"loss": 0.2649, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 1.7618778280542986, |
|
"grad_norm": 1.9126514196395874, |
|
"learning_rate": 8.254147812971343e-06, |
|
"loss": 0.255, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 11.250100135803223, |
|
"learning_rate": 8.23529411764706e-06, |
|
"loss": 0.302, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 1.7675339366515836, |
|
"grad_norm": 4.978902816772461, |
|
"learning_rate": 8.216440422322776e-06, |
|
"loss": 0.3068, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.7703619909502262, |
|
"grad_norm": 4.657087802886963, |
|
"learning_rate": 8.197586726998491e-06, |
|
"loss": 0.2641, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 1.7731900452488687, |
|
"grad_norm": 4.440770626068115, |
|
"learning_rate": 8.178733031674208e-06, |
|
"loss": 0.2831, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 1.7760180995475112, |
|
"grad_norm": 2.723531484603882, |
|
"learning_rate": 8.159879336349925e-06, |
|
"loss": 0.2247, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 1.7788461538461537, |
|
"grad_norm": 4.28981351852417, |
|
"learning_rate": 8.141025641025641e-06, |
|
"loss": 0.2403, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 1.7816742081447963, |
|
"grad_norm": 4.748565673828125, |
|
"learning_rate": 8.122171945701358e-06, |
|
"loss": 0.2126, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.7845022624434388, |
|
"grad_norm": 5.226318359375, |
|
"learning_rate": 8.103318250377074e-06, |
|
"loss": 0.3272, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 1.7873303167420813, |
|
"grad_norm": 2.937812089920044, |
|
"learning_rate": 8.084464555052791e-06, |
|
"loss": 0.2276, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 1.7901583710407238, |
|
"grad_norm": 3.215853452682495, |
|
"learning_rate": 8.065610859728508e-06, |
|
"loss": 0.2406, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 1.7929864253393664, |
|
"grad_norm": 6.499160289764404, |
|
"learning_rate": 8.046757164404224e-06, |
|
"loss": 0.2915, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 1.7958144796380089, |
|
"grad_norm": 3.940803289413452, |
|
"learning_rate": 8.027903469079941e-06, |
|
"loss": 0.276, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.7986425339366516, |
|
"grad_norm": 2.177950859069824, |
|
"learning_rate": 8.009049773755657e-06, |
|
"loss": 0.2558, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 1.8014705882352942, |
|
"grad_norm": 7.705915451049805, |
|
"learning_rate": 7.990196078431374e-06, |
|
"loss": 0.2799, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 1.8042986425339367, |
|
"grad_norm": 5.586729526519775, |
|
"learning_rate": 7.97134238310709e-06, |
|
"loss": 0.224, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 1.8071266968325792, |
|
"grad_norm": 2.9311821460723877, |
|
"learning_rate": 7.952488687782806e-06, |
|
"loss": 0.2316, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 1.8099547511312217, |
|
"grad_norm": 3.5633130073547363, |
|
"learning_rate": 7.933634992458522e-06, |
|
"loss": 0.2298, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.8127828054298643, |
|
"grad_norm": 3.4238994121551514, |
|
"learning_rate": 7.914781297134239e-06, |
|
"loss": 0.2647, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 1.8156108597285068, |
|
"grad_norm": 9.544416427612305, |
|
"learning_rate": 7.895927601809955e-06, |
|
"loss": 0.3275, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 1.8184389140271493, |
|
"grad_norm": 2.8148701190948486, |
|
"learning_rate": 7.877073906485672e-06, |
|
"loss": 0.2131, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 1.8212669683257918, |
|
"grad_norm": 5.6752777099609375, |
|
"learning_rate": 7.858220211161389e-06, |
|
"loss": 0.3065, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 1.8240950226244343, |
|
"grad_norm": 6.207758903503418, |
|
"learning_rate": 7.839366515837105e-06, |
|
"loss": 0.3369, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.8269230769230769, |
|
"grad_norm": 2.1755306720733643, |
|
"learning_rate": 7.820512820512822e-06, |
|
"loss": 0.24, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 1.8297511312217196, |
|
"grad_norm": 4.380761623382568, |
|
"learning_rate": 7.801659125188537e-06, |
|
"loss": 0.2621, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 1.8325791855203621, |
|
"grad_norm": 7.944891452789307, |
|
"learning_rate": 7.782805429864253e-06, |
|
"loss": 0.2421, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 1.8354072398190047, |
|
"grad_norm": 6.696594715118408, |
|
"learning_rate": 7.76395173453997e-06, |
|
"loss": 0.2179, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 1.8382352941176472, |
|
"grad_norm": 5.534007549285889, |
|
"learning_rate": 7.745098039215687e-06, |
|
"loss": 0.2465, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.8410633484162897, |
|
"grad_norm": 4.6053290367126465, |
|
"learning_rate": 7.726244343891403e-06, |
|
"loss": 0.3311, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 1.8438914027149322, |
|
"grad_norm": 3.2913260459899902, |
|
"learning_rate": 7.70739064856712e-06, |
|
"loss": 0.2535, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 1.8467194570135748, |
|
"grad_norm": 5.70173454284668, |
|
"learning_rate": 7.688536953242837e-06, |
|
"loss": 0.2283, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 1.8495475113122173, |
|
"grad_norm": 6.683012962341309, |
|
"learning_rate": 7.669683257918553e-06, |
|
"loss": 0.2293, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 1.8523755656108598, |
|
"grad_norm": 4.2895612716674805, |
|
"learning_rate": 7.650829562594268e-06, |
|
"loss": 0.2013, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.8552036199095023, |
|
"grad_norm": 2.8891239166259766, |
|
"learning_rate": 7.631975867269985e-06, |
|
"loss": 0.2482, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 1.8580316742081449, |
|
"grad_norm": 5.462761402130127, |
|
"learning_rate": 7.613122171945701e-06, |
|
"loss": 0.3063, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 1.8608597285067874, |
|
"grad_norm": 4.3543806076049805, |
|
"learning_rate": 7.594268476621418e-06, |
|
"loss": 0.2519, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 1.86368778280543, |
|
"grad_norm": 5.1229681968688965, |
|
"learning_rate": 7.5754147812971346e-06, |
|
"loss": 0.2968, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 1.8665158371040724, |
|
"grad_norm": 1.8585267066955566, |
|
"learning_rate": 7.556561085972851e-06, |
|
"loss": 0.2208, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.869343891402715, |
|
"grad_norm": 4.255302429199219, |
|
"learning_rate": 7.537707390648568e-06, |
|
"loss": 0.2968, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 1.8721719457013575, |
|
"grad_norm": 4.815881729125977, |
|
"learning_rate": 7.518853695324284e-06, |
|
"loss": 0.3433, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 6.812479496002197, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.311, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 1.8778280542986425, |
|
"grad_norm": 3.9199917316436768, |
|
"learning_rate": 7.481146304675717e-06, |
|
"loss": 0.2767, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 1.880656108597285, |
|
"grad_norm": 4.117010593414307, |
|
"learning_rate": 7.462292609351433e-06, |
|
"loss": 0.2858, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.8834841628959276, |
|
"grad_norm": 4.636374473571777, |
|
"learning_rate": 7.44343891402715e-06, |
|
"loss": 0.2043, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 1.88631221719457, |
|
"grad_norm": 5.478713512420654, |
|
"learning_rate": 7.424585218702867e-06, |
|
"loss": 0.288, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 1.8891402714932126, |
|
"grad_norm": 4.690084457397461, |
|
"learning_rate": 7.405731523378583e-06, |
|
"loss": 0.2651, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 1.8919683257918551, |
|
"grad_norm": 2.4495575428009033, |
|
"learning_rate": 7.3868778280543e-06, |
|
"loss": 0.2651, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 1.8947963800904977, |
|
"grad_norm": 5.4684672355651855, |
|
"learning_rate": 7.3680241327300165e-06, |
|
"loss": 0.2834, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.8976244343891402, |
|
"grad_norm": 1.9919039011001587, |
|
"learning_rate": 7.349170437405732e-06, |
|
"loss": 0.2021, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 1.9004524886877827, |
|
"grad_norm": 4.975834846496582, |
|
"learning_rate": 7.330316742081448e-06, |
|
"loss": 0.3194, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 1.9032805429864252, |
|
"grad_norm": 4.014176368713379, |
|
"learning_rate": 7.311463046757165e-06, |
|
"loss": 0.2251, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 1.9061085972850678, |
|
"grad_norm": 7.0189409255981445, |
|
"learning_rate": 7.292609351432881e-06, |
|
"loss": 0.3062, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 1.9089366515837103, |
|
"grad_norm": 7.0651350021362305, |
|
"learning_rate": 7.273755656108598e-06, |
|
"loss": 0.2488, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.9117647058823528, |
|
"grad_norm": 7.110829830169678, |
|
"learning_rate": 7.2549019607843145e-06, |
|
"loss": 0.2226, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 1.9145927601809953, |
|
"grad_norm": 8.122304916381836, |
|
"learning_rate": 7.23604826546003e-06, |
|
"loss": 0.2236, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 1.9174208144796379, |
|
"grad_norm": 4.817609786987305, |
|
"learning_rate": 7.217194570135747e-06, |
|
"loss": 0.2935, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 1.9202488687782804, |
|
"grad_norm": 3.6452667713165283, |
|
"learning_rate": 7.1983408748114635e-06, |
|
"loss": 0.2711, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 5.04451847076416, |
|
"learning_rate": 7.17948717948718e-06, |
|
"loss": 0.3383, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.9259049773755657, |
|
"grad_norm": 3.0769617557525635, |
|
"learning_rate": 7.160633484162897e-06, |
|
"loss": 0.2481, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 1.9287330316742082, |
|
"grad_norm": 2.4666669368743896, |
|
"learning_rate": 7.141779788838613e-06, |
|
"loss": 0.2713, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 1.9315610859728507, |
|
"grad_norm": 6.22195291519165, |
|
"learning_rate": 7.12292609351433e-06, |
|
"loss": 0.253, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 1.9343891402714932, |
|
"grad_norm": 5.916505336761475, |
|
"learning_rate": 7.104072398190046e-06, |
|
"loss": 0.3023, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 1.9372171945701357, |
|
"grad_norm": 3.696983575820923, |
|
"learning_rate": 7.085218702865762e-06, |
|
"loss": 0.3176, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.9400452488687783, |
|
"grad_norm": 4.350560665130615, |
|
"learning_rate": 7.066365007541479e-06, |
|
"loss": 0.2488, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 1.9428733031674208, |
|
"grad_norm": 4.9616498947143555, |
|
"learning_rate": 7.047511312217196e-06, |
|
"loss": 0.2901, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 1.9457013574660633, |
|
"grad_norm": 2.2549595832824707, |
|
"learning_rate": 7.028657616892911e-06, |
|
"loss": 0.2526, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 1.9485294117647058, |
|
"grad_norm": 3.205310821533203, |
|
"learning_rate": 7.009803921568628e-06, |
|
"loss": 0.2819, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 1.9513574660633484, |
|
"grad_norm": 5.102742671966553, |
|
"learning_rate": 6.990950226244344e-06, |
|
"loss": 0.2573, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.9541855203619911, |
|
"grad_norm": 2.78604793548584, |
|
"learning_rate": 6.97209653092006e-06, |
|
"loss": 0.1702, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 1.9570135746606336, |
|
"grad_norm": 3.8111801147460938, |
|
"learning_rate": 6.953242835595777e-06, |
|
"loss": 0.2963, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 1.9598416289592762, |
|
"grad_norm": 4.204692363739014, |
|
"learning_rate": 6.934389140271494e-06, |
|
"loss": 0.2989, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 1.9626696832579187, |
|
"grad_norm": 3.3682045936584473, |
|
"learning_rate": 6.91553544494721e-06, |
|
"loss": 0.2744, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 1.9654977375565612, |
|
"grad_norm": 5.661670207977295, |
|
"learning_rate": 6.896681749622927e-06, |
|
"loss": 0.27, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 1.9683257918552037, |
|
"grad_norm": 3.925750494003296, |
|
"learning_rate": 6.8778280542986434e-06, |
|
"loss": 0.2711, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 1.9711538461538463, |
|
"grad_norm": 5.467376232147217, |
|
"learning_rate": 6.858974358974359e-06, |
|
"loss": 0.3182, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 1.9739819004524888, |
|
"grad_norm": 7.46327543258667, |
|
"learning_rate": 6.840120663650076e-06, |
|
"loss": 0.336, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 1.9768099547511313, |
|
"grad_norm": 4.464349269866943, |
|
"learning_rate": 6.8212669683257924e-06, |
|
"loss": 0.333, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 1.9796380090497738, |
|
"grad_norm": 5.0763421058654785, |
|
"learning_rate": 6.802413273001509e-06, |
|
"loss": 0.2332, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.9796380090497738, |
|
"eval_accuracy": 0.8868119630925867, |
|
"eval_loss": 0.2794936001300812, |
|
"eval_runtime": 126.4211, |
|
"eval_samples_per_second": 99.445, |
|
"eval_steps_per_second": 3.109, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.9824660633484164, |
|
"grad_norm": 4.514822483062744, |
|
"learning_rate": 6.783559577677226e-06, |
|
"loss": 0.3259, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 1.9852941176470589, |
|
"grad_norm": 3.9309160709381104, |
|
"learning_rate": 6.764705882352942e-06, |
|
"loss": 0.2671, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 1.9881221719457014, |
|
"grad_norm": 3.7512924671173096, |
|
"learning_rate": 6.745852187028659e-06, |
|
"loss": 0.3025, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 1.990950226244344, |
|
"grad_norm": 5.162522792816162, |
|
"learning_rate": 6.7269984917043755e-06, |
|
"loss": 0.2556, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 1.9937782805429864, |
|
"grad_norm": 5.968090534210205, |
|
"learning_rate": 6.7081447963800904e-06, |
|
"loss": 0.245, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 1.996606334841629, |
|
"grad_norm": 7.264348983764648, |
|
"learning_rate": 6.689291101055807e-06, |
|
"loss": 0.274, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 1.9994343891402715, |
|
"grad_norm": 4.840837478637695, |
|
"learning_rate": 6.670437405731524e-06, |
|
"loss": 0.2381, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 2.002262443438914, |
|
"grad_norm": 3.3212857246398926, |
|
"learning_rate": 6.65158371040724e-06, |
|
"loss": 0.2576, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 2.0050904977375565, |
|
"grad_norm": 6.3086419105529785, |
|
"learning_rate": 6.632730015082957e-06, |
|
"loss": 0.2471, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 2.007918552036199, |
|
"grad_norm": 2.5110299587249756, |
|
"learning_rate": 6.613876319758673e-06, |
|
"loss": 0.2414, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.0107466063348416, |
|
"grad_norm": 4.115811824798584, |
|
"learning_rate": 6.595022624434389e-06, |
|
"loss": 0.1715, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 2.013574660633484, |
|
"grad_norm": 5.045820236206055, |
|
"learning_rate": 6.576168929110106e-06, |
|
"loss": 0.2494, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 2.0164027149321266, |
|
"grad_norm": 4.6321845054626465, |
|
"learning_rate": 6.5573152337858225e-06, |
|
"loss": 0.222, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 2.019230769230769, |
|
"grad_norm": 5.135430335998535, |
|
"learning_rate": 6.538461538461539e-06, |
|
"loss": 0.2206, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 2.0220588235294117, |
|
"grad_norm": 4.786893367767334, |
|
"learning_rate": 6.519607843137256e-06, |
|
"loss": 0.229, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 2.024886877828054, |
|
"grad_norm": 3.568856716156006, |
|
"learning_rate": 6.500754147812972e-06, |
|
"loss": 0.2235, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 2.0277149321266967, |
|
"grad_norm": 6.938755989074707, |
|
"learning_rate": 6.481900452488689e-06, |
|
"loss": 0.23, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 2.0305429864253393, |
|
"grad_norm": 4.014111042022705, |
|
"learning_rate": 6.463046757164405e-06, |
|
"loss": 0.2076, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 2.033371040723982, |
|
"grad_norm": 5.143094062805176, |
|
"learning_rate": 6.444193061840121e-06, |
|
"loss": 0.3276, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 2.0361990950226243, |
|
"grad_norm": 4.8052191734313965, |
|
"learning_rate": 6.425339366515838e-06, |
|
"loss": 0.2223, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.039027149321267, |
|
"grad_norm": 6.07175874710083, |
|
"learning_rate": 6.406485671191555e-06, |
|
"loss": 0.2514, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 2.0418552036199094, |
|
"grad_norm": 3.0855891704559326, |
|
"learning_rate": 6.38763197586727e-06, |
|
"loss": 0.2043, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 2.044683257918552, |
|
"grad_norm": 5.760570049285889, |
|
"learning_rate": 6.368778280542986e-06, |
|
"loss": 0.2051, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 2.0475113122171944, |
|
"grad_norm": 5.127667427062988, |
|
"learning_rate": 6.349924585218703e-06, |
|
"loss": 0.2141, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 2.050339366515837, |
|
"grad_norm": 2.886842727661133, |
|
"learning_rate": 6.331070889894419e-06, |
|
"loss": 0.1705, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 2.0531674208144794, |
|
"grad_norm": 5.108696937561035, |
|
"learning_rate": 6.312217194570136e-06, |
|
"loss": 0.2737, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 2.055995475113122, |
|
"grad_norm": 7.453789234161377, |
|
"learning_rate": 6.293363499245853e-06, |
|
"loss": 0.288, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 2.0588235294117645, |
|
"grad_norm": 3.700695514678955, |
|
"learning_rate": 6.274509803921569e-06, |
|
"loss": 0.2087, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 2.0616515837104075, |
|
"grad_norm": 3.475170612335205, |
|
"learning_rate": 6.255656108597286e-06, |
|
"loss": 0.182, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 2.06447963800905, |
|
"grad_norm": 3.636042833328247, |
|
"learning_rate": 6.2368024132730024e-06, |
|
"loss": 0.1856, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.0673076923076925, |
|
"grad_norm": 4.326310157775879, |
|
"learning_rate": 6.217948717948718e-06, |
|
"loss": 0.2071, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 2.070135746606335, |
|
"grad_norm": 4.5239105224609375, |
|
"learning_rate": 6.199095022624435e-06, |
|
"loss": 0.2045, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 2.0729638009049776, |
|
"grad_norm": 5.962629318237305, |
|
"learning_rate": 6.1802413273001514e-06, |
|
"loss": 0.2236, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 2.07579185520362, |
|
"grad_norm": 6.830577373504639, |
|
"learning_rate": 6.161387631975868e-06, |
|
"loss": 0.2435, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 2.0786199095022626, |
|
"grad_norm": 6.650877952575684, |
|
"learning_rate": 6.142533936651585e-06, |
|
"loss": 0.2273, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 2.081447963800905, |
|
"grad_norm": 9.387392044067383, |
|
"learning_rate": 6.123680241327301e-06, |
|
"loss": 0.2265, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 2.0842760180995477, |
|
"grad_norm": 7.404173374176025, |
|
"learning_rate": 6.104826546003018e-06, |
|
"loss": 0.1513, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 2.08710407239819, |
|
"grad_norm": 3.4944663047790527, |
|
"learning_rate": 6.085972850678733e-06, |
|
"loss": 0.2339, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 2.0899321266968327, |
|
"grad_norm": 3.5213699340820312, |
|
"learning_rate": 6.0671191553544494e-06, |
|
"loss": 0.2839, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 2.0927601809954752, |
|
"grad_norm": 4.182003974914551, |
|
"learning_rate": 6.048265460030166e-06, |
|
"loss": 0.2125, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.0955882352941178, |
|
"grad_norm": 6.472683429718018, |
|
"learning_rate": 6.029411764705883e-06, |
|
"loss": 0.1934, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 2.0984162895927603, |
|
"grad_norm": 3.89056658744812, |
|
"learning_rate": 6.010558069381599e-06, |
|
"loss": 0.1829, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 2.101244343891403, |
|
"grad_norm": 6.370733261108398, |
|
"learning_rate": 5.991704374057316e-06, |
|
"loss": 0.1888, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 2.1040723981900453, |
|
"grad_norm": 6.549925327301025, |
|
"learning_rate": 5.972850678733032e-06, |
|
"loss": 0.2399, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 2.106900452488688, |
|
"grad_norm": 6.536769866943359, |
|
"learning_rate": 5.953996983408748e-06, |
|
"loss": 0.2937, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 2.1097285067873304, |
|
"grad_norm": 5.718851566314697, |
|
"learning_rate": 5.935143288084465e-06, |
|
"loss": 0.1983, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 2.112556561085973, |
|
"grad_norm": 6.838066577911377, |
|
"learning_rate": 5.9162895927601815e-06, |
|
"loss": 0.2941, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 2.1153846153846154, |
|
"grad_norm": 3.4056811332702637, |
|
"learning_rate": 5.897435897435898e-06, |
|
"loss": 0.2191, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 2.118212669683258, |
|
"grad_norm": 5.439931392669678, |
|
"learning_rate": 5.878582202111615e-06, |
|
"loss": 0.2095, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 2.1210407239819005, |
|
"grad_norm": 6.081836700439453, |
|
"learning_rate": 5.859728506787331e-06, |
|
"loss": 0.1964, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.123868778280543, |
|
"grad_norm": 2.3146896362304688, |
|
"learning_rate": 5.840874811463048e-06, |
|
"loss": 0.266, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 2.1266968325791855, |
|
"grad_norm": 2.6987674236297607, |
|
"learning_rate": 5.822021116138764e-06, |
|
"loss": 0.2508, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 2.129524886877828, |
|
"grad_norm": 4.278384208679199, |
|
"learning_rate": 5.80316742081448e-06, |
|
"loss": 0.1764, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 2.1323529411764706, |
|
"grad_norm": 6.95686674118042, |
|
"learning_rate": 5.784313725490197e-06, |
|
"loss": 0.274, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 2.135180995475113, |
|
"grad_norm": 3.3586158752441406, |
|
"learning_rate": 5.765460030165913e-06, |
|
"loss": 0.2624, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 2.1380090497737556, |
|
"grad_norm": 3.704134702682495, |
|
"learning_rate": 5.746606334841629e-06, |
|
"loss": 0.2229, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 2.140837104072398, |
|
"grad_norm": 6.012093544006348, |
|
"learning_rate": 5.727752639517345e-06, |
|
"loss": 0.2215, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 2.1436651583710407, |
|
"grad_norm": 4.300053596496582, |
|
"learning_rate": 5.708898944193062e-06, |
|
"loss": 0.199, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 2.146493212669683, |
|
"grad_norm": 7.028651714324951, |
|
"learning_rate": 5.690045248868778e-06, |
|
"loss": 0.22, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 2.1493212669683257, |
|
"grad_norm": 5.363503456115723, |
|
"learning_rate": 5.671191553544495e-06, |
|
"loss": 0.1895, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.1521493212669682, |
|
"grad_norm": 4.580994129180908, |
|
"learning_rate": 5.652337858220212e-06, |
|
"loss": 0.1713, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 2.1549773755656108, |
|
"grad_norm": 7.074058532714844, |
|
"learning_rate": 5.633484162895928e-06, |
|
"loss": 0.2861, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 2.1578054298642533, |
|
"grad_norm": 6.180254936218262, |
|
"learning_rate": 5.614630467571645e-06, |
|
"loss": 0.2316, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 2.160633484162896, |
|
"grad_norm": 9.370762825012207, |
|
"learning_rate": 5.5957767722473614e-06, |
|
"loss": 0.2717, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 2.1634615384615383, |
|
"grad_norm": 4.996572017669678, |
|
"learning_rate": 5.576923076923077e-06, |
|
"loss": 0.2513, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 2.166289592760181, |
|
"grad_norm": 6.018435478210449, |
|
"learning_rate": 5.558069381598794e-06, |
|
"loss": 0.2279, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 2.1691176470588234, |
|
"grad_norm": 4.290647983551025, |
|
"learning_rate": 5.5392156862745104e-06, |
|
"loss": 0.2459, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 2.171945701357466, |
|
"grad_norm": 3.902825117111206, |
|
"learning_rate": 5.520361990950227e-06, |
|
"loss": 0.2181, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 2.1747737556561084, |
|
"grad_norm": 2.4550859928131104, |
|
"learning_rate": 5.501508295625944e-06, |
|
"loss": 0.2309, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 2.177601809954751, |
|
"grad_norm": 3.8267788887023926, |
|
"learning_rate": 5.48265460030166e-06, |
|
"loss": 0.2444, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.1804298642533935, |
|
"grad_norm": 2.1368167400360107, |
|
"learning_rate": 5.463800904977375e-06, |
|
"loss": 0.2044, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 2.183257918552036, |
|
"grad_norm": 4.121007919311523, |
|
"learning_rate": 5.444947209653092e-06, |
|
"loss": 0.193, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 2.1860859728506785, |
|
"grad_norm": 1.0247951745986938, |
|
"learning_rate": 5.4260935143288084e-06, |
|
"loss": 0.2452, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 2.1889140271493215, |
|
"grad_norm": 6.7461323738098145, |
|
"learning_rate": 5.407239819004525e-06, |
|
"loss": 0.2341, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 2.191742081447964, |
|
"grad_norm": 3.962465286254883, |
|
"learning_rate": 5.388386123680242e-06, |
|
"loss": 0.1699, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 2.1945701357466065, |
|
"grad_norm": 3.7287843227386475, |
|
"learning_rate": 5.369532428355958e-06, |
|
"loss": 0.1768, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 2.197398190045249, |
|
"grad_norm": 3.93239426612854, |
|
"learning_rate": 5.350678733031675e-06, |
|
"loss": 0.2383, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 2.2002262443438916, |
|
"grad_norm": 5.207613468170166, |
|
"learning_rate": 5.331825037707391e-06, |
|
"loss": 0.2282, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 2.203054298642534, |
|
"grad_norm": 3.9662837982177734, |
|
"learning_rate": 5.312971342383107e-06, |
|
"loss": 0.1616, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 2.2058823529411766, |
|
"grad_norm": 4.898771286010742, |
|
"learning_rate": 5.294117647058824e-06, |
|
"loss": 0.2013, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.208710407239819, |
|
"grad_norm": 7.645010948181152, |
|
"learning_rate": 5.2752639517345405e-06, |
|
"loss": 0.2478, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 2.2115384615384617, |
|
"grad_norm": 2.4150936603546143, |
|
"learning_rate": 5.256410256410257e-06, |
|
"loss": 0.1754, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 2.214366515837104, |
|
"grad_norm": 1.881043791770935, |
|
"learning_rate": 5.237556561085974e-06, |
|
"loss": 0.264, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 2.2171945701357467, |
|
"grad_norm": 6.877952575683594, |
|
"learning_rate": 5.21870286576169e-06, |
|
"loss": 0.2879, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 2.2200226244343892, |
|
"grad_norm": 3.3370893001556396, |
|
"learning_rate": 5.199849170437406e-06, |
|
"loss": 0.2312, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 2.2228506787330318, |
|
"grad_norm": 4.1501545906066895, |
|
"learning_rate": 5.180995475113123e-06, |
|
"loss": 0.2129, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 2.2256787330316743, |
|
"grad_norm": 4.085570335388184, |
|
"learning_rate": 5.162141779788839e-06, |
|
"loss": 0.1647, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 2.228506787330317, |
|
"grad_norm": 4.05198335647583, |
|
"learning_rate": 5.143288084464555e-06, |
|
"loss": 0.2338, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 2.2313348416289593, |
|
"grad_norm": 3.9560508728027344, |
|
"learning_rate": 5.124434389140272e-06, |
|
"loss": 0.3062, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 2.234162895927602, |
|
"grad_norm": 2.1549770832061768, |
|
"learning_rate": 5.105580693815988e-06, |
|
"loss": 0.2259, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.2369909502262444, |
|
"grad_norm": 2.7982289791107178, |
|
"learning_rate": 5.086726998491704e-06, |
|
"loss": 0.1782, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 2.239819004524887, |
|
"grad_norm": 4.951447010040283, |
|
"learning_rate": 5.067873303167421e-06, |
|
"loss": 0.2604, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 2.2426470588235294, |
|
"grad_norm": 5.907583713531494, |
|
"learning_rate": 5.049019607843137e-06, |
|
"loss": 0.2447, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 2.245475113122172, |
|
"grad_norm": 5.986253261566162, |
|
"learning_rate": 5.030165912518854e-06, |
|
"loss": 0.2829, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 2.2483031674208145, |
|
"grad_norm": 4.330525875091553, |
|
"learning_rate": 5.011312217194571e-06, |
|
"loss": 0.1908, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 2.251131221719457, |
|
"grad_norm": 5.337680816650391, |
|
"learning_rate": 4.992458521870287e-06, |
|
"loss": 0.2539, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 2.2539592760180995, |
|
"grad_norm": 7.187500476837158, |
|
"learning_rate": 4.973604826546004e-06, |
|
"loss": 0.2405, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 2.256787330316742, |
|
"grad_norm": 5.105306625366211, |
|
"learning_rate": 4.95475113122172e-06, |
|
"loss": 0.2616, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 2.2596153846153846, |
|
"grad_norm": 4.068017482757568, |
|
"learning_rate": 4.935897435897436e-06, |
|
"loss": 0.2233, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 2.262443438914027, |
|
"grad_norm": 2.9654664993286133, |
|
"learning_rate": 4.917043740573153e-06, |
|
"loss": 0.2187, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.262443438914027, |
|
"eval_accuracy": 0.8858574610244989, |
|
"eval_loss": 0.29285645484924316, |
|
"eval_runtime": 126.4151, |
|
"eval_samples_per_second": 99.45, |
|
"eval_steps_per_second": 3.109, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.2652714932126696, |
|
"grad_norm": 4.892025470733643, |
|
"learning_rate": 4.898190045248869e-06, |
|
"loss": 0.223, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 2.268099547511312, |
|
"grad_norm": 6.540407657623291, |
|
"learning_rate": 4.879336349924585e-06, |
|
"loss": 0.2356, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 2.2709276018099547, |
|
"grad_norm": 4.254669666290283, |
|
"learning_rate": 4.860482654600302e-06, |
|
"loss": 0.2295, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 2.273755656108597, |
|
"grad_norm": 2.9539434909820557, |
|
"learning_rate": 4.8416289592760185e-06, |
|
"loss": 0.2617, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 2.2765837104072397, |
|
"grad_norm": 6.981826305389404, |
|
"learning_rate": 4.822775263951735e-06, |
|
"loss": 0.2911, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 2.2794117647058822, |
|
"grad_norm": 4.400992393493652, |
|
"learning_rate": 4.803921568627452e-06, |
|
"loss": 0.2384, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 2.2822398190045248, |
|
"grad_norm": 6.687214374542236, |
|
"learning_rate": 4.785067873303168e-06, |
|
"loss": 0.2139, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 2.2850678733031673, |
|
"grad_norm": 2.111176013946533, |
|
"learning_rate": 4.766214177978885e-06, |
|
"loss": 0.2223, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 2.28789592760181, |
|
"grad_norm": 7.312646389007568, |
|
"learning_rate": 4.747360482654601e-06, |
|
"loss": 0.2631, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 2.2907239819004523, |
|
"grad_norm": 5.643038749694824, |
|
"learning_rate": 4.728506787330317e-06, |
|
"loss": 0.179, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.293552036199095, |
|
"grad_norm": 8.725652694702148, |
|
"learning_rate": 4.709653092006033e-06, |
|
"loss": 0.2362, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 2.2963800904977374, |
|
"grad_norm": 6.781122207641602, |
|
"learning_rate": 4.69079939668175e-06, |
|
"loss": 0.205, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 2.29920814479638, |
|
"grad_norm": 0.9392467141151428, |
|
"learning_rate": 4.671945701357466e-06, |
|
"loss": 0.2181, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 2.3020361990950224, |
|
"grad_norm": 1.8741260766983032, |
|
"learning_rate": 4.653092006033183e-06, |
|
"loss": 0.1588, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 2.3048642533936654, |
|
"grad_norm": 5.825664520263672, |
|
"learning_rate": 4.6342383107088995e-06, |
|
"loss": 0.2214, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 4.3385701179504395, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.2024, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 2.3105203619909505, |
|
"grad_norm": 5.437368869781494, |
|
"learning_rate": 4.596530920060332e-06, |
|
"loss": 0.2341, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 2.3133484162895925, |
|
"grad_norm": 5.2032270431518555, |
|
"learning_rate": 4.5776772247360485e-06, |
|
"loss": 0.2639, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 2.3161764705882355, |
|
"grad_norm": 4.702691555023193, |
|
"learning_rate": 4.558823529411765e-06, |
|
"loss": 0.2153, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 2.3190045248868776, |
|
"grad_norm": 3.5364975929260254, |
|
"learning_rate": 4.539969834087482e-06, |
|
"loss": 0.1909, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.3218325791855206, |
|
"grad_norm": 2.7947473526000977, |
|
"learning_rate": 4.521116138763198e-06, |
|
"loss": 0.216, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 2.324660633484163, |
|
"grad_norm": 8.211967468261719, |
|
"learning_rate": 4.502262443438914e-06, |
|
"loss": 0.2122, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 2.3274886877828056, |
|
"grad_norm": 3.7828614711761475, |
|
"learning_rate": 4.483408748114631e-06, |
|
"loss": 0.2741, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 2.330316742081448, |
|
"grad_norm": 5.757340908050537, |
|
"learning_rate": 4.464555052790347e-06, |
|
"loss": 0.2854, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 2.3331447963800906, |
|
"grad_norm": 4.723744869232178, |
|
"learning_rate": 4.445701357466063e-06, |
|
"loss": 0.2508, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 2.335972850678733, |
|
"grad_norm": 4.520774841308594, |
|
"learning_rate": 4.42684766214178e-06, |
|
"loss": 0.2414, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 2.3388009049773757, |
|
"grad_norm": 4.983455181121826, |
|
"learning_rate": 4.407993966817496e-06, |
|
"loss": 0.2414, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 2.341628959276018, |
|
"grad_norm": 6.122417449951172, |
|
"learning_rate": 4.389140271493213e-06, |
|
"loss": 0.2177, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 2.3444570135746607, |
|
"grad_norm": 2.776017189025879, |
|
"learning_rate": 4.37028657616893e-06, |
|
"loss": 0.2133, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 2.3472850678733033, |
|
"grad_norm": 7.429429054260254, |
|
"learning_rate": 4.351432880844646e-06, |
|
"loss": 0.1915, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.350113122171946, |
|
"grad_norm": 7.583387851715088, |
|
"learning_rate": 4.332579185520363e-06, |
|
"loss": 0.2396, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 8.560108184814453, |
|
"learning_rate": 4.313725490196079e-06, |
|
"loss": 0.2364, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 2.355769230769231, |
|
"grad_norm": 2.898757219314575, |
|
"learning_rate": 4.294871794871795e-06, |
|
"loss": 0.2685, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 2.3585972850678734, |
|
"grad_norm": 5.2947564125061035, |
|
"learning_rate": 4.276018099547512e-06, |
|
"loss": 0.2222, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 2.361425339366516, |
|
"grad_norm": 2.573645830154419, |
|
"learning_rate": 4.257164404223228e-06, |
|
"loss": 0.2335, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 2.3642533936651584, |
|
"grad_norm": 6.62631368637085, |
|
"learning_rate": 4.238310708898944e-06, |
|
"loss": 0.2325, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 2.367081447963801, |
|
"grad_norm": 5.814454555511475, |
|
"learning_rate": 4.219457013574661e-06, |
|
"loss": 0.2538, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 2.3699095022624435, |
|
"grad_norm": 6.129361152648926, |
|
"learning_rate": 4.2006033182503775e-06, |
|
"loss": 0.2395, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 2.372737556561086, |
|
"grad_norm": 5.893956184387207, |
|
"learning_rate": 4.181749622926094e-06, |
|
"loss": 0.2651, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 2.3755656108597285, |
|
"grad_norm": 6.977567672729492, |
|
"learning_rate": 4.162895927601811e-06, |
|
"loss": 0.2575, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.378393665158371, |
|
"grad_norm": 1.8976235389709473, |
|
"learning_rate": 4.144042232277527e-06, |
|
"loss": 0.199, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 2.3812217194570136, |
|
"grad_norm": 1.1803913116455078, |
|
"learning_rate": 4.125188536953243e-06, |
|
"loss": 0.2826, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 2.384049773755656, |
|
"grad_norm": 4.858994483947754, |
|
"learning_rate": 4.10633484162896e-06, |
|
"loss": 0.1937, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 2.3868778280542986, |
|
"grad_norm": 3.6424715518951416, |
|
"learning_rate": 4.087481146304676e-06, |
|
"loss": 0.2383, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 2.389705882352941, |
|
"grad_norm": 4.879428863525391, |
|
"learning_rate": 4.068627450980392e-06, |
|
"loss": 0.2187, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 2.3925339366515836, |
|
"grad_norm": 4.588160991668701, |
|
"learning_rate": 4.049773755656109e-06, |
|
"loss": 0.2134, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 2.395361990950226, |
|
"grad_norm": 3.9123332500457764, |
|
"learning_rate": 4.030920060331825e-06, |
|
"loss": 0.1968, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 2.3981900452488687, |
|
"grad_norm": 6.140926361083984, |
|
"learning_rate": 4.012066365007542e-06, |
|
"loss": 0.2356, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 2.401018099547511, |
|
"grad_norm": 2.6923718452453613, |
|
"learning_rate": 3.9932126696832585e-06, |
|
"loss": 0.2502, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 2.4038461538461537, |
|
"grad_norm": 3.490473508834839, |
|
"learning_rate": 3.974358974358974e-06, |
|
"loss": 0.2253, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.4066742081447963, |
|
"grad_norm": 3.2556686401367188, |
|
"learning_rate": 3.955505279034691e-06, |
|
"loss": 0.2228, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 2.409502262443439, |
|
"grad_norm": 5.598496437072754, |
|
"learning_rate": 3.9366515837104075e-06, |
|
"loss": 0.234, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 2.4123303167420813, |
|
"grad_norm": 4.937731742858887, |
|
"learning_rate": 3.917797888386124e-06, |
|
"loss": 0.2064, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 2.415158371040724, |
|
"grad_norm": 2.0519907474517822, |
|
"learning_rate": 3.898944193061841e-06, |
|
"loss": 0.2148, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 2.4179864253393664, |
|
"grad_norm": 4.925931453704834, |
|
"learning_rate": 3.880090497737557e-06, |
|
"loss": 0.2406, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 2.420814479638009, |
|
"grad_norm": 3.878779172897339, |
|
"learning_rate": 3.861236802413273e-06, |
|
"loss": 0.2159, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 2.4236425339366514, |
|
"grad_norm": 5.424575328826904, |
|
"learning_rate": 3.84238310708899e-06, |
|
"loss": 0.2202, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 2.426470588235294, |
|
"grad_norm": 4.764692306518555, |
|
"learning_rate": 3.8235294117647055e-06, |
|
"loss": 0.2238, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 2.4292986425339365, |
|
"grad_norm": 6.2886881828308105, |
|
"learning_rate": 3.8046757164404226e-06, |
|
"loss": 0.2258, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 2.4321266968325794, |
|
"grad_norm": 5.105391502380371, |
|
"learning_rate": 3.7858220211161388e-06, |
|
"loss": 0.2291, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.4349547511312215, |
|
"grad_norm": 3.7577686309814453, |
|
"learning_rate": 3.7669683257918554e-06, |
|
"loss": 0.2041, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 2.4377828054298645, |
|
"grad_norm": 2.689021587371826, |
|
"learning_rate": 3.748114630467572e-06, |
|
"loss": 0.2576, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 2.4406108597285066, |
|
"grad_norm": 3.162226438522339, |
|
"learning_rate": 3.7292609351432886e-06, |
|
"loss": 0.2342, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 2.4434389140271495, |
|
"grad_norm": 4.014715671539307, |
|
"learning_rate": 3.710407239819005e-06, |
|
"loss": 0.236, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 2.446266968325792, |
|
"grad_norm": 5.3587822914123535, |
|
"learning_rate": 3.6915535444947214e-06, |
|
"loss": 0.2328, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 2.4490950226244346, |
|
"grad_norm": 7.895315647125244, |
|
"learning_rate": 3.672699849170438e-06, |
|
"loss": 0.2333, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 2.451923076923077, |
|
"grad_norm": 8.392569541931152, |
|
"learning_rate": 3.653846153846154e-06, |
|
"loss": 0.2605, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 2.4547511312217196, |
|
"grad_norm": 3.8333370685577393, |
|
"learning_rate": 3.6349924585218704e-06, |
|
"loss": 0.2289, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 2.457579185520362, |
|
"grad_norm": 7.176278114318848, |
|
"learning_rate": 3.616138763197587e-06, |
|
"loss": 0.2119, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 2.4604072398190047, |
|
"grad_norm": 8.778523445129395, |
|
"learning_rate": 3.5972850678733032e-06, |
|
"loss": 0.2363, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.463235294117647, |
|
"grad_norm": 3.1572511196136475, |
|
"learning_rate": 3.57843137254902e-06, |
|
"loss": 0.2083, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 2.4660633484162897, |
|
"grad_norm": 6.948089122772217, |
|
"learning_rate": 3.5595776772247365e-06, |
|
"loss": 0.2337, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 2.4688914027149322, |
|
"grad_norm": 7.237654209136963, |
|
"learning_rate": 3.540723981900453e-06, |
|
"loss": 0.1903, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 2.4717194570135748, |
|
"grad_norm": 3.5161070823669434, |
|
"learning_rate": 3.5218702865761693e-06, |
|
"loss": 0.2003, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 2.4745475113122173, |
|
"grad_norm": 5.7288737297058105, |
|
"learning_rate": 3.5030165912518855e-06, |
|
"loss": 0.1979, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 2.47737556561086, |
|
"grad_norm": 6.921863079071045, |
|
"learning_rate": 3.484162895927602e-06, |
|
"loss": 0.2681, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 2.4802036199095023, |
|
"grad_norm": 1.5838019847869873, |
|
"learning_rate": 3.4653092006033183e-06, |
|
"loss": 0.186, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 2.483031674208145, |
|
"grad_norm": 6.464385986328125, |
|
"learning_rate": 3.446455505279035e-06, |
|
"loss": 0.2791, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 2.4858597285067874, |
|
"grad_norm": 4.105411529541016, |
|
"learning_rate": 3.4276018099547515e-06, |
|
"loss": 0.246, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 2.48868778280543, |
|
"grad_norm": 5.3756632804870605, |
|
"learning_rate": 3.408748114630468e-06, |
|
"loss": 0.2344, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.4915158371040724, |
|
"grad_norm": 3.4841089248657227, |
|
"learning_rate": 3.3898944193061843e-06, |
|
"loss": 0.1978, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 2.494343891402715, |
|
"grad_norm": 7.188533782958984, |
|
"learning_rate": 3.371040723981901e-06, |
|
"loss": 0.2737, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 2.4971719457013575, |
|
"grad_norm": 4.090082168579102, |
|
"learning_rate": 3.3521870286576167e-06, |
|
"loss": 0.2139, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 7.417943000793457, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.2275, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 2.5028280542986425, |
|
"grad_norm": 3.605393648147583, |
|
"learning_rate": 3.31447963800905e-06, |
|
"loss": 0.2446, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 2.505656108597285, |
|
"grad_norm": 5.961788654327393, |
|
"learning_rate": 3.2956259426847666e-06, |
|
"loss": 0.2923, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 2.5084841628959276, |
|
"grad_norm": 4.26703405380249, |
|
"learning_rate": 3.2767722473604827e-06, |
|
"loss": 0.1962, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 2.51131221719457, |
|
"grad_norm": 4.207533359527588, |
|
"learning_rate": 3.2579185520361994e-06, |
|
"loss": 0.1995, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 2.5141402714932126, |
|
"grad_norm": 4.4618682861328125, |
|
"learning_rate": 3.239064856711916e-06, |
|
"loss": 0.172, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 2.516968325791855, |
|
"grad_norm": 5.302677631378174, |
|
"learning_rate": 3.2202111613876326e-06, |
|
"loss": 0.1653, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.5197963800904977, |
|
"grad_norm": 3.299323558807373, |
|
"learning_rate": 3.2013574660633484e-06, |
|
"loss": 0.2407, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 2.52262443438914, |
|
"grad_norm": 6.668271541595459, |
|
"learning_rate": 3.182503770739065e-06, |
|
"loss": 0.2283, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 2.5254524886877827, |
|
"grad_norm": 7.668635368347168, |
|
"learning_rate": 3.1636500754147816e-06, |
|
"loss": 0.2752, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 2.5282805429864252, |
|
"grad_norm": 1.711267113685608, |
|
"learning_rate": 3.1447963800904978e-06, |
|
"loss": 0.2136, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 2.5311085972850678, |
|
"grad_norm": 8.963603019714355, |
|
"learning_rate": 3.1259426847662144e-06, |
|
"loss": 0.205, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 2.5339366515837103, |
|
"grad_norm": 2.520670175552368, |
|
"learning_rate": 3.107088989441931e-06, |
|
"loss": 0.2131, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 2.536764705882353, |
|
"grad_norm": 8.796506881713867, |
|
"learning_rate": 3.0882352941176476e-06, |
|
"loss": 0.2969, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 2.5395927601809953, |
|
"grad_norm": 7.460408687591553, |
|
"learning_rate": 3.069381598793364e-06, |
|
"loss": 0.2432, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 2.542420814479638, |
|
"grad_norm": 9.012686729431152, |
|
"learning_rate": 3.0505279034690804e-06, |
|
"loss": 0.2707, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 2.5452488687782804, |
|
"grad_norm": 5.107896327972412, |
|
"learning_rate": 3.0316742081447962e-06, |
|
"loss": 0.2239, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.5452488687782804, |
|
"eval_accuracy": 0.888880050906777, |
|
"eval_loss": 0.288782000541687, |
|
"eval_runtime": 126.5084, |
|
"eval_samples_per_second": 99.377, |
|
"eval_steps_per_second": 3.107, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.5480769230769234, |
|
"grad_norm": 2.8435633182525635, |
|
"learning_rate": 3.012820512820513e-06, |
|
"loss": 0.2544, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 2.5509049773755654, |
|
"grad_norm": 4.109634876251221, |
|
"learning_rate": 2.9939668174962294e-06, |
|
"loss": 0.2508, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 2.5537330316742084, |
|
"grad_norm": 3.3078644275665283, |
|
"learning_rate": 2.975113122171946e-06, |
|
"loss": 0.2025, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 2.5565610859728505, |
|
"grad_norm": 6.037450790405273, |
|
"learning_rate": 2.9562594268476623e-06, |
|
"loss": 0.2347, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 2.5593891402714934, |
|
"grad_norm": 5.157569408416748, |
|
"learning_rate": 2.937405731523379e-06, |
|
"loss": 0.2684, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 2.5622171945701355, |
|
"grad_norm": 2.070380210876465, |
|
"learning_rate": 2.9185520361990955e-06, |
|
"loss": 0.2217, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 2.5650452488687785, |
|
"grad_norm": 2.0333659648895264, |
|
"learning_rate": 2.899698340874812e-06, |
|
"loss": 0.1901, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 2.5678733031674206, |
|
"grad_norm": 2.8762121200561523, |
|
"learning_rate": 2.880844645550528e-06, |
|
"loss": 0.2175, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 2.5707013574660635, |
|
"grad_norm": 3.8669402599334717, |
|
"learning_rate": 2.8619909502262445e-06, |
|
"loss": 0.2218, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 2.5735294117647056, |
|
"grad_norm": 5.87692403793335, |
|
"learning_rate": 2.843137254901961e-06, |
|
"loss": 0.2058, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.5763574660633486, |
|
"grad_norm": 3.9730098247528076, |
|
"learning_rate": 2.8242835595776773e-06, |
|
"loss": 0.2191, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 2.579185520361991, |
|
"grad_norm": 3.073633909225464, |
|
"learning_rate": 2.805429864253394e-06, |
|
"loss": 0.2499, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 2.5820135746606336, |
|
"grad_norm": 3.6937789916992188, |
|
"learning_rate": 2.7865761689291105e-06, |
|
"loss": 0.2499, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 2.584841628959276, |
|
"grad_norm": 4.838074207305908, |
|
"learning_rate": 2.767722473604827e-06, |
|
"loss": 0.1883, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 2.5876696832579187, |
|
"grad_norm": 6.562351226806641, |
|
"learning_rate": 2.7488687782805433e-06, |
|
"loss": 0.2019, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 2.590497737556561, |
|
"grad_norm": 3.512963056564331, |
|
"learning_rate": 2.7300150829562595e-06, |
|
"loss": 0.1934, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 2.5933257918552037, |
|
"grad_norm": 4.1841511726379395, |
|
"learning_rate": 2.7111613876319757e-06, |
|
"loss": 0.1807, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 2.5961538461538463, |
|
"grad_norm": 4.239630222320557, |
|
"learning_rate": 2.6923076923076923e-06, |
|
"loss": 0.2101, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 2.598981900452489, |
|
"grad_norm": 3.499694585800171, |
|
"learning_rate": 2.673453996983409e-06, |
|
"loss": 0.2152, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 2.6018099547511313, |
|
"grad_norm": 3.0219247341156006, |
|
"learning_rate": 2.6546003016591256e-06, |
|
"loss": 0.2311, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.604638009049774, |
|
"grad_norm": 4.168036937713623, |
|
"learning_rate": 2.6357466063348418e-06, |
|
"loss": 0.1943, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 2.6074660633484164, |
|
"grad_norm": 1.4795814752578735, |
|
"learning_rate": 2.6168929110105584e-06, |
|
"loss": 0.1786, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 2.610294117647059, |
|
"grad_norm": 1.5753957033157349, |
|
"learning_rate": 2.598039215686275e-06, |
|
"loss": 0.1892, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 2.6131221719457014, |
|
"grad_norm": 3.37406325340271, |
|
"learning_rate": 2.5791855203619916e-06, |
|
"loss": 0.1632, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 2.615950226244344, |
|
"grad_norm": 4.640278339385986, |
|
"learning_rate": 2.5603318250377074e-06, |
|
"loss": 0.239, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 2.6187782805429864, |
|
"grad_norm": 5.864749431610107, |
|
"learning_rate": 2.541478129713424e-06, |
|
"loss": 0.2349, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 2.621606334841629, |
|
"grad_norm": 4.219099521636963, |
|
"learning_rate": 2.5226244343891406e-06, |
|
"loss": 0.2298, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 2.6244343891402715, |
|
"grad_norm": 6.88966703414917, |
|
"learning_rate": 2.503770739064857e-06, |
|
"loss": 0.2096, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 2.627262443438914, |
|
"grad_norm": 3.7265114784240723, |
|
"learning_rate": 2.4849170437405734e-06, |
|
"loss": 0.1961, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 2.6300904977375565, |
|
"grad_norm": 3.687527656555176, |
|
"learning_rate": 2.46606334841629e-06, |
|
"loss": 0.2054, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.632918552036199, |
|
"grad_norm": 5.014760971069336, |
|
"learning_rate": 2.4472096530920062e-06, |
|
"loss": 0.2425, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 2.6357466063348416, |
|
"grad_norm": 8.167291641235352, |
|
"learning_rate": 2.428355957767723e-06, |
|
"loss": 0.2079, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 2.638574660633484, |
|
"grad_norm": 4.277304649353027, |
|
"learning_rate": 2.409502262443439e-06, |
|
"loss": 0.2205, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 2.6414027149321266, |
|
"grad_norm": 5.0269975662231445, |
|
"learning_rate": 2.3906485671191556e-06, |
|
"loss": 0.2586, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 2.644230769230769, |
|
"grad_norm": 4.617335796356201, |
|
"learning_rate": 2.371794871794872e-06, |
|
"loss": 0.2167, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 2.6470588235294117, |
|
"grad_norm": 3.6927714347839355, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 0.2195, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 2.649886877828054, |
|
"grad_norm": 3.20468807220459, |
|
"learning_rate": 2.334087481146305e-06, |
|
"loss": 0.2495, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 2.6527149321266967, |
|
"grad_norm": 4.111125946044922, |
|
"learning_rate": 2.3152337858220213e-06, |
|
"loss": 0.1675, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 2.6555429864253393, |
|
"grad_norm": 3.872500419616699, |
|
"learning_rate": 2.2963800904977375e-06, |
|
"loss": 0.2614, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 2.658371040723982, |
|
"grad_norm": 5.960339069366455, |
|
"learning_rate": 2.277526395173454e-06, |
|
"loss": 0.2031, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.6611990950226243, |
|
"grad_norm": 7.735962390899658, |
|
"learning_rate": 2.2586726998491707e-06, |
|
"loss": 0.2164, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 2.664027149321267, |
|
"grad_norm": 4.943899154663086, |
|
"learning_rate": 2.2398190045248873e-06, |
|
"loss": 0.2322, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 2.6668552036199094, |
|
"grad_norm": 3.7775423526763916, |
|
"learning_rate": 2.2209653092006035e-06, |
|
"loss": 0.2238, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 2.669683257918552, |
|
"grad_norm": 6.782299995422363, |
|
"learning_rate": 2.2021116138763197e-06, |
|
"loss": 0.2141, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 2.6725113122171944, |
|
"grad_norm": 2.3152804374694824, |
|
"learning_rate": 2.1832579185520363e-06, |
|
"loss": 0.1729, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 2.6753393665158374, |
|
"grad_norm": 5.257414817810059, |
|
"learning_rate": 2.164404223227753e-06, |
|
"loss": 0.1875, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 2.6781674208144794, |
|
"grad_norm": 5.083720684051514, |
|
"learning_rate": 2.145550527903469e-06, |
|
"loss": 0.2721, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 2.6809954751131224, |
|
"grad_norm": 3.5238163471221924, |
|
"learning_rate": 2.1266968325791857e-06, |
|
"loss": 0.1752, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 2.6838235294117645, |
|
"grad_norm": 9.12520694732666, |
|
"learning_rate": 2.1078431372549023e-06, |
|
"loss": 0.2184, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 2.6866515837104075, |
|
"grad_norm": 3.9677796363830566, |
|
"learning_rate": 2.0889894419306185e-06, |
|
"loss": 0.2685, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.6894796380090495, |
|
"grad_norm": 8.702911376953125, |
|
"learning_rate": 2.0701357466063347e-06, |
|
"loss": 0.2143, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 5.3467841148376465, |
|
"learning_rate": 2.0512820512820513e-06, |
|
"loss": 0.1466, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 2.6951357466063346, |
|
"grad_norm": 8.666280746459961, |
|
"learning_rate": 2.032428355957768e-06, |
|
"loss": 0.2221, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 2.6979638009049776, |
|
"grad_norm": 4.463994979858398, |
|
"learning_rate": 2.0135746606334846e-06, |
|
"loss": 0.2115, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 2.7007918552036196, |
|
"grad_norm": 8.998452186584473, |
|
"learning_rate": 1.9947209653092008e-06, |
|
"loss": 0.2286, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 2.7036199095022626, |
|
"grad_norm": 2.3983922004699707, |
|
"learning_rate": 1.975867269984917e-06, |
|
"loss": 0.1753, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 2.706447963800905, |
|
"grad_norm": 5.0769524574279785, |
|
"learning_rate": 1.9570135746606336e-06, |
|
"loss": 0.2029, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 2.7092760180995477, |
|
"grad_norm": 3.6228933334350586, |
|
"learning_rate": 1.93815987933635e-06, |
|
"loss": 0.2282, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 2.71210407239819, |
|
"grad_norm": 7.759435176849365, |
|
"learning_rate": 1.919306184012067e-06, |
|
"loss": 0.2395, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 2.7149321266968327, |
|
"grad_norm": 7.777573585510254, |
|
"learning_rate": 1.9004524886877828e-06, |
|
"loss": 0.1992, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.7177601809954752, |
|
"grad_norm": 4.795551300048828, |
|
"learning_rate": 1.8815987933634994e-06, |
|
"loss": 0.227, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 2.7205882352941178, |
|
"grad_norm": 4.623630046844482, |
|
"learning_rate": 1.8627450980392158e-06, |
|
"loss": 0.2199, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 2.7234162895927603, |
|
"grad_norm": 1.8060227632522583, |
|
"learning_rate": 1.8438914027149324e-06, |
|
"loss": 0.2662, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 2.726244343891403, |
|
"grad_norm": 4.0437798500061035, |
|
"learning_rate": 1.8250377073906486e-06, |
|
"loss": 0.2106, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 2.7290723981900453, |
|
"grad_norm": 2.629993200302124, |
|
"learning_rate": 1.806184012066365e-06, |
|
"loss": 0.2275, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 2.731900452488688, |
|
"grad_norm": 4.662147045135498, |
|
"learning_rate": 1.7873303167420816e-06, |
|
"loss": 0.1597, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 2.7347285067873304, |
|
"grad_norm": 7.3248066902160645, |
|
"learning_rate": 1.768476621417798e-06, |
|
"loss": 0.2161, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 2.737556561085973, |
|
"grad_norm": 5.798586845397949, |
|
"learning_rate": 1.7496229260935144e-06, |
|
"loss": 0.2557, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 2.7403846153846154, |
|
"grad_norm": 2.832303524017334, |
|
"learning_rate": 1.7307692307692308e-06, |
|
"loss": 0.2108, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 2.743212669683258, |
|
"grad_norm": 1.038588047027588, |
|
"learning_rate": 1.7119155354449475e-06, |
|
"loss": 0.2141, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.7460407239819005, |
|
"grad_norm": 6.463703155517578, |
|
"learning_rate": 1.6930618401206639e-06, |
|
"loss": 0.219, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 2.748868778280543, |
|
"grad_norm": 6.210083484649658, |
|
"learning_rate": 1.67420814479638e-06, |
|
"loss": 0.1784, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 2.7516968325791855, |
|
"grad_norm": 5.5614848136901855, |
|
"learning_rate": 1.6553544494720967e-06, |
|
"loss": 0.2848, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 2.754524886877828, |
|
"grad_norm": 6.321543216705322, |
|
"learning_rate": 1.636500754147813e-06, |
|
"loss": 0.1904, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 2.7573529411764706, |
|
"grad_norm": 2.9993443489074707, |
|
"learning_rate": 1.6176470588235297e-06, |
|
"loss": 0.2348, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 2.760180995475113, |
|
"grad_norm": 2.8095312118530273, |
|
"learning_rate": 1.5987933634992459e-06, |
|
"loss": 0.2194, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 2.7630090497737556, |
|
"grad_norm": 9.010799407958984, |
|
"learning_rate": 1.5799396681749623e-06, |
|
"loss": 0.274, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 2.765837104072398, |
|
"grad_norm": 4.045629501342773, |
|
"learning_rate": 1.561085972850679e-06, |
|
"loss": 0.2011, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 2.7686651583710407, |
|
"grad_norm": 7.133453845977783, |
|
"learning_rate": 1.5422322775263953e-06, |
|
"loss": 0.2241, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 2.771493212669683, |
|
"grad_norm": 4.382336616516113, |
|
"learning_rate": 1.5233785822021115e-06, |
|
"loss": 0.2694, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.7743212669683257, |
|
"grad_norm": 4.200496673583984, |
|
"learning_rate": 1.5045248868778281e-06, |
|
"loss": 0.2299, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 2.7771493212669682, |
|
"grad_norm": 3.4665303230285645, |
|
"learning_rate": 1.4856711915535445e-06, |
|
"loss": 0.1934, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 2.7799773755656108, |
|
"grad_norm": 5.625051498413086, |
|
"learning_rate": 1.4668174962292611e-06, |
|
"loss": 0.2565, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 2.7828054298642533, |
|
"grad_norm": 0.8546460866928101, |
|
"learning_rate": 1.4479638009049775e-06, |
|
"loss": 0.159, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 2.785633484162896, |
|
"grad_norm": 2.4043455123901367, |
|
"learning_rate": 1.429110105580694e-06, |
|
"loss": 0.1937, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 2.7884615384615383, |
|
"grad_norm": 5.863745212554932, |
|
"learning_rate": 1.4102564102564104e-06, |
|
"loss": 0.2213, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 2.791289592760181, |
|
"grad_norm": 4.6385722160339355, |
|
"learning_rate": 1.391402714932127e-06, |
|
"loss": 0.263, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 2.7941176470588234, |
|
"grad_norm": 6.428844928741455, |
|
"learning_rate": 1.3725490196078434e-06, |
|
"loss": 0.1866, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 2.7969457013574663, |
|
"grad_norm": 4.29943323135376, |
|
"learning_rate": 1.3536953242835596e-06, |
|
"loss": 0.2021, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 2.7997737556561084, |
|
"grad_norm": 3.2437448501586914, |
|
"learning_rate": 1.3348416289592762e-06, |
|
"loss": 0.1571, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.8026018099547514, |
|
"grad_norm": 3.756850481033325, |
|
"learning_rate": 1.3159879336349926e-06, |
|
"loss": 0.2301, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 2.8054298642533935, |
|
"grad_norm": 4.855559825897217, |
|
"learning_rate": 1.2971342383107092e-06, |
|
"loss": 0.2538, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 2.8082579185520364, |
|
"grad_norm": 4.502439498901367, |
|
"learning_rate": 1.2782805429864254e-06, |
|
"loss": 0.1862, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 2.8110859728506785, |
|
"grad_norm": 6.242438793182373, |
|
"learning_rate": 1.2594268476621418e-06, |
|
"loss": 0.145, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 2.8139140271493215, |
|
"grad_norm": 6.00593900680542, |
|
"learning_rate": 1.2405731523378584e-06, |
|
"loss": 0.2097, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 2.8167420814479636, |
|
"grad_norm": 10.398560523986816, |
|
"learning_rate": 1.2217194570135748e-06, |
|
"loss": 0.2605, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 2.8195701357466065, |
|
"grad_norm": 4.909145832061768, |
|
"learning_rate": 1.2028657616892912e-06, |
|
"loss": 0.2473, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 2.8223981900452486, |
|
"grad_norm": 4.879443168640137, |
|
"learning_rate": 1.1840120663650076e-06, |
|
"loss": 0.2071, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 2.8252262443438916, |
|
"grad_norm": 5.404385566711426, |
|
"learning_rate": 1.165158371040724e-06, |
|
"loss": 0.2138, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 2.8280542986425337, |
|
"grad_norm": 3.4741604328155518, |
|
"learning_rate": 1.1463046757164404e-06, |
|
"loss": 0.2502, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.8280542986425337, |
|
"eval_accuracy": 0.888880050906777, |
|
"eval_loss": 0.2902699112892151, |
|
"eval_runtime": 126.3428, |
|
"eval_samples_per_second": 99.507, |
|
"eval_steps_per_second": 3.111, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.8308823529411766, |
|
"grad_norm": 4.25987434387207, |
|
"learning_rate": 1.127450980392157e-06, |
|
"loss": 0.2242, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 2.833710407239819, |
|
"grad_norm": 7.45045280456543, |
|
"learning_rate": 1.1085972850678732e-06, |
|
"loss": 0.2791, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 2.8365384615384617, |
|
"grad_norm": 4.844043254852295, |
|
"learning_rate": 1.0897435897435899e-06, |
|
"loss": 0.2357, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 2.839366515837104, |
|
"grad_norm": 5.769428253173828, |
|
"learning_rate": 1.0708898944193063e-06, |
|
"loss": 0.2296, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 2.8421945701357467, |
|
"grad_norm": 7.023671627044678, |
|
"learning_rate": 1.0520361990950227e-06, |
|
"loss": 0.2318, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 2.8450226244343892, |
|
"grad_norm": 3.501164436340332, |
|
"learning_rate": 1.033182503770739e-06, |
|
"loss": 0.2456, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 2.8478506787330318, |
|
"grad_norm": 9.939863204956055, |
|
"learning_rate": 1.0143288084464557e-06, |
|
"loss": 0.2474, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 2.8506787330316743, |
|
"grad_norm": 5.502429962158203, |
|
"learning_rate": 9.954751131221719e-07, |
|
"loss": 0.2381, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 2.853506787330317, |
|
"grad_norm": 5.186315536499023, |
|
"learning_rate": 9.766214177978885e-07, |
|
"loss": 0.2141, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 2.8563348416289593, |
|
"grad_norm": 11.375553131103516, |
|
"learning_rate": 9.57767722473605e-07, |
|
"loss": 0.2459, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.859162895927602, |
|
"grad_norm": 4.658810615539551, |
|
"learning_rate": 9.389140271493213e-07, |
|
"loss": 0.1952, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 2.8619909502262444, |
|
"grad_norm": 2.7533957958221436, |
|
"learning_rate": 9.200603318250378e-07, |
|
"loss": 0.2113, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 2.864819004524887, |
|
"grad_norm": 2.1169681549072266, |
|
"learning_rate": 9.012066365007542e-07, |
|
"loss": 0.198, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 2.8676470588235294, |
|
"grad_norm": 5.239007472991943, |
|
"learning_rate": 8.823529411764707e-07, |
|
"loss": 0.1799, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 2.870475113122172, |
|
"grad_norm": 9.836233139038086, |
|
"learning_rate": 8.634992458521871e-07, |
|
"loss": 0.2345, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 2.8733031674208145, |
|
"grad_norm": 2.392709970474243, |
|
"learning_rate": 8.446455505279036e-07, |
|
"loss": 0.2205, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 2.876131221719457, |
|
"grad_norm": 2.652374267578125, |
|
"learning_rate": 8.257918552036199e-07, |
|
"loss": 0.155, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 2.8789592760180995, |
|
"grad_norm": 8.32434368133545, |
|
"learning_rate": 8.069381598793364e-07, |
|
"loss": 0.2262, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 2.881787330316742, |
|
"grad_norm": 5.847408771514893, |
|
"learning_rate": 7.880844645550528e-07, |
|
"loss": 0.2315, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 2.8846153846153846, |
|
"grad_norm": 2.832589864730835, |
|
"learning_rate": 7.692307692307694e-07, |
|
"loss": 0.2618, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.887443438914027, |
|
"grad_norm": 4.295781135559082, |
|
"learning_rate": 7.503770739064857e-07, |
|
"loss": 0.2084, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 2.8902714932126696, |
|
"grad_norm": 5.640412330627441, |
|
"learning_rate": 7.315233785822022e-07, |
|
"loss": 0.2891, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 2.893099547511312, |
|
"grad_norm": 7.115928649902344, |
|
"learning_rate": 7.126696832579186e-07, |
|
"loss": 0.2689, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 2.8959276018099547, |
|
"grad_norm": 4.832301139831543, |
|
"learning_rate": 6.938159879336351e-07, |
|
"loss": 0.1793, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 2.898755656108597, |
|
"grad_norm": 5.678529262542725, |
|
"learning_rate": 6.749622926093515e-07, |
|
"loss": 0.1599, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 2.9015837104072397, |
|
"grad_norm": 6.394534587860107, |
|
"learning_rate": 6.56108597285068e-07, |
|
"loss": 0.2382, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 2.9044117647058822, |
|
"grad_norm": 5.185941219329834, |
|
"learning_rate": 6.372549019607843e-07, |
|
"loss": 0.1801, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 2.9072398190045248, |
|
"grad_norm": 3.3339009284973145, |
|
"learning_rate": 6.184012066365008e-07, |
|
"loss": 0.2108, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 2.9100678733031673, |
|
"grad_norm": 4.131908416748047, |
|
"learning_rate": 5.995475113122173e-07, |
|
"loss": 0.1994, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 2.91289592760181, |
|
"grad_norm": 5.131499290466309, |
|
"learning_rate": 5.806938159879337e-07, |
|
"loss": 0.2334, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 2.9157239819004523, |
|
"grad_norm": 6.0886712074279785, |
|
"learning_rate": 5.618401206636501e-07, |
|
"loss": 0.1895, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 2.918552036199095, |
|
"grad_norm": 6.050991058349609, |
|
"learning_rate": 5.429864253393665e-07, |
|
"loss": 0.2218, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 2.9213800904977374, |
|
"grad_norm": 5.902265548706055, |
|
"learning_rate": 5.24132730015083e-07, |
|
"loss": 0.2539, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 2.9242081447963804, |
|
"grad_norm": 2.757305860519409, |
|
"learning_rate": 5.052790346907994e-07, |
|
"loss": 0.1548, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 2.9270361990950224, |
|
"grad_norm": 2.187263011932373, |
|
"learning_rate": 4.864253393665158e-07, |
|
"loss": 0.164, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 2.9298642533936654, |
|
"grad_norm": 6.808703899383545, |
|
"learning_rate": 4.675716440422323e-07, |
|
"loss": 0.2073, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 2.9326923076923075, |
|
"grad_norm": 7.97061014175415, |
|
"learning_rate": 4.4871794871794876e-07, |
|
"loss": 0.2322, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 2.9355203619909505, |
|
"grad_norm": 6.372758865356445, |
|
"learning_rate": 4.298642533936652e-07, |
|
"loss": 0.2257, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 2.9383484162895925, |
|
"grad_norm": 6.094609260559082, |
|
"learning_rate": 4.110105580693816e-07, |
|
"loss": 0.2583, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 2.3653512001037598, |
|
"learning_rate": 3.921568627450981e-07, |
|
"loss": 0.2137, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.9440045248868776, |
|
"grad_norm": 2.020627737045288, |
|
"learning_rate": 3.733031674208145e-07, |
|
"loss": 0.1286, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 2.9468325791855206, |
|
"grad_norm": 8.650784492492676, |
|
"learning_rate": 3.5444947209653094e-07, |
|
"loss": 0.2233, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 2.9496606334841626, |
|
"grad_norm": 4.553081512451172, |
|
"learning_rate": 3.355957767722474e-07, |
|
"loss": 0.1869, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 2.9524886877828056, |
|
"grad_norm": 3.9334750175476074, |
|
"learning_rate": 3.167420814479638e-07, |
|
"loss": 0.2109, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 2.955316742081448, |
|
"grad_norm": 10.762858390808105, |
|
"learning_rate": 2.978883861236803e-07, |
|
"loss": 0.2151, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 2.9581447963800906, |
|
"grad_norm": 6.37054967880249, |
|
"learning_rate": 2.790346907993967e-07, |
|
"loss": 0.2099, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 2.960972850678733, |
|
"grad_norm": 4.642254829406738, |
|
"learning_rate": 2.6018099547511317e-07, |
|
"loss": 0.1753, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 2.9638009049773757, |
|
"grad_norm": 4.995135307312012, |
|
"learning_rate": 2.4132730015082957e-07, |
|
"loss": 0.1708, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 2.966628959276018, |
|
"grad_norm": 5.875439643859863, |
|
"learning_rate": 2.2247360482654603e-07, |
|
"loss": 0.2445, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 2.9694570135746607, |
|
"grad_norm": 0.8066132664680481, |
|
"learning_rate": 2.0361990950226246e-07, |
|
"loss": 0.1628, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.9722850678733033, |
|
"grad_norm": 9.494747161865234, |
|
"learning_rate": 1.847662141779789e-07, |
|
"loss": 0.1322, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 2.975113122171946, |
|
"grad_norm": 6.8470001220703125, |
|
"learning_rate": 1.6591251885369535e-07, |
|
"loss": 0.2345, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 2.9779411764705883, |
|
"grad_norm": 5.916505813598633, |
|
"learning_rate": 1.4705882352941178e-07, |
|
"loss": 0.2001, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 2.980769230769231, |
|
"grad_norm": 6.173225402832031, |
|
"learning_rate": 1.282051282051282e-07, |
|
"loss": 0.2196, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 2.9835972850678734, |
|
"grad_norm": 4.780458927154541, |
|
"learning_rate": 1.0935143288084465e-07, |
|
"loss": 0.2821, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 2.986425339366516, |
|
"grad_norm": 2.4266226291656494, |
|
"learning_rate": 9.04977375565611e-08, |
|
"loss": 0.2189, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 2.9892533936651584, |
|
"grad_norm": 3.3179211616516113, |
|
"learning_rate": 7.164404223227753e-08, |
|
"loss": 0.2647, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 2.992081447963801, |
|
"grad_norm": 4.998234272003174, |
|
"learning_rate": 5.279034690799398e-08, |
|
"loss": 0.1746, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 2.9949095022624435, |
|
"grad_norm": 5.929104804992676, |
|
"learning_rate": 3.393665158371041e-08, |
|
"loss": 0.1786, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 2.997737556561086, |
|
"grad_norm": 4.823156356811523, |
|
"learning_rate": 1.5082956259426848e-08, |
|
"loss": 0.217, |
|
"step": 10600 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10608, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.132054385068024e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|