{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 1000, "global_step": 10608, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002828054298642534, "grad_norm": 2.258908748626709, "learning_rate": 1.9981146304675718e-05, "loss": 1.5648, "step": 10 }, { "epoch": 0.005656108597285068, "grad_norm": 2.3489885330200195, "learning_rate": 1.9962292609351435e-05, "loss": 1.2587, "step": 20 }, { "epoch": 0.008484162895927601, "grad_norm": 3.1505980491638184, "learning_rate": 1.994343891402715e-05, "loss": 0.996, "step": 30 }, { "epoch": 0.011312217194570135, "grad_norm": 2.7015912532806396, "learning_rate": 1.9924585218702868e-05, "loss": 0.991, "step": 40 }, { "epoch": 0.01414027149321267, "grad_norm": 2.6314280033111572, "learning_rate": 1.9905731523378585e-05, "loss": 0.934, "step": 50 }, { "epoch": 0.016968325791855202, "grad_norm": 2.055577516555786, "learning_rate": 1.98868778280543e-05, "loss": 0.8245, "step": 60 }, { "epoch": 0.019796380090497737, "grad_norm": 2.4679293632507324, "learning_rate": 1.9868024132730018e-05, "loss": 0.7501, "step": 70 }, { "epoch": 0.02262443438914027, "grad_norm": 4.737666130065918, "learning_rate": 1.9849170437405735e-05, "loss": 0.7572, "step": 80 }, { "epoch": 0.025452488687782805, "grad_norm": 3.135712146759033, "learning_rate": 1.983031674208145e-05, "loss": 0.6785, "step": 90 }, { "epoch": 0.02828054298642534, "grad_norm": 3.1648664474487305, "learning_rate": 1.9811463046757168e-05, "loss": 0.6764, "step": 100 }, { "epoch": 0.031108597285067874, "grad_norm": 2.4548099040985107, "learning_rate": 1.9792609351432884e-05, "loss": 0.6322, "step": 110 }, { "epoch": 0.033936651583710405, "grad_norm": 2.9620838165283203, "learning_rate": 1.97737556561086e-05, "loss": 0.6756, "step": 120 }, { "epoch": 0.03676470588235294, "grad_norm": 6.274707794189453, "learning_rate": 1.9754901960784318e-05, "loss": 0.6566, "step": 130 }, { "epoch": 0.03959276018099547, "grad_norm": 4.661096096038818, "learning_rate": 1.9736048265460034e-05, "loss": 0.6179, "step": 140 }, { "epoch": 0.04242081447963801, "grad_norm": 3.97485613822937, "learning_rate": 1.971719457013575e-05, "loss": 0.5623, "step": 150 }, { "epoch": 0.04524886877828054, "grad_norm": 5.157923698425293, "learning_rate": 1.9698340874811464e-05, "loss": 0.5132, "step": 160 }, { "epoch": 0.04807692307692308, "grad_norm": 4.4768385887146, "learning_rate": 1.967948717948718e-05, "loss": 0.5081, "step": 170 }, { "epoch": 0.05090497737556561, "grad_norm": 4.282762050628662, "learning_rate": 1.9660633484162897e-05, "loss": 0.5104, "step": 180 }, { "epoch": 0.05373303167420815, "grad_norm": 5.214530944824219, "learning_rate": 1.9641779788838614e-05, "loss": 0.6005, "step": 190 }, { "epoch": 0.05656108597285068, "grad_norm": 2.7484629154205322, "learning_rate": 1.962292609351433e-05, "loss": 0.5388, "step": 200 }, { "epoch": 0.05938914027149321, "grad_norm": 3.6074981689453125, "learning_rate": 1.9604072398190047e-05, "loss": 0.512, "step": 210 }, { "epoch": 0.06221719457013575, "grad_norm": 5.487575054168701, "learning_rate": 1.9585218702865764e-05, "loss": 0.5233, "step": 220 }, { "epoch": 0.06504524886877829, "grad_norm": 3.193223714828491, "learning_rate": 1.956636500754148e-05, "loss": 0.5091, "step": 230 }, { "epoch": 0.06787330316742081, "grad_norm": 8.64730167388916, "learning_rate": 1.9547511312217197e-05, "loss": 0.472, "step": 240 }, { "epoch": 0.07070135746606335, "grad_norm": 3.8944942951202393, "learning_rate": 1.9528657616892914e-05, "loss": 0.5551, "step": 250 }, { "epoch": 0.07352941176470588, "grad_norm": 3.6671645641326904, "learning_rate": 1.950980392156863e-05, "loss": 0.5575, "step": 260 }, { "epoch": 0.07635746606334842, "grad_norm": 5.698751449584961, "learning_rate": 1.9490950226244343e-05, "loss": 0.4495, "step": 270 }, { "epoch": 0.07918552036199095, "grad_norm": 6.565758228302002, "learning_rate": 1.947209653092006e-05, "loss": 0.5017, "step": 280 }, { "epoch": 0.08201357466063348, "grad_norm": 4.61959171295166, "learning_rate": 1.9453242835595777e-05, "loss": 0.4639, "step": 290 }, { "epoch": 0.08484162895927602, "grad_norm": 4.656176567077637, "learning_rate": 1.9434389140271493e-05, "loss": 0.4566, "step": 300 }, { "epoch": 0.08766968325791855, "grad_norm": 3.0015969276428223, "learning_rate": 1.941553544494721e-05, "loss": 0.5226, "step": 310 }, { "epoch": 0.09049773755656108, "grad_norm": 3.7588982582092285, "learning_rate": 1.9396681749622927e-05, "loss": 0.4415, "step": 320 }, { "epoch": 0.09332579185520362, "grad_norm": 8.415599822998047, "learning_rate": 1.9377828054298643e-05, "loss": 0.4992, "step": 330 }, { "epoch": 0.09615384615384616, "grad_norm": 2.917985677719116, "learning_rate": 1.935897435897436e-05, "loss": 0.392, "step": 340 }, { "epoch": 0.09898190045248868, "grad_norm": 3.867098093032837, "learning_rate": 1.9340120663650076e-05, "loss": 0.4382, "step": 350 }, { "epoch": 0.10180995475113122, "grad_norm": 3.6737847328186035, "learning_rate": 1.9321266968325793e-05, "loss": 0.4458, "step": 360 }, { "epoch": 0.10463800904977376, "grad_norm": 3.9890048503875732, "learning_rate": 1.930241327300151e-05, "loss": 0.4038, "step": 370 }, { "epoch": 0.1074660633484163, "grad_norm": 7.732723236083984, "learning_rate": 1.9283559577677226e-05, "loss": 0.4541, "step": 380 }, { "epoch": 0.11029411764705882, "grad_norm": 7.815601348876953, "learning_rate": 1.9264705882352943e-05, "loss": 0.5302, "step": 390 }, { "epoch": 0.11312217194570136, "grad_norm": 4.053082466125488, "learning_rate": 1.924585218702866e-05, "loss": 0.4203, "step": 400 }, { "epoch": 0.1159502262443439, "grad_norm": 5.627740859985352, "learning_rate": 1.9226998491704376e-05, "loss": 0.4473, "step": 410 }, { "epoch": 0.11877828054298642, "grad_norm": 5.07185697555542, "learning_rate": 1.9208144796380093e-05, "loss": 0.4345, "step": 420 }, { "epoch": 0.12160633484162896, "grad_norm": 3.9544167518615723, "learning_rate": 1.918929110105581e-05, "loss": 0.4849, "step": 430 }, { "epoch": 0.1244343891402715, "grad_norm": 6.9721293449401855, "learning_rate": 1.9170437405731526e-05, "loss": 0.4029, "step": 440 }, { "epoch": 0.12726244343891402, "grad_norm": 4.0150556564331055, "learning_rate": 1.9151583710407243e-05, "loss": 0.5236, "step": 450 }, { "epoch": 0.13009049773755657, "grad_norm": 6.681572914123535, "learning_rate": 1.913273001508296e-05, "loss": 0.4059, "step": 460 }, { "epoch": 0.1329185520361991, "grad_norm": 5.1267876625061035, "learning_rate": 1.9113876319758676e-05, "loss": 0.467, "step": 470 }, { "epoch": 0.13574660633484162, "grad_norm": 3.66207218170166, "learning_rate": 1.9095022624434392e-05, "loss": 0.3689, "step": 480 }, { "epoch": 0.13857466063348417, "grad_norm": 4.016237735748291, "learning_rate": 1.907616892911011e-05, "loss": 0.4502, "step": 490 }, { "epoch": 0.1414027149321267, "grad_norm": 5.003229141235352, "learning_rate": 1.9057315233785822e-05, "loss": 0.3935, "step": 500 }, { "epoch": 0.14423076923076922, "grad_norm": 7.917128562927246, "learning_rate": 1.903846153846154e-05, "loss": 0.4765, "step": 510 }, { "epoch": 0.14705882352941177, "grad_norm": 4.975776672363281, "learning_rate": 1.9019607843137255e-05, "loss": 0.4564, "step": 520 }, { "epoch": 0.1498868778280543, "grad_norm": 5.559664726257324, "learning_rate": 1.9000754147812972e-05, "loss": 0.3663, "step": 530 }, { "epoch": 0.15271493212669685, "grad_norm": 4.463156700134277, "learning_rate": 1.898190045248869e-05, "loss": 0.358, "step": 540 }, { "epoch": 0.15554298642533937, "grad_norm": 4.642306327819824, "learning_rate": 1.8963046757164405e-05, "loss": 0.4083, "step": 550 }, { "epoch": 0.1583710407239819, "grad_norm": 10.270988464355469, "learning_rate": 1.8944193061840122e-05, "loss": 0.5147, "step": 560 }, { "epoch": 0.16119909502262444, "grad_norm": 4.234387397766113, "learning_rate": 1.892533936651584e-05, "loss": 0.5115, "step": 570 }, { "epoch": 0.16402714932126697, "grad_norm": 5.710938930511475, "learning_rate": 1.8906485671191555e-05, "loss": 0.4483, "step": 580 }, { "epoch": 0.1668552036199095, "grad_norm": 8.131850242614746, "learning_rate": 1.8887631975867272e-05, "loss": 0.4282, "step": 590 }, { "epoch": 0.16968325791855204, "grad_norm": 4.962357997894287, "learning_rate": 1.886877828054299e-05, "loss": 0.4691, "step": 600 }, { "epoch": 0.17251131221719457, "grad_norm": 7.434023857116699, "learning_rate": 1.8849924585218705e-05, "loss": 0.3864, "step": 610 }, { "epoch": 0.1753393665158371, "grad_norm": 3.2809391021728516, "learning_rate": 1.883107088989442e-05, "loss": 0.3506, "step": 620 }, { "epoch": 0.17816742081447964, "grad_norm": 4.324213027954102, "learning_rate": 1.8812217194570138e-05, "loss": 0.3316, "step": 630 }, { "epoch": 0.18099547511312217, "grad_norm": 4.655824661254883, "learning_rate": 1.8793363499245855e-05, "loss": 0.4192, "step": 640 }, { "epoch": 0.18382352941176472, "grad_norm": 3.21587872505188, "learning_rate": 1.877450980392157e-05, "loss": 0.3091, "step": 650 }, { "epoch": 0.18665158371040724, "grad_norm": 5.112785339355469, "learning_rate": 1.8755656108597288e-05, "loss": 0.4218, "step": 660 }, { "epoch": 0.18947963800904977, "grad_norm": 2.902008533477783, "learning_rate": 1.8736802413273005e-05, "loss": 0.3595, "step": 670 }, { "epoch": 0.19230769230769232, "grad_norm": 4.45237398147583, "learning_rate": 1.8717948717948718e-05, "loss": 0.4378, "step": 680 }, { "epoch": 0.19513574660633484, "grad_norm": 3.8630735874176025, "learning_rate": 1.8699095022624435e-05, "loss": 0.4072, "step": 690 }, { "epoch": 0.19796380090497737, "grad_norm": 3.8529324531555176, "learning_rate": 1.868024132730015e-05, "loss": 0.4335, "step": 700 }, { "epoch": 0.20079185520361992, "grad_norm": 2.7820119857788086, "learning_rate": 1.8661387631975868e-05, "loss": 0.4457, "step": 710 }, { "epoch": 0.20361990950226244, "grad_norm": 7.398665428161621, "learning_rate": 1.8642533936651584e-05, "loss": 0.4552, "step": 720 }, { "epoch": 0.20644796380090497, "grad_norm": 5.17724609375, "learning_rate": 1.86236802413273e-05, "loss": 0.4322, "step": 730 }, { "epoch": 0.20927601809954752, "grad_norm": 7.6877851486206055, "learning_rate": 1.8604826546003018e-05, "loss": 0.3493, "step": 740 }, { "epoch": 0.21210407239819004, "grad_norm": 3.655296802520752, "learning_rate": 1.8585972850678734e-05, "loss": 0.4361, "step": 750 }, { "epoch": 0.2149321266968326, "grad_norm": 5.219052791595459, "learning_rate": 1.856711915535445e-05, "loss": 0.391, "step": 760 }, { "epoch": 0.21776018099547512, "grad_norm": 7.211420059204102, "learning_rate": 1.8548265460030168e-05, "loss": 0.3808, "step": 770 }, { "epoch": 0.22058823529411764, "grad_norm": 3.348724842071533, "learning_rate": 1.8529411764705884e-05, "loss": 0.4041, "step": 780 }, { "epoch": 0.2234162895927602, "grad_norm": 5.180337905883789, "learning_rate": 1.85105580693816e-05, "loss": 0.2996, "step": 790 }, { "epoch": 0.22624434389140272, "grad_norm": 5.770839214324951, "learning_rate": 1.8491704374057317e-05, "loss": 0.3898, "step": 800 }, { "epoch": 0.22907239819004524, "grad_norm": 2.4614834785461426, "learning_rate": 1.8472850678733034e-05, "loss": 0.3307, "step": 810 }, { "epoch": 0.2319004524886878, "grad_norm": 5.4143548011779785, "learning_rate": 1.845399698340875e-05, "loss": 0.4487, "step": 820 }, { "epoch": 0.23472850678733032, "grad_norm": 3.846611499786377, "learning_rate": 1.8435143288084464e-05, "loss": 0.4334, "step": 830 }, { "epoch": 0.23755656108597284, "grad_norm": 7.2528510093688965, "learning_rate": 1.841628959276018e-05, "loss": 0.3471, "step": 840 }, { "epoch": 0.2403846153846154, "grad_norm": 4.265413284301758, "learning_rate": 1.8397435897435897e-05, "loss": 0.3879, "step": 850 }, { "epoch": 0.24321266968325791, "grad_norm": 4.564918518066406, "learning_rate": 1.8378582202111614e-05, "loss": 0.4142, "step": 860 }, { "epoch": 0.24604072398190044, "grad_norm": 4.268716335296631, "learning_rate": 1.835972850678733e-05, "loss": 0.4358, "step": 870 }, { "epoch": 0.248868778280543, "grad_norm": 6.909433841705322, "learning_rate": 1.8340874811463047e-05, "loss": 0.405, "step": 880 }, { "epoch": 0.2516968325791855, "grad_norm": 6.801779270172119, "learning_rate": 1.8322021116138764e-05, "loss": 0.413, "step": 890 }, { "epoch": 0.25452488687782804, "grad_norm": 4.851901531219482, "learning_rate": 1.830316742081448e-05, "loss": 0.4337, "step": 900 }, { "epoch": 0.25735294117647056, "grad_norm": 2.650651693344116, "learning_rate": 1.8284313725490197e-05, "loss": 0.372, "step": 910 }, { "epoch": 0.26018099547511314, "grad_norm": 3.660430669784546, "learning_rate": 1.8265460030165913e-05, "loss": 0.4142, "step": 920 }, { "epoch": 0.26300904977375567, "grad_norm": 6.7838544845581055, "learning_rate": 1.824660633484163e-05, "loss": 0.4053, "step": 930 }, { "epoch": 0.2658371040723982, "grad_norm": 4.861571311950684, "learning_rate": 1.8227752639517347e-05, "loss": 0.4416, "step": 940 }, { "epoch": 0.2686651583710407, "grad_norm": 5.85540771484375, "learning_rate": 1.8208898944193063e-05, "loss": 0.3848, "step": 950 }, { "epoch": 0.27149321266968324, "grad_norm": 2.6535346508026123, "learning_rate": 1.819004524886878e-05, "loss": 0.4228, "step": 960 }, { "epoch": 0.2743212669683258, "grad_norm": 2.8000805377960205, "learning_rate": 1.8171191553544496e-05, "loss": 0.3476, "step": 970 }, { "epoch": 0.27714932126696834, "grad_norm": 5.302433013916016, "learning_rate": 1.8152337858220213e-05, "loss": 0.3536, "step": 980 }, { "epoch": 0.27997737556561086, "grad_norm": 4.889918327331543, "learning_rate": 1.813348416289593e-05, "loss": 0.3817, "step": 990 }, { "epoch": 0.2828054298642534, "grad_norm": 6.261002540588379, "learning_rate": 1.8114630467571646e-05, "loss": 0.3976, "step": 1000 }, { "epoch": 0.2828054298642534, "eval_accuracy": 0.8570633153038498, "eval_loss": 0.3707549571990967, "eval_runtime": 127.6312, "eval_samples_per_second": 98.503, "eval_steps_per_second": 3.079, "step": 1000 }, { "epoch": 0.2856334841628959, "grad_norm": 4.15647554397583, "learning_rate": 1.8095776772247363e-05, "loss": 0.3198, "step": 1010 }, { "epoch": 0.28846153846153844, "grad_norm": 3.652892589569092, "learning_rate": 1.807692307692308e-05, "loss": 0.376, "step": 1020 }, { "epoch": 0.291289592760181, "grad_norm": 7.219604015350342, "learning_rate": 1.8058069381598796e-05, "loss": 0.4519, "step": 1030 }, { "epoch": 0.29411764705882354, "grad_norm": 3.8920180797576904, "learning_rate": 1.8039215686274513e-05, "loss": 0.3336, "step": 1040 }, { "epoch": 0.29694570135746606, "grad_norm": 4.875617504119873, "learning_rate": 1.802036199095023e-05, "loss": 0.357, "step": 1050 }, { "epoch": 0.2997737556561086, "grad_norm": 3.1264288425445557, "learning_rate": 1.8001508295625946e-05, "loss": 0.3482, "step": 1060 }, { "epoch": 0.3026018099547511, "grad_norm": 3.4531030654907227, "learning_rate": 1.7982654600301663e-05, "loss": 0.3533, "step": 1070 }, { "epoch": 0.3054298642533937, "grad_norm": 3.0971388816833496, "learning_rate": 1.796380090497738e-05, "loss": 0.3737, "step": 1080 }, { "epoch": 0.3082579185520362, "grad_norm": 3.3527133464813232, "learning_rate": 1.7944947209653092e-05, "loss": 0.4425, "step": 1090 }, { "epoch": 0.31108597285067874, "grad_norm": 3.197056293487549, "learning_rate": 1.792609351432881e-05, "loss": 0.3657, "step": 1100 }, { "epoch": 0.31391402714932126, "grad_norm": 4.942928791046143, "learning_rate": 1.7907239819004526e-05, "loss": 0.3988, "step": 1110 }, { "epoch": 0.3167420814479638, "grad_norm": 4.839690208435059, "learning_rate": 1.7888386123680242e-05, "loss": 0.3794, "step": 1120 }, { "epoch": 0.3195701357466063, "grad_norm": 5.171438694000244, "learning_rate": 1.786953242835596e-05, "loss": 0.3795, "step": 1130 }, { "epoch": 0.3223981900452489, "grad_norm": 2.4731950759887695, "learning_rate": 1.7850678733031676e-05, "loss": 0.3396, "step": 1140 }, { "epoch": 0.3252262443438914, "grad_norm": 4.658932209014893, "learning_rate": 1.7831825037707392e-05, "loss": 0.2841, "step": 1150 }, { "epoch": 0.32805429864253394, "grad_norm": 3.5409414768218994, "learning_rate": 1.781297134238311e-05, "loss": 0.3701, "step": 1160 }, { "epoch": 0.33088235294117646, "grad_norm": 3.814213275909424, "learning_rate": 1.7794117647058825e-05, "loss": 0.3445, "step": 1170 }, { "epoch": 0.333710407239819, "grad_norm": 3.226147413253784, "learning_rate": 1.7775263951734542e-05, "loss": 0.3513, "step": 1180 }, { "epoch": 0.33653846153846156, "grad_norm": 4.451591491699219, "learning_rate": 1.775641025641026e-05, "loss": 0.3648, "step": 1190 }, { "epoch": 0.3393665158371041, "grad_norm": 4.0332818031311035, "learning_rate": 1.7737556561085972e-05, "loss": 0.4392, "step": 1200 }, { "epoch": 0.3421945701357466, "grad_norm": 3.1572704315185547, "learning_rate": 1.771870286576169e-05, "loss": 0.3602, "step": 1210 }, { "epoch": 0.34502262443438914, "grad_norm": 4.314695835113525, "learning_rate": 1.7699849170437405e-05, "loss": 0.29, "step": 1220 }, { "epoch": 0.34785067873303166, "grad_norm": 5.7975239753723145, "learning_rate": 1.768099547511312e-05, "loss": 0.3716, "step": 1230 }, { "epoch": 0.3506787330316742, "grad_norm": 5.377049446105957, "learning_rate": 1.7662141779788838e-05, "loss": 0.3566, "step": 1240 }, { "epoch": 0.35350678733031676, "grad_norm": 3.84669828414917, "learning_rate": 1.7643288084464555e-05, "loss": 0.3961, "step": 1250 }, { "epoch": 0.3563348416289593, "grad_norm": 5.146121501922607, "learning_rate": 1.762443438914027e-05, "loss": 0.4366, "step": 1260 }, { "epoch": 0.3591628959276018, "grad_norm": 3.1066689491271973, "learning_rate": 1.7605580693815988e-05, "loss": 0.3698, "step": 1270 }, { "epoch": 0.36199095022624433, "grad_norm": 4.1310296058654785, "learning_rate": 1.7586726998491705e-05, "loss": 0.3631, "step": 1280 }, { "epoch": 0.36481900452488686, "grad_norm": 3.0287930965423584, "learning_rate": 1.756787330316742e-05, "loss": 0.3151, "step": 1290 }, { "epoch": 0.36764705882352944, "grad_norm": 4.4270219802856445, "learning_rate": 1.7549019607843138e-05, "loss": 0.357, "step": 1300 }, { "epoch": 0.37047511312217196, "grad_norm": 4.785469055175781, "learning_rate": 1.7530165912518855e-05, "loss": 0.3809, "step": 1310 }, { "epoch": 0.3733031674208145, "grad_norm": 5.920436859130859, "learning_rate": 1.751131221719457e-05, "loss": 0.4003, "step": 1320 }, { "epoch": 0.376131221719457, "grad_norm": 5.400911331176758, "learning_rate": 1.7492458521870288e-05, "loss": 0.4313, "step": 1330 }, { "epoch": 0.37895927601809953, "grad_norm": 6.202630996704102, "learning_rate": 1.7473604826546004e-05, "loss": 0.3792, "step": 1340 }, { "epoch": 0.38178733031674206, "grad_norm": 3.413867473602295, "learning_rate": 1.745475113122172e-05, "loss": 0.3881, "step": 1350 }, { "epoch": 0.38461538461538464, "grad_norm": 5.005847930908203, "learning_rate": 1.7435897435897438e-05, "loss": 0.4344, "step": 1360 }, { "epoch": 0.38744343891402716, "grad_norm": 4.416658878326416, "learning_rate": 1.7417043740573154e-05, "loss": 0.332, "step": 1370 }, { "epoch": 0.3902714932126697, "grad_norm": 5.2433247566223145, "learning_rate": 1.739819004524887e-05, "loss": 0.3873, "step": 1380 }, { "epoch": 0.3930995475113122, "grad_norm": 3.740522861480713, "learning_rate": 1.7379336349924588e-05, "loss": 0.3495, "step": 1390 }, { "epoch": 0.39592760180995473, "grad_norm": 6.047609329223633, "learning_rate": 1.7360482654600304e-05, "loss": 0.3922, "step": 1400 }, { "epoch": 0.3987556561085973, "grad_norm": 3.7461910247802734, "learning_rate": 1.734162895927602e-05, "loss": 0.3556, "step": 1410 }, { "epoch": 0.40158371040723984, "grad_norm": 7.2883405685424805, "learning_rate": 1.7322775263951737e-05, "loss": 0.319, "step": 1420 }, { "epoch": 0.40441176470588236, "grad_norm": 5.338521480560303, "learning_rate": 1.7303921568627454e-05, "loss": 0.3272, "step": 1430 }, { "epoch": 0.4072398190045249, "grad_norm": 5.680319309234619, "learning_rate": 1.728506787330317e-05, "loss": 0.3336, "step": 1440 }, { "epoch": 0.4100678733031674, "grad_norm": 3.7183480262756348, "learning_rate": 1.7266214177978887e-05, "loss": 0.3218, "step": 1450 }, { "epoch": 0.41289592760180993, "grad_norm": 4.478979110717773, "learning_rate": 1.7247360482654604e-05, "loss": 0.3719, "step": 1460 }, { "epoch": 0.4157239819004525, "grad_norm": 3.1170661449432373, "learning_rate": 1.722850678733032e-05, "loss": 0.3563, "step": 1470 }, { "epoch": 0.41855203619909503, "grad_norm": 5.310198783874512, "learning_rate": 1.7209653092006037e-05, "loss": 0.3009, "step": 1480 }, { "epoch": 0.42138009049773756, "grad_norm": 4.134536266326904, "learning_rate": 1.7190799396681754e-05, "loss": 0.4116, "step": 1490 }, { "epoch": 0.4242081447963801, "grad_norm": 2.9341182708740234, "learning_rate": 1.7171945701357467e-05, "loss": 0.3547, "step": 1500 }, { "epoch": 0.4270361990950226, "grad_norm": 2.2353224754333496, "learning_rate": 1.7153092006033184e-05, "loss": 0.3652, "step": 1510 }, { "epoch": 0.4298642533936652, "grad_norm": 4.16320276260376, "learning_rate": 1.71342383107089e-05, "loss": 0.3733, "step": 1520 }, { "epoch": 0.4326923076923077, "grad_norm": 4.933135986328125, "learning_rate": 1.7115384615384617e-05, "loss": 0.4125, "step": 1530 }, { "epoch": 0.43552036199095023, "grad_norm": 5.511205673217773, "learning_rate": 1.7096530920060333e-05, "loss": 0.3102, "step": 1540 }, { "epoch": 0.43834841628959276, "grad_norm": 4.415884494781494, "learning_rate": 1.707767722473605e-05, "loss": 0.3348, "step": 1550 }, { "epoch": 0.4411764705882353, "grad_norm": 3.8917481899261475, "learning_rate": 1.7058823529411767e-05, "loss": 0.3866, "step": 1560 }, { "epoch": 0.4440045248868778, "grad_norm": 2.751532793045044, "learning_rate": 1.7039969834087483e-05, "loss": 0.3644, "step": 1570 }, { "epoch": 0.4468325791855204, "grad_norm": 5.6193413734436035, "learning_rate": 1.70211161387632e-05, "loss": 0.3566, "step": 1580 }, { "epoch": 0.4496606334841629, "grad_norm": 3.058835744857788, "learning_rate": 1.7002262443438916e-05, "loss": 0.3681, "step": 1590 }, { "epoch": 0.45248868778280543, "grad_norm": 3.9540457725524902, "learning_rate": 1.6983408748114633e-05, "loss": 0.3132, "step": 1600 }, { "epoch": 0.45531674208144796, "grad_norm": 3.8163225650787354, "learning_rate": 1.6964555052790346e-05, "loss": 0.3654, "step": 1610 }, { "epoch": 0.4581447963800905, "grad_norm": 4.724973201751709, "learning_rate": 1.6945701357466063e-05, "loss": 0.3434, "step": 1620 }, { "epoch": 0.46097285067873306, "grad_norm": 3.3608546257019043, "learning_rate": 1.692684766214178e-05, "loss": 0.3593, "step": 1630 }, { "epoch": 0.4638009049773756, "grad_norm": 4.132437705993652, "learning_rate": 1.6907993966817496e-05, "loss": 0.3552, "step": 1640 }, { "epoch": 0.4666289592760181, "grad_norm": 4.544163227081299, "learning_rate": 1.6889140271493213e-05, "loss": 0.3678, "step": 1650 }, { "epoch": 0.46945701357466063, "grad_norm": 4.244106769561768, "learning_rate": 1.687028657616893e-05, "loss": 0.3432, "step": 1660 }, { "epoch": 0.47228506787330315, "grad_norm": 3.3168179988861084, "learning_rate": 1.6851432880844646e-05, "loss": 0.313, "step": 1670 }, { "epoch": 0.4751131221719457, "grad_norm": 4.040717601776123, "learning_rate": 1.6832579185520363e-05, "loss": 0.3334, "step": 1680 }, { "epoch": 0.47794117647058826, "grad_norm": 4.582857608795166, "learning_rate": 1.681372549019608e-05, "loss": 0.3004, "step": 1690 }, { "epoch": 0.4807692307692308, "grad_norm": 6.330207347869873, "learning_rate": 1.6794871794871796e-05, "loss": 0.3486, "step": 1700 }, { "epoch": 0.4835972850678733, "grad_norm": 3.564183473587036, "learning_rate": 1.6776018099547512e-05, "loss": 0.3312, "step": 1710 }, { "epoch": 0.48642533936651583, "grad_norm": 5.753744125366211, "learning_rate": 1.675716440422323e-05, "loss": 0.4188, "step": 1720 }, { "epoch": 0.48925339366515835, "grad_norm": 2.692269802093506, "learning_rate": 1.6738310708898946e-05, "loss": 0.3149, "step": 1730 }, { "epoch": 0.4920814479638009, "grad_norm": 3.748378038406372, "learning_rate": 1.6719457013574662e-05, "loss": 0.374, "step": 1740 }, { "epoch": 0.49490950226244346, "grad_norm": 7.150949478149414, "learning_rate": 1.670060331825038e-05, "loss": 0.4427, "step": 1750 }, { "epoch": 0.497737556561086, "grad_norm": 4.332088470458984, "learning_rate": 1.6681749622926096e-05, "loss": 0.296, "step": 1760 }, { "epoch": 0.5005656108597285, "grad_norm": 3.9501969814300537, "learning_rate": 1.6662895927601812e-05, "loss": 0.3236, "step": 1770 }, { "epoch": 0.503393665158371, "grad_norm": 4.039945602416992, "learning_rate": 1.664404223227753e-05, "loss": 0.3654, "step": 1780 }, { "epoch": 0.5062217194570136, "grad_norm": 4.735800743103027, "learning_rate": 1.6625188536953245e-05, "loss": 0.3177, "step": 1790 }, { "epoch": 0.5090497737556561, "grad_norm": 3.796029806137085, "learning_rate": 1.6606334841628962e-05, "loss": 0.3302, "step": 1800 }, { "epoch": 0.5118778280542986, "grad_norm": 2.808561086654663, "learning_rate": 1.658748114630468e-05, "loss": 0.3533, "step": 1810 }, { "epoch": 0.5147058823529411, "grad_norm": 3.9006407260894775, "learning_rate": 1.6568627450980395e-05, "loss": 0.3862, "step": 1820 }, { "epoch": 0.5175339366515838, "grad_norm": 6.6023850440979, "learning_rate": 1.654977375565611e-05, "loss": 0.4115, "step": 1830 }, { "epoch": 0.5203619909502263, "grad_norm": 3.3932111263275146, "learning_rate": 1.6530920060331825e-05, "loss": 0.2936, "step": 1840 }, { "epoch": 0.5231900452488688, "grad_norm": 4.266836166381836, "learning_rate": 1.651206636500754e-05, "loss": 0.2848, "step": 1850 }, { "epoch": 0.5260180995475113, "grad_norm": 4.283823490142822, "learning_rate": 1.6493212669683258e-05, "loss": 0.4285, "step": 1860 }, { "epoch": 0.5288461538461539, "grad_norm": 3.3755383491516113, "learning_rate": 1.6474358974358975e-05, "loss": 0.3579, "step": 1870 }, { "epoch": 0.5316742081447964, "grad_norm": 5.754073143005371, "learning_rate": 1.645550527903469e-05, "loss": 0.3325, "step": 1880 }, { "epoch": 0.5345022624434389, "grad_norm": 2.890216588973999, "learning_rate": 1.6436651583710408e-05, "loss": 0.3866, "step": 1890 }, { "epoch": 0.5373303167420814, "grad_norm": 4.1960978507995605, "learning_rate": 1.6417797888386125e-05, "loss": 0.4097, "step": 1900 }, { "epoch": 0.540158371040724, "grad_norm": 4.490061283111572, "learning_rate": 1.639894419306184e-05, "loss": 0.3541, "step": 1910 }, { "epoch": 0.5429864253393665, "grad_norm": 2.911954879760742, "learning_rate": 1.6380090497737558e-05, "loss": 0.3435, "step": 1920 }, { "epoch": 0.545814479638009, "grad_norm": 2.816277027130127, "learning_rate": 1.6361236802413275e-05, "loss": 0.3168, "step": 1930 }, { "epoch": 0.5486425339366516, "grad_norm": 5.4081807136535645, "learning_rate": 1.634238310708899e-05, "loss": 0.404, "step": 1940 }, { "epoch": 0.5514705882352942, "grad_norm": 6.02499532699585, "learning_rate": 1.6323529411764708e-05, "loss": 0.4293, "step": 1950 }, { "epoch": 0.5542986425339367, "grad_norm": 5.138996124267578, "learning_rate": 1.6304675716440424e-05, "loss": 0.4194, "step": 1960 }, { "epoch": 0.5571266968325792, "grad_norm": 4.069638252258301, "learning_rate": 1.628582202111614e-05, "loss": 0.4188, "step": 1970 }, { "epoch": 0.5599547511312217, "grad_norm": 4.273077487945557, "learning_rate": 1.6266968325791858e-05, "loss": 0.3714, "step": 1980 }, { "epoch": 0.5627828054298643, "grad_norm": 3.559727430343628, "learning_rate": 1.6248114630467574e-05, "loss": 0.2804, "step": 1990 }, { "epoch": 0.5656108597285068, "grad_norm": 3.5052013397216797, "learning_rate": 1.622926093514329e-05, "loss": 0.375, "step": 2000 }, { "epoch": 0.5656108597285068, "eval_accuracy": 0.8721762647152402, "eval_loss": 0.32142505049705505, "eval_runtime": 126.2811, "eval_samples_per_second": 99.556, "eval_steps_per_second": 3.112, "step": 2000 }, { "epoch": 0.5684389140271493, "grad_norm": 3.033839464187622, "learning_rate": 1.6210407239819008e-05, "loss": 0.2655, "step": 2010 }, { "epoch": 0.5712669683257918, "grad_norm": 8.15062427520752, "learning_rate": 1.6191553544494724e-05, "loss": 0.282, "step": 2020 }, { "epoch": 0.5740950226244343, "grad_norm": 4.665267467498779, "learning_rate": 1.6172699849170437e-05, "loss": 0.3432, "step": 2030 }, { "epoch": 0.5769230769230769, "grad_norm": 5.122295379638672, "learning_rate": 1.6153846153846154e-05, "loss": 0.4143, "step": 2040 }, { "epoch": 0.5797511312217195, "grad_norm": 5.127368450164795, "learning_rate": 1.613499245852187e-05, "loss": 0.3529, "step": 2050 }, { "epoch": 0.582579185520362, "grad_norm": 4.725905418395996, "learning_rate": 1.6116138763197587e-05, "loss": 0.3102, "step": 2060 }, { "epoch": 0.5854072398190046, "grad_norm": 2.358879566192627, "learning_rate": 1.6097285067873304e-05, "loss": 0.4083, "step": 2070 }, { "epoch": 0.5882352941176471, "grad_norm": 4.624474048614502, "learning_rate": 1.607843137254902e-05, "loss": 0.3742, "step": 2080 }, { "epoch": 0.5910633484162896, "grad_norm": 3.6771047115325928, "learning_rate": 1.6059577677224737e-05, "loss": 0.3705, "step": 2090 }, { "epoch": 0.5938914027149321, "grad_norm": 3.136711359024048, "learning_rate": 1.6040723981900454e-05, "loss": 0.3365, "step": 2100 }, { "epoch": 0.5967194570135747, "grad_norm": 4.1188883781433105, "learning_rate": 1.602187028657617e-05, "loss": 0.3138, "step": 2110 }, { "epoch": 0.5995475113122172, "grad_norm": 2.472294569015503, "learning_rate": 1.6003016591251887e-05, "loss": 0.2888, "step": 2120 }, { "epoch": 0.6023755656108597, "grad_norm": 3.7209103107452393, "learning_rate": 1.5984162895927604e-05, "loss": 0.3057, "step": 2130 }, { "epoch": 0.6052036199095022, "grad_norm": 3.5798637866973877, "learning_rate": 1.596530920060332e-05, "loss": 0.3481, "step": 2140 }, { "epoch": 0.6080316742081447, "grad_norm": 3.1317641735076904, "learning_rate": 1.5946455505279037e-05, "loss": 0.2694, "step": 2150 }, { "epoch": 0.6108597285067874, "grad_norm": 3.438688278198242, "learning_rate": 1.592760180995475e-05, "loss": 0.338, "step": 2160 }, { "epoch": 0.6136877828054299, "grad_norm": 2.2631101608276367, "learning_rate": 1.5908748114630467e-05, "loss": 0.4032, "step": 2170 }, { "epoch": 0.6165158371040724, "grad_norm": 3.2705330848693848, "learning_rate": 1.5889894419306183e-05, "loss": 0.3878, "step": 2180 }, { "epoch": 0.619343891402715, "grad_norm": 5.617705821990967, "learning_rate": 1.58710407239819e-05, "loss": 0.3213, "step": 2190 }, { "epoch": 0.6221719457013575, "grad_norm": 5.0493550300598145, "learning_rate": 1.5852187028657616e-05, "loss": 0.3606, "step": 2200 }, { "epoch": 0.625, "grad_norm": 2.885690689086914, "learning_rate": 1.5833333333333333e-05, "loss": 0.3232, "step": 2210 }, { "epoch": 0.6278280542986425, "grad_norm": 2.4986419677734375, "learning_rate": 1.581447963800905e-05, "loss": 0.3318, "step": 2220 }, { "epoch": 0.630656108597285, "grad_norm": 3.8310494422912598, "learning_rate": 1.5795625942684766e-05, "loss": 0.3241, "step": 2230 }, { "epoch": 0.6334841628959276, "grad_norm": 4.589399337768555, "learning_rate": 1.5776772247360483e-05, "loss": 0.3537, "step": 2240 }, { "epoch": 0.6363122171945701, "grad_norm": 3.939833164215088, "learning_rate": 1.57579185520362e-05, "loss": 0.3665, "step": 2250 }, { "epoch": 0.6391402714932126, "grad_norm": 3.5939204692840576, "learning_rate": 1.5739064856711916e-05, "loss": 0.3462, "step": 2260 }, { "epoch": 0.6419683257918553, "grad_norm": 4.346156597137451, "learning_rate": 1.5720211161387633e-05, "loss": 0.3887, "step": 2270 }, { "epoch": 0.6447963800904978, "grad_norm": 4.5238165855407715, "learning_rate": 1.570135746606335e-05, "loss": 0.269, "step": 2280 }, { "epoch": 0.6476244343891403, "grad_norm": 4.225012302398682, "learning_rate": 1.5682503770739066e-05, "loss": 0.3346, "step": 2290 }, { "epoch": 0.6504524886877828, "grad_norm": 5.076806545257568, "learning_rate": 1.5663650075414783e-05, "loss": 0.4031, "step": 2300 }, { "epoch": 0.6532805429864253, "grad_norm": 5.921730041503906, "learning_rate": 1.56447963800905e-05, "loss": 0.3348, "step": 2310 }, { "epoch": 0.6561085972850679, "grad_norm": 5.128915309906006, "learning_rate": 1.5625942684766216e-05, "loss": 0.3626, "step": 2320 }, { "epoch": 0.6589366515837104, "grad_norm": 3.5405006408691406, "learning_rate": 1.5607088989441932e-05, "loss": 0.3396, "step": 2330 }, { "epoch": 0.6617647058823529, "grad_norm": 5.166226863861084, "learning_rate": 1.558823529411765e-05, "loss": 0.3281, "step": 2340 }, { "epoch": 0.6645927601809954, "grad_norm": 3.0114974975585938, "learning_rate": 1.5569381598793366e-05, "loss": 0.3495, "step": 2350 }, { "epoch": 0.667420814479638, "grad_norm": 4.730240345001221, "learning_rate": 1.5550527903469082e-05, "loss": 0.3775, "step": 2360 }, { "epoch": 0.6702488687782805, "grad_norm": 6.134402275085449, "learning_rate": 1.55316742081448e-05, "loss": 0.408, "step": 2370 }, { "epoch": 0.6730769230769231, "grad_norm": 8.204373359680176, "learning_rate": 1.5512820512820516e-05, "loss": 0.3215, "step": 2380 }, { "epoch": 0.6759049773755657, "grad_norm": 5.2875895500183105, "learning_rate": 1.5493966817496232e-05, "loss": 0.3004, "step": 2390 }, { "epoch": 0.6787330316742082, "grad_norm": 4.722002029418945, "learning_rate": 1.547511312217195e-05, "loss": 0.3495, "step": 2400 }, { "epoch": 0.6815610859728507, "grad_norm": 3.777385711669922, "learning_rate": 1.5456259426847665e-05, "loss": 0.314, "step": 2410 }, { "epoch": 0.6843891402714932, "grad_norm": 4.804584503173828, "learning_rate": 1.5437405731523382e-05, "loss": 0.302, "step": 2420 }, { "epoch": 0.6872171945701357, "grad_norm": 1.9814542531967163, "learning_rate": 1.54185520361991e-05, "loss": 0.32, "step": 2430 }, { "epoch": 0.6900452488687783, "grad_norm": 4.671655178070068, "learning_rate": 1.5399698340874812e-05, "loss": 0.3471, "step": 2440 }, { "epoch": 0.6928733031674208, "grad_norm": 4.3465776443481445, "learning_rate": 1.538084464555053e-05, "loss": 0.4079, "step": 2450 }, { "epoch": 0.6957013574660633, "grad_norm": 5.087115287780762, "learning_rate": 1.5361990950226245e-05, "loss": 0.2957, "step": 2460 }, { "epoch": 0.6985294117647058, "grad_norm": 4.124098777770996, "learning_rate": 1.5343137254901962e-05, "loss": 0.3573, "step": 2470 }, { "epoch": 0.7013574660633484, "grad_norm": 4.266404628753662, "learning_rate": 1.532428355957768e-05, "loss": 0.3054, "step": 2480 }, { "epoch": 0.704185520361991, "grad_norm": 3.325258731842041, "learning_rate": 1.5305429864253395e-05, "loss": 0.3675, "step": 2490 }, { "epoch": 0.7070135746606335, "grad_norm": 2.9218814373016357, "learning_rate": 1.528657616892911e-05, "loss": 0.3175, "step": 2500 }, { "epoch": 0.709841628959276, "grad_norm": 4.399160385131836, "learning_rate": 1.5267722473604828e-05, "loss": 0.3239, "step": 2510 }, { "epoch": 0.7126696832579186, "grad_norm": 4.460221290588379, "learning_rate": 1.5248868778280543e-05, "loss": 0.3827, "step": 2520 }, { "epoch": 0.7154977375565611, "grad_norm": 3.0739834308624268, "learning_rate": 1.523001508295626e-05, "loss": 0.3725, "step": 2530 }, { "epoch": 0.7183257918552036, "grad_norm": 2.8812670707702637, "learning_rate": 1.5211161387631976e-05, "loss": 0.2495, "step": 2540 }, { "epoch": 0.7211538461538461, "grad_norm": 6.949345588684082, "learning_rate": 1.5192307692307693e-05, "loss": 0.3632, "step": 2550 }, { "epoch": 0.7239819004524887, "grad_norm": 3.124908685684204, "learning_rate": 1.517345399698341e-05, "loss": 0.3681, "step": 2560 }, { "epoch": 0.7268099547511312, "grad_norm": 4.435882091522217, "learning_rate": 1.5154600301659126e-05, "loss": 0.3279, "step": 2570 }, { "epoch": 0.7296380090497737, "grad_norm": 3.6505391597747803, "learning_rate": 1.5135746606334843e-05, "loss": 0.3487, "step": 2580 }, { "epoch": 0.7324660633484162, "grad_norm": 3.057103395462036, "learning_rate": 1.511689291101056e-05, "loss": 0.3086, "step": 2590 }, { "epoch": 0.7352941176470589, "grad_norm": 2.988297462463379, "learning_rate": 1.5098039215686276e-05, "loss": 0.2901, "step": 2600 }, { "epoch": 0.7381221719457014, "grad_norm": 8.121850967407227, "learning_rate": 1.5079185520361993e-05, "loss": 0.3656, "step": 2610 }, { "epoch": 0.7409502262443439, "grad_norm": 3.4862985610961914, "learning_rate": 1.506033182503771e-05, "loss": 0.3144, "step": 2620 }, { "epoch": 0.7437782805429864, "grad_norm": 2.3046765327453613, "learning_rate": 1.5041478129713424e-05, "loss": 0.2498, "step": 2630 }, { "epoch": 0.746606334841629, "grad_norm": 3.606008529663086, "learning_rate": 1.502262443438914e-05, "loss": 0.3449, "step": 2640 }, { "epoch": 0.7494343891402715, "grad_norm": 3.494842767715454, "learning_rate": 1.5003770739064857e-05, "loss": 0.2784, "step": 2650 }, { "epoch": 0.752262443438914, "grad_norm": 5.306181907653809, "learning_rate": 1.4984917043740574e-05, "loss": 0.2783, "step": 2660 }, { "epoch": 0.7550904977375565, "grad_norm": 3.1774604320526123, "learning_rate": 1.496606334841629e-05, "loss": 0.2216, "step": 2670 }, { "epoch": 0.7579185520361991, "grad_norm": 5.226140022277832, "learning_rate": 1.4947209653092007e-05, "loss": 0.386, "step": 2680 }, { "epoch": 0.7607466063348416, "grad_norm": 3.7945973873138428, "learning_rate": 1.4928355957767724e-05, "loss": 0.3213, "step": 2690 }, { "epoch": 0.7635746606334841, "grad_norm": 3.4387052059173584, "learning_rate": 1.490950226244344e-05, "loss": 0.3386, "step": 2700 }, { "epoch": 0.7664027149321267, "grad_norm": 3.023867607116699, "learning_rate": 1.4890648567119157e-05, "loss": 0.3507, "step": 2710 }, { "epoch": 0.7692307692307693, "grad_norm": 5.2512640953063965, "learning_rate": 1.4871794871794874e-05, "loss": 0.3141, "step": 2720 }, { "epoch": 0.7720588235294118, "grad_norm": 3.79915452003479, "learning_rate": 1.485294117647059e-05, "loss": 0.3441, "step": 2730 }, { "epoch": 0.7748868778280543, "grad_norm": 1.8601824045181274, "learning_rate": 1.4834087481146307e-05, "loss": 0.3662, "step": 2740 }, { "epoch": 0.7777149321266968, "grad_norm": 2.1231563091278076, "learning_rate": 1.4815233785822024e-05, "loss": 0.3365, "step": 2750 }, { "epoch": 0.7805429864253394, "grad_norm": 5.087540149688721, "learning_rate": 1.479638009049774e-05, "loss": 0.3339, "step": 2760 }, { "epoch": 0.7833710407239819, "grad_norm": 6.3354668617248535, "learning_rate": 1.4777526395173457e-05, "loss": 0.317, "step": 2770 }, { "epoch": 0.7861990950226244, "grad_norm": 3.519740581512451, "learning_rate": 1.4758672699849172e-05, "loss": 0.3237, "step": 2780 }, { "epoch": 0.7890271493212669, "grad_norm": 2.299184560775757, "learning_rate": 1.4739819004524888e-05, "loss": 0.289, "step": 2790 }, { "epoch": 0.7918552036199095, "grad_norm": 6.4508490562438965, "learning_rate": 1.4720965309200605e-05, "loss": 0.3214, "step": 2800 }, { "epoch": 0.794683257918552, "grad_norm": 1.989512324333191, "learning_rate": 1.4702111613876322e-05, "loss": 0.2949, "step": 2810 }, { "epoch": 0.7975113122171946, "grad_norm": 5.373081684112549, "learning_rate": 1.4683257918552036e-05, "loss": 0.3365, "step": 2820 }, { "epoch": 0.8003393665158371, "grad_norm": 2.989363193511963, "learning_rate": 1.4664404223227753e-05, "loss": 0.2495, "step": 2830 }, { "epoch": 0.8031674208144797, "grad_norm": 4.9633660316467285, "learning_rate": 1.464555052790347e-05, "loss": 0.2815, "step": 2840 }, { "epoch": 0.8059954751131222, "grad_norm": 6.031944274902344, "learning_rate": 1.4626696832579186e-05, "loss": 0.2632, "step": 2850 }, { "epoch": 0.8088235294117647, "grad_norm": 3.689105987548828, "learning_rate": 1.4607843137254903e-05, "loss": 0.3192, "step": 2860 }, { "epoch": 0.8116515837104072, "grad_norm": 3.926541805267334, "learning_rate": 1.458898944193062e-05, "loss": 0.3569, "step": 2870 }, { "epoch": 0.8144796380090498, "grad_norm": 4.753978252410889, "learning_rate": 1.4570135746606336e-05, "loss": 0.3957, "step": 2880 }, { "epoch": 0.8173076923076923, "grad_norm": 5.156829833984375, "learning_rate": 1.4551282051282051e-05, "loss": 0.3271, "step": 2890 }, { "epoch": 0.8201357466063348, "grad_norm": 3.1717522144317627, "learning_rate": 1.4532428355957768e-05, "loss": 0.3337, "step": 2900 }, { "epoch": 0.8229638009049773, "grad_norm": 2.9101099967956543, "learning_rate": 1.4513574660633484e-05, "loss": 0.3331, "step": 2910 }, { "epoch": 0.8257918552036199, "grad_norm": 3.014803171157837, "learning_rate": 1.4494720965309201e-05, "loss": 0.303, "step": 2920 }, { "epoch": 0.8286199095022625, "grad_norm": 6.68400764465332, "learning_rate": 1.4475867269984918e-05, "loss": 0.3414, "step": 2930 }, { "epoch": 0.831447963800905, "grad_norm": 4.039949417114258, "learning_rate": 1.4457013574660634e-05, "loss": 0.3107, "step": 2940 }, { "epoch": 0.8342760180995475, "grad_norm": 5.4491286277771, "learning_rate": 1.443815987933635e-05, "loss": 0.3094, "step": 2950 }, { "epoch": 0.8371040723981901, "grad_norm": 4.456144332885742, "learning_rate": 1.4419306184012067e-05, "loss": 0.3137, "step": 2960 }, { "epoch": 0.8399321266968326, "grad_norm": 3.682917594909668, "learning_rate": 1.4400452488687784e-05, "loss": 0.3221, "step": 2970 }, { "epoch": 0.8427601809954751, "grad_norm": 4.826881408691406, "learning_rate": 1.43815987933635e-05, "loss": 0.3828, "step": 2980 }, { "epoch": 0.8455882352941176, "grad_norm": 4.945711135864258, "learning_rate": 1.4362745098039217e-05, "loss": 0.2887, "step": 2990 }, { "epoch": 0.8484162895927602, "grad_norm": 6.562948226928711, "learning_rate": 1.4343891402714934e-05, "loss": 0.3222, "step": 3000 }, { "epoch": 0.8484162895927602, "eval_accuracy": 0.8736875596563792, "eval_loss": 0.32356539368629456, "eval_runtime": 126.3914, "eval_samples_per_second": 99.469, "eval_steps_per_second": 3.109, "step": 3000 }, { "epoch": 0.8512443438914027, "grad_norm": 5.110568523406982, "learning_rate": 1.432503770739065e-05, "loss": 0.3145, "step": 3010 }, { "epoch": 0.8540723981900452, "grad_norm": 4.4614081382751465, "learning_rate": 1.4306184012066367e-05, "loss": 0.3098, "step": 3020 }, { "epoch": 0.8569004524886877, "grad_norm": 3.0560202598571777, "learning_rate": 1.4287330316742084e-05, "loss": 0.3271, "step": 3030 }, { "epoch": 0.8597285067873304, "grad_norm": 5.284294605255127, "learning_rate": 1.4268476621417799e-05, "loss": 0.3461, "step": 3040 }, { "epoch": 0.8625565610859729, "grad_norm": 3.1206512451171875, "learning_rate": 1.4249622926093515e-05, "loss": 0.2705, "step": 3050 }, { "epoch": 0.8653846153846154, "grad_norm": 4.308442115783691, "learning_rate": 1.4230769230769232e-05, "loss": 0.3691, "step": 3060 }, { "epoch": 0.8682126696832579, "grad_norm": 2.637321710586548, "learning_rate": 1.4211915535444948e-05, "loss": 0.3388, "step": 3070 }, { "epoch": 0.8710407239819005, "grad_norm": 4.938368797302246, "learning_rate": 1.4193061840120665e-05, "loss": 0.288, "step": 3080 }, { "epoch": 0.873868778280543, "grad_norm": 4.269132137298584, "learning_rate": 1.4174208144796382e-05, "loss": 0.3347, "step": 3090 }, { "epoch": 0.8766968325791855, "grad_norm": 4.967940807342529, "learning_rate": 1.4155354449472098e-05, "loss": 0.318, "step": 3100 }, { "epoch": 0.879524886877828, "grad_norm": 4.122420787811279, "learning_rate": 1.4136500754147815e-05, "loss": 0.3139, "step": 3110 }, { "epoch": 0.8823529411764706, "grad_norm": 3.85040545463562, "learning_rate": 1.4117647058823532e-05, "loss": 0.3544, "step": 3120 }, { "epoch": 0.8851809954751131, "grad_norm": 5.834123134613037, "learning_rate": 1.4098793363499248e-05, "loss": 0.3717, "step": 3130 }, { "epoch": 0.8880090497737556, "grad_norm": 3.2261178493499756, "learning_rate": 1.4079939668174965e-05, "loss": 0.3071, "step": 3140 }, { "epoch": 0.8908371040723982, "grad_norm": 5.519437789916992, "learning_rate": 1.4061085972850678e-05, "loss": 0.2519, "step": 3150 }, { "epoch": 0.8936651583710408, "grad_norm": 2.5292046070098877, "learning_rate": 1.4042232277526395e-05, "loss": 0.3543, "step": 3160 }, { "epoch": 0.8964932126696833, "grad_norm": 3.737870454788208, "learning_rate": 1.4023378582202111e-05, "loss": 0.3036, "step": 3170 }, { "epoch": 0.8993212669683258, "grad_norm": 4.248650550842285, "learning_rate": 1.4004524886877828e-05, "loss": 0.321, "step": 3180 }, { "epoch": 0.9021493212669683, "grad_norm": 3.5133938789367676, "learning_rate": 1.3985671191553544e-05, "loss": 0.354, "step": 3190 }, { "epoch": 0.9049773755656109, "grad_norm": 2.819633722305298, "learning_rate": 1.3966817496229261e-05, "loss": 0.2789, "step": 3200 }, { "epoch": 0.9078054298642534, "grad_norm": 3.6485252380371094, "learning_rate": 1.3947963800904978e-05, "loss": 0.3348, "step": 3210 }, { "epoch": 0.9106334841628959, "grad_norm": 4.469762325286865, "learning_rate": 1.3929110105580694e-05, "loss": 0.2611, "step": 3220 }, { "epoch": 0.9134615384615384, "grad_norm": 5.00715970993042, "learning_rate": 1.3910256410256411e-05, "loss": 0.3701, "step": 3230 }, { "epoch": 0.916289592760181, "grad_norm": 3.802788734436035, "learning_rate": 1.3891402714932128e-05, "loss": 0.4052, "step": 3240 }, { "epoch": 0.9191176470588235, "grad_norm": 3.6908090114593506, "learning_rate": 1.3872549019607844e-05, "loss": 0.3163, "step": 3250 }, { "epoch": 0.9219457013574661, "grad_norm": 4.198665142059326, "learning_rate": 1.385369532428356e-05, "loss": 0.3004, "step": 3260 }, { "epoch": 0.9247737556561086, "grad_norm": 5.080460071563721, "learning_rate": 1.3834841628959277e-05, "loss": 0.3444, "step": 3270 }, { "epoch": 0.9276018099547512, "grad_norm": 5.11644983291626, "learning_rate": 1.3815987933634994e-05, "loss": 0.3238, "step": 3280 }, { "epoch": 0.9304298642533937, "grad_norm": 7.753527641296387, "learning_rate": 1.379713423831071e-05, "loss": 0.3969, "step": 3290 }, { "epoch": 0.9332579185520362, "grad_norm": 3.2283082008361816, "learning_rate": 1.3778280542986426e-05, "loss": 0.3186, "step": 3300 }, { "epoch": 0.9360859728506787, "grad_norm": 5.4364094734191895, "learning_rate": 1.3759426847662142e-05, "loss": 0.2786, "step": 3310 }, { "epoch": 0.9389140271493213, "grad_norm": 4.061675071716309, "learning_rate": 1.3740573152337859e-05, "loss": 0.2909, "step": 3320 }, { "epoch": 0.9417420814479638, "grad_norm": 2.8919031620025635, "learning_rate": 1.3721719457013575e-05, "loss": 0.3572, "step": 3330 }, { "epoch": 0.9445701357466063, "grad_norm": 2.643793821334839, "learning_rate": 1.3702865761689292e-05, "loss": 0.2582, "step": 3340 }, { "epoch": 0.9473981900452488, "grad_norm": 2.6080071926116943, "learning_rate": 1.3684012066365009e-05, "loss": 0.3299, "step": 3350 }, { "epoch": 0.9502262443438914, "grad_norm": 3.8307015895843506, "learning_rate": 1.3665158371040725e-05, "loss": 0.3193, "step": 3360 }, { "epoch": 0.9530542986425339, "grad_norm": 5.132751941680908, "learning_rate": 1.3646304675716442e-05, "loss": 0.3029, "step": 3370 }, { "epoch": 0.9558823529411765, "grad_norm": 2.5157196521759033, "learning_rate": 1.3627450980392158e-05, "loss": 0.2851, "step": 3380 }, { "epoch": 0.958710407239819, "grad_norm": 2.9101061820983887, "learning_rate": 1.3608597285067875e-05, "loss": 0.2542, "step": 3390 }, { "epoch": 0.9615384615384616, "grad_norm": 4.939927101135254, "learning_rate": 1.3589743589743592e-05, "loss": 0.2904, "step": 3400 }, { "epoch": 0.9643665158371041, "grad_norm": 3.6113576889038086, "learning_rate": 1.3570889894419308e-05, "loss": 0.2654, "step": 3410 }, { "epoch": 0.9671945701357466, "grad_norm": 7.237005710601807, "learning_rate": 1.3552036199095025e-05, "loss": 0.2636, "step": 3420 }, { "epoch": 0.9700226244343891, "grad_norm": 4.309847354888916, "learning_rate": 1.3533182503770742e-05, "loss": 0.3095, "step": 3430 }, { "epoch": 0.9728506787330317, "grad_norm": 3.404597520828247, "learning_rate": 1.3514328808446458e-05, "loss": 0.3147, "step": 3440 }, { "epoch": 0.9756787330316742, "grad_norm": 2.2847061157226562, "learning_rate": 1.3495475113122173e-05, "loss": 0.2536, "step": 3450 }, { "epoch": 0.9785067873303167, "grad_norm": 3.670473337173462, "learning_rate": 1.347662141779789e-05, "loss": 0.4246, "step": 3460 }, { "epoch": 0.9813348416289592, "grad_norm": 3.955064296722412, "learning_rate": 1.3457767722473606e-05, "loss": 0.3578, "step": 3470 }, { "epoch": 0.9841628959276018, "grad_norm": 4.502097129821777, "learning_rate": 1.3438914027149323e-05, "loss": 0.2467, "step": 3480 }, { "epoch": 0.9869909502262444, "grad_norm": 2.4463083744049072, "learning_rate": 1.3420060331825038e-05, "loss": 0.2968, "step": 3490 }, { "epoch": 0.9898190045248869, "grad_norm": 4.400903224945068, "learning_rate": 1.3401206636500754e-05, "loss": 0.2767, "step": 3500 }, { "epoch": 0.9926470588235294, "grad_norm": 3.8190836906433105, "learning_rate": 1.3382352941176471e-05, "loss": 0.2868, "step": 3510 }, { "epoch": 0.995475113122172, "grad_norm": 6.496269702911377, "learning_rate": 1.3363499245852188e-05, "loss": 0.2562, "step": 3520 }, { "epoch": 0.9983031674208145, "grad_norm": 6.07765531539917, "learning_rate": 1.3344645550527904e-05, "loss": 0.3454, "step": 3530 }, { "epoch": 1.001131221719457, "grad_norm": 2.3242061138153076, "learning_rate": 1.3325791855203621e-05, "loss": 0.2916, "step": 3540 }, { "epoch": 1.0039592760180995, "grad_norm": 4.1568922996521, "learning_rate": 1.3306938159879338e-05, "loss": 0.2674, "step": 3550 }, { "epoch": 1.006787330316742, "grad_norm": 4.240556240081787, "learning_rate": 1.3288084464555052e-05, "loss": 0.2308, "step": 3560 }, { "epoch": 1.0096153846153846, "grad_norm": 3.1726534366607666, "learning_rate": 1.3269230769230769e-05, "loss": 0.2158, "step": 3570 }, { "epoch": 1.012443438914027, "grad_norm": 2.371945381164551, "learning_rate": 1.3250377073906486e-05, "loss": 0.2726, "step": 3580 }, { "epoch": 1.0152714932126696, "grad_norm": 4.0892744064331055, "learning_rate": 1.3231523378582202e-05, "loss": 0.2854, "step": 3590 }, { "epoch": 1.0180995475113122, "grad_norm": 4.087936878204346, "learning_rate": 1.3212669683257919e-05, "loss": 0.2819, "step": 3600 }, { "epoch": 1.0209276018099547, "grad_norm": 2.393385171890259, "learning_rate": 1.3193815987933636e-05, "loss": 0.2647, "step": 3610 }, { "epoch": 1.0237556561085972, "grad_norm": 5.088064193725586, "learning_rate": 1.3174962292609352e-05, "loss": 0.3036, "step": 3620 }, { "epoch": 1.0265837104072397, "grad_norm": 5.494114875793457, "learning_rate": 1.3156108597285069e-05, "loss": 0.2903, "step": 3630 }, { "epoch": 1.0294117647058822, "grad_norm": 1.7145814895629883, "learning_rate": 1.3137254901960785e-05, "loss": 0.2035, "step": 3640 }, { "epoch": 1.032239819004525, "grad_norm": 5.571091651916504, "learning_rate": 1.3118401206636502e-05, "loss": 0.2725, "step": 3650 }, { "epoch": 1.0350678733031675, "grad_norm": 2.1600940227508545, "learning_rate": 1.3099547511312219e-05, "loss": 0.2664, "step": 3660 }, { "epoch": 1.03789592760181, "grad_norm": 5.715987205505371, "learning_rate": 1.3080693815987935e-05, "loss": 0.288, "step": 3670 }, { "epoch": 1.0407239819004526, "grad_norm": 3.7473366260528564, "learning_rate": 1.3061840120663652e-05, "loss": 0.3043, "step": 3680 }, { "epoch": 1.043552036199095, "grad_norm": 6.1543121337890625, "learning_rate": 1.3042986425339369e-05, "loss": 0.3095, "step": 3690 }, { "epoch": 1.0463800904977376, "grad_norm": 2.4978766441345215, "learning_rate": 1.3024132730015085e-05, "loss": 0.2721, "step": 3700 }, { "epoch": 1.0492081447963801, "grad_norm": 6.851878643035889, "learning_rate": 1.3005279034690802e-05, "loss": 0.2912, "step": 3710 }, { "epoch": 1.0520361990950227, "grad_norm": 3.8570425510406494, "learning_rate": 1.2986425339366517e-05, "loss": 0.3127, "step": 3720 }, { "epoch": 1.0548642533936652, "grad_norm": 7.417280197143555, "learning_rate": 1.2967571644042233e-05, "loss": 0.2707, "step": 3730 }, { "epoch": 1.0576923076923077, "grad_norm": 4.451798915863037, "learning_rate": 1.294871794871795e-05, "loss": 0.2387, "step": 3740 }, { "epoch": 1.0605203619909502, "grad_norm": 3.9390320777893066, "learning_rate": 1.2929864253393667e-05, "loss": 0.2626, "step": 3750 }, { "epoch": 1.0633484162895928, "grad_norm": 1.990342617034912, "learning_rate": 1.2911010558069383e-05, "loss": 0.2694, "step": 3760 }, { "epoch": 1.0661764705882353, "grad_norm": 3.4424543380737305, "learning_rate": 1.28921568627451e-05, "loss": 0.3381, "step": 3770 }, { "epoch": 1.0690045248868778, "grad_norm": 1.7783031463623047, "learning_rate": 1.2873303167420816e-05, "loss": 0.1981, "step": 3780 }, { "epoch": 1.0718325791855203, "grad_norm": 3.8346874713897705, "learning_rate": 1.2854449472096533e-05, "loss": 0.2624, "step": 3790 }, { "epoch": 1.0746606334841629, "grad_norm": 5.832867622375488, "learning_rate": 1.283559577677225e-05, "loss": 0.2581, "step": 3800 }, { "epoch": 1.0774886877828054, "grad_norm": 4.65895414352417, "learning_rate": 1.2816742081447966e-05, "loss": 0.3619, "step": 3810 }, { "epoch": 1.080316742081448, "grad_norm": 3.93692946434021, "learning_rate": 1.279788838612368e-05, "loss": 0.3296, "step": 3820 }, { "epoch": 1.0831447963800904, "grad_norm": 2.544408082962036, "learning_rate": 1.2779034690799396e-05, "loss": 0.2733, "step": 3830 }, { "epoch": 1.085972850678733, "grad_norm": 4.070341110229492, "learning_rate": 1.2760180995475113e-05, "loss": 0.2303, "step": 3840 }, { "epoch": 1.0888009049773755, "grad_norm": 3.7344400882720947, "learning_rate": 1.274132730015083e-05, "loss": 0.2809, "step": 3850 }, { "epoch": 1.091628959276018, "grad_norm": 5.270275592803955, "learning_rate": 1.2722473604826546e-05, "loss": 0.2894, "step": 3860 }, { "epoch": 1.0944570135746607, "grad_norm": 4.697700500488281, "learning_rate": 1.2703619909502263e-05, "loss": 0.2382, "step": 3870 }, { "epoch": 1.0972850678733033, "grad_norm": 3.1016902923583984, "learning_rate": 1.2684766214177979e-05, "loss": 0.2545, "step": 3880 }, { "epoch": 1.1001131221719458, "grad_norm": 3.6058175563812256, "learning_rate": 1.2665912518853696e-05, "loss": 0.2747, "step": 3890 }, { "epoch": 1.1029411764705883, "grad_norm": 6.918750286102295, "learning_rate": 1.2647058823529412e-05, "loss": 0.2541, "step": 3900 }, { "epoch": 1.1057692307692308, "grad_norm": 4.158249855041504, "learning_rate": 1.2628205128205129e-05, "loss": 0.2514, "step": 3910 }, { "epoch": 1.1085972850678734, "grad_norm": 5.783833980560303, "learning_rate": 1.2609351432880846e-05, "loss": 0.2547, "step": 3920 }, { "epoch": 1.1114253393665159, "grad_norm": 3.524967670440674, "learning_rate": 1.2590497737556562e-05, "loss": 0.2767, "step": 3930 }, { "epoch": 1.1142533936651584, "grad_norm": 2.39933705329895, "learning_rate": 1.2571644042232279e-05, "loss": 0.2359, "step": 3940 }, { "epoch": 1.117081447963801, "grad_norm": 4.107085704803467, "learning_rate": 1.2552790346907995e-05, "loss": 0.244, "step": 3950 }, { "epoch": 1.1199095022624435, "grad_norm": 6.81174898147583, "learning_rate": 1.2533936651583712e-05, "loss": 0.2579, "step": 3960 }, { "epoch": 1.122737556561086, "grad_norm": 1.8872418403625488, "learning_rate": 1.2515082956259429e-05, "loss": 0.2101, "step": 3970 }, { "epoch": 1.1255656108597285, "grad_norm": 3.872263193130493, "learning_rate": 1.2496229260935144e-05, "loss": 0.3009, "step": 3980 }, { "epoch": 1.128393665158371, "grad_norm": 2.7092275619506836, "learning_rate": 1.247737556561086e-05, "loss": 0.3053, "step": 3990 }, { "epoch": 1.1312217194570136, "grad_norm": 6.832910537719727, "learning_rate": 1.2458521870286577e-05, "loss": 0.2339, "step": 4000 }, { "epoch": 1.1312217194570136, "eval_accuracy": 0.8789373210308622, "eval_loss": 0.31826069951057434, "eval_runtime": 126.5036, "eval_samples_per_second": 99.381, "eval_steps_per_second": 3.107, "step": 4000 }, { "epoch": 1.134049773755656, "grad_norm": 4.47673225402832, "learning_rate": 1.2439668174962293e-05, "loss": 0.2778, "step": 4010 }, { "epoch": 1.1368778280542986, "grad_norm": 5.049123287200928, "learning_rate": 1.242081447963801e-05, "loss": 0.3089, "step": 4020 }, { "epoch": 1.1397058823529411, "grad_norm": 3.6429476737976074, "learning_rate": 1.2401960784313727e-05, "loss": 0.2583, "step": 4030 }, { "epoch": 1.1425339366515836, "grad_norm": 4.532712936401367, "learning_rate": 1.2383107088989443e-05, "loss": 0.276, "step": 4040 }, { "epoch": 1.1453619909502262, "grad_norm": 2.8139843940734863, "learning_rate": 1.236425339366516e-05, "loss": 0.2866, "step": 4050 }, { "epoch": 1.1481900452488687, "grad_norm": 3.5737717151641846, "learning_rate": 1.2345399698340877e-05, "loss": 0.2609, "step": 4060 }, { "epoch": 1.1510180995475112, "grad_norm": 3.4656126499176025, "learning_rate": 1.2326546003016593e-05, "loss": 0.2063, "step": 4070 }, { "epoch": 1.1538461538461537, "grad_norm": 7.43180513381958, "learning_rate": 1.230769230769231e-05, "loss": 0.3614, "step": 4080 }, { "epoch": 1.1566742081447963, "grad_norm": 2.853827476501465, "learning_rate": 1.2288838612368026e-05, "loss": 0.3006, "step": 4090 }, { "epoch": 1.1595022624434388, "grad_norm": 4.522756099700928, "learning_rate": 1.2269984917043743e-05, "loss": 0.3122, "step": 4100 }, { "epoch": 1.1623303167420815, "grad_norm": 3.891043186187744, "learning_rate": 1.225113122171946e-05, "loss": 0.3056, "step": 4110 }, { "epoch": 1.165158371040724, "grad_norm": 2.7950618267059326, "learning_rate": 1.2232277526395176e-05, "loss": 0.3195, "step": 4120 }, { "epoch": 1.1679864253393666, "grad_norm": 3.972943067550659, "learning_rate": 1.2213423831070891e-05, "loss": 0.3315, "step": 4130 }, { "epoch": 1.170814479638009, "grad_norm": 10.891520500183105, "learning_rate": 1.2194570135746608e-05, "loss": 0.276, "step": 4140 }, { "epoch": 1.1736425339366516, "grad_norm": 4.51856803894043, "learning_rate": 1.2175716440422323e-05, "loss": 0.2355, "step": 4150 }, { "epoch": 1.1764705882352942, "grad_norm": 2.575591564178467, "learning_rate": 1.215686274509804e-05, "loss": 0.195, "step": 4160 }, { "epoch": 1.1792986425339367, "grad_norm": 3.9654879570007324, "learning_rate": 1.2138009049773756e-05, "loss": 0.2505, "step": 4170 }, { "epoch": 1.1821266968325792, "grad_norm": 5.328989028930664, "learning_rate": 1.2119155354449473e-05, "loss": 0.2326, "step": 4180 }, { "epoch": 1.1849547511312217, "grad_norm": 3.573969841003418, "learning_rate": 1.2100301659125189e-05, "loss": 0.2302, "step": 4190 }, { "epoch": 1.1877828054298643, "grad_norm": 4.914673328399658, "learning_rate": 1.2081447963800906e-05, "loss": 0.3016, "step": 4200 }, { "epoch": 1.1906108597285068, "grad_norm": 3.4880571365356445, "learning_rate": 1.2062594268476622e-05, "loss": 0.2811, "step": 4210 }, { "epoch": 1.1934389140271493, "grad_norm": 3.827131509780884, "learning_rate": 1.2043740573152339e-05, "loss": 0.2431, "step": 4220 }, { "epoch": 1.1962669683257918, "grad_norm": 4.236039161682129, "learning_rate": 1.2024886877828056e-05, "loss": 0.2951, "step": 4230 }, { "epoch": 1.1990950226244343, "grad_norm": 4.934144973754883, "learning_rate": 1.200603318250377e-05, "loss": 0.2739, "step": 4240 }, { "epoch": 1.2019230769230769, "grad_norm": 6.265100479125977, "learning_rate": 1.1987179487179487e-05, "loss": 0.2298, "step": 4250 }, { "epoch": 1.2047511312217194, "grad_norm": 2.0881166458129883, "learning_rate": 1.1968325791855204e-05, "loss": 0.245, "step": 4260 }, { "epoch": 1.207579185520362, "grad_norm": 8.104879379272461, "learning_rate": 1.194947209653092e-05, "loss": 0.2787, "step": 4270 }, { "epoch": 1.2104072398190044, "grad_norm": 4.176904201507568, "learning_rate": 1.1930618401206637e-05, "loss": 0.2805, "step": 4280 }, { "epoch": 1.213235294117647, "grad_norm": 4.049289226531982, "learning_rate": 1.1911764705882354e-05, "loss": 0.3075, "step": 4290 }, { "epoch": 1.2160633484162897, "grad_norm": 1.6341451406478882, "learning_rate": 1.189291101055807e-05, "loss": 0.2664, "step": 4300 }, { "epoch": 1.2188914027149322, "grad_norm": 7.3927388191223145, "learning_rate": 1.1874057315233787e-05, "loss": 0.3117, "step": 4310 }, { "epoch": 1.2217194570135748, "grad_norm": 1.8302087783813477, "learning_rate": 1.1855203619909503e-05, "loss": 0.333, "step": 4320 }, { "epoch": 1.2245475113122173, "grad_norm": 2.7119219303131104, "learning_rate": 1.183634992458522e-05, "loss": 0.313, "step": 4330 }, { "epoch": 1.2273755656108598, "grad_norm": 6.975501537322998, "learning_rate": 1.1817496229260937e-05, "loss": 0.2833, "step": 4340 }, { "epoch": 1.2302036199095023, "grad_norm": 2.1703109741210938, "learning_rate": 1.1798642533936653e-05, "loss": 0.2671, "step": 4350 }, { "epoch": 1.2330316742081449, "grad_norm": 3.932482957839966, "learning_rate": 1.177978883861237e-05, "loss": 0.2358, "step": 4360 }, { "epoch": 1.2358597285067874, "grad_norm": 2.7635726928710938, "learning_rate": 1.1760935143288087e-05, "loss": 0.3196, "step": 4370 }, { "epoch": 1.23868778280543, "grad_norm": 2.945617914199829, "learning_rate": 1.1742081447963803e-05, "loss": 0.286, "step": 4380 }, { "epoch": 1.2415158371040724, "grad_norm": 4.623812675476074, "learning_rate": 1.1723227752639518e-05, "loss": 0.2605, "step": 4390 }, { "epoch": 1.244343891402715, "grad_norm": 3.3469064235687256, "learning_rate": 1.1704374057315235e-05, "loss": 0.2515, "step": 4400 }, { "epoch": 1.2471719457013575, "grad_norm": 6.414296627044678, "learning_rate": 1.1685520361990951e-05, "loss": 0.3239, "step": 4410 }, { "epoch": 1.25, "grad_norm": 5.76809549331665, "learning_rate": 1.1666666666666668e-05, "loss": 0.2534, "step": 4420 }, { "epoch": 1.2528280542986425, "grad_norm": 5.739138603210449, "learning_rate": 1.1647812971342385e-05, "loss": 0.2617, "step": 4430 }, { "epoch": 1.255656108597285, "grad_norm": 3.76336407661438, "learning_rate": 1.1628959276018101e-05, "loss": 0.2809, "step": 4440 }, { "epoch": 1.2584841628959276, "grad_norm": 3.3274784088134766, "learning_rate": 1.1610105580693818e-05, "loss": 0.2785, "step": 4450 }, { "epoch": 1.26131221719457, "grad_norm": 3.9663026332855225, "learning_rate": 1.1591251885369534e-05, "loss": 0.2404, "step": 4460 }, { "epoch": 1.2641402714932126, "grad_norm": 3.5841290950775146, "learning_rate": 1.1572398190045251e-05, "loss": 0.2417, "step": 4470 }, { "epoch": 1.2669683257918551, "grad_norm": 3.4801056385040283, "learning_rate": 1.1553544494720966e-05, "loss": 0.372, "step": 4480 }, { "epoch": 1.2697963800904977, "grad_norm": 4.876957416534424, "learning_rate": 1.1534690799396683e-05, "loss": 0.2947, "step": 4490 }, { "epoch": 1.2726244343891402, "grad_norm": 5.119454860687256, "learning_rate": 1.1515837104072397e-05, "loss": 0.3047, "step": 4500 }, { "epoch": 1.2754524886877827, "grad_norm": 4.234288215637207, "learning_rate": 1.1496983408748114e-05, "loss": 0.3204, "step": 4510 }, { "epoch": 1.2782805429864252, "grad_norm": 2.9957668781280518, "learning_rate": 1.147812971342383e-05, "loss": 0.2602, "step": 4520 }, { "epoch": 1.2811085972850678, "grad_norm": 2.333770990371704, "learning_rate": 1.1459276018099547e-05, "loss": 0.1949, "step": 4530 }, { "epoch": 1.2839366515837103, "grad_norm": 4.577385425567627, "learning_rate": 1.1440422322775264e-05, "loss": 0.3381, "step": 4540 }, { "epoch": 1.2867647058823528, "grad_norm": 4.607064723968506, "learning_rate": 1.142156862745098e-05, "loss": 0.268, "step": 4550 }, { "epoch": 1.2895927601809956, "grad_norm": 4.690824031829834, "learning_rate": 1.1402714932126697e-05, "loss": 0.2716, "step": 4560 }, { "epoch": 1.292420814479638, "grad_norm": 4.504805564880371, "learning_rate": 1.1383861236802414e-05, "loss": 0.2324, "step": 4570 }, { "epoch": 1.2952488687782806, "grad_norm": 5.126098155975342, "learning_rate": 1.136500754147813e-05, "loss": 0.3124, "step": 4580 }, { "epoch": 1.2980769230769231, "grad_norm": 4.265847206115723, "learning_rate": 1.1346153846153847e-05, "loss": 0.2522, "step": 4590 }, { "epoch": 1.3009049773755657, "grad_norm": 6.03093957901001, "learning_rate": 1.1327300150829564e-05, "loss": 0.1997, "step": 4600 }, { "epoch": 1.3037330316742082, "grad_norm": 5.404048442840576, "learning_rate": 1.130844645550528e-05, "loss": 0.2789, "step": 4610 }, { "epoch": 1.3065610859728507, "grad_norm": 3.481680393218994, "learning_rate": 1.1289592760180997e-05, "loss": 0.3238, "step": 4620 }, { "epoch": 1.3093891402714932, "grad_norm": 5.523119926452637, "learning_rate": 1.1270739064856713e-05, "loss": 0.2349, "step": 4630 }, { "epoch": 1.3122171945701357, "grad_norm": 8.244139671325684, "learning_rate": 1.125188536953243e-05, "loss": 0.2466, "step": 4640 }, { "epoch": 1.3150452488687783, "grad_norm": 4.985035419464111, "learning_rate": 1.1233031674208145e-05, "loss": 0.2634, "step": 4650 }, { "epoch": 1.3178733031674208, "grad_norm": 5.409512996673584, "learning_rate": 1.1214177978883862e-05, "loss": 0.2667, "step": 4660 }, { "epoch": 1.3207013574660633, "grad_norm": 3.6168251037597656, "learning_rate": 1.1195324283559578e-05, "loss": 0.2507, "step": 4670 }, { "epoch": 1.3235294117647058, "grad_norm": 4.121711730957031, "learning_rate": 1.1176470588235295e-05, "loss": 0.3009, "step": 4680 }, { "epoch": 1.3263574660633484, "grad_norm": 5.204695224761963, "learning_rate": 1.1157616892911011e-05, "loss": 0.3055, "step": 4690 }, { "epoch": 1.329185520361991, "grad_norm": 3.2947821617126465, "learning_rate": 1.1138763197586728e-05, "loss": 0.2476, "step": 4700 }, { "epoch": 1.3320135746606334, "grad_norm": 4.02095365524292, "learning_rate": 1.1119909502262445e-05, "loss": 0.2667, "step": 4710 }, { "epoch": 1.334841628959276, "grad_norm": 4.2972025871276855, "learning_rate": 1.1101055806938161e-05, "loss": 0.2281, "step": 4720 }, { "epoch": 1.3376696832579187, "grad_norm": 3.918163537979126, "learning_rate": 1.1082202111613878e-05, "loss": 0.2363, "step": 4730 }, { "epoch": 1.3404977375565612, "grad_norm": 4.1806206703186035, "learning_rate": 1.1063348416289595e-05, "loss": 0.2639, "step": 4740 }, { "epoch": 1.3433257918552037, "grad_norm": 2.949676990509033, "learning_rate": 1.1044494720965311e-05, "loss": 0.2625, "step": 4750 }, { "epoch": 1.3461538461538463, "grad_norm": 5.957220554351807, "learning_rate": 1.1025641025641028e-05, "loss": 0.2416, "step": 4760 }, { "epoch": 1.3489819004524888, "grad_norm": 12.7191162109375, "learning_rate": 1.1006787330316744e-05, "loss": 0.2752, "step": 4770 }, { "epoch": 1.3518099547511313, "grad_norm": 4.849847793579102, "learning_rate": 1.0987933634992461e-05, "loss": 0.295, "step": 4780 }, { "epoch": 1.3546380090497738, "grad_norm": 3.8798282146453857, "learning_rate": 1.0969079939668178e-05, "loss": 0.2744, "step": 4790 }, { "epoch": 1.3574660633484164, "grad_norm": 3.093064546585083, "learning_rate": 1.0950226244343893e-05, "loss": 0.332, "step": 4800 }, { "epoch": 1.3602941176470589, "grad_norm": 5.489840507507324, "learning_rate": 1.0931372549019607e-05, "loss": 0.3402, "step": 4810 }, { "epoch": 1.3631221719457014, "grad_norm": 3.8440115451812744, "learning_rate": 1.0912518853695324e-05, "loss": 0.2283, "step": 4820 }, { "epoch": 1.365950226244344, "grad_norm": 6.518070220947266, "learning_rate": 1.089366515837104e-05, "loss": 0.2859, "step": 4830 }, { "epoch": 1.3687782805429864, "grad_norm": 1.7918236255645752, "learning_rate": 1.0874811463046757e-05, "loss": 0.2879, "step": 4840 }, { "epoch": 1.371606334841629, "grad_norm": 5.9217424392700195, "learning_rate": 1.0855957767722474e-05, "loss": 0.3221, "step": 4850 }, { "epoch": 1.3744343891402715, "grad_norm": 4.664400100708008, "learning_rate": 1.083710407239819e-05, "loss": 0.3259, "step": 4860 }, { "epoch": 1.377262443438914, "grad_norm": 8.561564445495605, "learning_rate": 1.0818250377073907e-05, "loss": 0.2629, "step": 4870 }, { "epoch": 1.3800904977375565, "grad_norm": 5.151259422302246, "learning_rate": 1.0799396681749624e-05, "loss": 0.3046, "step": 4880 }, { "epoch": 1.382918552036199, "grad_norm": 5.32489538192749, "learning_rate": 1.078054298642534e-05, "loss": 0.2654, "step": 4890 }, { "epoch": 1.3857466063348416, "grad_norm": 4.306127071380615, "learning_rate": 1.0761689291101057e-05, "loss": 0.2518, "step": 4900 }, { "epoch": 1.3885746606334841, "grad_norm": 2.7166082859039307, "learning_rate": 1.0742835595776772e-05, "loss": 0.2278, "step": 4910 }, { "epoch": 1.3914027149321266, "grad_norm": 5.469938278198242, "learning_rate": 1.0723981900452489e-05, "loss": 0.2938, "step": 4920 }, { "epoch": 1.3942307692307692, "grad_norm": 5.4974260330200195, "learning_rate": 1.0705128205128205e-05, "loss": 0.2261, "step": 4930 }, { "epoch": 1.3970588235294117, "grad_norm": 2.7733094692230225, "learning_rate": 1.0686274509803922e-05, "loss": 0.3328, "step": 4940 }, { "epoch": 1.3998868778280542, "grad_norm": 3.532456398010254, "learning_rate": 1.0667420814479638e-05, "loss": 0.323, "step": 4950 }, { "epoch": 1.4027149321266967, "grad_norm": 5.4216227531433105, "learning_rate": 1.0648567119155355e-05, "loss": 0.3178, "step": 4960 }, { "epoch": 1.4055429864253393, "grad_norm": 5.761581897735596, "learning_rate": 1.0629713423831072e-05, "loss": 0.2391, "step": 4970 }, { "epoch": 1.4083710407239818, "grad_norm": 7.104434013366699, "learning_rate": 1.0610859728506788e-05, "loss": 0.2565, "step": 4980 }, { "epoch": 1.4111990950226243, "grad_norm": 5.054209232330322, "learning_rate": 1.0592006033182505e-05, "loss": 0.2805, "step": 4990 }, { "epoch": 1.4140271493212668, "grad_norm": 7.0140228271484375, "learning_rate": 1.0573152337858221e-05, "loss": 0.3011, "step": 5000 }, { "epoch": 1.4140271493212668, "eval_accuracy": 0.881880369074133, "eval_loss": 0.3024204969406128, "eval_runtime": 126.4534, "eval_samples_per_second": 99.42, "eval_steps_per_second": 3.108, "step": 5000 }, { "epoch": 1.4168552036199096, "grad_norm": 5.970915794372559, "learning_rate": 1.0554298642533938e-05, "loss": 0.2841, "step": 5010 }, { "epoch": 1.419683257918552, "grad_norm": 4.6934943199157715, "learning_rate": 1.0535444947209655e-05, "loss": 0.2451, "step": 5020 }, { "epoch": 1.4225113122171946, "grad_norm": 5.1019978523254395, "learning_rate": 1.0516591251885371e-05, "loss": 0.2622, "step": 5030 }, { "epoch": 1.4253393665158371, "grad_norm": 3.4515976905822754, "learning_rate": 1.0497737556561088e-05, "loss": 0.3172, "step": 5040 }, { "epoch": 1.4281674208144797, "grad_norm": 4.001848220825195, "learning_rate": 1.0478883861236805e-05, "loss": 0.2949, "step": 5050 }, { "epoch": 1.4309954751131222, "grad_norm": 3.414452075958252, "learning_rate": 1.046003016591252e-05, "loss": 0.2345, "step": 5060 }, { "epoch": 1.4338235294117647, "grad_norm": 6.0561747550964355, "learning_rate": 1.0441176470588236e-05, "loss": 0.3239, "step": 5070 }, { "epoch": 1.4366515837104072, "grad_norm": 2.448591470718384, "learning_rate": 1.0422322775263953e-05, "loss": 0.2031, "step": 5080 }, { "epoch": 1.4394796380090498, "grad_norm": 5.490105152130127, "learning_rate": 1.040346907993967e-05, "loss": 0.2607, "step": 5090 }, { "epoch": 1.4423076923076923, "grad_norm": 2.7472801208496094, "learning_rate": 1.0384615384615386e-05, "loss": 0.2412, "step": 5100 }, { "epoch": 1.4451357466063348, "grad_norm": 4.4468770027160645, "learning_rate": 1.0365761689291103e-05, "loss": 0.288, "step": 5110 }, { "epoch": 1.4479638009049773, "grad_norm": 1.942518949508667, "learning_rate": 1.0346907993966819e-05, "loss": 0.2592, "step": 5120 }, { "epoch": 1.4507918552036199, "grad_norm": 4.880716800689697, "learning_rate": 1.0328054298642536e-05, "loss": 0.2454, "step": 5130 }, { "epoch": 1.4536199095022624, "grad_norm": 3.7106387615203857, "learning_rate": 1.030920060331825e-05, "loss": 0.2863, "step": 5140 }, { "epoch": 1.456447963800905, "grad_norm": 5.332839488983154, "learning_rate": 1.0290346907993967e-05, "loss": 0.3325, "step": 5150 }, { "epoch": 1.4592760180995474, "grad_norm": 4.884565353393555, "learning_rate": 1.0271493212669684e-05, "loss": 0.2284, "step": 5160 }, { "epoch": 1.4621040723981902, "grad_norm": 4.775869846343994, "learning_rate": 1.0252639517345399e-05, "loss": 0.1897, "step": 5170 }, { "epoch": 1.4649321266968327, "grad_norm": 2.5493810176849365, "learning_rate": 1.0233785822021115e-05, "loss": 0.2919, "step": 5180 }, { "epoch": 1.4677601809954752, "grad_norm": 3.7652482986450195, "learning_rate": 1.0214932126696832e-05, "loss": 0.2795, "step": 5190 }, { "epoch": 1.4705882352941178, "grad_norm": 4.398680686950684, "learning_rate": 1.0196078431372549e-05, "loss": 0.2685, "step": 5200 }, { "epoch": 1.4734162895927603, "grad_norm": 2.400367498397827, "learning_rate": 1.0177224736048265e-05, "loss": 0.251, "step": 5210 }, { "epoch": 1.4762443438914028, "grad_norm": 3.4146950244903564, "learning_rate": 1.0158371040723982e-05, "loss": 0.2115, "step": 5220 }, { "epoch": 1.4790723981900453, "grad_norm": 4.488588809967041, "learning_rate": 1.0139517345399699e-05, "loss": 0.2405, "step": 5230 }, { "epoch": 1.4819004524886878, "grad_norm": 6.304666996002197, "learning_rate": 1.0120663650075415e-05, "loss": 0.3394, "step": 5240 }, { "epoch": 1.4847285067873304, "grad_norm": 2.8380801677703857, "learning_rate": 1.0101809954751132e-05, "loss": 0.2637, "step": 5250 }, { "epoch": 1.487556561085973, "grad_norm": 4.873356819152832, "learning_rate": 1.0082956259426848e-05, "loss": 0.2652, "step": 5260 }, { "epoch": 1.4903846153846154, "grad_norm": 5.6608123779296875, "learning_rate": 1.0064102564102565e-05, "loss": 0.2961, "step": 5270 }, { "epoch": 1.493212669683258, "grad_norm": 4.332230567932129, "learning_rate": 1.0045248868778282e-05, "loss": 0.2705, "step": 5280 }, { "epoch": 1.4960407239819005, "grad_norm": 5.802159309387207, "learning_rate": 1.0026395173453998e-05, "loss": 0.2599, "step": 5290 }, { "epoch": 1.498868778280543, "grad_norm": 3.019793748855591, "learning_rate": 1.0007541478129715e-05, "loss": 0.2623, "step": 5300 }, { "epoch": 1.5016968325791855, "grad_norm": 4.762251377105713, "learning_rate": 9.988687782805431e-06, "loss": 0.2585, "step": 5310 }, { "epoch": 1.504524886877828, "grad_norm": 6.202815055847168, "learning_rate": 9.969834087481146e-06, "loss": 0.2778, "step": 5320 }, { "epoch": 1.5073529411764706, "grad_norm": 3.872309684753418, "learning_rate": 9.950980392156863e-06, "loss": 0.3034, "step": 5330 }, { "epoch": 1.510180995475113, "grad_norm": 4.060298919677734, "learning_rate": 9.93212669683258e-06, "loss": 0.2884, "step": 5340 }, { "epoch": 1.5130090497737556, "grad_norm": 2.0391085147857666, "learning_rate": 9.913273001508296e-06, "loss": 0.2867, "step": 5350 }, { "epoch": 1.5158371040723981, "grad_norm": 4.735014915466309, "learning_rate": 9.894419306184013e-06, "loss": 0.2797, "step": 5360 }, { "epoch": 1.5186651583710407, "grad_norm": 4.086658000946045, "learning_rate": 9.87556561085973e-06, "loss": 0.2841, "step": 5370 }, { "epoch": 1.5214932126696832, "grad_norm": 4.3362040519714355, "learning_rate": 9.856711915535446e-06, "loss": 0.2566, "step": 5380 }, { "epoch": 1.5243212669683257, "grad_norm": 3.9439034461975098, "learning_rate": 9.837858220211161e-06, "loss": 0.2506, "step": 5390 }, { "epoch": 1.5271493212669682, "grad_norm": 4.754290580749512, "learning_rate": 9.819004524886878e-06, "loss": 0.2271, "step": 5400 }, { "epoch": 1.5299773755656108, "grad_norm": 4.914488792419434, "learning_rate": 9.800150829562594e-06, "loss": 0.3032, "step": 5410 }, { "epoch": 1.5328054298642533, "grad_norm": 3.0046920776367188, "learning_rate": 9.781297134238311e-06, "loss": 0.2123, "step": 5420 }, { "epoch": 1.5356334841628958, "grad_norm": 4.0427985191345215, "learning_rate": 9.762443438914027e-06, "loss": 0.2621, "step": 5430 }, { "epoch": 1.5384615384615383, "grad_norm": 6.442467212677002, "learning_rate": 9.743589743589744e-06, "loss": 0.247, "step": 5440 }, { "epoch": 1.5412895927601808, "grad_norm": 3.7217085361480713, "learning_rate": 9.72473604826546e-06, "loss": 0.3037, "step": 5450 }, { "epoch": 1.5441176470588234, "grad_norm": 7.558680534362793, "learning_rate": 9.705882352941177e-06, "loss": 0.3076, "step": 5460 }, { "epoch": 1.5469457013574661, "grad_norm": 3.152740240097046, "learning_rate": 9.687028657616894e-06, "loss": 0.2778, "step": 5470 }, { "epoch": 1.5497737556561086, "grad_norm": 3.996135711669922, "learning_rate": 9.66817496229261e-06, "loss": 0.3243, "step": 5480 }, { "epoch": 1.5526018099547512, "grad_norm": 3.837599039077759, "learning_rate": 9.649321266968327e-06, "loss": 0.2284, "step": 5490 }, { "epoch": 1.5554298642533937, "grad_norm": 4.957329750061035, "learning_rate": 9.630467571644044e-06, "loss": 0.2585, "step": 5500 }, { "epoch": 1.5582579185520362, "grad_norm": 4.0857133865356445, "learning_rate": 9.61161387631976e-06, "loss": 0.2947, "step": 5510 }, { "epoch": 1.5610859728506787, "grad_norm": 5.3217902183532715, "learning_rate": 9.592760180995477e-06, "loss": 0.3083, "step": 5520 }, { "epoch": 1.5639140271493213, "grad_norm": 6.3014326095581055, "learning_rate": 9.573906485671192e-06, "loss": 0.2514, "step": 5530 }, { "epoch": 1.5667420814479638, "grad_norm": 5.7632670402526855, "learning_rate": 9.555052790346909e-06, "loss": 0.2889, "step": 5540 }, { "epoch": 1.5695701357466063, "grad_norm": 3.6774861812591553, "learning_rate": 9.536199095022625e-06, "loss": 0.2933, "step": 5550 }, { "epoch": 1.5723981900452488, "grad_norm": 2.207911968231201, "learning_rate": 9.517345399698342e-06, "loss": 0.2594, "step": 5560 }, { "epoch": 1.5752262443438914, "grad_norm": 4.789866924285889, "learning_rate": 9.498491704374058e-06, "loss": 0.3103, "step": 5570 }, { "epoch": 1.5780542986425339, "grad_norm": 5.097392559051514, "learning_rate": 9.479638009049773e-06, "loss": 0.2757, "step": 5580 }, { "epoch": 1.5808823529411766, "grad_norm": 4.389581203460693, "learning_rate": 9.46078431372549e-06, "loss": 0.3006, "step": 5590 }, { "epoch": 1.5837104072398192, "grad_norm": 6.803945541381836, "learning_rate": 9.441930618401207e-06, "loss": 0.2912, "step": 5600 }, { "epoch": 1.5865384615384617, "grad_norm": 2.0034751892089844, "learning_rate": 9.423076923076923e-06, "loss": 0.2173, "step": 5610 }, { "epoch": 1.5893665158371042, "grad_norm": 3.0462636947631836, "learning_rate": 9.40422322775264e-06, "loss": 0.3155, "step": 5620 }, { "epoch": 1.5921945701357467, "grad_norm": 6.887737274169922, "learning_rate": 9.385369532428356e-06, "loss": 0.238, "step": 5630 }, { "epoch": 1.5950226244343892, "grad_norm": 7.331830978393555, "learning_rate": 9.366515837104073e-06, "loss": 0.3028, "step": 5640 }, { "epoch": 1.5978506787330318, "grad_norm": 3.274845600128174, "learning_rate": 9.34766214177979e-06, "loss": 0.2585, "step": 5650 }, { "epoch": 1.6006787330316743, "grad_norm": 6.801854133605957, "learning_rate": 9.328808446455506e-06, "loss": 0.256, "step": 5660 }, { "epoch": 1.6035067873303168, "grad_norm": 7.7837982177734375, "learning_rate": 9.309954751131223e-06, "loss": 0.2289, "step": 5670 }, { "epoch": 1.6063348416289593, "grad_norm": 8.501007080078125, "learning_rate": 9.29110105580694e-06, "loss": 0.2766, "step": 5680 }, { "epoch": 1.6091628959276019, "grad_norm": 4.016129493713379, "learning_rate": 9.272247360482656e-06, "loss": 0.2611, "step": 5690 }, { "epoch": 1.6119909502262444, "grad_norm": 5.062587738037109, "learning_rate": 9.253393665158373e-06, "loss": 0.2142, "step": 5700 }, { "epoch": 1.614819004524887, "grad_norm": 2.5895862579345703, "learning_rate": 9.23453996983409e-06, "loss": 0.2681, "step": 5710 }, { "epoch": 1.6176470588235294, "grad_norm": 5.066253662109375, "learning_rate": 9.215686274509804e-06, "loss": 0.2604, "step": 5720 }, { "epoch": 1.620475113122172, "grad_norm": 5.256166934967041, "learning_rate": 9.196832579185521e-06, "loss": 0.3009, "step": 5730 }, { "epoch": 1.6233031674208145, "grad_norm": 4.829041004180908, "learning_rate": 9.177978883861237e-06, "loss": 0.2614, "step": 5740 }, { "epoch": 1.626131221719457, "grad_norm": 4.902761459350586, "learning_rate": 9.159125188536954e-06, "loss": 0.2348, "step": 5750 }, { "epoch": 1.6289592760180995, "grad_norm": 5.516357421875, "learning_rate": 9.14027149321267e-06, "loss": 0.328, "step": 5760 }, { "epoch": 1.631787330316742, "grad_norm": 3.2983596324920654, "learning_rate": 9.121417797888387e-06, "loss": 0.1956, "step": 5770 }, { "epoch": 1.6346153846153846, "grad_norm": 7.548886775970459, "learning_rate": 9.102564102564104e-06, "loss": 0.2712, "step": 5780 }, { "epoch": 1.637443438914027, "grad_norm": 4.081298828125, "learning_rate": 9.083710407239819e-06, "loss": 0.2726, "step": 5790 }, { "epoch": 1.6402714932126696, "grad_norm": 6.161011695861816, "learning_rate": 9.064856711915535e-06, "loss": 0.2995, "step": 5800 }, { "epoch": 1.6430995475113122, "grad_norm": 4.223090171813965, "learning_rate": 9.046003016591252e-06, "loss": 0.3111, "step": 5810 }, { "epoch": 1.6459276018099547, "grad_norm": 7.8988728523254395, "learning_rate": 9.027149321266969e-06, "loss": 0.2875, "step": 5820 }, { "epoch": 1.6487556561085972, "grad_norm": 2.9701428413391113, "learning_rate": 9.008295625942685e-06, "loss": 0.2549, "step": 5830 }, { "epoch": 1.6515837104072397, "grad_norm": 6.37022066116333, "learning_rate": 8.989441930618402e-06, "loss": 0.3591, "step": 5840 }, { "epoch": 1.6544117647058822, "grad_norm": 4.708193302154541, "learning_rate": 8.970588235294119e-06, "loss": 0.2561, "step": 5850 }, { "epoch": 1.6572398190045248, "grad_norm": 5.106235027313232, "learning_rate": 8.951734539969835e-06, "loss": 0.2546, "step": 5860 }, { "epoch": 1.6600678733031673, "grad_norm": 4.135291576385498, "learning_rate": 8.932880844645552e-06, "loss": 0.2841, "step": 5870 }, { "epoch": 1.6628959276018098, "grad_norm": 5.418251991271973, "learning_rate": 8.914027149321268e-06, "loss": 0.2606, "step": 5880 }, { "epoch": 1.6657239819004523, "grad_norm": 7.133711338043213, "learning_rate": 8.895173453996983e-06, "loss": 0.3058, "step": 5890 }, { "epoch": 1.6685520361990949, "grad_norm": 3.556772470474243, "learning_rate": 8.8763197586727e-06, "loss": 0.271, "step": 5900 }, { "epoch": 1.6713800904977374, "grad_norm": 4.334698677062988, "learning_rate": 8.857466063348417e-06, "loss": 0.2695, "step": 5910 }, { "epoch": 1.6742081447963801, "grad_norm": 5.072098731994629, "learning_rate": 8.838612368024133e-06, "loss": 0.2845, "step": 5920 }, { "epoch": 1.6770361990950227, "grad_norm": 5.321040630340576, "learning_rate": 8.81975867269985e-06, "loss": 0.3065, "step": 5930 }, { "epoch": 1.6798642533936652, "grad_norm": 3.292698860168457, "learning_rate": 8.800904977375566e-06, "loss": 0.2547, "step": 5940 }, { "epoch": 1.6826923076923077, "grad_norm": 8.568231582641602, "learning_rate": 8.782051282051283e-06, "loss": 0.2547, "step": 5950 }, { "epoch": 1.6855203619909502, "grad_norm": 5.787846088409424, "learning_rate": 8.763197586727e-06, "loss": 0.3768, "step": 5960 }, { "epoch": 1.6883484162895928, "grad_norm": 4.789765357971191, "learning_rate": 8.744343891402716e-06, "loss": 0.2252, "step": 5970 }, { "epoch": 1.6911764705882353, "grad_norm": 6.947218418121338, "learning_rate": 8.725490196078433e-06, "loss": 0.2309, "step": 5980 }, { "epoch": 1.6940045248868778, "grad_norm": 3.733675956726074, "learning_rate": 8.70663650075415e-06, "loss": 0.2422, "step": 5990 }, { "epoch": 1.6968325791855203, "grad_norm": 4.800724506378174, "learning_rate": 8.687782805429864e-06, "loss": 0.2322, "step": 6000 }, { "epoch": 1.6968325791855203, "eval_accuracy": 0.8820394527521477, "eval_loss": 0.29341939091682434, "eval_runtime": 126.3457, "eval_samples_per_second": 99.505, "eval_steps_per_second": 3.111, "step": 6000 }, { "epoch": 1.6996606334841629, "grad_norm": 2.899115800857544, "learning_rate": 8.668929110105581e-06, "loss": 0.2858, "step": 6010 }, { "epoch": 1.7024886877828054, "grad_norm": 5.119002819061279, "learning_rate": 8.650075414781298e-06, "loss": 0.2373, "step": 6020 }, { "epoch": 1.7053167420814481, "grad_norm": 4.328557968139648, "learning_rate": 8.631221719457014e-06, "loss": 0.2857, "step": 6030 }, { "epoch": 1.7081447963800906, "grad_norm": 6.154530048370361, "learning_rate": 8.612368024132731e-06, "loss": 0.2563, "step": 6040 }, { "epoch": 1.7109728506787332, "grad_norm": 2.4150142669677734, "learning_rate": 8.593514328808446e-06, "loss": 0.2766, "step": 6050 }, { "epoch": 1.7138009049773757, "grad_norm": 5.834397315979004, "learning_rate": 8.574660633484162e-06, "loss": 0.2804, "step": 6060 }, { "epoch": 1.7166289592760182, "grad_norm": 5.142675876617432, "learning_rate": 8.555806938159879e-06, "loss": 0.2573, "step": 6070 }, { "epoch": 1.7194570135746607, "grad_norm": 4.238577842712402, "learning_rate": 8.536953242835596e-06, "loss": 0.235, "step": 6080 }, { "epoch": 1.7222850678733033, "grad_norm": 4.491209506988525, "learning_rate": 8.518099547511312e-06, "loss": 0.2605, "step": 6090 }, { "epoch": 1.7251131221719458, "grad_norm": 5.393953323364258, "learning_rate": 8.499245852187029e-06, "loss": 0.2804, "step": 6100 }, { "epoch": 1.7279411764705883, "grad_norm": 4.455014228820801, "learning_rate": 8.480392156862745e-06, "loss": 0.2453, "step": 6110 }, { "epoch": 1.7307692307692308, "grad_norm": 4.781386375427246, "learning_rate": 8.461538461538462e-06, "loss": 0.2194, "step": 6120 }, { "epoch": 1.7335972850678734, "grad_norm": 5.215591907501221, "learning_rate": 8.442684766214179e-06, "loss": 0.2602, "step": 6130 }, { "epoch": 1.7364253393665159, "grad_norm": 5.542301654815674, "learning_rate": 8.423831070889895e-06, "loss": 0.3245, "step": 6140 }, { "epoch": 1.7392533936651584, "grad_norm": 2.144392967224121, "learning_rate": 8.404977375565612e-06, "loss": 0.2445, "step": 6150 }, { "epoch": 1.742081447963801, "grad_norm": 3.160285711288452, "learning_rate": 8.386123680241329e-06, "loss": 0.2702, "step": 6160 }, { "epoch": 1.7449095022624435, "grad_norm": 4.129340171813965, "learning_rate": 8.367269984917045e-06, "loss": 0.2924, "step": 6170 }, { "epoch": 1.747737556561086, "grad_norm": 4.408333778381348, "learning_rate": 8.348416289592762e-06, "loss": 0.2364, "step": 6180 }, { "epoch": 1.7505656108597285, "grad_norm": 5.696101188659668, "learning_rate": 8.329562594268478e-06, "loss": 0.2445, "step": 6190 }, { "epoch": 1.753393665158371, "grad_norm": 4.723424434661865, "learning_rate": 8.310708898944195e-06, "loss": 0.2284, "step": 6200 }, { "epoch": 1.7562217194570136, "grad_norm": 4.272291660308838, "learning_rate": 8.29185520361991e-06, "loss": 0.3189, "step": 6210 }, { "epoch": 1.759049773755656, "grad_norm": 4.042122840881348, "learning_rate": 8.273001508295627e-06, "loss": 0.2649, "step": 6220 }, { "epoch": 1.7618778280542986, "grad_norm": 1.9126514196395874, "learning_rate": 8.254147812971343e-06, "loss": 0.255, "step": 6230 }, { "epoch": 1.7647058823529411, "grad_norm": 11.250100135803223, "learning_rate": 8.23529411764706e-06, "loss": 0.302, "step": 6240 }, { "epoch": 1.7675339366515836, "grad_norm": 4.978902816772461, "learning_rate": 8.216440422322776e-06, "loss": 0.3068, "step": 6250 }, { "epoch": 1.7703619909502262, "grad_norm": 4.657087802886963, "learning_rate": 8.197586726998491e-06, "loss": 0.2641, "step": 6260 }, { "epoch": 1.7731900452488687, "grad_norm": 4.440770626068115, "learning_rate": 8.178733031674208e-06, "loss": 0.2831, "step": 6270 }, { "epoch": 1.7760180995475112, "grad_norm": 2.723531484603882, "learning_rate": 8.159879336349925e-06, "loss": 0.2247, "step": 6280 }, { "epoch": 1.7788461538461537, "grad_norm": 4.28981351852417, "learning_rate": 8.141025641025641e-06, "loss": 0.2403, "step": 6290 }, { "epoch": 1.7816742081447963, "grad_norm": 4.748565673828125, "learning_rate": 8.122171945701358e-06, "loss": 0.2126, "step": 6300 }, { "epoch": 1.7845022624434388, "grad_norm": 5.226318359375, "learning_rate": 8.103318250377074e-06, "loss": 0.3272, "step": 6310 }, { "epoch": 1.7873303167420813, "grad_norm": 2.937812089920044, "learning_rate": 8.084464555052791e-06, "loss": 0.2276, "step": 6320 }, { "epoch": 1.7901583710407238, "grad_norm": 3.215853452682495, "learning_rate": 8.065610859728508e-06, "loss": 0.2406, "step": 6330 }, { "epoch": 1.7929864253393664, "grad_norm": 6.499160289764404, "learning_rate": 8.046757164404224e-06, "loss": 0.2915, "step": 6340 }, { "epoch": 1.7958144796380089, "grad_norm": 3.940803289413452, "learning_rate": 8.027903469079941e-06, "loss": 0.276, "step": 6350 }, { "epoch": 1.7986425339366516, "grad_norm": 2.177950859069824, "learning_rate": 8.009049773755657e-06, "loss": 0.2558, "step": 6360 }, { "epoch": 1.8014705882352942, "grad_norm": 7.705915451049805, "learning_rate": 7.990196078431374e-06, "loss": 0.2799, "step": 6370 }, { "epoch": 1.8042986425339367, "grad_norm": 5.586729526519775, "learning_rate": 7.97134238310709e-06, "loss": 0.224, "step": 6380 }, { "epoch": 1.8071266968325792, "grad_norm": 2.9311821460723877, "learning_rate": 7.952488687782806e-06, "loss": 0.2316, "step": 6390 }, { "epoch": 1.8099547511312217, "grad_norm": 3.5633130073547363, "learning_rate": 7.933634992458522e-06, "loss": 0.2298, "step": 6400 }, { "epoch": 1.8127828054298643, "grad_norm": 3.4238994121551514, "learning_rate": 7.914781297134239e-06, "loss": 0.2647, "step": 6410 }, { "epoch": 1.8156108597285068, "grad_norm": 9.544416427612305, "learning_rate": 7.895927601809955e-06, "loss": 0.3275, "step": 6420 }, { "epoch": 1.8184389140271493, "grad_norm": 2.8148701190948486, "learning_rate": 7.877073906485672e-06, "loss": 0.2131, "step": 6430 }, { "epoch": 1.8212669683257918, "grad_norm": 5.6752777099609375, "learning_rate": 7.858220211161389e-06, "loss": 0.3065, "step": 6440 }, { "epoch": 1.8240950226244343, "grad_norm": 6.207758903503418, "learning_rate": 7.839366515837105e-06, "loss": 0.3369, "step": 6450 }, { "epoch": 1.8269230769230769, "grad_norm": 2.1755306720733643, "learning_rate": 7.820512820512822e-06, "loss": 0.24, "step": 6460 }, { "epoch": 1.8297511312217196, "grad_norm": 4.380761623382568, "learning_rate": 7.801659125188537e-06, "loss": 0.2621, "step": 6470 }, { "epoch": 1.8325791855203621, "grad_norm": 7.944891452789307, "learning_rate": 7.782805429864253e-06, "loss": 0.2421, "step": 6480 }, { "epoch": 1.8354072398190047, "grad_norm": 6.696594715118408, "learning_rate": 7.76395173453997e-06, "loss": 0.2179, "step": 6490 }, { "epoch": 1.8382352941176472, "grad_norm": 5.534007549285889, "learning_rate": 7.745098039215687e-06, "loss": 0.2465, "step": 6500 }, { "epoch": 1.8410633484162897, "grad_norm": 4.6053290367126465, "learning_rate": 7.726244343891403e-06, "loss": 0.3311, "step": 6510 }, { "epoch": 1.8438914027149322, "grad_norm": 3.2913260459899902, "learning_rate": 7.70739064856712e-06, "loss": 0.2535, "step": 6520 }, { "epoch": 1.8467194570135748, "grad_norm": 5.70173454284668, "learning_rate": 7.688536953242837e-06, "loss": 0.2283, "step": 6530 }, { "epoch": 1.8495475113122173, "grad_norm": 6.683012962341309, "learning_rate": 7.669683257918553e-06, "loss": 0.2293, "step": 6540 }, { "epoch": 1.8523755656108598, "grad_norm": 4.2895612716674805, "learning_rate": 7.650829562594268e-06, "loss": 0.2013, "step": 6550 }, { "epoch": 1.8552036199095023, "grad_norm": 2.8891239166259766, "learning_rate": 7.631975867269985e-06, "loss": 0.2482, "step": 6560 }, { "epoch": 1.8580316742081449, "grad_norm": 5.462761402130127, "learning_rate": 7.613122171945701e-06, "loss": 0.3063, "step": 6570 }, { "epoch": 1.8608597285067874, "grad_norm": 4.3543806076049805, "learning_rate": 7.594268476621418e-06, "loss": 0.2519, "step": 6580 }, { "epoch": 1.86368778280543, "grad_norm": 5.1229681968688965, "learning_rate": 7.5754147812971346e-06, "loss": 0.2968, "step": 6590 }, { "epoch": 1.8665158371040724, "grad_norm": 1.8585267066955566, "learning_rate": 7.556561085972851e-06, "loss": 0.2208, "step": 6600 }, { "epoch": 1.869343891402715, "grad_norm": 4.255302429199219, "learning_rate": 7.537707390648568e-06, "loss": 0.2968, "step": 6610 }, { "epoch": 1.8721719457013575, "grad_norm": 4.815881729125977, "learning_rate": 7.518853695324284e-06, "loss": 0.3433, "step": 6620 }, { "epoch": 1.875, "grad_norm": 6.812479496002197, "learning_rate": 7.500000000000001e-06, "loss": 0.311, "step": 6630 }, { "epoch": 1.8778280542986425, "grad_norm": 3.9199917316436768, "learning_rate": 7.481146304675717e-06, "loss": 0.2767, "step": 6640 }, { "epoch": 1.880656108597285, "grad_norm": 4.117010593414307, "learning_rate": 7.462292609351433e-06, "loss": 0.2858, "step": 6650 }, { "epoch": 1.8834841628959276, "grad_norm": 4.636374473571777, "learning_rate": 7.44343891402715e-06, "loss": 0.2043, "step": 6660 }, { "epoch": 1.88631221719457, "grad_norm": 5.478713512420654, "learning_rate": 7.424585218702867e-06, "loss": 0.288, "step": 6670 }, { "epoch": 1.8891402714932126, "grad_norm": 4.690084457397461, "learning_rate": 7.405731523378583e-06, "loss": 0.2651, "step": 6680 }, { "epoch": 1.8919683257918551, "grad_norm": 2.4495575428009033, "learning_rate": 7.3868778280543e-06, "loss": 0.2651, "step": 6690 }, { "epoch": 1.8947963800904977, "grad_norm": 5.4684672355651855, "learning_rate": 7.3680241327300165e-06, "loss": 0.2834, "step": 6700 }, { "epoch": 1.8976244343891402, "grad_norm": 1.9919039011001587, "learning_rate": 7.349170437405732e-06, "loss": 0.2021, "step": 6710 }, { "epoch": 1.9004524886877827, "grad_norm": 4.975834846496582, "learning_rate": 7.330316742081448e-06, "loss": 0.3194, "step": 6720 }, { "epoch": 1.9032805429864252, "grad_norm": 4.014176368713379, "learning_rate": 7.311463046757165e-06, "loss": 0.2251, "step": 6730 }, { "epoch": 1.9061085972850678, "grad_norm": 7.0189409255981445, "learning_rate": 7.292609351432881e-06, "loss": 0.3062, "step": 6740 }, { "epoch": 1.9089366515837103, "grad_norm": 7.0651350021362305, "learning_rate": 7.273755656108598e-06, "loss": 0.2488, "step": 6750 }, { "epoch": 1.9117647058823528, "grad_norm": 7.110829830169678, "learning_rate": 7.2549019607843145e-06, "loss": 0.2226, "step": 6760 }, { "epoch": 1.9145927601809953, "grad_norm": 8.122304916381836, "learning_rate": 7.23604826546003e-06, "loss": 0.2236, "step": 6770 }, { "epoch": 1.9174208144796379, "grad_norm": 4.817609786987305, "learning_rate": 7.217194570135747e-06, "loss": 0.2935, "step": 6780 }, { "epoch": 1.9202488687782804, "grad_norm": 3.6452667713165283, "learning_rate": 7.1983408748114635e-06, "loss": 0.2711, "step": 6790 }, { "epoch": 1.9230769230769231, "grad_norm": 5.04451847076416, "learning_rate": 7.17948717948718e-06, "loss": 0.3383, "step": 6800 }, { "epoch": 1.9259049773755657, "grad_norm": 3.0769617557525635, "learning_rate": 7.160633484162897e-06, "loss": 0.2481, "step": 6810 }, { "epoch": 1.9287330316742082, "grad_norm": 2.4666669368743896, "learning_rate": 7.141779788838613e-06, "loss": 0.2713, "step": 6820 }, { "epoch": 1.9315610859728507, "grad_norm": 6.22195291519165, "learning_rate": 7.12292609351433e-06, "loss": 0.253, "step": 6830 }, { "epoch": 1.9343891402714932, "grad_norm": 5.916505336761475, "learning_rate": 7.104072398190046e-06, "loss": 0.3023, "step": 6840 }, { "epoch": 1.9372171945701357, "grad_norm": 3.696983575820923, "learning_rate": 7.085218702865762e-06, "loss": 0.3176, "step": 6850 }, { "epoch": 1.9400452488687783, "grad_norm": 4.350560665130615, "learning_rate": 7.066365007541479e-06, "loss": 0.2488, "step": 6860 }, { "epoch": 1.9428733031674208, "grad_norm": 4.9616498947143555, "learning_rate": 7.047511312217196e-06, "loss": 0.2901, "step": 6870 }, { "epoch": 1.9457013574660633, "grad_norm": 2.2549595832824707, "learning_rate": 7.028657616892911e-06, "loss": 0.2526, "step": 6880 }, { "epoch": 1.9485294117647058, "grad_norm": 3.205310821533203, "learning_rate": 7.009803921568628e-06, "loss": 0.2819, "step": 6890 }, { "epoch": 1.9513574660633484, "grad_norm": 5.102742671966553, "learning_rate": 6.990950226244344e-06, "loss": 0.2573, "step": 6900 }, { "epoch": 1.9541855203619911, "grad_norm": 2.78604793548584, "learning_rate": 6.97209653092006e-06, "loss": 0.1702, "step": 6910 }, { "epoch": 1.9570135746606336, "grad_norm": 3.8111801147460938, "learning_rate": 6.953242835595777e-06, "loss": 0.2963, "step": 6920 }, { "epoch": 1.9598416289592762, "grad_norm": 4.204692363739014, "learning_rate": 6.934389140271494e-06, "loss": 0.2989, "step": 6930 }, { "epoch": 1.9626696832579187, "grad_norm": 3.3682045936584473, "learning_rate": 6.91553544494721e-06, "loss": 0.2744, "step": 6940 }, { "epoch": 1.9654977375565612, "grad_norm": 5.661670207977295, "learning_rate": 6.896681749622927e-06, "loss": 0.27, "step": 6950 }, { "epoch": 1.9683257918552037, "grad_norm": 3.925750494003296, "learning_rate": 6.8778280542986434e-06, "loss": 0.2711, "step": 6960 }, { "epoch": 1.9711538461538463, "grad_norm": 5.467376232147217, "learning_rate": 6.858974358974359e-06, "loss": 0.3182, "step": 6970 }, { "epoch": 1.9739819004524888, "grad_norm": 7.46327543258667, "learning_rate": 6.840120663650076e-06, "loss": 0.336, "step": 6980 }, { "epoch": 1.9768099547511313, "grad_norm": 4.464349269866943, "learning_rate": 6.8212669683257924e-06, "loss": 0.333, "step": 6990 }, { "epoch": 1.9796380090497738, "grad_norm": 5.0763421058654785, "learning_rate": 6.802413273001509e-06, "loss": 0.2332, "step": 7000 }, { "epoch": 1.9796380090497738, "eval_accuracy": 0.8868119630925867, "eval_loss": 0.2794936001300812, "eval_runtime": 126.4211, "eval_samples_per_second": 99.445, "eval_steps_per_second": 3.109, "step": 7000 }, { "epoch": 1.9824660633484164, "grad_norm": 4.514822483062744, "learning_rate": 6.783559577677226e-06, "loss": 0.3259, "step": 7010 }, { "epoch": 1.9852941176470589, "grad_norm": 3.9309160709381104, "learning_rate": 6.764705882352942e-06, "loss": 0.2671, "step": 7020 }, { "epoch": 1.9881221719457014, "grad_norm": 3.7512924671173096, "learning_rate": 6.745852187028659e-06, "loss": 0.3025, "step": 7030 }, { "epoch": 1.990950226244344, "grad_norm": 5.162522792816162, "learning_rate": 6.7269984917043755e-06, "loss": 0.2556, "step": 7040 }, { "epoch": 1.9937782805429864, "grad_norm": 5.968090534210205, "learning_rate": 6.7081447963800904e-06, "loss": 0.245, "step": 7050 }, { "epoch": 1.996606334841629, "grad_norm": 7.264348983764648, "learning_rate": 6.689291101055807e-06, "loss": 0.274, "step": 7060 }, { "epoch": 1.9994343891402715, "grad_norm": 4.840837478637695, "learning_rate": 6.670437405731524e-06, "loss": 0.2381, "step": 7070 }, { "epoch": 2.002262443438914, "grad_norm": 3.3212857246398926, "learning_rate": 6.65158371040724e-06, "loss": 0.2576, "step": 7080 }, { "epoch": 2.0050904977375565, "grad_norm": 6.3086419105529785, "learning_rate": 6.632730015082957e-06, "loss": 0.2471, "step": 7090 }, { "epoch": 2.007918552036199, "grad_norm": 2.5110299587249756, "learning_rate": 6.613876319758673e-06, "loss": 0.2414, "step": 7100 }, { "epoch": 2.0107466063348416, "grad_norm": 4.115811824798584, "learning_rate": 6.595022624434389e-06, "loss": 0.1715, "step": 7110 }, { "epoch": 2.013574660633484, "grad_norm": 5.045820236206055, "learning_rate": 6.576168929110106e-06, "loss": 0.2494, "step": 7120 }, { "epoch": 2.0164027149321266, "grad_norm": 4.6321845054626465, "learning_rate": 6.5573152337858225e-06, "loss": 0.222, "step": 7130 }, { "epoch": 2.019230769230769, "grad_norm": 5.135430335998535, "learning_rate": 6.538461538461539e-06, "loss": 0.2206, "step": 7140 }, { "epoch": 2.0220588235294117, "grad_norm": 4.786893367767334, "learning_rate": 6.519607843137256e-06, "loss": 0.229, "step": 7150 }, { "epoch": 2.024886877828054, "grad_norm": 3.568856716156006, "learning_rate": 6.500754147812972e-06, "loss": 0.2235, "step": 7160 }, { "epoch": 2.0277149321266967, "grad_norm": 6.938755989074707, "learning_rate": 6.481900452488689e-06, "loss": 0.23, "step": 7170 }, { "epoch": 2.0305429864253393, "grad_norm": 4.014111042022705, "learning_rate": 6.463046757164405e-06, "loss": 0.2076, "step": 7180 }, { "epoch": 2.033371040723982, "grad_norm": 5.143094062805176, "learning_rate": 6.444193061840121e-06, "loss": 0.3276, "step": 7190 }, { "epoch": 2.0361990950226243, "grad_norm": 4.8052191734313965, "learning_rate": 6.425339366515838e-06, "loss": 0.2223, "step": 7200 }, { "epoch": 2.039027149321267, "grad_norm": 6.07175874710083, "learning_rate": 6.406485671191555e-06, "loss": 0.2514, "step": 7210 }, { "epoch": 2.0418552036199094, "grad_norm": 3.0855891704559326, "learning_rate": 6.38763197586727e-06, "loss": 0.2043, "step": 7220 }, { "epoch": 2.044683257918552, "grad_norm": 5.760570049285889, "learning_rate": 6.368778280542986e-06, "loss": 0.2051, "step": 7230 }, { "epoch": 2.0475113122171944, "grad_norm": 5.127667427062988, "learning_rate": 6.349924585218703e-06, "loss": 0.2141, "step": 7240 }, { "epoch": 2.050339366515837, "grad_norm": 2.886842727661133, "learning_rate": 6.331070889894419e-06, "loss": 0.1705, "step": 7250 }, { "epoch": 2.0531674208144794, "grad_norm": 5.108696937561035, "learning_rate": 6.312217194570136e-06, "loss": 0.2737, "step": 7260 }, { "epoch": 2.055995475113122, "grad_norm": 7.453789234161377, "learning_rate": 6.293363499245853e-06, "loss": 0.288, "step": 7270 }, { "epoch": 2.0588235294117645, "grad_norm": 3.700695514678955, "learning_rate": 6.274509803921569e-06, "loss": 0.2087, "step": 7280 }, { "epoch": 2.0616515837104075, "grad_norm": 3.475170612335205, "learning_rate": 6.255656108597286e-06, "loss": 0.182, "step": 7290 }, { "epoch": 2.06447963800905, "grad_norm": 3.636042833328247, "learning_rate": 6.2368024132730024e-06, "loss": 0.1856, "step": 7300 }, { "epoch": 2.0673076923076925, "grad_norm": 4.326310157775879, "learning_rate": 6.217948717948718e-06, "loss": 0.2071, "step": 7310 }, { "epoch": 2.070135746606335, "grad_norm": 4.5239105224609375, "learning_rate": 6.199095022624435e-06, "loss": 0.2045, "step": 7320 }, { "epoch": 2.0729638009049776, "grad_norm": 5.962629318237305, "learning_rate": 6.1802413273001514e-06, "loss": 0.2236, "step": 7330 }, { "epoch": 2.07579185520362, "grad_norm": 6.830577373504639, "learning_rate": 6.161387631975868e-06, "loss": 0.2435, "step": 7340 }, { "epoch": 2.0786199095022626, "grad_norm": 6.650877952575684, "learning_rate": 6.142533936651585e-06, "loss": 0.2273, "step": 7350 }, { "epoch": 2.081447963800905, "grad_norm": 9.387392044067383, "learning_rate": 6.123680241327301e-06, "loss": 0.2265, "step": 7360 }, { "epoch": 2.0842760180995477, "grad_norm": 7.404173374176025, "learning_rate": 6.104826546003018e-06, "loss": 0.1513, "step": 7370 }, { "epoch": 2.08710407239819, "grad_norm": 3.4944663047790527, "learning_rate": 6.085972850678733e-06, "loss": 0.2339, "step": 7380 }, { "epoch": 2.0899321266968327, "grad_norm": 3.5213699340820312, "learning_rate": 6.0671191553544494e-06, "loss": 0.2839, "step": 7390 }, { "epoch": 2.0927601809954752, "grad_norm": 4.182003974914551, "learning_rate": 6.048265460030166e-06, "loss": 0.2125, "step": 7400 }, { "epoch": 2.0955882352941178, "grad_norm": 6.472683429718018, "learning_rate": 6.029411764705883e-06, "loss": 0.1934, "step": 7410 }, { "epoch": 2.0984162895927603, "grad_norm": 3.89056658744812, "learning_rate": 6.010558069381599e-06, "loss": 0.1829, "step": 7420 }, { "epoch": 2.101244343891403, "grad_norm": 6.370733261108398, "learning_rate": 5.991704374057316e-06, "loss": 0.1888, "step": 7430 }, { "epoch": 2.1040723981900453, "grad_norm": 6.549925327301025, "learning_rate": 5.972850678733032e-06, "loss": 0.2399, "step": 7440 }, { "epoch": 2.106900452488688, "grad_norm": 6.536769866943359, "learning_rate": 5.953996983408748e-06, "loss": 0.2937, "step": 7450 }, { "epoch": 2.1097285067873304, "grad_norm": 5.718851566314697, "learning_rate": 5.935143288084465e-06, "loss": 0.1983, "step": 7460 }, { "epoch": 2.112556561085973, "grad_norm": 6.838066577911377, "learning_rate": 5.9162895927601815e-06, "loss": 0.2941, "step": 7470 }, { "epoch": 2.1153846153846154, "grad_norm": 3.4056811332702637, "learning_rate": 5.897435897435898e-06, "loss": 0.2191, "step": 7480 }, { "epoch": 2.118212669683258, "grad_norm": 5.439931392669678, "learning_rate": 5.878582202111615e-06, "loss": 0.2095, "step": 7490 }, { "epoch": 2.1210407239819005, "grad_norm": 6.081836700439453, "learning_rate": 5.859728506787331e-06, "loss": 0.1964, "step": 7500 }, { "epoch": 2.123868778280543, "grad_norm": 2.3146896362304688, "learning_rate": 5.840874811463048e-06, "loss": 0.266, "step": 7510 }, { "epoch": 2.1266968325791855, "grad_norm": 2.6987674236297607, "learning_rate": 5.822021116138764e-06, "loss": 0.2508, "step": 7520 }, { "epoch": 2.129524886877828, "grad_norm": 4.278384208679199, "learning_rate": 5.80316742081448e-06, "loss": 0.1764, "step": 7530 }, { "epoch": 2.1323529411764706, "grad_norm": 6.95686674118042, "learning_rate": 5.784313725490197e-06, "loss": 0.274, "step": 7540 }, { "epoch": 2.135180995475113, "grad_norm": 3.3586158752441406, "learning_rate": 5.765460030165913e-06, "loss": 0.2624, "step": 7550 }, { "epoch": 2.1380090497737556, "grad_norm": 3.704134702682495, "learning_rate": 5.746606334841629e-06, "loss": 0.2229, "step": 7560 }, { "epoch": 2.140837104072398, "grad_norm": 6.012093544006348, "learning_rate": 5.727752639517345e-06, "loss": 0.2215, "step": 7570 }, { "epoch": 2.1436651583710407, "grad_norm": 4.300053596496582, "learning_rate": 5.708898944193062e-06, "loss": 0.199, "step": 7580 }, { "epoch": 2.146493212669683, "grad_norm": 7.028651714324951, "learning_rate": 5.690045248868778e-06, "loss": 0.22, "step": 7590 }, { "epoch": 2.1493212669683257, "grad_norm": 5.363503456115723, "learning_rate": 5.671191553544495e-06, "loss": 0.1895, "step": 7600 }, { "epoch": 2.1521493212669682, "grad_norm": 4.580994129180908, "learning_rate": 5.652337858220212e-06, "loss": 0.1713, "step": 7610 }, { "epoch": 2.1549773755656108, "grad_norm": 7.074058532714844, "learning_rate": 5.633484162895928e-06, "loss": 0.2861, "step": 7620 }, { "epoch": 2.1578054298642533, "grad_norm": 6.180254936218262, "learning_rate": 5.614630467571645e-06, "loss": 0.2316, "step": 7630 }, { "epoch": 2.160633484162896, "grad_norm": 9.370762825012207, "learning_rate": 5.5957767722473614e-06, "loss": 0.2717, "step": 7640 }, { "epoch": 2.1634615384615383, "grad_norm": 4.996572017669678, "learning_rate": 5.576923076923077e-06, "loss": 0.2513, "step": 7650 }, { "epoch": 2.166289592760181, "grad_norm": 6.018435478210449, "learning_rate": 5.558069381598794e-06, "loss": 0.2279, "step": 7660 }, { "epoch": 2.1691176470588234, "grad_norm": 4.290647983551025, "learning_rate": 5.5392156862745104e-06, "loss": 0.2459, "step": 7670 }, { "epoch": 2.171945701357466, "grad_norm": 3.902825117111206, "learning_rate": 5.520361990950227e-06, "loss": 0.2181, "step": 7680 }, { "epoch": 2.1747737556561084, "grad_norm": 2.4550859928131104, "learning_rate": 5.501508295625944e-06, "loss": 0.2309, "step": 7690 }, { "epoch": 2.177601809954751, "grad_norm": 3.8267788887023926, "learning_rate": 5.48265460030166e-06, "loss": 0.2444, "step": 7700 }, { "epoch": 2.1804298642533935, "grad_norm": 2.1368167400360107, "learning_rate": 5.463800904977375e-06, "loss": 0.2044, "step": 7710 }, { "epoch": 2.183257918552036, "grad_norm": 4.121007919311523, "learning_rate": 5.444947209653092e-06, "loss": 0.193, "step": 7720 }, { "epoch": 2.1860859728506785, "grad_norm": 1.0247951745986938, "learning_rate": 5.4260935143288084e-06, "loss": 0.2452, "step": 7730 }, { "epoch": 2.1889140271493215, "grad_norm": 6.7461323738098145, "learning_rate": 5.407239819004525e-06, "loss": 0.2341, "step": 7740 }, { "epoch": 2.191742081447964, "grad_norm": 3.962465286254883, "learning_rate": 5.388386123680242e-06, "loss": 0.1699, "step": 7750 }, { "epoch": 2.1945701357466065, "grad_norm": 3.7287843227386475, "learning_rate": 5.369532428355958e-06, "loss": 0.1768, "step": 7760 }, { "epoch": 2.197398190045249, "grad_norm": 3.93239426612854, "learning_rate": 5.350678733031675e-06, "loss": 0.2383, "step": 7770 }, { "epoch": 2.2002262443438916, "grad_norm": 5.207613468170166, "learning_rate": 5.331825037707391e-06, "loss": 0.2282, "step": 7780 }, { "epoch": 2.203054298642534, "grad_norm": 3.9662837982177734, "learning_rate": 5.312971342383107e-06, "loss": 0.1616, "step": 7790 }, { "epoch": 2.2058823529411766, "grad_norm": 4.898771286010742, "learning_rate": 5.294117647058824e-06, "loss": 0.2013, "step": 7800 }, { "epoch": 2.208710407239819, "grad_norm": 7.645010948181152, "learning_rate": 5.2752639517345405e-06, "loss": 0.2478, "step": 7810 }, { "epoch": 2.2115384615384617, "grad_norm": 2.4150936603546143, "learning_rate": 5.256410256410257e-06, "loss": 0.1754, "step": 7820 }, { "epoch": 2.214366515837104, "grad_norm": 1.881043791770935, "learning_rate": 5.237556561085974e-06, "loss": 0.264, "step": 7830 }, { "epoch": 2.2171945701357467, "grad_norm": 6.877952575683594, "learning_rate": 5.21870286576169e-06, "loss": 0.2879, "step": 7840 }, { "epoch": 2.2200226244343892, "grad_norm": 3.3370893001556396, "learning_rate": 5.199849170437406e-06, "loss": 0.2312, "step": 7850 }, { "epoch": 2.2228506787330318, "grad_norm": 4.1501545906066895, "learning_rate": 5.180995475113123e-06, "loss": 0.2129, "step": 7860 }, { "epoch": 2.2256787330316743, "grad_norm": 4.085570335388184, "learning_rate": 5.162141779788839e-06, "loss": 0.1647, "step": 7870 }, { "epoch": 2.228506787330317, "grad_norm": 4.05198335647583, "learning_rate": 5.143288084464555e-06, "loss": 0.2338, "step": 7880 }, { "epoch": 2.2313348416289593, "grad_norm": 3.9560508728027344, "learning_rate": 5.124434389140272e-06, "loss": 0.3062, "step": 7890 }, { "epoch": 2.234162895927602, "grad_norm": 2.1549770832061768, "learning_rate": 5.105580693815988e-06, "loss": 0.2259, "step": 7900 }, { "epoch": 2.2369909502262444, "grad_norm": 2.7982289791107178, "learning_rate": 5.086726998491704e-06, "loss": 0.1782, "step": 7910 }, { "epoch": 2.239819004524887, "grad_norm": 4.951447010040283, "learning_rate": 5.067873303167421e-06, "loss": 0.2604, "step": 7920 }, { "epoch": 2.2426470588235294, "grad_norm": 5.907583713531494, "learning_rate": 5.049019607843137e-06, "loss": 0.2447, "step": 7930 }, { "epoch": 2.245475113122172, "grad_norm": 5.986253261566162, "learning_rate": 5.030165912518854e-06, "loss": 0.2829, "step": 7940 }, { "epoch": 2.2483031674208145, "grad_norm": 4.330525875091553, "learning_rate": 5.011312217194571e-06, "loss": 0.1908, "step": 7950 }, { "epoch": 2.251131221719457, "grad_norm": 5.337680816650391, "learning_rate": 4.992458521870287e-06, "loss": 0.2539, "step": 7960 }, { "epoch": 2.2539592760180995, "grad_norm": 7.187500476837158, "learning_rate": 4.973604826546004e-06, "loss": 0.2405, "step": 7970 }, { "epoch": 2.256787330316742, "grad_norm": 5.105306625366211, "learning_rate": 4.95475113122172e-06, "loss": 0.2616, "step": 7980 }, { "epoch": 2.2596153846153846, "grad_norm": 4.068017482757568, "learning_rate": 4.935897435897436e-06, "loss": 0.2233, "step": 7990 }, { "epoch": 2.262443438914027, "grad_norm": 2.9654664993286133, "learning_rate": 4.917043740573153e-06, "loss": 0.2187, "step": 8000 }, { "epoch": 2.262443438914027, "eval_accuracy": 0.8858574610244989, "eval_loss": 0.29285645484924316, "eval_runtime": 126.4151, "eval_samples_per_second": 99.45, "eval_steps_per_second": 3.109, "step": 8000 }, { "epoch": 2.2652714932126696, "grad_norm": 4.892025470733643, "learning_rate": 4.898190045248869e-06, "loss": 0.223, "step": 8010 }, { "epoch": 2.268099547511312, "grad_norm": 6.540407657623291, "learning_rate": 4.879336349924585e-06, "loss": 0.2356, "step": 8020 }, { "epoch": 2.2709276018099547, "grad_norm": 4.254669666290283, "learning_rate": 4.860482654600302e-06, "loss": 0.2295, "step": 8030 }, { "epoch": 2.273755656108597, "grad_norm": 2.9539434909820557, "learning_rate": 4.8416289592760185e-06, "loss": 0.2617, "step": 8040 }, { "epoch": 2.2765837104072397, "grad_norm": 6.981826305389404, "learning_rate": 4.822775263951735e-06, "loss": 0.2911, "step": 8050 }, { "epoch": 2.2794117647058822, "grad_norm": 4.400992393493652, "learning_rate": 4.803921568627452e-06, "loss": 0.2384, "step": 8060 }, { "epoch": 2.2822398190045248, "grad_norm": 6.687214374542236, "learning_rate": 4.785067873303168e-06, "loss": 0.2139, "step": 8070 }, { "epoch": 2.2850678733031673, "grad_norm": 2.111176013946533, "learning_rate": 4.766214177978885e-06, "loss": 0.2223, "step": 8080 }, { "epoch": 2.28789592760181, "grad_norm": 7.312646389007568, "learning_rate": 4.747360482654601e-06, "loss": 0.2631, "step": 8090 }, { "epoch": 2.2907239819004523, "grad_norm": 5.643038749694824, "learning_rate": 4.728506787330317e-06, "loss": 0.179, "step": 8100 }, { "epoch": 2.293552036199095, "grad_norm": 8.725652694702148, "learning_rate": 4.709653092006033e-06, "loss": 0.2362, "step": 8110 }, { "epoch": 2.2963800904977374, "grad_norm": 6.781122207641602, "learning_rate": 4.69079939668175e-06, "loss": 0.205, "step": 8120 }, { "epoch": 2.29920814479638, "grad_norm": 0.9392467141151428, "learning_rate": 4.671945701357466e-06, "loss": 0.2181, "step": 8130 }, { "epoch": 2.3020361990950224, "grad_norm": 1.8741260766983032, "learning_rate": 4.653092006033183e-06, "loss": 0.1588, "step": 8140 }, { "epoch": 2.3048642533936654, "grad_norm": 5.825664520263672, "learning_rate": 4.6342383107088995e-06, "loss": 0.2214, "step": 8150 }, { "epoch": 2.3076923076923075, "grad_norm": 4.3385701179504395, "learning_rate": 4.615384615384616e-06, "loss": 0.2024, "step": 8160 }, { "epoch": 2.3105203619909505, "grad_norm": 5.437368869781494, "learning_rate": 4.596530920060332e-06, "loss": 0.2341, "step": 8170 }, { "epoch": 2.3133484162895925, "grad_norm": 5.2032270431518555, "learning_rate": 4.5776772247360485e-06, "loss": 0.2639, "step": 8180 }, { "epoch": 2.3161764705882355, "grad_norm": 4.702691555023193, "learning_rate": 4.558823529411765e-06, "loss": 0.2153, "step": 8190 }, { "epoch": 2.3190045248868776, "grad_norm": 3.5364975929260254, "learning_rate": 4.539969834087482e-06, "loss": 0.1909, "step": 8200 }, { "epoch": 2.3218325791855206, "grad_norm": 2.7947473526000977, "learning_rate": 4.521116138763198e-06, "loss": 0.216, "step": 8210 }, { "epoch": 2.324660633484163, "grad_norm": 8.211967468261719, "learning_rate": 4.502262443438914e-06, "loss": 0.2122, "step": 8220 }, { "epoch": 2.3274886877828056, "grad_norm": 3.7828614711761475, "learning_rate": 4.483408748114631e-06, "loss": 0.2741, "step": 8230 }, { "epoch": 2.330316742081448, "grad_norm": 5.757340908050537, "learning_rate": 4.464555052790347e-06, "loss": 0.2854, "step": 8240 }, { "epoch": 2.3331447963800906, "grad_norm": 4.723744869232178, "learning_rate": 4.445701357466063e-06, "loss": 0.2508, "step": 8250 }, { "epoch": 2.335972850678733, "grad_norm": 4.520774841308594, "learning_rate": 4.42684766214178e-06, "loss": 0.2414, "step": 8260 }, { "epoch": 2.3388009049773757, "grad_norm": 4.983455181121826, "learning_rate": 4.407993966817496e-06, "loss": 0.2414, "step": 8270 }, { "epoch": 2.341628959276018, "grad_norm": 6.122417449951172, "learning_rate": 4.389140271493213e-06, "loss": 0.2177, "step": 8280 }, { "epoch": 2.3444570135746607, "grad_norm": 2.776017189025879, "learning_rate": 4.37028657616893e-06, "loss": 0.2133, "step": 8290 }, { "epoch": 2.3472850678733033, "grad_norm": 7.429429054260254, "learning_rate": 4.351432880844646e-06, "loss": 0.1915, "step": 8300 }, { "epoch": 2.350113122171946, "grad_norm": 7.583387851715088, "learning_rate": 4.332579185520363e-06, "loss": 0.2396, "step": 8310 }, { "epoch": 2.3529411764705883, "grad_norm": 8.560108184814453, "learning_rate": 4.313725490196079e-06, "loss": 0.2364, "step": 8320 }, { "epoch": 2.355769230769231, "grad_norm": 2.898757219314575, "learning_rate": 4.294871794871795e-06, "loss": 0.2685, "step": 8330 }, { "epoch": 2.3585972850678734, "grad_norm": 5.2947564125061035, "learning_rate": 4.276018099547512e-06, "loss": 0.2222, "step": 8340 }, { "epoch": 2.361425339366516, "grad_norm": 2.573645830154419, "learning_rate": 4.257164404223228e-06, "loss": 0.2335, "step": 8350 }, { "epoch": 2.3642533936651584, "grad_norm": 6.62631368637085, "learning_rate": 4.238310708898944e-06, "loss": 0.2325, "step": 8360 }, { "epoch": 2.367081447963801, "grad_norm": 5.814454555511475, "learning_rate": 4.219457013574661e-06, "loss": 0.2538, "step": 8370 }, { "epoch": 2.3699095022624435, "grad_norm": 6.129361152648926, "learning_rate": 4.2006033182503775e-06, "loss": 0.2395, "step": 8380 }, { "epoch": 2.372737556561086, "grad_norm": 5.893956184387207, "learning_rate": 4.181749622926094e-06, "loss": 0.2651, "step": 8390 }, { "epoch": 2.3755656108597285, "grad_norm": 6.977567672729492, "learning_rate": 4.162895927601811e-06, "loss": 0.2575, "step": 8400 }, { "epoch": 2.378393665158371, "grad_norm": 1.8976235389709473, "learning_rate": 4.144042232277527e-06, "loss": 0.199, "step": 8410 }, { "epoch": 2.3812217194570136, "grad_norm": 1.1803913116455078, "learning_rate": 4.125188536953243e-06, "loss": 0.2826, "step": 8420 }, { "epoch": 2.384049773755656, "grad_norm": 4.858994483947754, "learning_rate": 4.10633484162896e-06, "loss": 0.1937, "step": 8430 }, { "epoch": 2.3868778280542986, "grad_norm": 3.6424715518951416, "learning_rate": 4.087481146304676e-06, "loss": 0.2383, "step": 8440 }, { "epoch": 2.389705882352941, "grad_norm": 4.879428863525391, "learning_rate": 4.068627450980392e-06, "loss": 0.2187, "step": 8450 }, { "epoch": 2.3925339366515836, "grad_norm": 4.588160991668701, "learning_rate": 4.049773755656109e-06, "loss": 0.2134, "step": 8460 }, { "epoch": 2.395361990950226, "grad_norm": 3.9123332500457764, "learning_rate": 4.030920060331825e-06, "loss": 0.1968, "step": 8470 }, { "epoch": 2.3981900452488687, "grad_norm": 6.140926361083984, "learning_rate": 4.012066365007542e-06, "loss": 0.2356, "step": 8480 }, { "epoch": 2.401018099547511, "grad_norm": 2.6923718452453613, "learning_rate": 3.9932126696832585e-06, "loss": 0.2502, "step": 8490 }, { "epoch": 2.4038461538461537, "grad_norm": 3.490473508834839, "learning_rate": 3.974358974358974e-06, "loss": 0.2253, "step": 8500 }, { "epoch": 2.4066742081447963, "grad_norm": 3.2556686401367188, "learning_rate": 3.955505279034691e-06, "loss": 0.2228, "step": 8510 }, { "epoch": 2.409502262443439, "grad_norm": 5.598496437072754, "learning_rate": 3.9366515837104075e-06, "loss": 0.234, "step": 8520 }, { "epoch": 2.4123303167420813, "grad_norm": 4.937731742858887, "learning_rate": 3.917797888386124e-06, "loss": 0.2064, "step": 8530 }, { "epoch": 2.415158371040724, "grad_norm": 2.0519907474517822, "learning_rate": 3.898944193061841e-06, "loss": 0.2148, "step": 8540 }, { "epoch": 2.4179864253393664, "grad_norm": 4.925931453704834, "learning_rate": 3.880090497737557e-06, "loss": 0.2406, "step": 8550 }, { "epoch": 2.420814479638009, "grad_norm": 3.878779172897339, "learning_rate": 3.861236802413273e-06, "loss": 0.2159, "step": 8560 }, { "epoch": 2.4236425339366514, "grad_norm": 5.424575328826904, "learning_rate": 3.84238310708899e-06, "loss": 0.2202, "step": 8570 }, { "epoch": 2.426470588235294, "grad_norm": 4.764692306518555, "learning_rate": 3.8235294117647055e-06, "loss": 0.2238, "step": 8580 }, { "epoch": 2.4292986425339365, "grad_norm": 6.2886881828308105, "learning_rate": 3.8046757164404226e-06, "loss": 0.2258, "step": 8590 }, { "epoch": 2.4321266968325794, "grad_norm": 5.105391502380371, "learning_rate": 3.7858220211161388e-06, "loss": 0.2291, "step": 8600 }, { "epoch": 2.4349547511312215, "grad_norm": 3.7577686309814453, "learning_rate": 3.7669683257918554e-06, "loss": 0.2041, "step": 8610 }, { "epoch": 2.4377828054298645, "grad_norm": 2.689021587371826, "learning_rate": 3.748114630467572e-06, "loss": 0.2576, "step": 8620 }, { "epoch": 2.4406108597285066, "grad_norm": 3.162226438522339, "learning_rate": 3.7292609351432886e-06, "loss": 0.2342, "step": 8630 }, { "epoch": 2.4434389140271495, "grad_norm": 4.014715671539307, "learning_rate": 3.710407239819005e-06, "loss": 0.236, "step": 8640 }, { "epoch": 2.446266968325792, "grad_norm": 5.3587822914123535, "learning_rate": 3.6915535444947214e-06, "loss": 0.2328, "step": 8650 }, { "epoch": 2.4490950226244346, "grad_norm": 7.895315647125244, "learning_rate": 3.672699849170438e-06, "loss": 0.2333, "step": 8660 }, { "epoch": 2.451923076923077, "grad_norm": 8.392569541931152, "learning_rate": 3.653846153846154e-06, "loss": 0.2605, "step": 8670 }, { "epoch": 2.4547511312217196, "grad_norm": 3.8333370685577393, "learning_rate": 3.6349924585218704e-06, "loss": 0.2289, "step": 8680 }, { "epoch": 2.457579185520362, "grad_norm": 7.176278114318848, "learning_rate": 3.616138763197587e-06, "loss": 0.2119, "step": 8690 }, { "epoch": 2.4604072398190047, "grad_norm": 8.778523445129395, "learning_rate": 3.5972850678733032e-06, "loss": 0.2363, "step": 8700 }, { "epoch": 2.463235294117647, "grad_norm": 3.1572511196136475, "learning_rate": 3.57843137254902e-06, "loss": 0.2083, "step": 8710 }, { "epoch": 2.4660633484162897, "grad_norm": 6.948089122772217, "learning_rate": 3.5595776772247365e-06, "loss": 0.2337, "step": 8720 }, { "epoch": 2.4688914027149322, "grad_norm": 7.237654209136963, "learning_rate": 3.540723981900453e-06, "loss": 0.1903, "step": 8730 }, { "epoch": 2.4717194570135748, "grad_norm": 3.5161070823669434, "learning_rate": 3.5218702865761693e-06, "loss": 0.2003, "step": 8740 }, { "epoch": 2.4745475113122173, "grad_norm": 5.7288737297058105, "learning_rate": 3.5030165912518855e-06, "loss": 0.1979, "step": 8750 }, { "epoch": 2.47737556561086, "grad_norm": 6.921863079071045, "learning_rate": 3.484162895927602e-06, "loss": 0.2681, "step": 8760 }, { "epoch": 2.4802036199095023, "grad_norm": 1.5838019847869873, "learning_rate": 3.4653092006033183e-06, "loss": 0.186, "step": 8770 }, { "epoch": 2.483031674208145, "grad_norm": 6.464385986328125, "learning_rate": 3.446455505279035e-06, "loss": 0.2791, "step": 8780 }, { "epoch": 2.4858597285067874, "grad_norm": 4.105411529541016, "learning_rate": 3.4276018099547515e-06, "loss": 0.246, "step": 8790 }, { "epoch": 2.48868778280543, "grad_norm": 5.3756632804870605, "learning_rate": 3.408748114630468e-06, "loss": 0.2344, "step": 8800 }, { "epoch": 2.4915158371040724, "grad_norm": 3.4841089248657227, "learning_rate": 3.3898944193061843e-06, "loss": 0.1978, "step": 8810 }, { "epoch": 2.494343891402715, "grad_norm": 7.188533782958984, "learning_rate": 3.371040723981901e-06, "loss": 0.2737, "step": 8820 }, { "epoch": 2.4971719457013575, "grad_norm": 4.090082168579102, "learning_rate": 3.3521870286576167e-06, "loss": 0.2139, "step": 8830 }, { "epoch": 2.5, "grad_norm": 7.417943000793457, "learning_rate": 3.3333333333333333e-06, "loss": 0.2275, "step": 8840 }, { "epoch": 2.5028280542986425, "grad_norm": 3.605393648147583, "learning_rate": 3.31447963800905e-06, "loss": 0.2446, "step": 8850 }, { "epoch": 2.505656108597285, "grad_norm": 5.961788654327393, "learning_rate": 3.2956259426847666e-06, "loss": 0.2923, "step": 8860 }, { "epoch": 2.5084841628959276, "grad_norm": 4.26703405380249, "learning_rate": 3.2767722473604827e-06, "loss": 0.1962, "step": 8870 }, { "epoch": 2.51131221719457, "grad_norm": 4.207533359527588, "learning_rate": 3.2579185520361994e-06, "loss": 0.1995, "step": 8880 }, { "epoch": 2.5141402714932126, "grad_norm": 4.4618682861328125, "learning_rate": 3.239064856711916e-06, "loss": 0.172, "step": 8890 }, { "epoch": 2.516968325791855, "grad_norm": 5.302677631378174, "learning_rate": 3.2202111613876326e-06, "loss": 0.1653, "step": 8900 }, { "epoch": 2.5197963800904977, "grad_norm": 3.299323558807373, "learning_rate": 3.2013574660633484e-06, "loss": 0.2407, "step": 8910 }, { "epoch": 2.52262443438914, "grad_norm": 6.668271541595459, "learning_rate": 3.182503770739065e-06, "loss": 0.2283, "step": 8920 }, { "epoch": 2.5254524886877827, "grad_norm": 7.668635368347168, "learning_rate": 3.1636500754147816e-06, "loss": 0.2752, "step": 8930 }, { "epoch": 2.5282805429864252, "grad_norm": 1.711267113685608, "learning_rate": 3.1447963800904978e-06, "loss": 0.2136, "step": 8940 }, { "epoch": 2.5311085972850678, "grad_norm": 8.963603019714355, "learning_rate": 3.1259426847662144e-06, "loss": 0.205, "step": 8950 }, { "epoch": 2.5339366515837103, "grad_norm": 2.520670175552368, "learning_rate": 3.107088989441931e-06, "loss": 0.2131, "step": 8960 }, { "epoch": 2.536764705882353, "grad_norm": 8.796506881713867, "learning_rate": 3.0882352941176476e-06, "loss": 0.2969, "step": 8970 }, { "epoch": 2.5395927601809953, "grad_norm": 7.460408687591553, "learning_rate": 3.069381598793364e-06, "loss": 0.2432, "step": 8980 }, { "epoch": 2.542420814479638, "grad_norm": 9.012686729431152, "learning_rate": 3.0505279034690804e-06, "loss": 0.2707, "step": 8990 }, { "epoch": 2.5452488687782804, "grad_norm": 5.107896327972412, "learning_rate": 3.0316742081447962e-06, "loss": 0.2239, "step": 9000 }, { "epoch": 2.5452488687782804, "eval_accuracy": 0.888880050906777, "eval_loss": 0.288782000541687, "eval_runtime": 126.5084, "eval_samples_per_second": 99.377, "eval_steps_per_second": 3.107, "step": 9000 }, { "epoch": 2.5480769230769234, "grad_norm": 2.8435633182525635, "learning_rate": 3.012820512820513e-06, "loss": 0.2544, "step": 9010 }, { "epoch": 2.5509049773755654, "grad_norm": 4.109634876251221, "learning_rate": 2.9939668174962294e-06, "loss": 0.2508, "step": 9020 }, { "epoch": 2.5537330316742084, "grad_norm": 3.3078644275665283, "learning_rate": 2.975113122171946e-06, "loss": 0.2025, "step": 9030 }, { "epoch": 2.5565610859728505, "grad_norm": 6.037450790405273, "learning_rate": 2.9562594268476623e-06, "loss": 0.2347, "step": 9040 }, { "epoch": 2.5593891402714934, "grad_norm": 5.157569408416748, "learning_rate": 2.937405731523379e-06, "loss": 0.2684, "step": 9050 }, { "epoch": 2.5622171945701355, "grad_norm": 2.070380210876465, "learning_rate": 2.9185520361990955e-06, "loss": 0.2217, "step": 9060 }, { "epoch": 2.5650452488687785, "grad_norm": 2.0333659648895264, "learning_rate": 2.899698340874812e-06, "loss": 0.1901, "step": 9070 }, { "epoch": 2.5678733031674206, "grad_norm": 2.8762121200561523, "learning_rate": 2.880844645550528e-06, "loss": 0.2175, "step": 9080 }, { "epoch": 2.5707013574660635, "grad_norm": 3.8669402599334717, "learning_rate": 2.8619909502262445e-06, "loss": 0.2218, "step": 9090 }, { "epoch": 2.5735294117647056, "grad_norm": 5.87692403793335, "learning_rate": 2.843137254901961e-06, "loss": 0.2058, "step": 9100 }, { "epoch": 2.5763574660633486, "grad_norm": 3.9730098247528076, "learning_rate": 2.8242835595776773e-06, "loss": 0.2191, "step": 9110 }, { "epoch": 2.579185520361991, "grad_norm": 3.073633909225464, "learning_rate": 2.805429864253394e-06, "loss": 0.2499, "step": 9120 }, { "epoch": 2.5820135746606336, "grad_norm": 3.6937789916992188, "learning_rate": 2.7865761689291105e-06, "loss": 0.2499, "step": 9130 }, { "epoch": 2.584841628959276, "grad_norm": 4.838074207305908, "learning_rate": 2.767722473604827e-06, "loss": 0.1883, "step": 9140 }, { "epoch": 2.5876696832579187, "grad_norm": 6.562351226806641, "learning_rate": 2.7488687782805433e-06, "loss": 0.2019, "step": 9150 }, { "epoch": 2.590497737556561, "grad_norm": 3.512963056564331, "learning_rate": 2.7300150829562595e-06, "loss": 0.1934, "step": 9160 }, { "epoch": 2.5933257918552037, "grad_norm": 4.1841511726379395, "learning_rate": 2.7111613876319757e-06, "loss": 0.1807, "step": 9170 }, { "epoch": 2.5961538461538463, "grad_norm": 4.239630222320557, "learning_rate": 2.6923076923076923e-06, "loss": 0.2101, "step": 9180 }, { "epoch": 2.598981900452489, "grad_norm": 3.499694585800171, "learning_rate": 2.673453996983409e-06, "loss": 0.2152, "step": 9190 }, { "epoch": 2.6018099547511313, "grad_norm": 3.0219247341156006, "learning_rate": 2.6546003016591256e-06, "loss": 0.2311, "step": 9200 }, { "epoch": 2.604638009049774, "grad_norm": 4.168036937713623, "learning_rate": 2.6357466063348418e-06, "loss": 0.1943, "step": 9210 }, { "epoch": 2.6074660633484164, "grad_norm": 1.4795814752578735, "learning_rate": 2.6168929110105584e-06, "loss": 0.1786, "step": 9220 }, { "epoch": 2.610294117647059, "grad_norm": 1.5753957033157349, "learning_rate": 2.598039215686275e-06, "loss": 0.1892, "step": 9230 }, { "epoch": 2.6131221719457014, "grad_norm": 3.37406325340271, "learning_rate": 2.5791855203619916e-06, "loss": 0.1632, "step": 9240 }, { "epoch": 2.615950226244344, "grad_norm": 4.640278339385986, "learning_rate": 2.5603318250377074e-06, "loss": 0.239, "step": 9250 }, { "epoch": 2.6187782805429864, "grad_norm": 5.864749431610107, "learning_rate": 2.541478129713424e-06, "loss": 0.2349, "step": 9260 }, { "epoch": 2.621606334841629, "grad_norm": 4.219099521636963, "learning_rate": 2.5226244343891406e-06, "loss": 0.2298, "step": 9270 }, { "epoch": 2.6244343891402715, "grad_norm": 6.88966703414917, "learning_rate": 2.503770739064857e-06, "loss": 0.2096, "step": 9280 }, { "epoch": 2.627262443438914, "grad_norm": 3.7265114784240723, "learning_rate": 2.4849170437405734e-06, "loss": 0.1961, "step": 9290 }, { "epoch": 2.6300904977375565, "grad_norm": 3.687527656555176, "learning_rate": 2.46606334841629e-06, "loss": 0.2054, "step": 9300 }, { "epoch": 2.632918552036199, "grad_norm": 5.014760971069336, "learning_rate": 2.4472096530920062e-06, "loss": 0.2425, "step": 9310 }, { "epoch": 2.6357466063348416, "grad_norm": 8.167291641235352, "learning_rate": 2.428355957767723e-06, "loss": 0.2079, "step": 9320 }, { "epoch": 2.638574660633484, "grad_norm": 4.277304649353027, "learning_rate": 2.409502262443439e-06, "loss": 0.2205, "step": 9330 }, { "epoch": 2.6414027149321266, "grad_norm": 5.0269975662231445, "learning_rate": 2.3906485671191556e-06, "loss": 0.2586, "step": 9340 }, { "epoch": 2.644230769230769, "grad_norm": 4.617335796356201, "learning_rate": 2.371794871794872e-06, "loss": 0.2167, "step": 9350 }, { "epoch": 2.6470588235294117, "grad_norm": 3.6927714347839355, "learning_rate": 2.3529411764705885e-06, "loss": 0.2195, "step": 9360 }, { "epoch": 2.649886877828054, "grad_norm": 3.20468807220459, "learning_rate": 2.334087481146305e-06, "loss": 0.2495, "step": 9370 }, { "epoch": 2.6527149321266967, "grad_norm": 4.111125946044922, "learning_rate": 2.3152337858220213e-06, "loss": 0.1675, "step": 9380 }, { "epoch": 2.6555429864253393, "grad_norm": 3.872500419616699, "learning_rate": 2.2963800904977375e-06, "loss": 0.2614, "step": 9390 }, { "epoch": 2.658371040723982, "grad_norm": 5.960339069366455, "learning_rate": 2.277526395173454e-06, "loss": 0.2031, "step": 9400 }, { "epoch": 2.6611990950226243, "grad_norm": 7.735962390899658, "learning_rate": 2.2586726998491707e-06, "loss": 0.2164, "step": 9410 }, { "epoch": 2.664027149321267, "grad_norm": 4.943899154663086, "learning_rate": 2.2398190045248873e-06, "loss": 0.2322, "step": 9420 }, { "epoch": 2.6668552036199094, "grad_norm": 3.7775423526763916, "learning_rate": 2.2209653092006035e-06, "loss": 0.2238, "step": 9430 }, { "epoch": 2.669683257918552, "grad_norm": 6.782299995422363, "learning_rate": 2.2021116138763197e-06, "loss": 0.2141, "step": 9440 }, { "epoch": 2.6725113122171944, "grad_norm": 2.3152804374694824, "learning_rate": 2.1832579185520363e-06, "loss": 0.1729, "step": 9450 }, { "epoch": 2.6753393665158374, "grad_norm": 5.257414817810059, "learning_rate": 2.164404223227753e-06, "loss": 0.1875, "step": 9460 }, { "epoch": 2.6781674208144794, "grad_norm": 5.083720684051514, "learning_rate": 2.145550527903469e-06, "loss": 0.2721, "step": 9470 }, { "epoch": 2.6809954751131224, "grad_norm": 3.5238163471221924, "learning_rate": 2.1266968325791857e-06, "loss": 0.1752, "step": 9480 }, { "epoch": 2.6838235294117645, "grad_norm": 9.12520694732666, "learning_rate": 2.1078431372549023e-06, "loss": 0.2184, "step": 9490 }, { "epoch": 2.6866515837104075, "grad_norm": 3.9677796363830566, "learning_rate": 2.0889894419306185e-06, "loss": 0.2685, "step": 9500 }, { "epoch": 2.6894796380090495, "grad_norm": 8.702911376953125, "learning_rate": 2.0701357466063347e-06, "loss": 0.2143, "step": 9510 }, { "epoch": 2.6923076923076925, "grad_norm": 5.3467841148376465, "learning_rate": 2.0512820512820513e-06, "loss": 0.1466, "step": 9520 }, { "epoch": 2.6951357466063346, "grad_norm": 8.666280746459961, "learning_rate": 2.032428355957768e-06, "loss": 0.2221, "step": 9530 }, { "epoch": 2.6979638009049776, "grad_norm": 4.463994979858398, "learning_rate": 2.0135746606334846e-06, "loss": 0.2115, "step": 9540 }, { "epoch": 2.7007918552036196, "grad_norm": 8.998452186584473, "learning_rate": 1.9947209653092008e-06, "loss": 0.2286, "step": 9550 }, { "epoch": 2.7036199095022626, "grad_norm": 2.3983922004699707, "learning_rate": 1.975867269984917e-06, "loss": 0.1753, "step": 9560 }, { "epoch": 2.706447963800905, "grad_norm": 5.0769524574279785, "learning_rate": 1.9570135746606336e-06, "loss": 0.2029, "step": 9570 }, { "epoch": 2.7092760180995477, "grad_norm": 3.6228933334350586, "learning_rate": 1.93815987933635e-06, "loss": 0.2282, "step": 9580 }, { "epoch": 2.71210407239819, "grad_norm": 7.759435176849365, "learning_rate": 1.919306184012067e-06, "loss": 0.2395, "step": 9590 }, { "epoch": 2.7149321266968327, "grad_norm": 7.777573585510254, "learning_rate": 1.9004524886877828e-06, "loss": 0.1992, "step": 9600 }, { "epoch": 2.7177601809954752, "grad_norm": 4.795551300048828, "learning_rate": 1.8815987933634994e-06, "loss": 0.227, "step": 9610 }, { "epoch": 2.7205882352941178, "grad_norm": 4.623630046844482, "learning_rate": 1.8627450980392158e-06, "loss": 0.2199, "step": 9620 }, { "epoch": 2.7234162895927603, "grad_norm": 1.8060227632522583, "learning_rate": 1.8438914027149324e-06, "loss": 0.2662, "step": 9630 }, { "epoch": 2.726244343891403, "grad_norm": 4.0437798500061035, "learning_rate": 1.8250377073906486e-06, "loss": 0.2106, "step": 9640 }, { "epoch": 2.7290723981900453, "grad_norm": 2.629993200302124, "learning_rate": 1.806184012066365e-06, "loss": 0.2275, "step": 9650 }, { "epoch": 2.731900452488688, "grad_norm": 4.662147045135498, "learning_rate": 1.7873303167420816e-06, "loss": 0.1597, "step": 9660 }, { "epoch": 2.7347285067873304, "grad_norm": 7.3248066902160645, "learning_rate": 1.768476621417798e-06, "loss": 0.2161, "step": 9670 }, { "epoch": 2.737556561085973, "grad_norm": 5.798586845397949, "learning_rate": 1.7496229260935144e-06, "loss": 0.2557, "step": 9680 }, { "epoch": 2.7403846153846154, "grad_norm": 2.832303524017334, "learning_rate": 1.7307692307692308e-06, "loss": 0.2108, "step": 9690 }, { "epoch": 2.743212669683258, "grad_norm": 1.038588047027588, "learning_rate": 1.7119155354449475e-06, "loss": 0.2141, "step": 9700 }, { "epoch": 2.7460407239819005, "grad_norm": 6.463703155517578, "learning_rate": 1.6930618401206639e-06, "loss": 0.219, "step": 9710 }, { "epoch": 2.748868778280543, "grad_norm": 6.210083484649658, "learning_rate": 1.67420814479638e-06, "loss": 0.1784, "step": 9720 }, { "epoch": 2.7516968325791855, "grad_norm": 5.5614848136901855, "learning_rate": 1.6553544494720967e-06, "loss": 0.2848, "step": 9730 }, { "epoch": 2.754524886877828, "grad_norm": 6.321543216705322, "learning_rate": 1.636500754147813e-06, "loss": 0.1904, "step": 9740 }, { "epoch": 2.7573529411764706, "grad_norm": 2.9993443489074707, "learning_rate": 1.6176470588235297e-06, "loss": 0.2348, "step": 9750 }, { "epoch": 2.760180995475113, "grad_norm": 2.8095312118530273, "learning_rate": 1.5987933634992459e-06, "loss": 0.2194, "step": 9760 }, { "epoch": 2.7630090497737556, "grad_norm": 9.010799407958984, "learning_rate": 1.5799396681749623e-06, "loss": 0.274, "step": 9770 }, { "epoch": 2.765837104072398, "grad_norm": 4.045629501342773, "learning_rate": 1.561085972850679e-06, "loss": 0.2011, "step": 9780 }, { "epoch": 2.7686651583710407, "grad_norm": 7.133453845977783, "learning_rate": 1.5422322775263953e-06, "loss": 0.2241, "step": 9790 }, { "epoch": 2.771493212669683, "grad_norm": 4.382336616516113, "learning_rate": 1.5233785822021115e-06, "loss": 0.2694, "step": 9800 }, { "epoch": 2.7743212669683257, "grad_norm": 4.200496673583984, "learning_rate": 1.5045248868778281e-06, "loss": 0.2299, "step": 9810 }, { "epoch": 2.7771493212669682, "grad_norm": 3.4665303230285645, "learning_rate": 1.4856711915535445e-06, "loss": 0.1934, "step": 9820 }, { "epoch": 2.7799773755656108, "grad_norm": 5.625051498413086, "learning_rate": 1.4668174962292611e-06, "loss": 0.2565, "step": 9830 }, { "epoch": 2.7828054298642533, "grad_norm": 0.8546460866928101, "learning_rate": 1.4479638009049775e-06, "loss": 0.159, "step": 9840 }, { "epoch": 2.785633484162896, "grad_norm": 2.4043455123901367, "learning_rate": 1.429110105580694e-06, "loss": 0.1937, "step": 9850 }, { "epoch": 2.7884615384615383, "grad_norm": 5.863745212554932, "learning_rate": 1.4102564102564104e-06, "loss": 0.2213, "step": 9860 }, { "epoch": 2.791289592760181, "grad_norm": 4.6385722160339355, "learning_rate": 1.391402714932127e-06, "loss": 0.263, "step": 9870 }, { "epoch": 2.7941176470588234, "grad_norm": 6.428844928741455, "learning_rate": 1.3725490196078434e-06, "loss": 0.1866, "step": 9880 }, { "epoch": 2.7969457013574663, "grad_norm": 4.29943323135376, "learning_rate": 1.3536953242835596e-06, "loss": 0.2021, "step": 9890 }, { "epoch": 2.7997737556561084, "grad_norm": 3.2437448501586914, "learning_rate": 1.3348416289592762e-06, "loss": 0.1571, "step": 9900 }, { "epoch": 2.8026018099547514, "grad_norm": 3.756850481033325, "learning_rate": 1.3159879336349926e-06, "loss": 0.2301, "step": 9910 }, { "epoch": 2.8054298642533935, "grad_norm": 4.855559825897217, "learning_rate": 1.2971342383107092e-06, "loss": 0.2538, "step": 9920 }, { "epoch": 2.8082579185520364, "grad_norm": 4.502439498901367, "learning_rate": 1.2782805429864254e-06, "loss": 0.1862, "step": 9930 }, { "epoch": 2.8110859728506785, "grad_norm": 6.242438793182373, "learning_rate": 1.2594268476621418e-06, "loss": 0.145, "step": 9940 }, { "epoch": 2.8139140271493215, "grad_norm": 6.00593900680542, "learning_rate": 1.2405731523378584e-06, "loss": 0.2097, "step": 9950 }, { "epoch": 2.8167420814479636, "grad_norm": 10.398560523986816, "learning_rate": 1.2217194570135748e-06, "loss": 0.2605, "step": 9960 }, { "epoch": 2.8195701357466065, "grad_norm": 4.909145832061768, "learning_rate": 1.2028657616892912e-06, "loss": 0.2473, "step": 9970 }, { "epoch": 2.8223981900452486, "grad_norm": 4.879443168640137, "learning_rate": 1.1840120663650076e-06, "loss": 0.2071, "step": 9980 }, { "epoch": 2.8252262443438916, "grad_norm": 5.404385566711426, "learning_rate": 1.165158371040724e-06, "loss": 0.2138, "step": 9990 }, { "epoch": 2.8280542986425337, "grad_norm": 3.4741604328155518, "learning_rate": 1.1463046757164404e-06, "loss": 0.2502, "step": 10000 }, { "epoch": 2.8280542986425337, "eval_accuracy": 0.888880050906777, "eval_loss": 0.2902699112892151, "eval_runtime": 126.3428, "eval_samples_per_second": 99.507, "eval_steps_per_second": 3.111, "step": 10000 }, { "epoch": 2.8308823529411766, "grad_norm": 4.25987434387207, "learning_rate": 1.127450980392157e-06, "loss": 0.2242, "step": 10010 }, { "epoch": 2.833710407239819, "grad_norm": 7.45045280456543, "learning_rate": 1.1085972850678732e-06, "loss": 0.2791, "step": 10020 }, { "epoch": 2.8365384615384617, "grad_norm": 4.844043254852295, "learning_rate": 1.0897435897435899e-06, "loss": 0.2357, "step": 10030 }, { "epoch": 2.839366515837104, "grad_norm": 5.769428253173828, "learning_rate": 1.0708898944193063e-06, "loss": 0.2296, "step": 10040 }, { "epoch": 2.8421945701357467, "grad_norm": 7.023671627044678, "learning_rate": 1.0520361990950227e-06, "loss": 0.2318, "step": 10050 }, { "epoch": 2.8450226244343892, "grad_norm": 3.501164436340332, "learning_rate": 1.033182503770739e-06, "loss": 0.2456, "step": 10060 }, { "epoch": 2.8478506787330318, "grad_norm": 9.939863204956055, "learning_rate": 1.0143288084464557e-06, "loss": 0.2474, "step": 10070 }, { "epoch": 2.8506787330316743, "grad_norm": 5.502429962158203, "learning_rate": 9.954751131221719e-07, "loss": 0.2381, "step": 10080 }, { "epoch": 2.853506787330317, "grad_norm": 5.186315536499023, "learning_rate": 9.766214177978885e-07, "loss": 0.2141, "step": 10090 }, { "epoch": 2.8563348416289593, "grad_norm": 11.375553131103516, "learning_rate": 9.57767722473605e-07, "loss": 0.2459, "step": 10100 }, { "epoch": 2.859162895927602, "grad_norm": 4.658810615539551, "learning_rate": 9.389140271493213e-07, "loss": 0.1952, "step": 10110 }, { "epoch": 2.8619909502262444, "grad_norm": 2.7533957958221436, "learning_rate": 9.200603318250378e-07, "loss": 0.2113, "step": 10120 }, { "epoch": 2.864819004524887, "grad_norm": 2.1169681549072266, "learning_rate": 9.012066365007542e-07, "loss": 0.198, "step": 10130 }, { "epoch": 2.8676470588235294, "grad_norm": 5.239007472991943, "learning_rate": 8.823529411764707e-07, "loss": 0.1799, "step": 10140 }, { "epoch": 2.870475113122172, "grad_norm": 9.836233139038086, "learning_rate": 8.634992458521871e-07, "loss": 0.2345, "step": 10150 }, { "epoch": 2.8733031674208145, "grad_norm": 2.392709970474243, "learning_rate": 8.446455505279036e-07, "loss": 0.2205, "step": 10160 }, { "epoch": 2.876131221719457, "grad_norm": 2.652374267578125, "learning_rate": 8.257918552036199e-07, "loss": 0.155, "step": 10170 }, { "epoch": 2.8789592760180995, "grad_norm": 8.32434368133545, "learning_rate": 8.069381598793364e-07, "loss": 0.2262, "step": 10180 }, { "epoch": 2.881787330316742, "grad_norm": 5.847408771514893, "learning_rate": 7.880844645550528e-07, "loss": 0.2315, "step": 10190 }, { "epoch": 2.8846153846153846, "grad_norm": 2.832589864730835, "learning_rate": 7.692307692307694e-07, "loss": 0.2618, "step": 10200 }, { "epoch": 2.887443438914027, "grad_norm": 4.295781135559082, "learning_rate": 7.503770739064857e-07, "loss": 0.2084, "step": 10210 }, { "epoch": 2.8902714932126696, "grad_norm": 5.640412330627441, "learning_rate": 7.315233785822022e-07, "loss": 0.2891, "step": 10220 }, { "epoch": 2.893099547511312, "grad_norm": 7.115928649902344, "learning_rate": 7.126696832579186e-07, "loss": 0.2689, "step": 10230 }, { "epoch": 2.8959276018099547, "grad_norm": 4.832301139831543, "learning_rate": 6.938159879336351e-07, "loss": 0.1793, "step": 10240 }, { "epoch": 2.898755656108597, "grad_norm": 5.678529262542725, "learning_rate": 6.749622926093515e-07, "loss": 0.1599, "step": 10250 }, { "epoch": 2.9015837104072397, "grad_norm": 6.394534587860107, "learning_rate": 6.56108597285068e-07, "loss": 0.2382, "step": 10260 }, { "epoch": 2.9044117647058822, "grad_norm": 5.185941219329834, "learning_rate": 6.372549019607843e-07, "loss": 0.1801, "step": 10270 }, { "epoch": 2.9072398190045248, "grad_norm": 3.3339009284973145, "learning_rate": 6.184012066365008e-07, "loss": 0.2108, "step": 10280 }, { "epoch": 2.9100678733031673, "grad_norm": 4.131908416748047, "learning_rate": 5.995475113122173e-07, "loss": 0.1994, "step": 10290 }, { "epoch": 2.91289592760181, "grad_norm": 5.131499290466309, "learning_rate": 5.806938159879337e-07, "loss": 0.2334, "step": 10300 }, { "epoch": 2.9157239819004523, "grad_norm": 6.0886712074279785, "learning_rate": 5.618401206636501e-07, "loss": 0.1895, "step": 10310 }, { "epoch": 2.918552036199095, "grad_norm": 6.050991058349609, "learning_rate": 5.429864253393665e-07, "loss": 0.2218, "step": 10320 }, { "epoch": 2.9213800904977374, "grad_norm": 5.902265548706055, "learning_rate": 5.24132730015083e-07, "loss": 0.2539, "step": 10330 }, { "epoch": 2.9242081447963804, "grad_norm": 2.757305860519409, "learning_rate": 5.052790346907994e-07, "loss": 0.1548, "step": 10340 }, { "epoch": 2.9270361990950224, "grad_norm": 2.187263011932373, "learning_rate": 4.864253393665158e-07, "loss": 0.164, "step": 10350 }, { "epoch": 2.9298642533936654, "grad_norm": 6.808703899383545, "learning_rate": 4.675716440422323e-07, "loss": 0.2073, "step": 10360 }, { "epoch": 2.9326923076923075, "grad_norm": 7.97061014175415, "learning_rate": 4.4871794871794876e-07, "loss": 0.2322, "step": 10370 }, { "epoch": 2.9355203619909505, "grad_norm": 6.372758865356445, "learning_rate": 4.298642533936652e-07, "loss": 0.2257, "step": 10380 }, { "epoch": 2.9383484162895925, "grad_norm": 6.094609260559082, "learning_rate": 4.110105580693816e-07, "loss": 0.2583, "step": 10390 }, { "epoch": 2.9411764705882355, "grad_norm": 2.3653512001037598, "learning_rate": 3.921568627450981e-07, "loss": 0.2137, "step": 10400 }, { "epoch": 2.9440045248868776, "grad_norm": 2.020627737045288, "learning_rate": 3.733031674208145e-07, "loss": 0.1286, "step": 10410 }, { "epoch": 2.9468325791855206, "grad_norm": 8.650784492492676, "learning_rate": 3.5444947209653094e-07, "loss": 0.2233, "step": 10420 }, { "epoch": 2.9496606334841626, "grad_norm": 4.553081512451172, "learning_rate": 3.355957767722474e-07, "loss": 0.1869, "step": 10430 }, { "epoch": 2.9524886877828056, "grad_norm": 3.9334750175476074, "learning_rate": 3.167420814479638e-07, "loss": 0.2109, "step": 10440 }, { "epoch": 2.955316742081448, "grad_norm": 10.762858390808105, "learning_rate": 2.978883861236803e-07, "loss": 0.2151, "step": 10450 }, { "epoch": 2.9581447963800906, "grad_norm": 6.37054967880249, "learning_rate": 2.790346907993967e-07, "loss": 0.2099, "step": 10460 }, { "epoch": 2.960972850678733, "grad_norm": 4.642254829406738, "learning_rate": 2.6018099547511317e-07, "loss": 0.1753, "step": 10470 }, { "epoch": 2.9638009049773757, "grad_norm": 4.995135307312012, "learning_rate": 2.4132730015082957e-07, "loss": 0.1708, "step": 10480 }, { "epoch": 2.966628959276018, "grad_norm": 5.875439643859863, "learning_rate": 2.2247360482654603e-07, "loss": 0.2445, "step": 10490 }, { "epoch": 2.9694570135746607, "grad_norm": 0.8066132664680481, "learning_rate": 2.0361990950226246e-07, "loss": 0.1628, "step": 10500 }, { "epoch": 2.9722850678733033, "grad_norm": 9.494747161865234, "learning_rate": 1.847662141779789e-07, "loss": 0.1322, "step": 10510 }, { "epoch": 2.975113122171946, "grad_norm": 6.8470001220703125, "learning_rate": 1.6591251885369535e-07, "loss": 0.2345, "step": 10520 }, { "epoch": 2.9779411764705883, "grad_norm": 5.916505813598633, "learning_rate": 1.4705882352941178e-07, "loss": 0.2001, "step": 10530 }, { "epoch": 2.980769230769231, "grad_norm": 6.173225402832031, "learning_rate": 1.282051282051282e-07, "loss": 0.2196, "step": 10540 }, { "epoch": 2.9835972850678734, "grad_norm": 4.780458927154541, "learning_rate": 1.0935143288084465e-07, "loss": 0.2821, "step": 10550 }, { "epoch": 2.986425339366516, "grad_norm": 2.4266226291656494, "learning_rate": 9.04977375565611e-08, "loss": 0.2189, "step": 10560 }, { "epoch": 2.9892533936651584, "grad_norm": 3.3179211616516113, "learning_rate": 7.164404223227753e-08, "loss": 0.2647, "step": 10570 }, { "epoch": 2.992081447963801, "grad_norm": 4.998234272003174, "learning_rate": 5.279034690799398e-08, "loss": 0.1746, "step": 10580 }, { "epoch": 2.9949095022624435, "grad_norm": 5.929104804992676, "learning_rate": 3.393665158371041e-08, "loss": 0.1786, "step": 10590 }, { "epoch": 2.997737556561086, "grad_norm": 4.823156356811523, "learning_rate": 1.5082956259426848e-08, "loss": 0.217, "step": 10600 } ], "logging_steps": 10, "max_steps": 10608, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.132054385068024e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }